update submodule and patches to 0.8.2

author Thomas Lamprecht <t.lamprecht@proxmox.com>

Sat, 28 Sep 2019 06:12:00 +0000 (08:12 +0200)

committer Thomas Lamprecht <t.lamprecht@proxmox.com>

Mon, 30 Sep 2019 04:59:38 +0000 (06:59 +0200)
author Thomas Lamprecht <t.lamprecht@proxmox.com>
Sat, 28 Sep 2019 06:12:00 +0000 (08:12 +0200)
committer Thomas Lamprecht <t.lamprecht@proxmox.com>
Mon, 30 Sep 2019 04:59:38 +0000 (06:59 +0200)
diff --git a/debian/patches/0001-Check-for-META-and-DCH-consistency-in-autoconf.patch b/debian/patches/0001-Check-for-META-and-DCH-consistency-in-autoconf.patch

index 6a89fe38271eb883db7c1ea5674bb18fe680d154..0fdbe9ddaaf7862eb5aeee0425d75ac4d6ebfc18 100644 (file)
--- a/debian/patches/0001-Check-for-META-and-DCH-consistency-in-autoconf.patch
+++ b/debian/patches/0001-Check-for-META-and-DCH-consistency-in-autoconf.patch
@@ -4,6 +4,7 @@ From: Debian ZFS on Linux maintainers
  Date: Wed, 30 Jan 2019 15:12:04 +0100
  Subject: [PATCH] Check-for-META-and-DCH-consistency-in-autoconf
  
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  ---
   config/zfs-meta.m4 | 34 +++++++++++++++++++++++++++++-----
   1 file changed, 29 insertions(+), 5 deletions(-)
diff --git a/debian/patches/0002-always-load-ZFS-module-on-boot.patch b/debian/patches/0002-always-load-ZFS-module-on-boot.patch

index 46a24baccf8d826de3bba3777aabd6b35afae73e..2ff27f4fd1d534d0b2abe4f9eca70888073d0894 100644 (file)
--- a/debian/patches/0002-always-load-ZFS-module-on-boot.patch
+++ b/debian/patches/0002-always-load-ZFS-module-on-boot.patch
@@ -13,6 +13,7 @@ not actually used.
  
  Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
  Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  ---
   etc/modules-load.d/zfs.conf.in | 2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/debian/patches/0003-Fix-the-path-to-the-zed-binary-on-the-systemd-unit.patch b/debian/patches/0003-Fix-the-path-to-the-zed-binary-on-the-systemd-unit.patch

index 4714094846f6602c161634e83d6c36bd1d28e92b..bd794ac3a815f2eb680a52ba104fbaf31a9688d4 100644 (file)
--- a/debian/patches/0003-Fix-the-path-to-the-zed-binary-on-the-systemd-unit.patch
+++ b/debian/patches/0003-Fix-the-path-to-the-zed-binary-on-the-systemd-unit.patch
@@ -6,6 +6,8 @@ Subject: [PATCH] Fix the path to the zed binary on the systemd unit.
  We install zed into /usr/sbin manually meanwhile the upstream default is
  installing it into /sbin. Ubuntu packages also install zed to /usr/sbin, but
  they ship their own zfs-zed unit.
+
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
  ---
   etc/systemd/system/zfs-zed.service.in | 2 +-
   1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/debian/patches/0004-import-with-d-dev-disk-by-id-in-scan-service.patch b/debian/patches/0004-import-with-d-dev-disk-by-id-in-scan-service.patch

new file mode 100644 (file)

index 0000000..e1f3360
--- /dev/null
+++ b/debian/patches/0004-import-with-d-dev-disk-by-id-in-scan-service.patch
@@ -0,0 +1,28 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
+Date: Mon, 24 Oct 2016 13:47:06 +0200
+Subject: [PATCH] import with -d /dev/disk/by-id in scan service
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ etc/systemd/system/zfs-import-scan.service.in | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in
+index 4aae9f06e..ec5c45e63 100644
+--- a/etc/systemd/system/zfs-import-scan.service.in
++++ b/etc/systemd/system/zfs-import-scan.service.in
+@@ -11,7 +11,7 @@ ConditionPathExists=!@sysconfdir@/zfs/zpool.cache
+ [Service]
+ Type=oneshot
+ RemainAfterExit=yes
+-ExecStart=@sbindir@/zpool import -aN -o cachefile=none
++ExecStart=@sbindir@/zpool import -aN -d /dev/disk/by-id -o cachefile=none
+ 
+ [Install]
+ WantedBy=zfs-import.target
diff --git a/debian/patches/0004-increase-default-zcmd-allocation-to-256K.patch b/debian/patches/0004-increase-default-zcmd-allocation-to-256K.patch

deleted file mode 100644 (file)

index a260280..0000000
--- a/debian/patches/0004-increase-default-zcmd-allocation-to-256K.patch
+++ /dev/null
@@ -1,75 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Colin Ian King <colin.king@canonical.com>
-Date: Tue, 31 Oct 2017 19:12:42 +0800
-Subject: [PATCH] increase-default-zcmd-allocation-to-256K
-
-Increase default zcmd allocation to 256K (LP: #567557)
-
-When creating hundreds of clones (for example using containers with
-LXD) cloning slows down as the number of clones increases over time.
-The reason for this is that the fetching of the clone information
-using a small zcmd buffer requires two ioctl calls, one to determine
-the size and a second to return the data. However, this requires
-gathering the data twice, once to determine the size and again to
-populate the zcmd buffer to return it to userspace.
-
-These are expensive ioctl() calls, so instead, make the default buffer
-size much larger: 256K. This may sound large, but on 64 bit systems
-running ZFS this is not a huge chunk of memory for the speed
-improvement we gains for large sets of clones:
-
-        16K zcmd        256K zcmd
-Clones Time    Clones  Time    Clone   % improvement
-        (secs)         per sec (secs)  per sec
-100    7       14.29   5       20.00   28.57
-200    10      20.00   9       22.22   10.00
-300    19      15.79   18      16.67   5.26
-400    22      18.18   22      18.18   0.00
-500    29      17.24   29      17.24   0.00
-600    39      15.38   39      15.38   0.00
-700    46      15.22   45      15.56   2.17
-800    58      13.79   51      15.69   12.07
-900    74      12.16   61      14.75   17.57
-1000   90      11.11   74      13.51   17.78
-1100   98      11.22   87      12.64   11.22
-1200   102     11.76   95      12.63   6.86
-1300   113     11.50   104     12.50   7.96
-1400   143     9.79    109     12.84   23.78
-1500   145     10.34   132     11.36   8.97
-1600   165     9.70    145     11.03   12.12
-1700   187     9.09    156     10.90   16.58
-1800   210     8.57    166     10.84   20.95
-1900   226     8.41    183     10.38   19.03
-2000   256     7.81    198     10.10   22.66
-2200   311     7.07    238     9.24    23.47
-2400   373     6.43    271     8.86    27.35
-2600   487     5.34    316     8.23    35.11
-3000   619     4.85    426     7.04    31.18
-3400   915     3.72    549     6.19    40.00
-4000   1332    3.00    923     4.33    30.71
-
-As one can see, with > 2000 clones we get 25-40% speed
-improvement.
-
-This patch was originally suggested by Brian Behlendorf
-(see https://github.com/zfsonlinux/zfs/issues/6372), however
-this fix is a more generic fix to cover all zcmd cases.
-
-Signed-off-by: Colin Ian King <colin.king@canonical.com>
----
- lib/libzfs/libzfs_util.c | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c
-index 19bb57ad4..2a069ffe6 100644
---- a/lib/libzfs/libzfs_util.c
-+++ b/lib/libzfs/libzfs_util.c
-@@ -1134,7 +1134,7 @@ int
- zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
- {
-       if (len == 0)
--              len = 16 * 1024;
-+              len = 256 * 1024;
-       zc->zc_nvlist_dst_size = len;
-       zc->zc_nvlist_dst =
-           (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size);
diff --git a/debian/patches/0005-Enable-zed-emails.patch b/debian/patches/0005-Enable-zed-emails.patch

new file mode 100644 (file)

index 0000000..ffbd21d
--- /dev/null
+++ b/debian/patches/0005-Enable-zed-emails.patch
@@ -0,0 +1,36 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Richard Laager <rlaager@wiktel.com>
+Date: Wed, 30 Jan 2019 15:12:04 +0100
+Subject: [PATCH] Enable zed emails
+
+The OpenZFS event daemon monitors pools. This patch enables the email sending
+function by default (if zed is installed). This is consistent with the default
+behavior of mdadm.
+
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ cmd/zed/zed.d/zed.rc | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc
+index 0ef706849..50632bfcb 100644
+--- a/cmd/zed/zed.d/zed.rc
++++ b/cmd/zed/zed.d/zed.rc
+@@ -15,7 +15,7 @@
+ # Email will only be sent if ZED_EMAIL_ADDR is defined.
+ # Disabled by default; uncomment to enable.
+ #
+-#ZED_EMAIL_ADDR="root"
++ZED_EMAIL_ADDR="root"
+ 
+ ##
+ # Name or path of executable responsible for sending notifications via email;
+@@ -41,7 +41,7 @@
+ ##
+ # Minimum number of seconds between notifications for a similar event.
+ #
+-#ZED_NOTIFY_INTERVAL_SECS=3600
++ZED_NOTIFY_INTERVAL_SECS=3600
+ 
+ ##
+ # Notification verbosity.
diff --git a/debian/patches/0005-import-with-d-dev-disk-by-id-in-scan-service.patch b/debian/patches/0005-import-with-d-dev-disk-by-id-in-scan-service.patch

deleted file mode 100644 (file)

index 5075b8b..0000000
--- a/debian/patches/0005-import-with-d-dev-disk-by-id-in-scan-service.patch
+++ /dev/null
@@ -1,27 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: =?UTF-8?q?Fabian=20Gr=C3=BCnbichler?= <f.gruenbichler@proxmox.com>
-Date: Mon, 24 Oct 2016 13:47:06 +0200
-Subject: [PATCH] import with -d /dev/disk/by-id in scan service
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
-Signed-off-by: Stoiko Ivanov <s.ivanov@proxmox.com>
----
- etc/systemd/system/zfs-import-scan.service.in | 2 +-
- 1 file changed, 1 insertion(+), 1 deletion(-)
-
-diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in
-index 4aae9f06e..ec5c45e63 100644
---- a/etc/systemd/system/zfs-import-scan.service.in
-+++ b/etc/systemd/system/zfs-import-scan.service.in
-@@ -11,7 +11,7 @@ ConditionPathExists=!@sysconfdir@/zfs/zpool.cache
- [Service]
- Type=oneshot
- RemainAfterExit=yes
--ExecStart=@sbindir@/zpool import -aN -o cachefile=none
-+ExecStart=@sbindir@/zpool import -aN -d /dev/disk/by-id -o cachefile=none
- 
- [Install]
- WantedBy=zfs-import.target
diff --git a/debian/patches/0006-Enable-zed-emails.patch b/debian/patches/0006-Enable-zed-emails.patch

deleted file mode 100644 (file)

index ed40ff1..0000000
--- a/debian/patches/0006-Enable-zed-emails.patch
+++ /dev/null
@@ -1,34 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Richard Laager <rlaager@wiktel.com>
-Date: Wed, 30 Jan 2019 15:12:04 +0100
-Subject: [PATCH] Enable zed emails
-
-The OpenZFS event daemon monitors pools. This patch enables the email sending
-function by default (if zed is installed). This is consistent with the default
-behavior of mdadm.
----
- cmd/zed/zed.d/zed.rc | 4 ++--
- 1 file changed, 2 insertions(+), 2 deletions(-)
-
-diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc
-index 0ef706849..50632bfcb 100644
---- a/cmd/zed/zed.d/zed.rc
-+++ b/cmd/zed/zed.d/zed.rc
-@@ -15,7 +15,7 @@
- # Email will only be sent if ZED_EMAIL_ADDR is defined.
- # Disabled by default; uncomment to enable.
- #
--#ZED_EMAIL_ADDR="root"
-+ZED_EMAIL_ADDR="root"
- 
- ##
- # Name or path of executable responsible for sending notifications via email;
-@@ -41,7 +41,7 @@
- ##
- # Minimum number of seconds between notifications for a similar event.
- #
--#ZED_NOTIFY_INTERVAL_SECS=3600
-+ZED_NOTIFY_INTERVAL_SECS=3600
- 
- ##
- # Notification verbosity.
diff --git a/debian/patches/0006-Linux-5.0-compat-SIMD-compatibility.patch b/debian/patches/0006-Linux-5.0-compat-SIMD-compatibility.patch

new file mode 100644 (file)

index 0000000..d85991d
--- /dev/null
+++ b/debian/patches/0006-Linux-5.0-compat-SIMD-compatibility.patch
@@ -0,0 +1,1616 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Fri, 12 Jul 2019 09:31:20 -0700
+Subject: [PATCH] Linux 5.0 compat: SIMD compatibility
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS,
+and 5.0 and newer kernels.  This is accomplished by leveraging
+the fact that by definition dedicated kernel threads never need
+to concern themselves with saving and restoring the user FPU state.
+Therefore, they may use the FPU as long as we can guarantee user
+tasks always restore their FPU state before context switching back
+to user space.
+
+For the 5.0 and 5.1 kernels disabling preemption and local
+interrupts is sufficient to allow the FPU to be used.  All non-kernel
+threads will restore the preserved user FPU state.
+
+For 5.2 and latter kernels the user FPU state restoration will be
+skipped if the kernel determines the registers have not changed.
+Therefore, for these kernels we need to perform the additional
+step of saving and restoring the FPU registers.  Invalidating the
+per-cpu global tracking the FPU state would force a restore but
+that functionality is private to the core x86 FPU implementation
+and unavailable.
+
+In practice, restricting SIMD to kernel threads is not a major
+restriction for ZFS.  The vast majority of SIMD operations are
+already performed by the IO pipeline.  The remaining cases are
+relatively infrequent and can be handled by the generic code
+without significant impact.  The two most noteworthy cases are:
+
+  1) Decrypting the wrapping key for an encrypted dataset,
+     i.e. `zfs load-key`.  All other encryption and decryption
+     operations will use the SIMD optimized implementations.
+
+  2) Generating the payload checksums for a `zfs send` stream.
+
+In order to avoid making any changes to the higher layers of ZFS
+all of the `*_get_ops()` functions were updated to take in to
+consideration the calling context.  This allows for the fastest
+implementation to be used as appropriate (see kfpu_allowed()).
+
+The only other notable instance of SIMD operations being used
+outside a kernel thread was at module load time.  This code
+was moved in to a taskq in order to accommodate the new kernel
+thread restriction.
+
+Finally, a few other modifications were made in order to further
+harden this code and facilitate testing.  They include updating
+each implementations operations structure to be declared as a
+constant.  And allowing "cycle" to be set when selecting the
+preferred ops in the kernel as well as user space.
+
+Reviewed-by: Tony Hutter <hutter2@llnl.gov>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #8754
+Closes #8793
+Closes #8965
+(cherry picked from commit e5db31349484e5e859c7a942eb15b98d68ce5b4d)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ cmd/ztest/ztest.c                           |   3 +
+ config/kernel-fpu.m4                        |  46 ++++-
+ include/linux/Makefile.am                   |   1 +
+ include/linux/simd.h                        |  41 +++++
+ include/linux/simd_aarch64.h                |  18 +-
+ include/linux/simd_x86.h                    | 192 +++++++++++++-------
+ include/sys/vdev_raidz.h                    |   2 +-
+ include/sys/vdev_raidz_impl.h               |   2 +-
+ module/icp/algs/aes/aes_impl.c              |  34 ++--
+ module/icp/algs/aes/aes_impl_aesni.c        |   2 +-
+ module/icp/algs/modes/gcm.c                 |  41 +++--
+ module/icp/algs/modes/gcm_pclmulqdq.c       |   2 +-
+ module/icp/include/aes/aes_impl.h           |   6 +-
+ module/icp/include/modes/gcm_impl.h         |   6 +-
+ module/icp/io/aes.c                         |  32 +++-
+ module/spl/spl-taskq.c                      |   2 +
+ module/spl/spl-thread.c                     |   2 +
+ module/zcommon/zfs_fletcher.c               |  88 ++++++---
+ module/zcommon/zfs_fletcher_aarch64_neon.c  |   2 +-
+ module/zcommon/zfs_fletcher_avx512.c        |   2 +-
+ module/zcommon/zfs_fletcher_intel.c         |   2 +-
+ module/zcommon/zfs_fletcher_sse.c           |   5 +-
+ module/zfs/vdev_raidz_math.c                | 105 +++++++----
+ module/zfs/vdev_raidz_math_aarch64_neon.c   |   2 +-
+ module/zfs/vdev_raidz_math_aarch64_neonx2.c |   2 +-
+ module/zfs/vdev_raidz_math_avx2.c           |   2 +-
+ module/zfs/vdev_raidz_math_avx512bw.c       |   5 +-
+ module/zfs/vdev_raidz_math_avx512f.c        |   5 +-
+ module/zfs/vdev_raidz_math_sse2.c           |   2 +-
+ module/zfs/vdev_raidz_math_ssse3.c          |   4 +-
+ 30 files changed, 454 insertions(+), 204 deletions(-)
+ create mode 100644 include/linux/simd.h
+
+diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
+index 3bf840d88..dc22faef7 100644
+--- a/cmd/ztest/ztest.c
++++ b/cmd/ztest/ztest.c
+@@ -107,6 +107,7 @@
+ #include <sys/vdev_impl.h>
+ #include <sys/vdev_file.h>
+ #include <sys/vdev_initialize.h>
++#include <sys/vdev_raidz.h>
+ #include <sys/vdev_trim.h>
+ #include <sys/spa_impl.h>
+ #include <sys/metaslab_impl.h>
+@@ -7110,6 +7111,8 @@ ztest_run(ztest_shared_t *zs)
+       metaslab_preload_limit = ztest_random(20) + 1;
+       ztest_spa = spa;
+ 
++      VERIFY0(vdev_raidz_impl_set("cycle"));
++
+       dmu_objset_stats_t dds;
+       VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
+           DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
+diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4
+index ebb02fb09..0e622e859 100644
+--- a/config/kernel-fpu.m4
++++ b/config/kernel-fpu.m4
+@@ -2,8 +2,15 @@ dnl #
+ dnl # Handle differences in kernel FPU code.
+ dnl #
+ dnl # Kernel
+-dnl # 5.0:    All kernel fpu functions are GPL only, so we can't use them.
+-dnl #         (nothing defined)
++dnl # 5.2:    The fpu->initialized flag was replaced by TIF_NEED_FPU_LOAD.
++dnl #         HAVE_KERNEL_TIF_NEED_FPU_LOAD
++dnl #
++dnl # 5.0:    As an optimization SIMD operations performed by kernel
++dnl #         threads can skip saving and restoring their FPU context.
++dnl #         Wrappers have been introduced to determine the running
++dnl #         context and use either the SIMD or generic implementation.
++dnl #         This change was made to the 4.19.38 and 4.14.120 LTS kernels.
++dnl #         HAVE_KERNEL_FPU_INITIALIZED
+ dnl #
+ dnl # 4.2:    Use __kernel_fpu_{begin,end}()
+ dnl #         HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU
+@@ -59,10 +66,39 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
+                       __kernel_fpu_end();
+               ], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [
+                       AC_MSG_RESULT(__kernel_fpu_*)
+-                      AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions])
+-                      AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions])
++                      AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1,
++                          [kernel has __kernel_fpu_* functions])
++                      AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
++                          [kernel exports FPU functions])
+               ],[
+-                      AC_MSG_RESULT(not exported)
++                      ZFS_LINUX_TRY_COMPILE([
++                              #include <linux/module.h>
++                              #include <linux/sched.h>
++                      ],[
++                              struct fpu *fpu = &current->thread.fpu;
++                              if (fpu->initialized) { return (0); };
++                      ],[
++                              AC_MSG_RESULT(fpu.initialized)
++                              AC_DEFINE(HAVE_KERNEL_FPU_INITIALIZED, 1,
++                                  [kernel fpu.initialized exists])
++                      ],[
++                              ZFS_LINUX_TRY_COMPILE([
++                                      #include <linux/module.h>
++                                      #include <asm/thread_info.h>
++
++                                      #if !defined(TIF_NEED_FPU_LOAD)
++                                      #error "TIF_NEED_FPU_LOAD undefined"
++                                      #endif
++                              ],[
++                              ],[
++                                      AC_MSG_RESULT(TIF_NEED_FPU_LOAD)
++                                      AC_DEFINE(
++                                          HAVE_KERNEL_TIF_NEED_FPU_LOAD, 1,
++                                          [kernel TIF_NEED_FPU_LOAD exists])
++                              ],[
++                                      AC_MSG_RESULT(unavailable)
++                              ])
++                      ])
+               ])
+       ])
+ ])
+diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am
+index efb49520e..2455759e8 100644
+--- a/include/linux/Makefile.am
++++ b/include/linux/Makefile.am
+@@ -7,6 +7,7 @@ KERNEL_H = \
+       $(top_srcdir)/include/linux/blkdev_compat.h \
+       $(top_srcdir)/include/linux/utsname_compat.h \
+       $(top_srcdir)/include/linux/kmap_compat.h \
++      $(top_srcdir)/include/linux/simd.h \
+       $(top_srcdir)/include/linux/simd_x86.h \
+       $(top_srcdir)/include/linux/simd_aarch64.h \
+       $(top_srcdir)/include/linux/mod_compat.h \
+diff --git a/include/linux/simd.h b/include/linux/simd.h
+new file mode 100644
+index 000000000..d2b60996a
+--- /dev/null
++++ b/include/linux/simd.h
+@@ -0,0 +1,41 @@
++/*
++ * CDDL HEADER START
++ *
++ * The contents of this file are subject to the terms of the
++ * Common Development and Distribution License (the "License").
++ * You may not use this file except in compliance with the License.
++ *
++ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
++ * or http://www.opensolaris.org/os/licensing.
++ * See the License for the specific language governing permissions
++ * and limitations under the License.
++ *
++ * When distributing Covered Code, include this CDDL HEADER in each
++ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
++ * If applicable, add the following below this CDDL HEADER, with the
++ * fields enclosed by brackets "[]" replaced with your own identifying
++ * information: Portions Copyright [yyyy] [name of copyright owner]
++ *
++ * CDDL HEADER END
++ */
++/*
++ * Copyright (C) 2019 Lawrence Livermore National Security, LLC.
++ */
++
++#ifndef _SIMD_H
++#define       _SIMD_H
++
++#if defined(__x86)
++#include <linux/simd_x86.h>
++
++#elif defined(__aarch64__)
++#include <linux/simd_aarch64.h>
++#else
++
++#define       kfpu_allowed()          1
++#define       kfpu_initialize(tsk)    do {} while (0)
++#define       kfpu_begin()            do {} while (0)
++#define       kfpu_end()              do {} while (0)
++
++#endif
++#endif /* _SIMD_H */
+diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h
+index 56153a160..b45d31c48 100644
+--- a/include/linux/simd_aarch64.h
++++ b/include/linux/simd_aarch64.h
+@@ -43,20 +43,18 @@
+ 
+ #if defined(_KERNEL)
+ #include <asm/neon.h>
+-#define       kfpu_begin()            \
+-{                                     \
+-      kernel_neon_begin();            \
+-}
+-#define       kfpu_end()                      \
+-{                                     \
+-      kernel_neon_end();              \
+-}
++#define       kfpu_allowed()          1
++#define       kfpu_initialize(tsk)    do {} while (0)
++#define       kfpu_begin()            kernel_neon_begin()
++#define       kfpu_end()              kernel_neon_end()
+ #else
+ /*
+  * fpu dummy methods for userspace
+  */
+-#define       kfpu_begin()    do {} while (0)
+-#define       kfpu_end()              do {} while (0)
++#define       kfpu_allowed()          1
++#define       kfpu_initialize(tsk)    do {} while (0)
++#define       kfpu_begin()            do {} while (0)
++#define       kfpu_end()              do {} while (0)
+ #endif /* defined(_KERNEL) */
+ 
+ #endif /* __aarch64__ */
+diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
+index 0489bfaa3..641f43955 100644
+--- a/include/linux/simd_x86.h
++++ b/include/linux/simd_x86.h
+@@ -92,33 +92,135 @@
+ #include <asm/xcr.h>
+ #endif
+ 
++/*
++ * The following cases are for kernels which export either the
++ * kernel_fpu_* or __kernel_fpu_* functions.
++ */
++#if defined(KERNEL_EXPORTS_X86_FPU)
++
++#define       kfpu_allowed()          1
++#define       kfpu_initialize(tsk)    do {} while (0)
++
+ #if defined(HAVE_UNDERSCORE_KERNEL_FPU)
+ #define       kfpu_begin()            \
+-{                                                     \
+-      preempt_disable();              \
++{                             \
++      preempt_disable();      \
+       __kernel_fpu_begin();   \
+ }
+-#define       kfpu_end()                      \
+-{                                                     \
+-      __kernel_fpu_end();             \
+-      preempt_enable();               \
++#define       kfpu_end()              \
++{                             \
++      __kernel_fpu_end();     \
++      preempt_enable();       \
+ }
++
+ #elif defined(HAVE_KERNEL_FPU)
+-#define       kfpu_begin()    kernel_fpu_begin()
++#define       kfpu_begin()            kernel_fpu_begin()
+ #define       kfpu_end()              kernel_fpu_end()
++
+ #else
+-/* Kernel doesn't export any kernel_fpu_* functions */
+-#include <asm/fpu/internal.h> /* For kernel xgetbv() */
+-#define       kfpu_begin()    panic("This code should never run")
+-#define       kfpu_end()      panic("This code should never run")
+-#endif /* defined(HAVE_KERNEL_FPU) */
++/*
++ * This case is unreachable.  When KERNEL_EXPORTS_X86_FPU is defined then
++ * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined.
++ */
++#error "Unreachable kernel configuration"
++#endif
++
++#else /* defined(KERNEL_EXPORTS_X86_FPU) */
++/*
++ * When the kernel_fpu_* symbols are unavailable then provide our own
++ * versions which allow the FPU to be safely used in kernel threads.
++ * In practice, this is not a significant restriction for ZFS since the
++ * vast majority of SIMD operations are performed by the IO pipeline.
++ */
+ 
++/*
++ * Returns non-zero if FPU operations are allowed in the current context.
++ */
++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
++#define       kfpu_allowed()          ((current->flags & PF_KTHREAD) && \
++                              test_thread_flag(TIF_NEED_FPU_LOAD))
++#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
++#define       kfpu_allowed()          ((current->flags & PF_KTHREAD) && \
++                              current->thread.fpu.initialized)
+ #else
++#define       kfpu_allowed()          0
++#endif
++
++static inline void
++kfpu_initialize(void)
++{
++      WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
++
++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
++      __fpu_invalidate_fpregs_state(&current->thread.fpu);
++      set_thread_flag(TIF_NEED_FPU_LOAD);
++#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
++      __fpu_invalidate_fpregs_state(&current->thread.fpu);
++      current->thread.fpu.initialized = 1;
++#endif
++}
++
++static inline void
++kfpu_begin(void)
++{
++      WARN_ON_ONCE(!kfpu_allowed());
++
++      /*
++       * Preemption and interrupts must be disabled for the critical
++       * region where the FPU state is being modified.
++       */
++      preempt_disable();
++      local_irq_disable();
++
++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
++      /*
++       * The current FPU registers need to be preserved by kfpu_begin()
++       * and restored by kfpu_end().  This is required because we can
++       * not call __cpu_invalidate_fpregs_state() to invalidate the
++       * per-cpu FPU state and force them to be restored during a
++       * context switch.
++       */
++      copy_fpregs_to_fpstate(&current->thread.fpu);
++#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
++      /*
++       * There is no need to preserve and restore the FPU registers.
++       * They will always be restored from the task's stored FPU state
++       * when switching contexts.
++       */
++      WARN_ON_ONCE(current->thread.fpu.initialized == 0);
++#endif
++}
++
++static inline void
++kfpu_end(void)
++{
++#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
++      union fpregs_state *state = &current->thread.fpu.state;
++      int error;
++
++      if (use_xsave()) {
++              error = copy_kernel_to_xregs_err(&state->xsave, -1);
++      } else if (use_fxsr()) {
++              error = copy_kernel_to_fxregs_err(&state->fxsave);
++      } else {
++              error = copy_kernel_to_fregs_err(&state->fsave);
++      }
++      WARN_ON_ONCE(error);
++#endif
++
++      local_irq_enable();
++      preempt_enable();
++}
++#endif /* defined(HAVE_KERNEL_FPU) */
++
++#else /* defined(_KERNEL) */
+ /*
+- * fpu dummy methods for userspace
++ * FPU dummy methods for user space.
+  */
+-#define       kfpu_begin()    do {} while (0)
+-#define       kfpu_end()              do {} while (0)
++#define       kfpu_allowed()          1
++#define       kfpu_initialize(tsk)    do {} while (0)
++#define       kfpu_begin()            do {} while (0)
++#define       kfpu_end()              do {} while (0)
+ #endif /* defined(_KERNEL) */
+ 
+ /*
+@@ -300,7 +402,7 @@ __simd_state_enabled(const uint64_t state)
+       uint64_t xcr0;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_OSXSAVE)
+       has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE);
+ #else
+       has_osxsave = B_FALSE;
+@@ -330,11 +432,7 @@ static inline boolean_t
+ zfs_sse_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+       return (!!boot_cpu_has(X86_FEATURE_XMM));
+-#else
+-      return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+       return (__cpuid_has_sse());
+ #endif
+@@ -347,11 +445,7 @@ static inline boolean_t
+ zfs_sse2_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+       return (!!boot_cpu_has(X86_FEATURE_XMM2));
+-#else
+-      return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+       return (__cpuid_has_sse2());
+ #endif
+@@ -364,11 +458,7 @@ static inline boolean_t
+ zfs_sse3_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+       return (!!boot_cpu_has(X86_FEATURE_XMM3));
+-#else
+-      return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+       return (__cpuid_has_sse3());
+ #endif
+@@ -381,11 +471,7 @@ static inline boolean_t
+ zfs_ssse3_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+       return (!!boot_cpu_has(X86_FEATURE_SSSE3));
+-#else
+-      return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+       return (__cpuid_has_ssse3());
+ #endif
+@@ -398,11 +484,7 @@ static inline boolean_t
+ zfs_sse4_1_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+       return (!!boot_cpu_has(X86_FEATURE_XMM4_1));
+-#else
+-      return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+       return (__cpuid_has_sse4_1());
+ #endif
+@@ -415,11 +497,7 @@ static inline boolean_t
+ zfs_sse4_2_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+       return (!!boot_cpu_has(X86_FEATURE_XMM4_2));
+-#else
+-      return (B_FALSE);
+-#endif
+ #elif !defined(_KERNEL)
+       return (__cpuid_has_sse4_2());
+ #endif
+@@ -433,11 +511,7 @@ zfs_avx_available(void)
+ {
+       boolean_t has_avx;
+ #if defined(_KERNEL)
+-#if defined(KERNEL_EXPORTS_X86_FPU)
+       has_avx = !!boot_cpu_has(X86_FEATURE_AVX);
+-#else
+-      has_avx = B_FALSE;
+-#endif
+ #elif !defined(_KERNEL)
+       has_avx = __cpuid_has_avx();
+ #endif
+@@ -453,11 +527,7 @@ zfs_avx2_available(void)
+ {
+       boolean_t has_avx2;
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU)
+       has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2);
+-#else
+-      has_avx2 = B_FALSE;
+-#endif
+ #elif !defined(_KERNEL)
+       has_avx2 = __cpuid_has_avx2();
+ #endif
+@@ -472,7 +542,7 @@ static inline boolean_t
+ zfs_bmi1_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_BMI1)
+       return (!!boot_cpu_has(X86_FEATURE_BMI1));
+ #else
+       return (B_FALSE);
+@@ -489,7 +559,7 @@ static inline boolean_t
+ zfs_bmi2_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_BMI2)
+       return (!!boot_cpu_has(X86_FEATURE_BMI2));
+ #else
+       return (B_FALSE);
+@@ -506,7 +576,7 @@ static inline boolean_t
+ zfs_aes_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AES)
+       return (!!boot_cpu_has(X86_FEATURE_AES));
+ #else
+       return (B_FALSE);
+@@ -523,7 +593,7 @@ static inline boolean_t
+ zfs_pclmulqdq_available(void)
+ {
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_PCLMULQDQ)
+       return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ));
+ #else
+       return (B_FALSE);
+@@ -557,7 +627,7 @@ zfs_avx512f_available(void)
+       boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512F)
+       has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F);
+ #else
+       has_avx512 = B_FALSE;
+@@ -576,7 +646,7 @@ zfs_avx512cd_available(void)
+       boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512CD)
+       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+           boot_cpu_has(X86_FEATURE_AVX512CD);
+ #else
+@@ -596,7 +666,7 @@ zfs_avx512er_available(void)
+       boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512ER)
+       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+           boot_cpu_has(X86_FEATURE_AVX512ER);
+ #else
+@@ -616,7 +686,7 @@ zfs_avx512pf_available(void)
+       boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512PF)
+       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+           boot_cpu_has(X86_FEATURE_AVX512PF);
+ #else
+@@ -636,7 +706,7 @@ zfs_avx512bw_available(void)
+       boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512BW)
+       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+           boot_cpu_has(X86_FEATURE_AVX512BW);
+ #else
+@@ -656,7 +726,7 @@ zfs_avx512dq_available(void)
+       boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512DQ)
+       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+           boot_cpu_has(X86_FEATURE_AVX512DQ);
+ #else
+@@ -676,7 +746,7 @@ zfs_avx512vl_available(void)
+       boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512VL)
+       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+           boot_cpu_has(X86_FEATURE_AVX512VL);
+ #else
+@@ -696,7 +766,7 @@ zfs_avx512ifma_available(void)
+       boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512IFMA)
+       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+           boot_cpu_has(X86_FEATURE_AVX512IFMA);
+ #else
+@@ -716,7 +786,7 @@ zfs_avx512vbmi_available(void)
+       boolean_t has_avx512 = B_FALSE;
+ 
+ #if defined(_KERNEL)
+-#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU)
++#if defined(X86_FEATURE_AVX512VBMI)
+       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
+           boot_cpu_has(X86_FEATURE_AVX512VBMI);
+ #else
+diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h
+index 2ce32469d..0ce2b5ea1 100644
+--- a/include/sys/vdev_raidz.h
++++ b/include/sys/vdev_raidz.h
+@@ -51,7 +51,7 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
+  */
+ void vdev_raidz_math_init(void);
+ void vdev_raidz_math_fini(void);
+-struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
++const struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
+ int vdev_raidz_math_generate(struct raidz_map *);
+ int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *,
+     const int);
+diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
+index 0799ed19d..4969d110b 100644
+--- a/include/sys/vdev_raidz_impl.h
++++ b/include/sys/vdev_raidz_impl.h
+@@ -126,7 +126,7 @@ typedef struct raidz_map {
+       uintptr_t rm_reports;           /* # of referencing checksum reports */
+       uint8_t rm_freed;               /* map no longer has referencing ZIO */
+       uint8_t rm_ecksuminjected;      /* checksum error was injected */
+-      raidz_impl_ops_t *rm_ops;       /* RAIDZ math operations */
++      const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
+       raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
+ } raidz_map_t;
+ 
+diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c
+index 36e0686a5..0f11f9999 100644
+--- a/module/icp/algs/aes/aes_impl.c
++++ b/module/icp/algs/aes/aes_impl.c
+@@ -27,6 +27,7 @@
+ #include <sys/crypto/spi.h>
+ #include <modes/modes.h>
+ #include <aes/aes_impl.h>
++#include <linux/simd.h>
+ 
+ /*
+  * Initialize AES encryption and decryption key schedules.
+@@ -40,9 +41,9 @@
+ void
+ aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
+ {
+-      aes_impl_ops_t  *ops = aes_impl_get_ops();
+-      aes_key_t       *newbie = keysched;
+-      uint_t          keysize, i, j;
++      const aes_impl_ops_t *ops = aes_impl_get_ops();
++      aes_key_t *newbie = keysched;
++      uint_t keysize, i, j;
+       union {
+               uint64_t        ka64[4];
+               uint32_t        ka32[8];
+@@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0;
+ static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
+ 
+ /*
+- * Selects the aes operations for encrypt/decrypt/key setup
++ * Returns the AES operations for encrypt/decrypt/key setup.  When a
++ * SIMD implementation is not allowed in the current context, then
++ * fallback to the fastest generic implementation.
+  */
+-aes_impl_ops_t *
+-aes_impl_get_ops()
++const aes_impl_ops_t *
++aes_impl_get_ops(void)
+ {
+-      aes_impl_ops_t *ops = NULL;
++      if (!kfpu_allowed())
++              return (&aes_generic_impl);
++
++      const aes_impl_ops_t *ops = NULL;
+       const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
+ 
+       switch (impl) {
+@@ -266,15 +272,13 @@ aes_impl_get_ops()
+               ops = &aes_fastest_impl;
+               break;
+       case IMPL_CYCLE:
+-      {
++              /* Cycle through supported implementations */
+               ASSERT(aes_impl_initialized);
+               ASSERT3U(aes_supp_impl_cnt, >, 0);
+-              /* Cycle through supported implementations */
+               static size_t cycle_impl_idx = 0;
+               size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
+               ops = aes_supp_impl[idx];
+-      }
+-      break;
++              break;
+       default:
+               ASSERT3U(impl, <, aes_supp_impl_cnt);
+               ASSERT3U(aes_supp_impl_cnt, >, 0);
+@@ -288,13 +292,17 @@ aes_impl_get_ops()
+       return (ops);
+ }
+ 
++/*
++ * Initialize all supported implementations.
++ */
++/* ARGSUSED */
+ void
+-aes_impl_init(void)
++aes_impl_init(void *arg)
+ {
+       aes_impl_ops_t *curr_impl;
+       int i, c;
+ 
+-      /* move supported impl into aes_supp_impls */
++      /* Move supported implementations into aes_supp_impls */
+       for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
+               curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
+ 
+diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c
+index 97f7c3a47..222c176aa 100644
+--- a/module/icp/algs/aes/aes_impl_aesni.c
++++ b/module/icp/algs/aes/aes_impl_aesni.c
+@@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
+ static boolean_t
+ aes_aesni_will_work(void)
+ {
+-      return (zfs_aes_available());
++      return (kfpu_allowed() && zfs_aes_available());
+ }
+ 
+ const aes_impl_ops_t aes_aesni_impl = {
+diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c
+index 0afd957f0..423b70e2c 100644
+--- a/module/icp/algs/modes/gcm.c
++++ b/module/icp/algs/modes/gcm.c
+@@ -29,6 +29,7 @@
+ #include <sys/crypto/impl.h>
+ #include <sys/byteorder.h>
+ #include <modes/gcm_impl.h>
++#include <linux/simd.h>
+ 
+ #define       GHASH(c, d, t, o) \
+       xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
+@@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
+     void (*copy_block)(uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
+-      gcm_impl_ops_t *gops;
++      const gcm_impl_ops_t *gops;
+       size_t remainder = length;
+       size_t need = 0;
+       uint8_t *datap = (uint8_t *)data;
+@@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+     void (*copy_block)(uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
+-      gcm_impl_ops_t *gops;
++      const gcm_impl_ops_t *gops;
+       uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
+       uint8_t *ghash, *macp = NULL;
+       int i, rv;
+@@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
+     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
+-      gcm_impl_ops_t *gops;
++      const gcm_impl_ops_t *gops;
+       size_t pt_len;
+       size_t remainder;
+       uint8_t *ghash;
+@@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
+     void (*copy_block)(uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
+-      gcm_impl_ops_t *gops;
++      const gcm_impl_ops_t *gops;
+       uint8_t *cb;
+       ulong_t remainder = iv_len;
+       ulong_t processed = 0;
+@@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
+     void (*copy_block)(uint8_t *, uint8_t *),
+     void (*xor_block)(uint8_t *, uint8_t *))
+ {
+-      gcm_impl_ops_t *gops;
++      const gcm_impl_ops_t *gops;
+       uint8_t *ghash, *datap, *authp;
+       size_t remainder, processed;
+ 
+@@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0;
+ static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
+ 
+ /*
+- * Selects the gcm operation
++ * Returns the GCM operations for encrypt/decrypt/key setup.  When a
++ * SIMD implementation is not allowed in the current context, then
++ * fallback to the fastest generic implementation.
+  */
+-gcm_impl_ops_t *
++const gcm_impl_ops_t *
+ gcm_impl_get_ops()
+ {
+-      gcm_impl_ops_t *ops = NULL;
++      if (!kfpu_allowed())
++              return (&gcm_generic_impl);
++
++      const gcm_impl_ops_t *ops = NULL;
+       const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
+ 
+       switch (impl) {
+@@ -674,15 +680,13 @@ gcm_impl_get_ops()
+               ops = &gcm_fastest_impl;
+               break;
+       case IMPL_CYCLE:
+-      {
++              /* Cycle through supported implementations */
+               ASSERT(gcm_impl_initialized);
+               ASSERT3U(gcm_supp_impl_cnt, >, 0);
+-              /* Cycle through supported implementations */
+               static size_t cycle_impl_idx = 0;
+               size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
+               ops = gcm_supp_impl[idx];
+-      }
+-      break;
++              break;
+       default:
+               ASSERT3U(impl, <, gcm_supp_impl_cnt);
+               ASSERT3U(gcm_supp_impl_cnt, >, 0);
+@@ -696,13 +700,17 @@ gcm_impl_get_ops()
+       return (ops);
+ }
+ 
++/*
++ * Initialize all supported implementations.
++ */
++/* ARGSUSED */
+ void
+-gcm_impl_init(void)
++gcm_impl_init(void *arg)
+ {
+       gcm_impl_ops_t *curr_impl;
+       int i, c;
+ 
+-      /* move supported impl into aes_supp_impls */
++      /* Move supported implementations into gcm_supp_impls */
+       for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
+               curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
+ 
+@@ -711,7 +719,10 @@ gcm_impl_init(void)
+       }
+       gcm_supp_impl_cnt = c;
+ 
+-      /* set fastest implementation. assume hardware accelerated is fastest */
++      /*
++       * Set the fastest implementation given the assumption that the
++       * hardware accelerated version is the fastest.
++       */
+ #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
+       if (gcm_pclmulqdq_impl.is_supported()) {
+               memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
+diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c
+index be00ba37b..8a43ba33a 100644
+--- a/module/icp/algs/modes/gcm_pclmulqdq.c
++++ b/module/icp/algs/modes/gcm_pclmulqdq.c
+@@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
+ static boolean_t
+ gcm_pclmulqdq_will_work(void)
+ {
+-      return (zfs_pclmulqdq_available());
++      return (kfpu_allowed() && zfs_pclmulqdq_available());
+ }
+ 
+ const gcm_impl_ops_t gcm_pclmulqdq_impl = {
+diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h
+index 3a3de91cf..329e32a8e 100644
+--- a/module/icp/include/aes/aes_impl.h
++++ b/module/icp/include/aes/aes_impl.h
+@@ -198,12 +198,12 @@ extern const aes_impl_ops_t aes_aesni_impl;
+ /*
+  * Initializes fastest implementation
+  */
+-void aes_impl_init(void);
++void aes_impl_init(void *arg);
+ 
+ /*
+- * Get selected aes implementation
++ * Returns optimal allowed AES implementation
+  */
+-struct aes_impl_ops *aes_impl_get_ops(void);
++const struct aes_impl_ops *aes_impl_get_ops(void);
+ 
+ #ifdef        __cplusplus
+ }
+diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h
+index b78cc8aab..dff372ef8 100644
+--- a/module/icp/include/modes/gcm_impl.h
++++ b/module/icp/include/modes/gcm_impl.h
+@@ -61,12 +61,12 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
+ /*
+  * Initializes fastest implementation
+  */
+-void gcm_impl_init(void);
++void gcm_impl_init(void *arg);
+ 
+ /*
+- * Get selected aes implementation
++ * Returns optimal allowed GCM implementation
+  */
+-struct gcm_impl_ops *gcm_impl_get_ops(void);
++const struct gcm_impl_ops *gcm_impl_get_ops(void);
+ 
+ #ifdef        __cplusplus
+ }
+diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c
+index 53b193693..51538bc60 100644
+--- a/module/icp/io/aes.c
++++ b/module/icp/io/aes.c
+@@ -206,9 +206,35 @@ aes_mod_init(void)
+ {
+       int ret;
+ 
+-      /* find fastest implementations and set any requested implementations */
+-      aes_impl_init();
+-      gcm_impl_init();
++#if defined(_KERNEL)
++      /*
++       * Determine the fastest available implementation.  The benchmarks
++       * are run in dedicated kernel threads to allow Linux 5.0+ kernels
++       * to use SIMD operations.  If for some reason this isn't possible,
++       * fallback to the generic implementations.  See the comment in
++       * include/linux/simd_x86.h for additional details.  Additionally,
++       * this has the benefit of allowing them to be run in parallel.
++       */
++      taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init,
++          NULL, TQ_SLEEP);
++      taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init,
++          NULL, TQ_SLEEP);
++
++      if (aes_id != TASKQID_INVALID) {
++              taskq_wait_id(system_taskq, aes_id);
++      } else {
++              aes_impl_init(NULL);
++      }
++
++      if (gcm_id != TASKQID_INVALID) {
++              taskq_wait_id(system_taskq, gcm_id);
++      } else {
++              gcm_impl_init(NULL);
++      }
++#else
++      aes_impl_init(NULL);
++      gcm_impl_init(NULL);
++#endif
+ 
+       if ((ret = mod_install(&modlinkage)) != 0)
+               return (ret);
+diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
+index a39f94e4c..69d591ff7 100644
+--- a/module/spl/spl-taskq.c
++++ b/module/spl/spl-taskq.c
+@@ -28,6 +28,7 @@
+ #include <sys/taskq.h>
+ #include <sys/kmem.h>
+ #include <sys/tsd.h>
++#include <linux/simd.h>
+ 
+ int spl_taskq_thread_bind = 0;
+ module_param(spl_taskq_thread_bind, int, 0644);
+@@ -853,6 +854,7 @@ taskq_thread(void *args)
+       sigfillset(&blocked);
+       sigprocmask(SIG_BLOCK, &blocked, NULL);
+       flush_signals(current);
++      kfpu_initialize();
+ 
+       tsd_set(taskq_tsd, tq);
+       spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
+diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c
+index 0352a31ea..07e3a1bff 100644
+--- a/module/spl/spl-thread.c
++++ b/module/spl/spl-thread.c
+@@ -27,6 +27,7 @@
+ #include <sys/thread.h>
+ #include <sys/kmem.h>
+ #include <sys/tsd.h>
++#include <linux/simd.h>
+ 
+ /*
+  * Thread interfaces
+@@ -54,6 +55,7 @@ thread_generic_wrapper(void *arg)
+       args = tp->tp_args;
+       set_current_state(tp->tp_state);
+       set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
++      kfpu_initialize();
+       kmem_free(tp->tp_name, tp->tp_name_size);
+       kmem_free(tp, sizeof (thread_priv_t));
+ 
+diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
+index f712ce40c..9187a7c1e 100644
+--- a/module/zcommon/zfs_fletcher.c
++++ b/module/zcommon/zfs_fletcher.c
+@@ -140,6 +140,7 @@
+ #include <sys/zio_checksum.h>
+ #include <sys/zfs_context.h>
+ #include <zfs_fletcher.h>
++#include <linux/simd.h>
+ 
+ #define       FLETCHER_MIN_SIMD_SIZE  64
+ 
+@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
+       const char      *fis_name;
+       uint32_t        fis_sel;
+ } fletcher_4_impl_selectors[] = {
+-#if !defined(_KERNEL)
+       { "cycle",      IMPL_CYCLE },
+-#endif
+       { "fastest",    IMPL_FASTEST },
+       { "scalar",     IMPL_SCALAR }
+ };
+ 
+ #if defined(_KERNEL)
+ static kstat_t *fletcher_4_kstat;
+-#endif
+ 
+ static struct fletcher_4_kstat {
+       uint64_t native;
+       uint64_t byteswap;
+ } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
++#endif
+ 
+ /* Indicate that benchmark has been completed */
+ static boolean_t fletcher_4_initialized = B_FALSE;
+@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
+       return (err);
+ }
+ 
++/*
++ * Returns the Fletcher 4 operations for checksums.   When a SIMD
++ * implementation is not allowed in the current context, then fallback
++ * to the fastest generic implementation.
++ */
+ static inline const fletcher_4_ops_t *
+ fletcher_4_impl_get(void)
+ {
+-      fletcher_4_ops_t *ops = NULL;
+-      const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
++      if (!kfpu_allowed())
++              return (&fletcher_4_superscalar4_ops);
++
++      const fletcher_4_ops_t *ops = NULL;
++      uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
+ 
+       switch (impl) {
+       case IMPL_FASTEST:
+               ASSERT(fletcher_4_initialized);
+               ops = &fletcher_4_fastest_impl;
+               break;
+-#if !defined(_KERNEL)
+-      case IMPL_CYCLE: {
++      case IMPL_CYCLE:
++              /* Cycle through supported implementations */
+               ASSERT(fletcher_4_initialized);
+               ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
+-
+               static uint32_t cycle_count = 0;
+               uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
+               ops = fletcher_4_supp_impls[idx];
+-      }
+-      break;
+-#endif
++              break;
+       default:
+               ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
+               ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
+-
+               ops = fletcher_4_supp_impls[impl];
+               break;
+       }
+@@ -659,6 +662,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
+ typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
+                                       zio_cksum_t *);
+ 
++#if defined(_KERNEL)
+ static void
+ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
+ {
+@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
+       /* restore original selection */
+       atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
+ }
++#endif /* _KERNEL */
+ 
+-void
+-fletcher_4_init(void)
++/*
++ * Initialize and benchmark all supported implementations.
++ */
++static void
++fletcher_4_benchmark(void *arg)
+ {
+-      static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
+       fletcher_4_ops_t *curr_impl;
+-      char *databuf;
+       int i, c;
+ 
+-      /* move supported impl into fletcher_4_supp_impls */
++      /* Move supported implementations into fletcher_4_supp_impls */
+       for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
+               curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
+ 
+@@ -735,19 +741,10 @@ fletcher_4_init(void)
+       membar_producer();      /* complete fletcher_4_supp_impls[] init */
+       fletcher_4_supp_impls_cnt = c;  /* number of supported impl */
+ 
+-#if !defined(_KERNEL)
+-      /* Skip benchmarking and use last implementation as fastest */
+-      memcpy(&fletcher_4_fastest_impl,
+-          fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
+-          sizeof (fletcher_4_fastest_impl));
+-      fletcher_4_fastest_impl.name = "fastest";
+-      membar_producer();
++#if defined(_KERNEL)
++      static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
++      char *databuf = vmem_alloc(data_size, KM_SLEEP);
+ 
+-      fletcher_4_initialized = B_TRUE;
+-      return;
+-#endif
+-      /* Benchmark all supported implementations */
+-      databuf = vmem_alloc(data_size, KM_SLEEP);
+       for (i = 0; i < data_size / sizeof (uint64_t); i++)
+               ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
+ 
+@@ -755,9 +752,38 @@ fletcher_4_init(void)
+       fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
+ 
+       vmem_free(databuf, data_size);
++#else
++      /*
++       * Skip the benchmark in user space to avoid impacting libzpool
++       * consumers (zdb, zhack, zinject, ztest).  The last implementation
++       * is assumed to be the fastest and used by default.
++       */
++      memcpy(&fletcher_4_fastest_impl,
++          fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
++          sizeof (fletcher_4_fastest_impl));
++      fletcher_4_fastest_impl.name = "fastest";
++      membar_producer();
++#endif /* _KERNEL */
++}
+ 
++void
++fletcher_4_init(void)
++{
+ #if defined(_KERNEL)
+-      /* install kstats for all implementations */
++      /*
++       * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
++       * run in a kernel threads.  This is needed to take advantage of the
++       * SIMD functionality, see include/linux/simd_x86.h for details.
++       */
++      taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark,
++          NULL, TQ_SLEEP);
++      if (id != TASKQID_INVALID) {
++              taskq_wait_id(system_taskq, id);
++      } else {
++              fletcher_4_benchmark(NULL);
++      }
++
++      /* Install kstats for all implementations */
+       fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
+           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+       if (fletcher_4_kstat != NULL) {
+@@ -769,6 +795,8 @@ fletcher_4_init(void)
+                   fletcher_4_kstat_addr);
+               kstat_install(fletcher_4_kstat);
+       }
++#else
++      fletcher_4_benchmark(NULL);
+ #endif
+ 
+       /* Finish initialization */
+diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c
+index bd2db2b20..3b3c1b52b 100644
+--- a/module/zcommon/zfs_fletcher_aarch64_neon.c
++++ b/module/zcommon/zfs_fletcher_aarch64_neon.c
+@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));
+ 
+ static boolean_t fletcher_4_aarch64_neon_valid(void)
+ {
+-      return (B_TRUE);
++      return (kfpu_allowed());
+ }
+ 
+ const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
+diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c
+index 7260a9864..0d4cff21a 100644
+--- a/module/zcommon/zfs_fletcher_avx512.c
++++ b/module/zcommon/zfs_fletcher_avx512.c
+@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
+ static boolean_t
+ fletcher_4_avx512f_valid(void)
+ {
+-      return (zfs_avx512f_available());
++      return (kfpu_allowed() && zfs_avx512f_available());
+ }
+ 
+ const fletcher_4_ops_t fletcher_4_avx512f_ops = {
+diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c
+index 6dac047da..7f12efe6d 100644
+--- a/module/zcommon/zfs_fletcher_intel.c
++++ b/module/zcommon/zfs_fletcher_intel.c
+@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+ 
+ static boolean_t fletcher_4_avx2_valid(void)
+ {
+-      return (zfs_avx_available() && zfs_avx2_available());
++      return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
+ }
+ 
+ const fletcher_4_ops_t fletcher_4_avx2_ops = {
+diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c
+index a0b42e5f5..e6389d6e5 100644
+--- a/module/zcommon/zfs_fletcher_sse.c
++++ b/module/zcommon/zfs_fletcher_sse.c
+@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+ 
+ static boolean_t fletcher_4_sse2_valid(void)
+ {
+-      return (zfs_sse2_available());
++      return (kfpu_allowed() && zfs_sse2_available());
+ }
+ 
+ const fletcher_4_ops_t fletcher_4_sse2_ops = {
+@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
+ 
+ static boolean_t fletcher_4_ssse3_valid(void)
+ {
+-      return (zfs_sse2_available() && zfs_ssse3_available());
++      return (kfpu_allowed() && zfs_sse2_available() &&
++          zfs_ssse3_available());
+ }
+ 
+ const fletcher_4_ops_t fletcher_4_ssse3_ops = {
+diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c
+index 3ef67768f..ef514e9e1 100644
+--- a/module/zfs/vdev_raidz_math.c
++++ b/module/zfs/vdev_raidz_math.c
+@@ -27,9 +27,9 @@
+ #include <sys/zio.h>
+ #include <sys/debug.h>
+ #include <sys/zfs_debug.h>
+-
+ #include <sys/vdev_raidz.h>
+ #include <sys/vdev_raidz_impl.h>
++#include <linux/simd.h>
+ 
+ extern boolean_t raidz_will_scalar_work(void);
+ 
+@@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
+ static size_t raidz_supp_impl_cnt = 0;
+ static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
+ 
++#if defined(_KERNEL)
+ /*
+  * kstats values for supported implementations
+  * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
+@@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
+ 
+ /* kstat for benchmarked implementations */
+ static kstat_t *raidz_math_kstat = NULL;
++#endif
+ 
+ /*
+- * Selects the raidz operation for raidz_map
+- * If rm_ops is set to NULL original raidz implementation will be used
++ * Returns the RAIDZ operations for raidz_map() parity calculations.   When
++ * a SIMD implementation is not allowed in the current context, then fallback
++ * to the fastest generic implementation.
+  */
+-raidz_impl_ops_t *
+-vdev_raidz_math_get_ops()
++const raidz_impl_ops_t *
++vdev_raidz_math_get_ops(void)
+ {
++      if (!kfpu_allowed())
++              return (&vdev_raidz_scalar_impl);
++
+       raidz_impl_ops_t *ops = NULL;
+       const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
+ 
+@@ -111,18 +117,14 @@ vdev_raidz_math_get_ops()
+               ASSERT(raidz_math_initialized);
+               ops = &vdev_raidz_fastest_impl;
+               break;
+-#if !defined(_KERNEL)
+       case IMPL_CYCLE:
+-      {
++              /* Cycle through all supported implementations */
+               ASSERT(raidz_math_initialized);
+               ASSERT3U(raidz_supp_impl_cnt, >, 0);
+-              /* Cycle through all supported implementations */
+               static size_t cycle_impl_idx = 0;
+               size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
+               ops = raidz_supp_impl[idx];
+-      }
+-      break;
+-#endif
++              break;
+       case IMPL_ORIGINAL:
+               ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
+               break;
+@@ -273,6 +275,8 @@ const char *raidz_rec_name[] = {
+       "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
+ };
+ 
++#if defined(_KERNEL)
++
+ #define       RAIDZ_KSTAT_LINE_LEN    (17 + 10*12 + 1)
+ 
+ static int
+@@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
+               }
+       }
+ }
++#endif
+ 
+-void
+-vdev_raidz_math_init(void)
++/*
++ * Initialize and benchmark all supported implementations.
++ */
++static void
++benchmark_raidz(void *arg)
+ {
+       raidz_impl_ops_t *curr_impl;
+-      zio_t *bench_zio = NULL;
+-      raidz_map_t *bench_rm = NULL;
+-      uint64_t bench_parity;
+-      int i, c, fn;
++      int i, c;
+ 
+-      /* move supported impl into raidz_supp_impl */
++      /* Move supported impl into raidz_supp_impl */
+       for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+               curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
+ 
+-              /* initialize impl */
+               if (curr_impl->init)
+                       curr_impl->init();
+ 
+@@ -459,18 +463,10 @@ vdev_raidz_math_init(void)
+       membar_producer();              /* complete raidz_supp_impl[] init */
+       raidz_supp_impl_cnt = c;        /* number of supported impl */
+ 
+-#if !defined(_KERNEL)
+-      /* Skip benchmarking and use last implementation as fastest */
+-      memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
+-          sizeof (vdev_raidz_fastest_impl));
+-      strcpy(vdev_raidz_fastest_impl.name, "fastest");
+-
+-      raidz_math_initialized = B_TRUE;
+-
+-      /* Use 'cycle' math selection method for userspace */
+-      VERIFY0(vdev_raidz_impl_set("cycle"));
+-      return;
+-#endif
++#if defined(_KERNEL)
++      zio_t *bench_zio = NULL;
++      raidz_map_t *bench_rm = NULL;
++      uint64_t bench_parity;
+ 
+       /* Fake a zio and run the benchmark on a warmed up buffer */
+       bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
+@@ -480,7 +476,7 @@ vdev_raidz_math_init(void)
+       memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
+ 
+       /* Benchmark parity generation methods */
+-      for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
++      for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
+               bench_parity = fn + 1;
+               /* New raidz_map is needed for each generate_p/q/r */
+               bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+@@ -495,7 +491,7 @@ vdev_raidz_math_init(void)
+       bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
+           BENCH_COLS, PARITY_PQR);
+ 
+-      for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
++      for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
+               benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
+ 
+       vdev_raidz_map_free(bench_rm);
+@@ -503,11 +499,39 @@ vdev_raidz_math_init(void)
+       /* cleanup the bench zio */
+       abd_free(bench_zio->io_abd);
+       kmem_free(bench_zio, sizeof (zio_t));
++#else
++      /*
++       * Skip the benchmark in user space to avoid impacting libzpool
++       * consumers (zdb, zhack, zinject, ztest).  The last implementation
++       * is assumed to be the fastest and used by default.
++       */
++      memcpy(&vdev_raidz_fastest_impl,
++          raidz_supp_impl[raidz_supp_impl_cnt - 1],
++          sizeof (vdev_raidz_fastest_impl));
++      strcpy(vdev_raidz_fastest_impl.name, "fastest");
++#endif /* _KERNEL */
++}
+ 
+-      /* install kstats for all impl */
++void
++vdev_raidz_math_init(void)
++{
++#if defined(_KERNEL)
++      /*
++       * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
++       * run in a kernel threads.  This is needed to take advantage of the
++       * SIMD functionality, see include/linux/simd_x86.h for details.
++       */
++      taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz,
++          NULL, TQ_SLEEP);
++      if (id != TASKQID_INVALID) {
++              taskq_wait_id(system_taskq, id);
++      } else {
++              benchmark_raidz(NULL);
++      }
++
++      /* Install kstats for all implementations */
+       raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
+           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+-
+       if (raidz_math_kstat != NULL) {
+               raidz_math_kstat->ks_data = NULL;
+               raidz_math_kstat->ks_ndata = UINT32_MAX;
+@@ -517,6 +541,9 @@ vdev_raidz_math_init(void)
+                   raidz_math_kstat_addr);
+               kstat_install(raidz_math_kstat);
+       }
++#else
++      benchmark_raidz(NULL);
++#endif
+ 
+       /* Finish initialization */
+       atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
+@@ -527,15 +554,15 @@ void
+ vdev_raidz_math_fini(void)
+ {
+       raidz_impl_ops_t const *curr_impl;
+-      int i;
+ 
++#if defined(_KERNEL)
+       if (raidz_math_kstat != NULL) {
+               kstat_delete(raidz_math_kstat);
+               raidz_math_kstat = NULL;
+       }
++#endif
+ 
+-      /* fini impl */
+-      for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
++      for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
+               curr_impl = raidz_all_maths[i];
+               if (curr_impl->fini)
+                       curr_impl->fini();
+@@ -546,9 +573,7 @@ static const struct {
+       char *name;
+       uint32_t sel;
+ } math_impl_opts[] = {
+-#if !defined(_KERNEL)
+               { "cycle",      IMPL_CYCLE },
+-#endif
+               { "fastest",    IMPL_FASTEST },
+               { "original",   IMPL_ORIGINAL },
+               { "scalar",     IMPL_SCALAR }
+diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c
+index e3ad06776..0a67ceb84 100644
+--- a/module/zfs/vdev_raidz_math_aarch64_neon.c
++++ b/module/zfs/vdev_raidz_math_aarch64_neon.c
+@@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon);
+ static boolean_t
+ raidz_will_aarch64_neon_work(void)
+ {
+-      return (B_TRUE); // __arch64__ requires NEON
++      return (kfpu_allowed());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
+diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
+index f8688a06a..e072f51cd 100644
+--- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c
++++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
+@@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2);
+ static boolean_t
+ raidz_will_aarch64_neonx2_work(void)
+ {
+-      return (B_TRUE); // __arch64__ requires NEON
++      return (kfpu_allowed());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
+diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c
+index 063d29bcd..a12eb6720 100644
+--- a/module/zfs/vdev_raidz_math_avx2.c
++++ b/module/zfs/vdev_raidz_math_avx2.c
+@@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2);
+ static boolean_t
+ raidz_will_avx2_work(void)
+ {
+-      return (zfs_avx_available() && zfs_avx2_available());
++      return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_avx2_impl = {
+diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c
+index d605653db..2f545c9ec 100644
+--- a/module/zfs/vdev_raidz_math_avx512bw.c
++++ b/module/zfs/vdev_raidz_math_avx512bw.c
+@@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw);
+ static boolean_t
+ raidz_will_avx512bw_work(void)
+ {
+-      return (zfs_avx_available() &&
+-          zfs_avx512f_available() &&
+-          zfs_avx512bw_available());
++      return (kfpu_allowed() && zfs_avx_available() &&
++          zfs_avx512f_available() && zfs_avx512bw_available());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
+diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c
+index f4e4560ce..75af7a8ee 100644
+--- a/module/zfs/vdev_raidz_math_avx512f.c
++++ b/module/zfs/vdev_raidz_math_avx512f.c
+@@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f);
+ static boolean_t
+ raidz_will_avx512f_work(void)
+ {
+-      return (zfs_avx_available() &&
+-          zfs_avx2_available() &&
+-          zfs_avx512f_available());
++      return (kfpu_allowed() && zfs_avx_available() &&
++          zfs_avx2_available() && zfs_avx512f_available());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
+diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c
+index 9985da273..5b3a9385c 100644
+--- a/module/zfs/vdev_raidz_math_sse2.c
++++ b/module/zfs/vdev_raidz_math_sse2.c
+@@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2);
+ static boolean_t
+ raidz_will_sse2_work(void)
+ {
+-      return (zfs_sse_available() && zfs_sse2_available());
++      return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_sse2_impl = {
+diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c
+index 047a48d54..62247cf8e 100644
+--- a/module/zfs/vdev_raidz_math_ssse3.c
++++ b/module/zfs/vdev_raidz_math_ssse3.c
+@@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3);
+ static boolean_t
+ raidz_will_ssse3_work(void)
+ {
+-      return (zfs_sse_available() && zfs_sse2_available() &&
+-          zfs_ssse3_available());
++      return (kfpu_allowed() && zfs_sse_available() &&
++          zfs_sse2_available() && zfs_ssse3_available());
+ }
+ 
+ const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
diff --git a/debian/patches/0007-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch b/debian/patches/0007-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch

new file mode 100644 (file)

index 0000000..c4b2d2b
--- /dev/null
+++ b/debian/patches/0007-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch
@@ -0,0 +1,45 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Brian Behlendorf <behlendorf1@llnl.gov>
+Date: Wed, 17 Jul 2019 09:14:36 -0700
+Subject: [PATCH] Fix CONFIG_X86_DEBUG_FPU build failure
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+When CONFIG_X86_DEBUG_FPU is defined the alternatives_patched symbol
+is pulled in as a dependency which results in a build failure.  To
+prevent this undefine CONFIG_X86_DEBUG_FPU to disable the WARN_ON_FPU()
+macro and rely on WARN_ON_ONCE debugging checks which were previously
+added.
+
+Reviewed-by: Tony Hutter <hutter2@llnl.gov>
+Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
+Closes #9041
+Closes #9049
+(cherry picked from commit 095b5412b31c07cad5cec74a4eb5ace011c92b27)
+Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ include/linux/simd_x86.h | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
+index 641f43955..edd456098 100644
+--- a/include/linux/simd_x86.h
++++ b/include/linux/simd_x86.h
+@@ -84,6 +84,15 @@
+ 
+ #if defined(_KERNEL)
+ 
++/*
++ * Disable the WARN_ON_FPU() macro to prevent additional dependencies
++ * when providing the kfpu_* functions.  Relevant warnings are included
++ * as appropriate and are unconditionally enabled.
++ */
++#if defined(CONFIG_X86_DEBUG_FPU) && !defined(KERNEL_EXPORTS_X86_FPU)
++#undef CONFIG_X86_DEBUG_FPU
++#endif
++
+ #if defined(HAVE_KERNEL_FPU_API_HEADER)
+ #include <asm/fpu/api.h>
+ #include <asm/fpu/internal.h>
diff --git a/debian/patches/0007-Fix-race-in-parallel-mount-s-thread-dispatching-algo.patch b/debian/patches/0007-Fix-race-in-parallel-mount-s-thread-dispatching-algo.patch

deleted file mode 100644 (file)

index 571d62d..0000000
--- a/debian/patches/0007-Fix-race-in-parallel-mount-s-thread-dispatching-algo.patch
+++ /dev/null
@@ -1,244 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
-Date: Wed, 10 Jul 2019 01:31:46 +0900
-Subject: [PATCH] Fix race in parallel mount's thread dispatching algorithm
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Strategy of parallel mount is as follows.
-
-1) Initial thread dispatching is to select sets of mount points that
- don't have dependencies on other sets, hence threads can/should run
- lock-less and shouldn't race with other threads for other sets. Each
- thread dispatched corresponds to top level directory which may or may
- not have datasets to be mounted on sub directories.
-
-2) Subsequent recursive thread dispatching for each thread from 1)
- is to mount datasets for each set of mount points. The mount points
- within each set have dependencies (i.e. child directories), so child
- directories are processed only after parent directory completes.
-
-The problem is that the initial thread dispatching in
-zfs_foreach_mountpoint() can be multi-threaded when it needs to be
-single-threaded, and this puts threads under race condition. This race
-appeared as mount/unmount issues on ZoL for ZoL having different
-timing regarding mount(2) execution due to fork(2)/exec(2) of mount(8).
-`zfs unmount -a` which expects proper mount order can't unmount if the
-mounts were reordered by the race condition.
-
-There are currently two known patterns of input list `handles` in
-`zfs_foreach_mountpoint(..,handles,..)` which cause the race condition.
-
-1) #8833 case where input is `/a /a /a/b` after sorting.
- The problem is that libzfs_path_contains() can't correctly handle an
- input list with two same top level directories.
- There is a race between two POSIX threads A and B,
-  * ThreadA for "/a" for test1 and "/a/b"
-  * ThreadB for "/a" for test0/a
- and in case of #8833, ThreadA won the race. Two threads were created
- because "/a" wasn't considered as `"/a" contains "/a"`.
-
-2) #8450 case where input is `/ /var/data /var/data/test` after sorting.
- The problem is that libzfs_path_contains() can't correctly handle an
- input list containing "/".
- There is a race between two POSIX threads A and B,
-  * ThreadA for "/" and "/var/data/test"
-  * ThreadB for "/var/data"
- and in case of #8450, ThreadA won the race. Two threads were created
- because "/var/data" wasn't considered as `"/" contains "/var/data"`.
- In other words, if there is (at least one) "/" in the input list,
- the initial thread dispatching must be single-threaded since every
- directory is a child of "/", meaning they all directly or indirectly
- depend on "/".
-
-In both cases, the first non_descendant_idx() call fails to correctly
-determine "path1-contains-path2", and as a result the initial thread
-dispatching creates another thread when it needs to be single-threaded.
-Fix a conditional in libzfs_path_contains() to consider above two.
-
-Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
-Reviewed by: Sebastien Roy <sebastien.roy@delphix.com>
-Signed-off-by: Tomohiro Kusumi <kusumi.tomohiro@gmail.com>
-Closes #8450
-Closes #8833
-Closes #8878
-(cherry picked from commit ab5036df1ccbe1b18c1ce6160b5829e8039d94ce)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- .../functional/cli_root/zfs_mount/Makefile.am |   1 +
- lib/libzfs/libzfs_mount.c                     |   6 +-
- tests/runfiles/linux.run                      |   3 +-
- .../cli_root/zfs_mount/zfs_mount_test_race.sh | 116 ++++++++++++++++++
- 4 files changed, 123 insertions(+), 3 deletions(-)
- create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh
-
-diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
-index b2de98934..c208a1c37 100644
---- a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
-+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am
-@@ -19,6 +19,7 @@ dist_pkgdata_SCRIPTS = \
-       zfs_mount_all_mountpoints.ksh \
-       zfs_mount_encrypted.ksh \
-       zfs_mount_remount.ksh \
-+      zfs_mount_test_race.sh \
-       zfs_multi_mount.ksh
- 
- dist_pkgdata_DATA = \
-diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c
-index 649c232aa..d62801cfd 100644
---- a/lib/libzfs/libzfs_mount.c
-+++ b/lib/libzfs/libzfs_mount.c
-@@ -1302,12 +1302,14 @@ mountpoint_cmp(const void *arga, const void *argb)
- }
- 
- /*
-- * Return true if path2 is a child of path1.
-+ * Return true if path2 is a child of path1 or path2 equals path1 or
-+ * path1 is "/" (path2 is always a child of "/").
-  */
- static boolean_t
- libzfs_path_contains(const char *path1, const char *path2)
- {
--      return (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/');
-+      return (strcmp(path1, path2) == 0 || strcmp(path1, "/") == 0 ||
-+          (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/'));
- }
- 
- /*
-diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run
-index 22fc26212..4d673cc95 100644
---- a/tests/runfiles/linux.run
-+++ b/tests/runfiles/linux.run
-@@ -182,7 +182,8 @@ tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos',
-     'zfs_mount_007_pos', 'zfs_mount_008_pos', 'zfs_mount_009_neg',
-     'zfs_mount_010_neg', 'zfs_mount_011_neg', 'zfs_mount_012_neg',
-     'zfs_mount_all_001_pos', 'zfs_mount_encrypted', 'zfs_mount_remount',
--    'zfs_multi_mount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints']
-+    'zfs_multi_mount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints',
-+    'zfs_mount_test_race']
- tags = ['functional', 'cli_root', 'zfs_mount']
- 
- [tests/functional/cli_root/zfs_program]
-diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh
-new file mode 100755
-index 000000000..404770b27
---- /dev/null
-+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.sh
-@@ -0,0 +1,116 @@
-+#!/bin/ksh
-+
-+#
-+# This file and its contents are supplied under the terms of the
-+# Common Development and Distribution License ("CDDL"), version 1.0.
-+# You may only use this file in accordance with the terms of version
-+# 1.0 of the CDDL.
-+#
-+# A full copy of the text of the CDDL should have accompanied this
-+# source.  A copy of the CDDL is also available via the Internet at
-+# http://www.illumos.org/license/CDDL.
-+#
-+
-+#
-+# Copyright (c) 2019 by Tomohiro Kusumi. All rights reserved.
-+#
-+
-+. $STF_SUITE/include/libtest.shlib
-+. $STF_SUITE/tests/functional/cli_root/zfs_mount/zfs_mount.cfg
-+
-+#
-+# DESCRIPTION:
-+# Verify parallel mount ordering is consistent.
-+#
-+# There was a bug in initial thread dispatching algorithm which put threads
-+# under race condition which resulted in undefined mount order.  The purpose
-+# of this test is to verify `zfs unmount -a` succeeds (not `zfs mount -a`
-+# succeeds, it always does) after `zfs mount -a`, which could fail if threads
-+# race.  See github.com/zfsonlinux/zfs/issues/{8450,8833,8878} for details.
-+#
-+# STRATEGY:
-+# 1. Create pools and filesystems.
-+# 2. Set same mount point for >1 datasets.
-+# 3. Unmount all datasets.
-+# 4. Mount all datasets.
-+# 5. Unmount all datasets (verify this succeeds).
-+#
-+
-+verify_runnable "both"
-+
-+TMPDIR=${TMPDIR:-$TEST_BASE_DIR}
-+MNTPT=$TMPDIR/zfs_mount_test_race_mntpt
-+DISK1="$TMPDIR/zfs_mount_test_race_disk1"
-+DISK2="$TMPDIR/zfs_mount_test_race_disk2"
-+
-+TESTPOOL1=zfs_mount_test_race_tp1
-+TESTPOOL2=zfs_mount_test_race_tp2
-+
-+export __ZFS_POOL_RESTRICT="$TESTPOOL1 $TESTPOOL2"
-+log_must zfs $unmountall
-+unset __ZFS_POOL_RESTRICT
-+
-+function cleanup
-+{
-+      zpool destroy $TESTPOOL1
-+      zpool destroy $TESTPOOL2
-+      rm -rf $MNTPT
-+      rm -rf /$TESTPOOL1
-+      rm -rf /$TESTPOOL2
-+      rm -f $DISK1
-+      rm -f $DISK2
-+      export __ZFS_POOL_RESTRICT="$TESTPOOL1 $TESTPOOL2"
-+      log_must zfs $mountall
-+      unset __ZFS_POOL_RESTRICT
-+}
-+log_onexit cleanup
-+
-+log_note "Verify parallel mount ordering is consistent"
-+
-+log_must truncate -s $MINVDEVSIZE $DISK1
-+log_must truncate -s $MINVDEVSIZE $DISK2
-+
-+log_must zpool create -f $TESTPOOL1 $DISK1
-+log_must zpool create -f $TESTPOOL2 $DISK2
-+
-+log_must zfs create $TESTPOOL1/$TESTFS1
-+log_must zfs create $TESTPOOL2/$TESTFS2
-+
-+log_must zfs set mountpoint=none $TESTPOOL1
-+log_must zfs set mountpoint=$MNTPT $TESTPOOL1/$TESTFS1
-+
-+# Note that unmount can fail (due to race condition on `zfs mount -a`) with or
-+# without `canmount=off`.  The race has nothing to do with canmount property,
-+# but turn it off for convenience of mount layout used in this test case.
-+log_must zfs set canmount=off $TESTPOOL2
-+log_must zfs set mountpoint=$MNTPT $TESTPOOL2
-+
-+# At this point, layout of datasets in two pools will look like below.
-+# Previously, on next `zfs mount -a`, pthreads assigned to TESTFS1 and TESTFS2
-+# could race, and TESTFS2 usually (actually always) won in ZoL.  Note that the
-+# problem is how two or more threads could initially be assigned to the same
-+# top level directory, not this specific layout.  This layout is just an example
-+# that can reproduce race, and is also the layout reported in #8833.
-+#
-+# NAME                  MOUNTED  MOUNTPOINT
-+# ----------------------------------------------
-+# /$TESTPOOL1           no       none
-+# /$TESTPOOL1/$TESTFS1  yes      $MNTPT
-+# /$TESTPOOL2           no       $MNTPT
-+# /$TESTPOOL2/$TESTFS2  yes      $MNTPT/$TESTFS2
-+
-+# Apparently two datasets must be mounted.
-+log_must ismounted $TESTPOOL1/$TESTFS1
-+log_must ismounted $TESTPOOL2/$TESTFS2
-+# This unmount always succeeds, because potential race hasn't happened yet.
-+log_must zfs unmount -a
-+# This mount always succeeds, whether threads are under race condition or not.
-+log_must zfs mount -a
-+
-+# Verify datasets are mounted (TESTFS2 fails if the race broke mount order).
-+log_must ismounted $TESTPOOL1/$TESTFS1
-+log_must ismounted $TESTPOOL2/$TESTFS2
-+# Verify unmount succeeds (fails if the race broke mount order).
-+log_must zfs unmount -a
-+
-+log_pass "Verify parallel mount ordering is consistent passed"
diff --git a/debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch b/debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch

deleted file mode 100644 (file)

index 9b25e0c..0000000
--- a/debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch
+++ /dev/null
@@ -1,1615 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Brian Behlendorf <behlendorf1@llnl.gov>
-Date: Fri, 12 Jul 2019 09:31:20 -0700
-Subject: [PATCH] Linux 5.0 compat: SIMD compatibility
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS,
-and 5.0 and newer kernels.  This is accomplished by leveraging
-the fact that by definition dedicated kernel threads never need
-to concern themselves with saving and restoring the user FPU state.
-Therefore, they may use the FPU as long as we can guarantee user
-tasks always restore their FPU state before context switching back
-to user space.
-
-For the 5.0 and 5.1 kernels disabling preemption and local
-interrupts is sufficient to allow the FPU to be used.  All non-kernel
-threads will restore the preserved user FPU state.
-
-For 5.2 and latter kernels the user FPU state restoration will be
-skipped if the kernel determines the registers have not changed.
-Therefore, for these kernels we need to perform the additional
-step of saving and restoring the FPU registers.  Invalidating the
-per-cpu global tracking the FPU state would force a restore but
-that functionality is private to the core x86 FPU implementation
-and unavailable.
-
-In practice, restricting SIMD to kernel threads is not a major
-restriction for ZFS.  The vast majority of SIMD operations are
-already performed by the IO pipeline.  The remaining cases are
-relatively infrequent and can be handled by the generic code
-without significant impact.  The two most noteworthy cases are:
-
-  1) Decrypting the wrapping key for an encrypted dataset,
-     i.e. `zfs load-key`.  All other encryption and decryption
-     operations will use the SIMD optimized implementations.
-
-  2) Generating the payload checksums for a `zfs send` stream.
-
-In order to avoid making any changes to the higher layers of ZFS
-all of the `*_get_ops()` functions were updated to take in to
-consideration the calling context.  This allows for the fastest
-implementation to be used as appropriate (see kfpu_allowed()).
-
-The only other notable instance of SIMD operations being used
-outside a kernel thread was at module load time.  This code
-was moved in to a taskq in order to accommodate the new kernel
-thread restriction.
-
-Finally, a few other modifications were made in order to further
-harden this code and facilitate testing.  They include updating
-each implementations operations structure to be declared as a
-constant.  And allowing "cycle" to be set when selecting the
-preferred ops in the kernel as well as user space.
-
-Reviewed-by: Tony Hutter <hutter2@llnl.gov>
-Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
-Closes #8754
-Closes #8793
-Closes #8965
-(cherry picked from commit e5db31349484e5e859c7a942eb15b98d68ce5b4d)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- include/linux/Makefile.am                   |   1 +
- include/linux/simd.h                        |  41 +++++
- include/linux/simd_aarch64.h                |  18 +-
- include/linux/simd_x86.h                    | 192 +++++++++++++-------
- include/sys/vdev_raidz.h                    |   2 +-
- include/sys/vdev_raidz_impl.h               |   2 +-
- module/icp/include/aes/aes_impl.h           |   6 +-
- module/icp/include/modes/gcm_impl.h         |   6 +-
- cmd/ztest/ztest.c                           |   3 +
- module/icp/algs/aes/aes_impl.c              |  34 ++--
- module/icp/algs/aes/aes_impl_aesni.c        |   2 +-
- module/icp/algs/modes/gcm.c                 |  41 +++--
- module/icp/algs/modes/gcm_pclmulqdq.c       |   2 +-
- module/icp/io/aes.c                         |  32 +++-
- module/spl/spl-taskq.c                      |   2 +
- module/spl/spl-thread.c                     |   2 +
- module/zcommon/zfs_fletcher.c               |  88 ++++++---
- module/zcommon/zfs_fletcher_aarch64_neon.c  |   2 +-
- module/zcommon/zfs_fletcher_avx512.c        |   2 +-
- module/zcommon/zfs_fletcher_intel.c         |   2 +-
- module/zcommon/zfs_fletcher_sse.c           |   5 +-
- module/zfs/vdev_raidz_math.c                | 105 +++++++----
- module/zfs/vdev_raidz_math_aarch64_neon.c   |   2 +-
- module/zfs/vdev_raidz_math_aarch64_neonx2.c |   2 +-
- module/zfs/vdev_raidz_math_avx2.c           |   2 +-
- module/zfs/vdev_raidz_math_avx512bw.c       |   5 +-
- module/zfs/vdev_raidz_math_avx512f.c        |   5 +-
- module/zfs/vdev_raidz_math_sse2.c           |   2 +-
- module/zfs/vdev_raidz_math_ssse3.c          |   4 +-
- config/kernel-fpu.m4                        |  46 ++++-
- 30 files changed, 454 insertions(+), 204 deletions(-)
- create mode 100644 include/linux/simd.h
-
-diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am
-index efb49520e..2455759e8 100644
---- a/include/linux/Makefile.am
-+++ b/include/linux/Makefile.am
-@@ -7,6 +7,7 @@ KERNEL_H = \
-       $(top_srcdir)/include/linux/blkdev_compat.h \
-       $(top_srcdir)/include/linux/utsname_compat.h \
-       $(top_srcdir)/include/linux/kmap_compat.h \
-+      $(top_srcdir)/include/linux/simd.h \
-       $(top_srcdir)/include/linux/simd_x86.h \
-       $(top_srcdir)/include/linux/simd_aarch64.h \
-       $(top_srcdir)/include/linux/mod_compat.h \
-diff --git a/include/linux/simd.h b/include/linux/simd.h
-new file mode 100644
-index 000000000..d2b60996a
---- /dev/null
-+++ b/include/linux/simd.h
-@@ -0,0 +1,41 @@
-+/*
-+ * CDDL HEADER START
-+ *
-+ * The contents of this file are subject to the terms of the
-+ * Common Development and Distribution License (the "License").
-+ * You may not use this file except in compliance with the License.
-+ *
-+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-+ * or http://www.opensolaris.org/os/licensing.
-+ * See the License for the specific language governing permissions
-+ * and limitations under the License.
-+ *
-+ * When distributing Covered Code, include this CDDL HEADER in each
-+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-+ * If applicable, add the following below this CDDL HEADER, with the
-+ * fields enclosed by brackets "[]" replaced with your own identifying
-+ * information: Portions Copyright [yyyy] [name of copyright owner]
-+ *
-+ * CDDL HEADER END
-+ */
-+/*
-+ * Copyright (C) 2019 Lawrence Livermore National Security, LLC.
-+ */
-+
-+#ifndef _SIMD_H
-+#define       _SIMD_H
-+
-+#if defined(__x86)
-+#include <linux/simd_x86.h>
-+
-+#elif defined(__aarch64__)
-+#include <linux/simd_aarch64.h>
-+#else
-+
-+#define       kfpu_allowed()          1
-+#define       kfpu_initialize(tsk)    do {} while (0)
-+#define       kfpu_begin()            do {} while (0)
-+#define       kfpu_end()              do {} while (0)
-+
-+#endif
-+#endif /* _SIMD_H */
-diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h
-index 155ef6205..1cfcd01e4 100644
---- a/include/linux/simd_aarch64.h
-+++ b/include/linux/simd_aarch64.h
-@@ -41,20 +41,18 @@
- 
- #if defined(_KERNEL)
- #include <asm/neon.h>
--#define       kfpu_begin()            \
--{                                     \
--      kernel_neon_begin();            \
--}
--#define       kfpu_end()                      \
--{                                     \
--      kernel_neon_end();              \
--}
-+#define       kfpu_allowed()          1
-+#define       kfpu_initialize(tsk)    do {} while (0)
-+#define       kfpu_begin()            kernel_neon_begin()
-+#define       kfpu_end()              kernel_neon_end()
- #else
- /*
-  * fpu dummy methods for userspace
-  */
--#define       kfpu_begin()    do {} while (0)
--#define       kfpu_end()              do {} while (0)
-+#define       kfpu_allowed()          1
-+#define       kfpu_initialize(tsk)    do {} while (0)
-+#define       kfpu_begin()            do {} while (0)
-+#define       kfpu_end()              do {} while (0)
- #endif /* defined(_KERNEL) */
- 
- #endif /* __aarch64__ */
-diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
-index 12cd74677..2d7a1c3a5 100644
---- a/include/linux/simd_x86.h
-+++ b/include/linux/simd_x86.h
-@@ -90,33 +90,135 @@
- #include <asm/xcr.h>
- #endif
- 
-+/*
-+ * The following cases are for kernels which export either the
-+ * kernel_fpu_* or __kernel_fpu_* functions.
-+ */
-+#if defined(KERNEL_EXPORTS_X86_FPU)
-+
-+#define       kfpu_allowed()          1
-+#define       kfpu_initialize(tsk)    do {} while (0)
-+
- #if defined(HAVE_UNDERSCORE_KERNEL_FPU)
- #define       kfpu_begin()            \
--{                                                     \
--      preempt_disable();              \
-+{                             \
-+      preempt_disable();      \
-       __kernel_fpu_begin();   \
- }
--#define       kfpu_end()                      \
--{                                                     \
--      __kernel_fpu_end();             \
--      preempt_enable();               \
-+#define       kfpu_end()              \
-+{                             \
-+      __kernel_fpu_end();     \
-+      preempt_enable();       \
- }
-+
- #elif defined(HAVE_KERNEL_FPU)
--#define       kfpu_begin()    kernel_fpu_begin()
-+#define       kfpu_begin()            kernel_fpu_begin()
- #define       kfpu_end()              kernel_fpu_end()
-+
- #else
--/* Kernel doesn't export any kernel_fpu_* functions */
--#include <asm/fpu/internal.h> /* For kernel xgetbv() */
--#define       kfpu_begin()    panic("This code should never run")
--#define       kfpu_end()      panic("This code should never run")
--#endif /* defined(HAVE_KERNEL_FPU) */
-+/*
-+ * This case is unreachable.  When KERNEL_EXPORTS_X86_FPU is defined then
-+ * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined.
-+ */
-+#error "Unreachable kernel configuration"
-+#endif
-+
-+#else /* defined(KERNEL_EXPORTS_X86_FPU) */
-+/*
-+ * When the kernel_fpu_* symbols are unavailable then provide our own
-+ * versions which allow the FPU to be safely used in kernel threads.
-+ * In practice, this is not a significant restriction for ZFS since the
-+ * vast majority of SIMD operations are performed by the IO pipeline.
-+ */
- 
-+/*
-+ * Returns non-zero if FPU operations are allowed in the current context.
-+ */
-+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
-+#define       kfpu_allowed()          ((current->flags & PF_KTHREAD) && \
-+                              test_thread_flag(TIF_NEED_FPU_LOAD))
-+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
-+#define       kfpu_allowed()          ((current->flags & PF_KTHREAD) && \
-+                              current->thread.fpu.initialized)
- #else
-+#define       kfpu_allowed()          0
-+#endif
-+
-+static inline void
-+kfpu_initialize(void)
-+{
-+      WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
-+
-+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
-+      __fpu_invalidate_fpregs_state(&current->thread.fpu);
-+      set_thread_flag(TIF_NEED_FPU_LOAD);
-+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
-+      __fpu_invalidate_fpregs_state(&current->thread.fpu);
-+      current->thread.fpu.initialized = 1;
-+#endif
-+}
-+
-+static inline void
-+kfpu_begin(void)
-+{
-+      WARN_ON_ONCE(!kfpu_allowed());
-+
-+      /*
-+       * Preemption and interrupts must be disabled for the critical
-+       * region where the FPU state is being modified.
-+       */
-+      preempt_disable();
-+      local_irq_disable();
-+
-+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
-+      /*
-+       * The current FPU registers need to be preserved by kfpu_begin()
-+       * and restored by kfpu_end().  This is required because we can
-+       * not call __cpu_invalidate_fpregs_state() to invalidate the
-+       * per-cpu FPU state and force them to be restored during a
-+       * context switch.
-+       */
-+      copy_fpregs_to_fpstate(&current->thread.fpu);
-+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
-+      /*
-+       * There is no need to preserve and restore the FPU registers.
-+       * They will always be restored from the task's stored FPU state
-+       * when switching contexts.
-+       */
-+      WARN_ON_ONCE(current->thread.fpu.initialized == 0);
-+#endif
-+}
-+
-+static inline void
-+kfpu_end(void)
-+{
-+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
-+      union fpregs_state *state = &current->thread.fpu.state;
-+      int error;
-+
-+      if (use_xsave()) {
-+              error = copy_kernel_to_xregs_err(&state->xsave, -1);
-+      } else if (use_fxsr()) {
-+              error = copy_kernel_to_fxregs_err(&state->fxsave);
-+      } else {
-+              error = copy_kernel_to_fregs_err(&state->fsave);
-+      }
-+      WARN_ON_ONCE(error);
-+#endif
-+
-+      local_irq_enable();
-+      preempt_enable();
-+}
-+#endif /* defined(HAVE_KERNEL_FPU) */
-+
-+#else /* defined(_KERNEL) */
- /*
-- * fpu dummy methods for userspace
-+ * FPU dummy methods for user space.
-  */
--#define       kfpu_begin()    do {} while (0)
--#define       kfpu_end()              do {} while (0)
-+#define       kfpu_allowed()          1
-+#define       kfpu_initialize(tsk)    do {} while (0)
-+#define       kfpu_begin()            do {} while (0)
-+#define       kfpu_end()              do {} while (0)
- #endif /* defined(_KERNEL) */
- 
- /*
-@@ -298,7 +400,7 @@ __simd_state_enabled(const uint64_t state)
-       uint64_t xcr0;
- 
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_OSXSAVE)
-       has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE);
- #else
-       has_osxsave = B_FALSE;
-@@ -328,11 +430,7 @@ static inline boolean_t
- zfs_sse_available(void)
- {
- #if defined(_KERNEL)
--#if defined(KERNEL_EXPORTS_X86_FPU)
-       return (!!boot_cpu_has(X86_FEATURE_XMM));
--#else
--      return (B_FALSE);
--#endif
- #elif !defined(_KERNEL)
-       return (__cpuid_has_sse());
- #endif
-@@ -345,11 +443,7 @@ static inline boolean_t
- zfs_sse2_available(void)
- {
- #if defined(_KERNEL)
--#if defined(KERNEL_EXPORTS_X86_FPU)
-       return (!!boot_cpu_has(X86_FEATURE_XMM2));
--#else
--      return (B_FALSE);
--#endif
- #elif !defined(_KERNEL)
-       return (__cpuid_has_sse2());
- #endif
-@@ -362,11 +456,7 @@ static inline boolean_t
- zfs_sse3_available(void)
- {
- #if defined(_KERNEL)
--#if defined(KERNEL_EXPORTS_X86_FPU)
-       return (!!boot_cpu_has(X86_FEATURE_XMM3));
--#else
--      return (B_FALSE);
--#endif
- #elif !defined(_KERNEL)
-       return (__cpuid_has_sse3());
- #endif
-@@ -379,11 +469,7 @@ static inline boolean_t
- zfs_ssse3_available(void)
- {
- #if defined(_KERNEL)
--#if defined(KERNEL_EXPORTS_X86_FPU)
-       return (!!boot_cpu_has(X86_FEATURE_SSSE3));
--#else
--      return (B_FALSE);
--#endif
- #elif !defined(_KERNEL)
-       return (__cpuid_has_ssse3());
- #endif
-@@ -396,11 +482,7 @@ static inline boolean_t
- zfs_sse4_1_available(void)
- {
- #if defined(_KERNEL)
--#if defined(KERNEL_EXPORTS_X86_FPU)
-       return (!!boot_cpu_has(X86_FEATURE_XMM4_1));
--#else
--      return (B_FALSE);
--#endif
- #elif !defined(_KERNEL)
-       return (__cpuid_has_sse4_1());
- #endif
-@@ -413,11 +495,7 @@ static inline boolean_t
- zfs_sse4_2_available(void)
- {
- #if defined(_KERNEL)
--#if defined(KERNEL_EXPORTS_X86_FPU)
-       return (!!boot_cpu_has(X86_FEATURE_XMM4_2));
--#else
--      return (B_FALSE);
--#endif
- #elif !defined(_KERNEL)
-       return (__cpuid_has_sse4_2());
- #endif
-@@ -431,11 +509,7 @@ zfs_avx_available(void)
- {
-       boolean_t has_avx;
- #if defined(_KERNEL)
--#if defined(KERNEL_EXPORTS_X86_FPU)
-       has_avx = !!boot_cpu_has(X86_FEATURE_AVX);
--#else
--      has_avx = B_FALSE;
--#endif
- #elif !defined(_KERNEL)
-       has_avx = __cpuid_has_avx();
- #endif
-@@ -451,11 +525,7 @@ zfs_avx2_available(void)
- {
-       boolean_t has_avx2;
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU)
-       has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2);
--#else
--      has_avx2 = B_FALSE;
--#endif
- #elif !defined(_KERNEL)
-       has_avx2 = __cpuid_has_avx2();
- #endif
-@@ -470,7 +540,7 @@ static inline boolean_t
- zfs_bmi1_available(void)
- {
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_BMI1)
-       return (!!boot_cpu_has(X86_FEATURE_BMI1));
- #else
-       return (B_FALSE);
-@@ -487,7 +557,7 @@ static inline boolean_t
- zfs_bmi2_available(void)
- {
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_BMI2)
-       return (!!boot_cpu_has(X86_FEATURE_BMI2));
- #else
-       return (B_FALSE);
-@@ -504,7 +574,7 @@ static inline boolean_t
- zfs_aes_available(void)
- {
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_AES)
-       return (!!boot_cpu_has(X86_FEATURE_AES));
- #else
-       return (B_FALSE);
-@@ -521,7 +591,7 @@ static inline boolean_t
- zfs_pclmulqdq_available(void)
- {
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_PCLMULQDQ)
-       return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ));
- #else
-       return (B_FALSE);
-@@ -555,7 +625,7 @@ zfs_avx512f_available(void)
-       boolean_t has_avx512 = B_FALSE;
- 
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_AVX512F)
-       has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F);
- #else
-       has_avx512 = B_FALSE;
-@@ -574,7 +644,7 @@ zfs_avx512cd_available(void)
-       boolean_t has_avx512 = B_FALSE;
- 
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_AVX512CD)
-       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
-           boot_cpu_has(X86_FEATURE_AVX512CD);
- #else
-@@ -594,7 +664,7 @@ zfs_avx512er_available(void)
-       boolean_t has_avx512 = B_FALSE;
- 
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_AVX512ER)
-       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
-           boot_cpu_has(X86_FEATURE_AVX512ER);
- #else
-@@ -614,7 +684,7 @@ zfs_avx512pf_available(void)
-       boolean_t has_avx512 = B_FALSE;
- 
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_AVX512PF)
-       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
-           boot_cpu_has(X86_FEATURE_AVX512PF);
- #else
-@@ -634,7 +704,7 @@ zfs_avx512bw_available(void)
-       boolean_t has_avx512 = B_FALSE;
- 
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_AVX512BW)
-       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
-           boot_cpu_has(X86_FEATURE_AVX512BW);
- #else
-@@ -654,7 +724,7 @@ zfs_avx512dq_available(void)
-       boolean_t has_avx512 = B_FALSE;
- 
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_AVX512DQ)
-       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
-           boot_cpu_has(X86_FEATURE_AVX512DQ);
- #else
-@@ -674,7 +744,7 @@ zfs_avx512vl_available(void)
-       boolean_t has_avx512 = B_FALSE;
- 
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_AVX512VL)
-       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
-           boot_cpu_has(X86_FEATURE_AVX512VL);
- #else
-@@ -694,7 +764,7 @@ zfs_avx512ifma_available(void)
-       boolean_t has_avx512 = B_FALSE;
- 
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_AVX512IFMA)
-       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
-           boot_cpu_has(X86_FEATURE_AVX512IFMA);
- #else
-@@ -714,7 +784,7 @@ zfs_avx512vbmi_available(void)
-       boolean_t has_avx512 = B_FALSE;
- 
- #if defined(_KERNEL)
--#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU)
-+#if defined(X86_FEATURE_AVX512VBMI)
-       has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
-           boot_cpu_has(X86_FEATURE_AVX512VBMI);
- #else
-diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h
-index 2ce32469d..0ce2b5ea1 100644
---- a/include/sys/vdev_raidz.h
-+++ b/include/sys/vdev_raidz.h
-@@ -51,7 +51,7 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
-  */
- void vdev_raidz_math_init(void);
- void vdev_raidz_math_fini(void);
--struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
-+const struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
- int vdev_raidz_math_generate(struct raidz_map *);
- int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *,
-     const int);
-diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
-index 0799ed19d..4969d110b 100644
---- a/include/sys/vdev_raidz_impl.h
-+++ b/include/sys/vdev_raidz_impl.h
-@@ -126,7 +126,7 @@ typedef struct raidz_map {
-       uintptr_t rm_reports;           /* # of referencing checksum reports */
-       uint8_t rm_freed;               /* map no longer has referencing ZIO */
-       uint8_t rm_ecksuminjected;      /* checksum error was injected */
--      raidz_impl_ops_t *rm_ops;       /* RAIDZ math operations */
-+      const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
-       raidz_col_t rm_col[1];          /* Flexible array of I/O columns */
- } raidz_map_t;
- 
-diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h
-index 95cfddf9e..9fd9c1bd1 100644
---- a/module/icp/include/aes/aes_impl.h
-+++ b/module/icp/include/aes/aes_impl.h
-@@ -198,12 +198,12 @@ extern const aes_impl_ops_t aes_aesni_impl;
- /*
-  * Initializes fastest implementation
-  */
--void aes_impl_init(void);
-+void aes_impl_init(void *arg);
- 
- /*
-- * Get selected aes implementation
-+ * Returns optimal allowed AES implementation
-  */
--struct aes_impl_ops *aes_impl_get_ops(void);
-+const struct aes_impl_ops *aes_impl_get_ops(void);
- 
- #ifdef        __cplusplus
- }
-diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h
-index cbb904c05..138090487 100644
---- a/module/icp/include/modes/gcm_impl.h
-+++ b/module/icp/include/modes/gcm_impl.h
-@@ -61,12 +61,12 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
- /*
-  * Initializes fastest implementation
-  */
--void gcm_impl_init(void);
-+void gcm_impl_init(void *arg);
- 
- /*
-- * Get selected aes implementation
-+ * Returns optimal allowed GCM implementation
-  */
--struct gcm_impl_ops *gcm_impl_get_ops(void);
-+const struct gcm_impl_ops *gcm_impl_get_ops(void);
- 
- #ifdef        __cplusplus
- }
-diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
-index 9c2cf9501..815462443 100644
---- a/cmd/ztest/ztest.c
-+++ b/cmd/ztest/ztest.c
-@@ -107,6 +107,7 @@
- #include <sys/vdev_impl.h>
- #include <sys/vdev_file.h>
- #include <sys/vdev_initialize.h>
-+#include <sys/vdev_raidz.h>
- #include <sys/vdev_trim.h>
- #include <sys/spa_impl.h>
- #include <sys/metaslab_impl.h>
-@@ -7094,6 +7095,8 @@ ztest_run(ztest_shared_t *zs)
-       metaslab_preload_limit = ztest_random(20) + 1;
-       ztest_spa = spa;
- 
-+      VERIFY0(vdev_raidz_impl_set("cycle"));
-+
-       dmu_objset_stats_t dds;
-       VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
-           DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
-diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c
-index e15050635..457b9e45c 100644
---- a/module/icp/algs/aes/aes_impl.c
-+++ b/module/icp/algs/aes/aes_impl.c
-@@ -27,6 +27,7 @@
- #include <sys/crypto/spi.h>
- #include <modes/modes.h>
- #include <aes/aes_impl.h>
-+#include <linux/simd.h>
- 
- /*
-  * Initialize AES encryption and decryption key schedules.
-@@ -40,9 +41,9 @@
- void
- aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
- {
--      aes_impl_ops_t  *ops = aes_impl_get_ops();
--      aes_key_t       *newbie = keysched;
--      uint_t          keysize, i, j;
-+      const aes_impl_ops_t *ops = aes_impl_get_ops();
-+      aes_key_t *newbie = keysched;
-+      uint_t keysize, i, j;
-       union {
-               uint64_t        ka64[4];
-               uint32_t        ka32[8];
-@@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0;
- static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
- 
- /*
-- * Selects the aes operations for encrypt/decrypt/key setup
-+ * Returns the AES operations for encrypt/decrypt/key setup.  When a
-+ * SIMD implementation is not allowed in the current context, then
-+ * fallback to the fastest generic implementation.
-  */
--aes_impl_ops_t *
--aes_impl_get_ops()
-+const aes_impl_ops_t *
-+aes_impl_get_ops(void)
- {
--      aes_impl_ops_t *ops = NULL;
-+      if (!kfpu_allowed())
-+              return (&aes_generic_impl);
-+
-+      const aes_impl_ops_t *ops = NULL;
-       const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
- 
-       switch (impl) {
-@@ -266,15 +272,13 @@ aes_impl_get_ops()
-               ops = &aes_fastest_impl;
-               break;
-       case IMPL_CYCLE:
--      {
-+              /* Cycle through supported implementations */
-               ASSERT(aes_impl_initialized);
-               ASSERT3U(aes_supp_impl_cnt, >, 0);
--              /* Cycle through supported implementations */
-               static size_t cycle_impl_idx = 0;
-               size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
-               ops = aes_supp_impl[idx];
--      }
--      break;
-+              break;
-       default:
-               ASSERT3U(impl, <, aes_supp_impl_cnt);
-               ASSERT3U(aes_supp_impl_cnt, >, 0);
-@@ -288,13 +292,17 @@ aes_impl_get_ops()
-       return (ops);
- }
- 
-+/*
-+ * Initialize all supported implementations.
-+ */
-+/* ARGSUSED */
- void
--aes_impl_init(void)
-+aes_impl_init(void *arg)
- {
-       aes_impl_ops_t *curr_impl;
-       int i, c;
- 
--      /* move supported impl into aes_supp_impls */
-+      /* Move supported implementations into aes_supp_impls */
-       for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
-               curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
- 
-diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c
-index 97f7c3a47..222c176aa 100644
---- a/module/icp/algs/aes/aes_impl_aesni.c
-+++ b/module/icp/algs/aes/aes_impl_aesni.c
-@@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
- static boolean_t
- aes_aesni_will_work(void)
- {
--      return (zfs_aes_available());
-+      return (kfpu_allowed() && zfs_aes_available());
- }
- 
- const aes_impl_ops_t aes_aesni_impl = {
-diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c
-index 13bceef0f..f6f8434de 100644
---- a/module/icp/algs/modes/gcm.c
-+++ b/module/icp/algs/modes/gcm.c
-@@ -29,6 +29,7 @@
- #include <sys/crypto/impl.h>
- #include <sys/byteorder.h>
- #include <modes/gcm_impl.h>
-+#include <linux/simd.h>
- 
- #define       GHASH(c, d, t, o) \
-       xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
-@@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
-     void (*copy_block)(uint8_t *, uint8_t *),
-     void (*xor_block)(uint8_t *, uint8_t *))
- {
--      gcm_impl_ops_t *gops;
-+      const gcm_impl_ops_t *gops;
-       size_t remainder = length;
-       size_t need = 0;
-       uint8_t *datap = (uint8_t *)data;
-@@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
-     void (*copy_block)(uint8_t *, uint8_t *),
-     void (*xor_block)(uint8_t *, uint8_t *))
- {
--      gcm_impl_ops_t *gops;
-+      const gcm_impl_ops_t *gops;
-       uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
-       uint8_t *ghash, *macp = NULL;
-       int i, rv;
-@@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
-     int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
-     void (*xor_block)(uint8_t *, uint8_t *))
- {
--      gcm_impl_ops_t *gops;
-+      const gcm_impl_ops_t *gops;
-       size_t pt_len;
-       size_t remainder;
-       uint8_t *ghash;
-@@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
-     void (*copy_block)(uint8_t *, uint8_t *),
-     void (*xor_block)(uint8_t *, uint8_t *))
- {
--      gcm_impl_ops_t *gops;
-+      const gcm_impl_ops_t *gops;
-       uint8_t *cb;
-       ulong_t remainder = iv_len;
-       ulong_t processed = 0;
-@@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
-     void (*copy_block)(uint8_t *, uint8_t *),
-     void (*xor_block)(uint8_t *, uint8_t *))
- {
--      gcm_impl_ops_t *gops;
-+      const gcm_impl_ops_t *gops;
-       uint8_t *ghash, *datap, *authp;
-       size_t remainder, processed;
- 
-@@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0;
- static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
- 
- /*
-- * Selects the gcm operation
-+ * Returns the GCM operations for encrypt/decrypt/key setup.  When a
-+ * SIMD implementation is not allowed in the current context, then
-+ * fallback to the fastest generic implementation.
-  */
--gcm_impl_ops_t *
-+const gcm_impl_ops_t *
- gcm_impl_get_ops()
- {
--      gcm_impl_ops_t *ops = NULL;
-+      if (!kfpu_allowed())
-+              return (&gcm_generic_impl);
-+
-+      const gcm_impl_ops_t *ops = NULL;
-       const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
- 
-       switch (impl) {
-@@ -674,15 +680,13 @@ gcm_impl_get_ops()
-               ops = &gcm_fastest_impl;
-               break;
-       case IMPL_CYCLE:
--      {
-+              /* Cycle through supported implementations */
-               ASSERT(gcm_impl_initialized);
-               ASSERT3U(gcm_supp_impl_cnt, >, 0);
--              /* Cycle through supported implementations */
-               static size_t cycle_impl_idx = 0;
-               size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
-               ops = gcm_supp_impl[idx];
--      }
--      break;
-+              break;
-       default:
-               ASSERT3U(impl, <, gcm_supp_impl_cnt);
-               ASSERT3U(gcm_supp_impl_cnt, >, 0);
-@@ -696,13 +700,17 @@ gcm_impl_get_ops()
-       return (ops);
- }
- 
-+/*
-+ * Initialize all supported implementations.
-+ */
-+/* ARGSUSED */
- void
--gcm_impl_init(void)
-+gcm_impl_init(void *arg)
- {
-       gcm_impl_ops_t *curr_impl;
-       int i, c;
- 
--      /* move supported impl into aes_supp_impls */
-+      /* Move supported implementations into gcm_supp_impls */
-       for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
-               curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
- 
-@@ -711,7 +719,10 @@ gcm_impl_init(void)
-       }
-       gcm_supp_impl_cnt = c;
- 
--      /* set fastest implementation. assume hardware accelerated is fastest */
-+      /*
-+       * Set the fastest implementation given the assumption that the
-+       * hardware accelerated version is the fastest.
-+       */
- #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
-       if (gcm_pclmulqdq_impl.is_supported())
-               memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
-diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c
-index be00ba37b..8a43ba33a 100644
---- a/module/icp/algs/modes/gcm_pclmulqdq.c
-+++ b/module/icp/algs/modes/gcm_pclmulqdq.c
-@@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
- static boolean_t
- gcm_pclmulqdq_will_work(void)
- {
--      return (zfs_pclmulqdq_available());
-+      return (kfpu_allowed() && zfs_pclmulqdq_available());
- }
- 
- const gcm_impl_ops_t gcm_pclmulqdq_impl = {
-diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c
-index 53b193693..51538bc60 100644
---- a/module/icp/io/aes.c
-+++ b/module/icp/io/aes.c
-@@ -206,9 +206,35 @@ aes_mod_init(void)
- {
-       int ret;
- 
--      /* find fastest implementations and set any requested implementations */
--      aes_impl_init();
--      gcm_impl_init();
-+#if defined(_KERNEL)
-+      /*
-+       * Determine the fastest available implementation.  The benchmarks
-+       * are run in dedicated kernel threads to allow Linux 5.0+ kernels
-+       * to use SIMD operations.  If for some reason this isn't possible,
-+       * fallback to the generic implementations.  See the comment in
-+       * include/linux/simd_x86.h for additional details.  Additionally,
-+       * this has the benefit of allowing them to be run in parallel.
-+       */
-+      taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init,
-+          NULL, TQ_SLEEP);
-+      taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init,
-+          NULL, TQ_SLEEP);
-+
-+      if (aes_id != TASKQID_INVALID) {
-+              taskq_wait_id(system_taskq, aes_id);
-+      } else {
-+              aes_impl_init(NULL);
-+      }
-+
-+      if (gcm_id != TASKQID_INVALID) {
-+              taskq_wait_id(system_taskq, gcm_id);
-+      } else {
-+              gcm_impl_init(NULL);
-+      }
-+#else
-+      aes_impl_init(NULL);
-+      gcm_impl_init(NULL);
-+#endif
- 
-       if ((ret = mod_install(&modlinkage)) != 0)
-               return (ret);
-diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
-index 7684257be..de0e45190 100644
---- a/module/spl/spl-taskq.c
-+++ b/module/spl/spl-taskq.c
-@@ -27,6 +27,7 @@
- #include <sys/taskq.h>
- #include <sys/kmem.h>
- #include <sys/tsd.h>
-+#include <linux/simd.h>
- 
- int spl_taskq_thread_bind = 0;
- module_param(spl_taskq_thread_bind, int, 0644);
-@@ -869,6 +870,7 @@ taskq_thread(void *args)
-       sigfillset(&blocked);
-       sigprocmask(SIG_BLOCK, &blocked, NULL);
-       flush_signals(current);
-+      kfpu_initialize();
- 
-       tsd_set(taskq_tsd, tq);
-       spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
-diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c
-index d441ad65f..c4977bcf2 100644
---- a/module/spl/spl-thread.c
-+++ b/module/spl/spl-thread.c
-@@ -27,6 +27,7 @@
- #include <sys/thread.h>
- #include <sys/kmem.h>
- #include <sys/tsd.h>
-+#include <linux/simd.h>
- 
- /*
-  * Thread interfaces
-@@ -54,6 +55,7 @@ thread_generic_wrapper(void *arg)
-       args = tp->tp_args;
-       set_current_state(tp->tp_state);
-       set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
-+      kfpu_initialize();
-       kmem_free(tp->tp_name, tp->tp_name_size);
-       kmem_free(tp, sizeof (thread_priv_t));
- 
-diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
-index 5a991ba60..b75d8ab00 100644
---- a/module/zcommon/zfs_fletcher.c
-+++ b/module/zcommon/zfs_fletcher.c
-@@ -140,6 +140,7 @@
- #include <sys/zio_checksum.h>
- #include <sys/zfs_context.h>
- #include <zfs_fletcher.h>
-+#include <linux/simd.h>
- 
- #define       FLETCHER_MIN_SIMD_SIZE  64
- 
-@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
-       const char      *fis_name;
-       uint32_t        fis_sel;
- } fletcher_4_impl_selectors[] = {
--#if !defined(_KERNEL)
-       { "cycle",      IMPL_CYCLE },
--#endif
-       { "fastest",    IMPL_FASTEST },
-       { "scalar",     IMPL_SCALAR }
- };
- 
- #if defined(_KERNEL)
- static kstat_t *fletcher_4_kstat;
--#endif
- 
- static struct fletcher_4_kstat {
-       uint64_t native;
-       uint64_t byteswap;
- } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
-+#endif
- 
- /* Indicate that benchmark has been completed */
- static boolean_t fletcher_4_initialized = B_FALSE;
-@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
-       return (err);
- }
- 
-+/*
-+ * Returns the Fletcher 4 operations for checksums.   When a SIMD
-+ * implementation is not allowed in the current context, then fallback
-+ * to the fastest generic implementation.
-+ */
- static inline const fletcher_4_ops_t *
- fletcher_4_impl_get(void)
- {
--      fletcher_4_ops_t *ops = NULL;
--      const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
-+      if (!kfpu_allowed())
-+              return (&fletcher_4_superscalar4_ops);
-+
-+      const fletcher_4_ops_t *ops = NULL;
-+      uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
- 
-       switch (impl) {
-       case IMPL_FASTEST:
-               ASSERT(fletcher_4_initialized);
-               ops = &fletcher_4_fastest_impl;
-               break;
--#if !defined(_KERNEL)
--      case IMPL_CYCLE: {
-+      case IMPL_CYCLE:
-+              /* Cycle through supported implementations */
-               ASSERT(fletcher_4_initialized);
-               ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
--
-               static uint32_t cycle_count = 0;
-               uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
-               ops = fletcher_4_supp_impls[idx];
--      }
--      break;
--#endif
-+              break;
-       default:
-               ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
-               ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
--
-               ops = fletcher_4_supp_impls[impl];
-               break;
-       }
-@@ -658,6 +661,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
- typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
-                                       zio_cksum_t *);
- 
-+#if defined(_KERNEL)
- static void
- fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
- {
-@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
-       /* restore original selection */
-       atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
- }
-+#endif /* _KERNEL */
- 
--void
--fletcher_4_init(void)
-+/*
-+ * Initialize and benchmark all supported implementations.
-+ */
-+static void
-+fletcher_4_benchmark(void *arg)
- {
--      static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
-       fletcher_4_ops_t *curr_impl;
--      char *databuf;
-       int i, c;
- 
--      /* move supported impl into fletcher_4_supp_impls */
-+      /* Move supported implementations into fletcher_4_supp_impls */
-       for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
-               curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
- 
-@@ -735,19 +741,10 @@ fletcher_4_init(void)
-       membar_producer();      /* complete fletcher_4_supp_impls[] init */
-       fletcher_4_supp_impls_cnt = c;  /* number of supported impl */
- 
--#if !defined(_KERNEL)
--      /* Skip benchmarking and use last implementation as fastest */
--      memcpy(&fletcher_4_fastest_impl,
--          fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
--          sizeof (fletcher_4_fastest_impl));
--      fletcher_4_fastest_impl.name = "fastest";
--      membar_producer();
-+#if defined(_KERNEL)
-+      static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
-+      char *databuf = vmem_alloc(data_size, KM_SLEEP);
- 
--      fletcher_4_initialized = B_TRUE;
--      return;
--#endif
--      /* Benchmark all supported implementations */
--      databuf = vmem_alloc(data_size, KM_SLEEP);
-       for (i = 0; i < data_size / sizeof (uint64_t); i++)
-               ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
- 
-@@ -755,9 +752,38 @@ fletcher_4_init(void)
-       fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
- 
-       vmem_free(databuf, data_size);
-+#else
-+      /*
-+       * Skip the benchmark in user space to avoid impacting libzpool
-+       * consumers (zdb, zhack, zinject, ztest).  The last implementation
-+       * is assumed to be the fastest and used by default.
-+       */
-+      memcpy(&fletcher_4_fastest_impl,
-+          fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
-+          sizeof (fletcher_4_fastest_impl));
-+      fletcher_4_fastest_impl.name = "fastest";
-+      membar_producer();
-+#endif /* _KERNEL */
-+}
- 
-+void
-+fletcher_4_init(void)
-+{
- #if defined(_KERNEL)
--      /* install kstats for all implementations */
-+      /*
-+       * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
-+       * run in a kernel threads.  This is needed to take advantage of the
-+       * SIMD functionality, see include/linux/simd_x86.h for details.
-+       */
-+      taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark,
-+          NULL, TQ_SLEEP);
-+      if (id != TASKQID_INVALID) {
-+              taskq_wait_id(system_taskq, id);
-+      } else {
-+              fletcher_4_benchmark(NULL);
-+      }
-+
-+      /* Install kstats for all implementations */
-       fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
-           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-       if (fletcher_4_kstat != NULL) {
-@@ -769,6 +795,8 @@ fletcher_4_init(void)
-                   fletcher_4_kstat_addr);
-               kstat_install(fletcher_4_kstat);
-       }
-+#else
-+      fletcher_4_benchmark(NULL);
- #endif
- 
-       /* Finish initialization */
-diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c
-index bd2db2b20..3b3c1b52b 100644
---- a/module/zcommon/zfs_fletcher_aarch64_neon.c
-+++ b/module/zcommon/zfs_fletcher_aarch64_neon.c
-@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));
- 
- static boolean_t fletcher_4_aarch64_neon_valid(void)
- {
--      return (B_TRUE);
-+      return (kfpu_allowed());
- }
- 
- const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
-diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c
-index 7260a9864..0d4cff21a 100644
---- a/module/zcommon/zfs_fletcher_avx512.c
-+++ b/module/zcommon/zfs_fletcher_avx512.c
-@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
- static boolean_t
- fletcher_4_avx512f_valid(void)
- {
--      return (zfs_avx512f_available());
-+      return (kfpu_allowed() && zfs_avx512f_available());
- }
- 
- const fletcher_4_ops_t fletcher_4_avx512f_ops = {
-diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c
-index 6dac047da..7f12efe6d 100644
---- a/module/zcommon/zfs_fletcher_intel.c
-+++ b/module/zcommon/zfs_fletcher_intel.c
-@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
- 
- static boolean_t fletcher_4_avx2_valid(void)
- {
--      return (zfs_avx_available() && zfs_avx2_available());
-+      return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
- }
- 
- const fletcher_4_ops_t fletcher_4_avx2_ops = {
-diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c
-index a0b42e5f5..e6389d6e5 100644
---- a/module/zcommon/zfs_fletcher_sse.c
-+++ b/module/zcommon/zfs_fletcher_sse.c
-@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
- 
- static boolean_t fletcher_4_sse2_valid(void)
- {
--      return (zfs_sse2_available());
-+      return (kfpu_allowed() && zfs_sse2_available());
- }
- 
- const fletcher_4_ops_t fletcher_4_sse2_ops = {
-@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
- 
- static boolean_t fletcher_4_ssse3_valid(void)
- {
--      return (zfs_sse2_available() && zfs_ssse3_available());
-+      return (kfpu_allowed() && zfs_sse2_available() &&
-+          zfs_ssse3_available());
- }
- 
- const fletcher_4_ops_t fletcher_4_ssse3_ops = {
-diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c
-index e6112bc02..e7a39015c 100644
---- a/module/zfs/vdev_raidz_math.c
-+++ b/module/zfs/vdev_raidz_math.c
-@@ -27,9 +27,9 @@
- #include <sys/zio.h>
- #include <sys/debug.h>
- #include <sys/zfs_debug.h>
--
- #include <sys/vdev_raidz.h>
- #include <sys/vdev_raidz_impl.h>
-+#include <linux/simd.h>
- 
- extern boolean_t raidz_will_scalar_work(void);
- 
-@@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
- static size_t raidz_supp_impl_cnt = 0;
- static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
- 
-+#if defined(_KERNEL)
- /*
-  * kstats values for supported implementations
-  * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
-@@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
- 
- /* kstat for benchmarked implementations */
- static kstat_t *raidz_math_kstat = NULL;
-+#endif
- 
- /*
-- * Selects the raidz operation for raidz_map
-- * If rm_ops is set to NULL original raidz implementation will be used
-+ * Returns the RAIDZ operations for raidz_map() parity calculations.   When
-+ * a SIMD implementation is not allowed in the current context, then fallback
-+ * to the fastest generic implementation.
-  */
--raidz_impl_ops_t *
--vdev_raidz_math_get_ops()
-+const raidz_impl_ops_t *
-+vdev_raidz_math_get_ops(void)
- {
-+      if (!kfpu_allowed())
-+              return (&vdev_raidz_scalar_impl);
-+
-       raidz_impl_ops_t *ops = NULL;
-       const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
- 
-@@ -111,18 +117,14 @@ vdev_raidz_math_get_ops()
-               ASSERT(raidz_math_initialized);
-               ops = &vdev_raidz_fastest_impl;
-               break;
--#if !defined(_KERNEL)
-       case IMPL_CYCLE:
--      {
-+              /* Cycle through all supported implementations */
-               ASSERT(raidz_math_initialized);
-               ASSERT3U(raidz_supp_impl_cnt, >, 0);
--              /* Cycle through all supported implementations */
-               static size_t cycle_impl_idx = 0;
-               size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
-               ops = raidz_supp_impl[idx];
--      }
--      break;
--#endif
-+              break;
-       case IMPL_ORIGINAL:
-               ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
-               break;
-@@ -273,6 +275,8 @@ const char *raidz_rec_name[] = {
-       "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
- };
- 
-+#if defined(_KERNEL)
-+
- #define       RAIDZ_KSTAT_LINE_LEN    (17 + 10*12 + 1)
- 
- static int
-@@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
-               }
-       }
- }
-+#endif
- 
--void
--vdev_raidz_math_init(void)
-+/*
-+ * Initialize and benchmark all supported implementations.
-+ */
-+static void
-+benchmark_raidz(void *arg)
- {
-       raidz_impl_ops_t *curr_impl;
--      zio_t *bench_zio = NULL;
--      raidz_map_t *bench_rm = NULL;
--      uint64_t bench_parity;
--      int i, c, fn;
-+      int i, c;
- 
--      /* move supported impl into raidz_supp_impl */
-+      /* Move supported impl into raidz_supp_impl */
-       for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
-               curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
- 
--              /* initialize impl */
-               if (curr_impl->init)
-                       curr_impl->init();
- 
-@@ -459,18 +463,10 @@ vdev_raidz_math_init(void)
-       membar_producer();              /* complete raidz_supp_impl[] init */
-       raidz_supp_impl_cnt = c;        /* number of supported impl */
- 
--#if !defined(_KERNEL)
--      /* Skip benchmarking and use last implementation as fastest */
--      memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
--          sizeof (vdev_raidz_fastest_impl));
--      strcpy(vdev_raidz_fastest_impl.name, "fastest");
--
--      raidz_math_initialized = B_TRUE;
--
--      /* Use 'cycle' math selection method for userspace */
--      VERIFY0(vdev_raidz_impl_set("cycle"));
--      return;
--#endif
-+#if defined(_KERNEL)
-+      zio_t *bench_zio = NULL;
-+      raidz_map_t *bench_rm = NULL;
-+      uint64_t bench_parity;
- 
-       /* Fake an zio and run the benchmark on a warmed up buffer */
-       bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
-@@ -480,7 +476,7 @@ vdev_raidz_math_init(void)
-       memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
- 
-       /* Benchmark parity generation methods */
--      for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
-+      for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
-               bench_parity = fn + 1;
-               /* New raidz_map is needed for each generate_p/q/r */
-               bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
-@@ -495,7 +491,7 @@ vdev_raidz_math_init(void)
-       bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
-           BENCH_COLS, PARITY_PQR);
- 
--      for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
-+      for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
-               benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
- 
-       vdev_raidz_map_free(bench_rm);
-@@ -503,11 +499,39 @@ vdev_raidz_math_init(void)
-       /* cleanup the bench zio */
-       abd_free(bench_zio->io_abd);
-       kmem_free(bench_zio, sizeof (zio_t));
-+#else
-+      /*
-+       * Skip the benchmark in user space to avoid impacting libzpool
-+       * consumers (zdb, zhack, zinject, ztest).  The last implementation
-+       * is assumed to be the fastest and used by default.
-+       */
-+      memcpy(&vdev_raidz_fastest_impl,
-+          raidz_supp_impl[raidz_supp_impl_cnt - 1],
-+          sizeof (vdev_raidz_fastest_impl));
-+      strcpy(vdev_raidz_fastest_impl.name, "fastest");
-+#endif /* _KERNEL */
-+}
- 
--      /* install kstats for all impl */
-+void
-+vdev_raidz_math_init(void)
-+{
-+#if defined(_KERNEL)
-+      /*
-+       * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
-+       * run in a kernel threads.  This is needed to take advantage of the
-+       * SIMD functionality, see include/linux/simd_x86.h for details.
-+       */
-+      taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz,
-+          NULL, TQ_SLEEP);
-+      if (id != TASKQID_INVALID) {
-+              taskq_wait_id(system_taskq, id);
-+      } else {
-+              benchmark_raidz(NULL);
-+      }
-+
-+      /* Install kstats for all implementations */
-       raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
-           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
--
-       if (raidz_math_kstat != NULL) {
-               raidz_math_kstat->ks_data = NULL;
-               raidz_math_kstat->ks_ndata = UINT32_MAX;
-@@ -517,6 +541,9 @@ vdev_raidz_math_init(void)
-                   raidz_math_kstat_addr);
-               kstat_install(raidz_math_kstat);
-       }
-+#else
-+      benchmark_raidz(NULL);
-+#endif
- 
-       /* Finish initialization */
-       atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
-@@ -527,15 +554,15 @@ void
- vdev_raidz_math_fini(void)
- {
-       raidz_impl_ops_t const *curr_impl;
--      int i;
- 
-+#if defined(_KERNEL)
-       if (raidz_math_kstat != NULL) {
-               kstat_delete(raidz_math_kstat);
-               raidz_math_kstat = NULL;
-       }
-+#endif
- 
--      /* fini impl */
--      for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
-+      for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
-               curr_impl = raidz_all_maths[i];
-               if (curr_impl->fini)
-                       curr_impl->fini();
-@@ -546,9 +573,7 @@ static const struct {
-       char *name;
-       uint32_t sel;
- } math_impl_opts[] = {
--#if !defined(_KERNEL)
-               { "cycle",      IMPL_CYCLE },
--#endif
-               { "fastest",    IMPL_FASTEST },
-               { "original",   IMPL_ORIGINAL },
-               { "scalar",     IMPL_SCALAR }
-diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c
-index e3ad06776..0a67ceb84 100644
---- a/module/zfs/vdev_raidz_math_aarch64_neon.c
-+++ b/module/zfs/vdev_raidz_math_aarch64_neon.c
-@@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon);
- static boolean_t
- raidz_will_aarch64_neon_work(void)
- {
--      return (B_TRUE); // __arch64__ requires NEON
-+      return (kfpu_allowed());
- }
- 
- const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
-diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
-index f8688a06a..e072f51cd 100644
---- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c
-+++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
-@@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2);
- static boolean_t
- raidz_will_aarch64_neonx2_work(void)
- {
--      return (B_TRUE); // __arch64__ requires NEON
-+      return (kfpu_allowed());
- }
- 
- const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
-diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c
-index 063d29bcd..a12eb6720 100644
---- a/module/zfs/vdev_raidz_math_avx2.c
-+++ b/module/zfs/vdev_raidz_math_avx2.c
-@@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2);
- static boolean_t
- raidz_will_avx2_work(void)
- {
--      return (zfs_avx_available() && zfs_avx2_available());
-+      return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
- }
- 
- const raidz_impl_ops_t vdev_raidz_avx2_impl = {
-diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c
-index d605653db..2f545c9ec 100644
---- a/module/zfs/vdev_raidz_math_avx512bw.c
-+++ b/module/zfs/vdev_raidz_math_avx512bw.c
-@@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw);
- static boolean_t
- raidz_will_avx512bw_work(void)
- {
--      return (zfs_avx_available() &&
--          zfs_avx512f_available() &&
--          zfs_avx512bw_available());
-+      return (kfpu_allowed() && zfs_avx_available() &&
-+          zfs_avx512f_available() && zfs_avx512bw_available());
- }
- 
- const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
-diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c
-index f4e4560ce..75af7a8ee 100644
---- a/module/zfs/vdev_raidz_math_avx512f.c
-+++ b/module/zfs/vdev_raidz_math_avx512f.c
-@@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f);
- static boolean_t
- raidz_will_avx512f_work(void)
- {
--      return (zfs_avx_available() &&
--          zfs_avx2_available() &&
--          zfs_avx512f_available());
-+      return (kfpu_allowed() && zfs_avx_available() &&
-+          zfs_avx2_available() && zfs_avx512f_available());
- }
- 
- const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
-diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c
-index 9985da273..5b3a9385c 100644
---- a/module/zfs/vdev_raidz_math_sse2.c
-+++ b/module/zfs/vdev_raidz_math_sse2.c
-@@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2);
- static boolean_t
- raidz_will_sse2_work(void)
- {
--      return (zfs_sse_available() && zfs_sse2_available());
-+      return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
- }
- 
- const raidz_impl_ops_t vdev_raidz_sse2_impl = {
-diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c
-index 047a48d54..62247cf8e 100644
---- a/module/zfs/vdev_raidz_math_ssse3.c
-+++ b/module/zfs/vdev_raidz_math_ssse3.c
-@@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3);
- static boolean_t
- raidz_will_ssse3_work(void)
- {
--      return (zfs_sse_available() && zfs_sse2_available() &&
--          zfs_ssse3_available());
-+      return (kfpu_allowed() && zfs_sse_available() &&
-+          zfs_sse2_available() && zfs_ssse3_available());
- }
- 
- const raidz_impl_ops_t vdev_raidz_ssse3_impl = {
-diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4
-index 5fff79a74..31bf35f83 100644
---- a/config/kernel-fpu.m4
-+++ b/config/kernel-fpu.m4
-@@ -2,8 +2,15 @@ dnl #
- dnl # Handle differences in kernel FPU code.
- dnl #
- dnl # Kernel
--dnl # 5.0:    All kernel fpu functions are GPL only, so we can't use them.
--dnl #         (nothing defined)
-+dnl # 5.2:    The fpu->initialized flag was replaced by TIF_NEED_FPU_LOAD.
-+dnl #         HAVE_KERNEL_TIF_NEED_FPU_LOAD
-+dnl #
-+dnl # 5.0:    As an optimization SIMD operations performed by kernel
-+dnl #         threads can skip saving and restoring their FPU context.
-+dnl #         Wrappers have been introduced to determine the running
-+dnl #         context and use either the SIMD or generic implementation.
-+dnl #         This change was made to the 4.19.38 and 4.14.120 LTS kernels.
-+dnl #         HAVE_KERNEL_FPU_INITIALIZED
- dnl #
- dnl # 4.2:    Use __kernel_fpu_{begin,end}()
- dnl #         HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU
-@@ -56,10 +63,39 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
-                       __kernel_fpu_end();
-               ], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [
-                       AC_MSG_RESULT(__kernel_fpu_*)
--                      AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions])
--                      AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions])
-+                      AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1,
-+                          [kernel has __kernel_fpu_* functions])
-+                      AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
-+                          [kernel exports FPU functions])
-               ],[
--                      AC_MSG_RESULT(not exported)
-+                      ZFS_LINUX_TRY_COMPILE([
-+                              #include <linux/module.h>
-+                              #include <linux/sched.h>
-+                      ],[
-+                              struct fpu *fpu = &current->thread.fpu;
-+                              if (fpu->initialized) { return (0); };
-+                      ],[
-+                              AC_MSG_RESULT(fpu.initialized)
-+                              AC_DEFINE(HAVE_KERNEL_FPU_INITIALIZED, 1,
-+                                  [kernel fpu.initialized exists])
-+                      ],[
-+                              ZFS_LINUX_TRY_COMPILE([
-+                                      #include <linux/module.h>
-+                                      #include <asm/thread_info.h>
-+
-+                                      #if !defined(TIF_NEED_FPU_LOAD)
-+                                      #error "TIF_NEED_FPU_LOAD undefined"
-+                                      #endif
-+                              ],[
-+                              ],[
-+                                      AC_MSG_RESULT(TIF_NEED_FPU_LOAD)
-+                                      AC_DEFINE(
-+                                          HAVE_KERNEL_TIF_NEED_FPU_LOAD, 1,
-+                                          [kernel TIF_NEED_FPU_LOAD exists])
-+                              ],[
-+                                      AC_MSG_RESULT(unavailable)
-+                              ])
-+                      ])
-               ])
-       ])
- ])
diff --git a/debian/patches/0008-SIMD-FPU-register-save-restore-is-also-required-on-5.patch b/debian/patches/0008-SIMD-FPU-register-save-restore-is-also-required-on-5.patch

new file mode 100644 (file)

index 0000000..db0c8a3
--- /dev/null
+++ b/debian/patches/0008-SIMD-FPU-register-save-restore-is-also-required-on-5.patch
@@ -0,0 +1,57 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Thomas Lamprecht <t.lamprecht@proxmox.com>
+Date: Wed, 25 Sep 2019 10:48:48 +0200
+Subject: [PATCH] [SIMD]: FPU register save/restore is also required on 5.0
+ kernels
+
+NOTE: the kernel needs to have the copy_kernel_to_xregs_err,
+copy_kernel_to_fxregs_err and copy_kernel_to_fregs_err funcitons
+backported for this to work.
+
+Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
+---
+ include/linux/simd_x86.h | 11 +----------
+ 1 file changed, 1 insertion(+), 10 deletions(-)
+
+diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
+index edd456098..13aa77345 100644
+--- a/include/linux/simd_x86.h
++++ b/include/linux/simd_x86.h
+@@ -181,7 +181,6 @@ kfpu_begin(void)
+       preempt_disable();
+       local_irq_disable();
+ 
+-#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
+       /*
+        * The current FPU registers need to be preserved by kfpu_begin()
+        * and restored by kfpu_end().  This is required because we can
+@@ -190,20 +189,13 @@ kfpu_begin(void)
+        * context switch.
+        */
+       copy_fpregs_to_fpstate(&current->thread.fpu);
+-#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
+-      /*
+-       * There is no need to preserve and restore the FPU registers.
+-       * They will always be restored from the task's stored FPU state
+-       * when switching contexts.
+-       */
++
+       WARN_ON_ONCE(current->thread.fpu.initialized == 0);
+-#endif
+ }
+ 
+ static inline void
+ kfpu_end(void)
+ {
+-#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
+       union fpregs_state *state = &current->thread.fpu.state;
+       int error;
+ 
+@@ -215,7 +207,6 @@ kfpu_end(void)
+               error = copy_kernel_to_fregs_err(&state->fsave);
+       }
+       WARN_ON_ONCE(error);
+-#endif
+ 
+       local_irq_enable();
+       preempt_enable();
diff --git a/debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch b/debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch

deleted file mode 100644 (file)

index c8c8267..0000000
--- a/debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch
+++ /dev/null
@@ -1,44 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Brian Behlendorf <behlendorf1@llnl.gov>
-Date: Wed, 17 Jul 2019 09:14:36 -0700
-Subject: [PATCH] Fix CONFIG_X86_DEBUG_FPU build failure
-MIME-Version: 1.0
-Content-Type: text/plain; charset=UTF-8
-Content-Transfer-Encoding: 8bit
-
-When CONFIG_X86_DEBUG_FPU is defined the alternatives_patched symbol
-is pulled in as a dependency which results in a build failure.  To
-prevent this undefine CONFIG_X86_DEBUG_FPU to disable the WARN_ON_FPU()
-macro and rely on WARN_ON_ONCE debugging checks which were previously
-added.
-
-Reviewed-by: Tony Hutter <hutter2@llnl.gov>
-Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
-Closes #9041
-Closes #9049
-(cherry picked from commit 095b5412b31c07cad5cec74a4eb5ace011c92b27)
-Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
----
- include/linux/simd_x86.h | 9 +++++++++
- 1 file changed, 9 insertions(+)
-
-diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
-index 2d7a1c3a5..5f243e0cc 100644
---- a/include/linux/simd_x86.h
-+++ b/include/linux/simd_x86.h
-@@ -82,6 +82,15 @@
- 
- #if defined(_KERNEL)
- 
-+/*
-+ * Disable the WARN_ON_FPU() macro to prevent additional dependencies
-+ * when providing the kfpu_* functions.  Relevant warnings are included
-+ * as appropriate and are unconditionally enabled.
-+ */
-+#if defined(CONFIG_X86_DEBUG_FPU) && !defined(KERNEL_EXPORTS_X86_FPU)
-+#undef CONFIG_X86_DEBUG_FPU
-+#endif
-+
- #if defined(HAVE_KERNEL_FPU_API_HEADER)
- #include <asm/fpu/api.h>
- #include <asm/fpu/internal.h>
diff --git a/debian/patches/0010-SIMD-FPU-register-save-restore-is-also-required-on-5.patch b/debian/patches/0010-SIMD-FPU-register-save-restore-is-also-required-on-5.patch

deleted file mode 100644 (file)

index 87c8af1..0000000
--- a/debian/patches/0010-SIMD-FPU-register-save-restore-is-also-required-on-5.patch
+++ /dev/null
@@ -1,57 +0,0 @@
-From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
-From: Thomas Lamprecht <t.lamprecht@proxmox.com>
-Date: Wed, 25 Sep 2019 10:48:48 +0200
-Subject: [PATCH] [SIMD]: FPU register save/restore is also required on 5.0
- kernels
-
-NOTE: the kernel needs to have the copy_kernel_to_xregs_err,
-copy_kernel_to_fxregs_err and copy_kernel_to_fregs_err funcitons
-backported for this to work.
-
-Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
----
- include/linux/simd_x86.h | 11 +----------
- 1 file changed, 1 insertion(+), 10 deletions(-)
-
-diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
-index 5f243e0cc..aac63d964 100644
---- a/include/linux/simd_x86.h
-+++ b/include/linux/simd_x86.h
-@@ -179,7 +179,6 @@ kfpu_begin(void)
-       preempt_disable();
-       local_irq_disable();
- 
--#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
-       /*
-        * The current FPU registers need to be preserved by kfpu_begin()
-        * and restored by kfpu_end().  This is required because we can
-@@ -188,20 +187,13 @@ kfpu_begin(void)
-        * context switch.
-        */
-       copy_fpregs_to_fpstate(&current->thread.fpu);
--#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
--      /*
--       * There is no need to preserve and restore the FPU registers.
--       * They will always be restored from the task's stored FPU state
--       * when switching contexts.
--       */
-+
-       WARN_ON_ONCE(current->thread.fpu.initialized == 0);
--#endif
- }
- 
- static inline void
- kfpu_end(void)
- {
--#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
-       union fpregs_state *state = &current->thread.fpu.state;
-       int error;
- 
-@@ -213,7 +205,6 @@ kfpu_end(void)
-               error = copy_kernel_to_fregs_err(&state->fsave);
-       }
-       WARN_ON_ONCE(error);
--#endif
- 
-       local_irq_enable();
-       preempt_enable();
diff --git a/debian/patches/series b/debian/patches/series

index c65e4104bdb96118b18dc5b018cfa5d5addec217..fb4276e089daf32efe0740329ed420de585a2726 100644 (file)
--- a/debian/patches/series
+++ b/debian/patches/series
@@ -1,10 +1,8 @@
  0001-Check-for-META-and-DCH-consistency-in-autoconf.patch
  0002-always-load-ZFS-module-on-boot.patch
  0003-Fix-the-path-to-the-zed-binary-on-the-systemd-unit.patch
-0004-increase-default-zcmd-allocation-to-256K.patch
-0005-import-with-d-dev-disk-by-id-in-scan-service.patch
-0006-Enable-zed-emails.patch
-0007-Fix-race-in-parallel-mount-s-thread-dispatching-algo.patch
-0008-Linux-5.0-compat-SIMD-compatibility.patch
-0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch
-0010-SIMD-FPU-register-save-restore-is-also-required-on-5.patch
+0004-import-with-d-dev-disk-by-id-in-scan-service.patch
+0005-Enable-zed-emails.patch
+0006-Linux-5.0-compat-SIMD-compatibility.patch
+0007-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch
+0008-SIMD-FPU-register-save-restore-is-also-required-on-5.patch
diff --git a/upstream b/upstream

index 63b88f7e223c1061c522762803b1431e7faba5b3..1222e921c9e3d8f5c693f196435be4604a1187c0 160000 (submodule)
--- a/upstream
+++ b/upstream
@@ -1 +1 @@
-Subproject commit 63b88f7e223c1061c522762803b1431e7faba5b3
+Subproject commit 1222e921c9e3d8f5c693f196435be4604a1187c0
author	Thomas Lamprecht <t.lamprecht@proxmox.com>
	Sat, 28 Sep 2019 06:12:00 +0000 (08:12 +0200)
committer	Thomas Lamprecht <t.lamprecht@proxmox.com>
	Mon, 30 Sep 2019 04:59:38 +0000 (06:59 +0200)
debian/patches/0001-Check-for-META-and-DCH-consistency-in-autoconf.patch		patch \| blob \| blame \| history
debian/patches/0002-always-load-ZFS-module-on-boot.patch		patch \| blob \| blame \| history
debian/patches/0003-Fix-the-path-to-the-zed-binary-on-the-systemd-unit.patch		patch \| blob \| blame \| history
debian/patches/0004-import-with-d-dev-disk-by-id-in-scan-service.patch	[new file with mode: 0644]	patch \| blob
debian/patches/0004-increase-default-zcmd-allocation-to-256K.patch	[deleted file]	patch \| blob \| blame \| history
debian/patches/0005-Enable-zed-emails.patch	[new file with mode: 0644]	patch \| blob
debian/patches/0005-import-with-d-dev-disk-by-id-in-scan-service.patch	[deleted file]	patch \| blob \| blame \| history
debian/patches/0006-Enable-zed-emails.patch	[deleted file]	patch \| blob \| blame \| history
debian/patches/0006-Linux-5.0-compat-SIMD-compatibility.patch	[new file with mode: 0644]	patch \| blob
debian/patches/0007-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch	[new file with mode: 0644]	patch \| blob
debian/patches/0007-Fix-race-in-parallel-mount-s-thread-dispatching-algo.patch	[deleted file]	patch \| blob \| blame \| history
debian/patches/0008-Linux-5.0-compat-SIMD-compatibility.patch	[deleted file]	patch \| blob \| blame \| history
debian/patches/0008-SIMD-FPU-register-save-restore-is-also-required-on-5.patch	[new file with mode: 0644]	patch \| blob
debian/patches/0009-Fix-CONFIG_X86_DEBUG_FPU-build-failure.patch	[deleted file]	patch \| blob \| blame \| history
debian/patches/0010-SIMD-FPU-register-save-restore-is-also-required-on-5.patch	[deleted file]	patch \| blob \| blame \| history
debian/patches/series		patch \| blob \| blame \| history
upstream		patch \| blob \| blame \| history