]> git.proxmox.com Git - zfsonlinux.git/blame - debian/patches/0006-Linux-5.0-compat-SIMD-compatibility.patch
update submodule and patches to 0.8.2
[zfsonlinux.git] / debian / patches / 0006-Linux-5.0-compat-SIMD-compatibility.patch
CommitLineData
f43dbfa7
FG
1From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
2From: Brian Behlendorf <behlendorf1@llnl.gov>
3Date: Fri, 12 Jul 2019 09:31:20 -0700
4Subject: [PATCH] Linux 5.0 compat: SIMD compatibility
5MIME-Version: 1.0
6Content-Type: text/plain; charset=UTF-8
7Content-Transfer-Encoding: 8bit
8
9Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS,
10and 5.0 and newer kernels. This is accomplished by leveraging
11the fact that by definition dedicated kernel threads never need
12to concern themselves with saving and restoring the user FPU state.
13Therefore, they may use the FPU as long as we can guarantee user
14tasks always restore their FPU state before context switching back
15to user space.
16
17For the 5.0 and 5.1 kernels disabling preemption and local
18interrupts is sufficient to allow the FPU to be used. All non-kernel
19threads will restore the preserved user FPU state.
20
21For 5.2 and latter kernels the user FPU state restoration will be
22skipped if the kernel determines the registers have not changed.
23Therefore, for these kernels we need to perform the additional
24step of saving and restoring the FPU registers. Invalidating the
25per-cpu global tracking the FPU state would force a restore but
26that functionality is private to the core x86 FPU implementation
27and unavailable.
28
29In practice, restricting SIMD to kernel threads is not a major
30restriction for ZFS. The vast majority of SIMD operations are
31already performed by the IO pipeline. The remaining cases are
32relatively infrequent and can be handled by the generic code
33without significant impact. The two most noteworthy cases are:
34
35 1) Decrypting the wrapping key for an encrypted dataset,
36 i.e. `zfs load-key`. All other encryption and decryption
37 operations will use the SIMD optimized implementations.
38
39 2) Generating the payload checksums for a `zfs send` stream.
40
41In order to avoid making any changes to the higher layers of ZFS
42all of the `*_get_ops()` functions were updated to take in to
43consideration the calling context. This allows for the fastest
44implementation to be used as appropriate (see kfpu_allowed()).
45
46The only other notable instance of SIMD operations being used
47outside a kernel thread was at module load time. This code
48was moved in to a taskq in order to accommodate the new kernel
49thread restriction.
50
51Finally, a few other modifications were made in order to further
52harden this code and facilitate testing. They include updating
53each implementations operations structure to be declared as a
54constant. And allowing "cycle" to be set when selecting the
55preferred ops in the kernel as well as user space.
56
57Reviewed-by: Tony Hutter <hutter2@llnl.gov>
58Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
59Closes #8754
60Closes #8793
61Closes #8965
62(cherry picked from commit e5db31349484e5e859c7a942eb15b98d68ce5b4d)
63Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com>
08743f90 64Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com>
f43dbfa7 65---
08743f90
TL
66 cmd/ztest/ztest.c | 3 +
67 config/kernel-fpu.m4 | 46 ++++-
f43dbfa7
FG
68 include/linux/Makefile.am | 1 +
69 include/linux/simd.h | 41 +++++
70 include/linux/simd_aarch64.h | 18 +-
71 include/linux/simd_x86.h | 192 +++++++++++++-------
72 include/sys/vdev_raidz.h | 2 +-
73 include/sys/vdev_raidz_impl.h | 2 +-
f43dbfa7
FG
74 module/icp/algs/aes/aes_impl.c | 34 ++--
75 module/icp/algs/aes/aes_impl_aesni.c | 2 +-
76 module/icp/algs/modes/gcm.c | 41 +++--
77 module/icp/algs/modes/gcm_pclmulqdq.c | 2 +-
08743f90
TL
78 module/icp/include/aes/aes_impl.h | 6 +-
79 module/icp/include/modes/gcm_impl.h | 6 +-
f43dbfa7
FG
80 module/icp/io/aes.c | 32 +++-
81 module/spl/spl-taskq.c | 2 +
82 module/spl/spl-thread.c | 2 +
83 module/zcommon/zfs_fletcher.c | 88 ++++++---
84 module/zcommon/zfs_fletcher_aarch64_neon.c | 2 +-
85 module/zcommon/zfs_fletcher_avx512.c | 2 +-
86 module/zcommon/zfs_fletcher_intel.c | 2 +-
87 module/zcommon/zfs_fletcher_sse.c | 5 +-
88 module/zfs/vdev_raidz_math.c | 105 +++++++----
89 module/zfs/vdev_raidz_math_aarch64_neon.c | 2 +-
90 module/zfs/vdev_raidz_math_aarch64_neonx2.c | 2 +-
91 module/zfs/vdev_raidz_math_avx2.c | 2 +-
92 module/zfs/vdev_raidz_math_avx512bw.c | 5 +-
93 module/zfs/vdev_raidz_math_avx512f.c | 5 +-
94 module/zfs/vdev_raidz_math_sse2.c | 2 +-
95 module/zfs/vdev_raidz_math_ssse3.c | 4 +-
f43dbfa7
FG
96 30 files changed, 454 insertions(+), 204 deletions(-)
97 create mode 100644 include/linux/simd.h
98
08743f90
TL
99diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c
100index 3bf840d88..dc22faef7 100644
101--- a/cmd/ztest/ztest.c
102+++ b/cmd/ztest/ztest.c
103@@ -107,6 +107,7 @@
104 #include <sys/vdev_impl.h>
105 #include <sys/vdev_file.h>
106 #include <sys/vdev_initialize.h>
107+#include <sys/vdev_raidz.h>
108 #include <sys/vdev_trim.h>
109 #include <sys/spa_impl.h>
110 #include <sys/metaslab_impl.h>
111@@ -7110,6 +7111,8 @@ ztest_run(ztest_shared_t *zs)
112 metaslab_preload_limit = ztest_random(20) + 1;
113 ztest_spa = spa;
114
115+ VERIFY0(vdev_raidz_impl_set("cycle"));
116+
117 dmu_objset_stats_t dds;
118 VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool,
119 DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os));
120diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4
121index ebb02fb09..0e622e859 100644
122--- a/config/kernel-fpu.m4
123+++ b/config/kernel-fpu.m4
124@@ -2,8 +2,15 @@ dnl #
125 dnl # Handle differences in kernel FPU code.
126 dnl #
127 dnl # Kernel
128-dnl # 5.0: All kernel fpu functions are GPL only, so we can't use them.
129-dnl # (nothing defined)
130+dnl # 5.2: The fpu->initialized flag was replaced by TIF_NEED_FPU_LOAD.
131+dnl # HAVE_KERNEL_TIF_NEED_FPU_LOAD
132+dnl #
133+dnl # 5.0: As an optimization SIMD operations performed by kernel
134+dnl # threads can skip saving and restoring their FPU context.
135+dnl # Wrappers have been introduced to determine the running
136+dnl # context and use either the SIMD or generic implementation.
137+dnl # This change was made to the 4.19.38 and 4.14.120 LTS kernels.
138+dnl # HAVE_KERNEL_FPU_INITIALIZED
139 dnl #
140 dnl # 4.2: Use __kernel_fpu_{begin,end}()
141 dnl # HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU
142@@ -59,10 +66,39 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [
143 __kernel_fpu_end();
144 ], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [
145 AC_MSG_RESULT(__kernel_fpu_*)
146- AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions])
147- AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions])
148+ AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1,
149+ [kernel has __kernel_fpu_* functions])
150+ AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1,
151+ [kernel exports FPU functions])
152 ],[
153- AC_MSG_RESULT(not exported)
154+ ZFS_LINUX_TRY_COMPILE([
155+ #include <linux/module.h>
156+ #include <linux/sched.h>
157+ ],[
158+ struct fpu *fpu = &current->thread.fpu;
159+ if (fpu->initialized) { return (0); };
160+ ],[
161+ AC_MSG_RESULT(fpu.initialized)
162+ AC_DEFINE(HAVE_KERNEL_FPU_INITIALIZED, 1,
163+ [kernel fpu.initialized exists])
164+ ],[
165+ ZFS_LINUX_TRY_COMPILE([
166+ #include <linux/module.h>
167+ #include <asm/thread_info.h>
168+
169+ #if !defined(TIF_NEED_FPU_LOAD)
170+ #error "TIF_NEED_FPU_LOAD undefined"
171+ #endif
172+ ],[
173+ ],[
174+ AC_MSG_RESULT(TIF_NEED_FPU_LOAD)
175+ AC_DEFINE(
176+ HAVE_KERNEL_TIF_NEED_FPU_LOAD, 1,
177+ [kernel TIF_NEED_FPU_LOAD exists])
178+ ],[
179+ AC_MSG_RESULT(unavailable)
180+ ])
181+ ])
182 ])
183 ])
184 ])
f43dbfa7
FG
185diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am
186index efb49520e..2455759e8 100644
187--- a/include/linux/Makefile.am
188+++ b/include/linux/Makefile.am
189@@ -7,6 +7,7 @@ KERNEL_H = \
190 $(top_srcdir)/include/linux/blkdev_compat.h \
191 $(top_srcdir)/include/linux/utsname_compat.h \
192 $(top_srcdir)/include/linux/kmap_compat.h \
193+ $(top_srcdir)/include/linux/simd.h \
194 $(top_srcdir)/include/linux/simd_x86.h \
195 $(top_srcdir)/include/linux/simd_aarch64.h \
196 $(top_srcdir)/include/linux/mod_compat.h \
197diff --git a/include/linux/simd.h b/include/linux/simd.h
198new file mode 100644
199index 000000000..d2b60996a
200--- /dev/null
201+++ b/include/linux/simd.h
202@@ -0,0 +1,41 @@
203+/*
204+ * CDDL HEADER START
205+ *
206+ * The contents of this file are subject to the terms of the
207+ * Common Development and Distribution License (the "License").
208+ * You may not use this file except in compliance with the License.
209+ *
210+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
211+ * or http://www.opensolaris.org/os/licensing.
212+ * See the License for the specific language governing permissions
213+ * and limitations under the License.
214+ *
215+ * When distributing Covered Code, include this CDDL HEADER in each
216+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
217+ * If applicable, add the following below this CDDL HEADER, with the
218+ * fields enclosed by brackets "[]" replaced with your own identifying
219+ * information: Portions Copyright [yyyy] [name of copyright owner]
220+ *
221+ * CDDL HEADER END
222+ */
223+/*
224+ * Copyright (C) 2019 Lawrence Livermore National Security, LLC.
225+ */
226+
227+#ifndef _SIMD_H
228+#define _SIMD_H
229+
230+#if defined(__x86)
231+#include <linux/simd_x86.h>
232+
233+#elif defined(__aarch64__)
234+#include <linux/simd_aarch64.h>
235+#else
236+
237+#define kfpu_allowed() 1
238+#define kfpu_initialize(tsk) do {} while (0)
239+#define kfpu_begin() do {} while (0)
240+#define kfpu_end() do {} while (0)
241+
242+#endif
243+#endif /* _SIMD_H */
244diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h
08743f90 245index 56153a160..b45d31c48 100644
f43dbfa7
FG
246--- a/include/linux/simd_aarch64.h
247+++ b/include/linux/simd_aarch64.h
08743f90 248@@ -43,20 +43,18 @@
f43dbfa7
FG
249
250 #if defined(_KERNEL)
251 #include <asm/neon.h>
252-#define kfpu_begin() \
253-{ \
254- kernel_neon_begin(); \
255-}
256-#define kfpu_end() \
257-{ \
258- kernel_neon_end(); \
259-}
260+#define kfpu_allowed() 1
261+#define kfpu_initialize(tsk) do {} while (0)
262+#define kfpu_begin() kernel_neon_begin()
263+#define kfpu_end() kernel_neon_end()
264 #else
265 /*
266 * fpu dummy methods for userspace
267 */
268-#define kfpu_begin() do {} while (0)
269-#define kfpu_end() do {} while (0)
270+#define kfpu_allowed() 1
271+#define kfpu_initialize(tsk) do {} while (0)
272+#define kfpu_begin() do {} while (0)
273+#define kfpu_end() do {} while (0)
274 #endif /* defined(_KERNEL) */
275
276 #endif /* __aarch64__ */
277diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h
08743f90 278index 0489bfaa3..641f43955 100644
f43dbfa7
FG
279--- a/include/linux/simd_x86.h
280+++ b/include/linux/simd_x86.h
08743f90 281@@ -92,33 +92,135 @@
f43dbfa7
FG
282 #include <asm/xcr.h>
283 #endif
284
285+/*
286+ * The following cases are for kernels which export either the
287+ * kernel_fpu_* or __kernel_fpu_* functions.
288+ */
289+#if defined(KERNEL_EXPORTS_X86_FPU)
290+
291+#define kfpu_allowed() 1
292+#define kfpu_initialize(tsk) do {} while (0)
293+
294 #if defined(HAVE_UNDERSCORE_KERNEL_FPU)
295 #define kfpu_begin() \
296-{ \
297- preempt_disable(); \
298+{ \
299+ preempt_disable(); \
300 __kernel_fpu_begin(); \
301 }
302-#define kfpu_end() \
303-{ \
304- __kernel_fpu_end(); \
305- preempt_enable(); \
306+#define kfpu_end() \
307+{ \
308+ __kernel_fpu_end(); \
309+ preempt_enable(); \
310 }
311+
312 #elif defined(HAVE_KERNEL_FPU)
313-#define kfpu_begin() kernel_fpu_begin()
314+#define kfpu_begin() kernel_fpu_begin()
315 #define kfpu_end() kernel_fpu_end()
316+
317 #else
318-/* Kernel doesn't export any kernel_fpu_* functions */
319-#include <asm/fpu/internal.h> /* For kernel xgetbv() */
320-#define kfpu_begin() panic("This code should never run")
321-#define kfpu_end() panic("This code should never run")
322-#endif /* defined(HAVE_KERNEL_FPU) */
323+/*
324+ * This case is unreachable. When KERNEL_EXPORTS_X86_FPU is defined then
325+ * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined.
326+ */
327+#error "Unreachable kernel configuration"
328+#endif
329+
330+#else /* defined(KERNEL_EXPORTS_X86_FPU) */
331+/*
332+ * When the kernel_fpu_* symbols are unavailable then provide our own
333+ * versions which allow the FPU to be safely used in kernel threads.
334+ * In practice, this is not a significant restriction for ZFS since the
335+ * vast majority of SIMD operations are performed by the IO pipeline.
336+ */
337
338+/*
339+ * Returns non-zero if FPU operations are allowed in the current context.
340+ */
341+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
342+#define kfpu_allowed() ((current->flags & PF_KTHREAD) && \
343+ test_thread_flag(TIF_NEED_FPU_LOAD))
344+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
345+#define kfpu_allowed() ((current->flags & PF_KTHREAD) && \
346+ current->thread.fpu.initialized)
347 #else
348+#define kfpu_allowed() 0
349+#endif
350+
351+static inline void
352+kfpu_initialize(void)
353+{
354+ WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
355+
356+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
357+ __fpu_invalidate_fpregs_state(&current->thread.fpu);
358+ set_thread_flag(TIF_NEED_FPU_LOAD);
359+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
360+ __fpu_invalidate_fpregs_state(&current->thread.fpu);
361+ current->thread.fpu.initialized = 1;
362+#endif
363+}
364+
365+static inline void
366+kfpu_begin(void)
367+{
368+ WARN_ON_ONCE(!kfpu_allowed());
369+
370+ /*
371+ * Preemption and interrupts must be disabled for the critical
372+ * region where the FPU state is being modified.
373+ */
374+ preempt_disable();
375+ local_irq_disable();
376+
377+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
378+ /*
379+ * The current FPU registers need to be preserved by kfpu_begin()
380+ * and restored by kfpu_end(). This is required because we can
381+ * not call __cpu_invalidate_fpregs_state() to invalidate the
382+ * per-cpu FPU state and force them to be restored during a
383+ * context switch.
384+ */
385+ copy_fpregs_to_fpstate(&current->thread.fpu);
386+#elif defined(HAVE_KERNEL_FPU_INITIALIZED)
387+ /*
388+ * There is no need to preserve and restore the FPU registers.
389+ * They will always be restored from the task's stored FPU state
390+ * when switching contexts.
391+ */
392+ WARN_ON_ONCE(current->thread.fpu.initialized == 0);
393+#endif
394+}
395+
396+static inline void
397+kfpu_end(void)
398+{
399+#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD)
400+ union fpregs_state *state = &current->thread.fpu.state;
401+ int error;
402+
403+ if (use_xsave()) {
404+ error = copy_kernel_to_xregs_err(&state->xsave, -1);
405+ } else if (use_fxsr()) {
406+ error = copy_kernel_to_fxregs_err(&state->fxsave);
407+ } else {
408+ error = copy_kernel_to_fregs_err(&state->fsave);
409+ }
410+ WARN_ON_ONCE(error);
411+#endif
412+
413+ local_irq_enable();
414+ preempt_enable();
415+}
416+#endif /* defined(HAVE_KERNEL_FPU) */
417+
418+#else /* defined(_KERNEL) */
419 /*
420- * fpu dummy methods for userspace
421+ * FPU dummy methods for user space.
422 */
423-#define kfpu_begin() do {} while (0)
424-#define kfpu_end() do {} while (0)
425+#define kfpu_allowed() 1
426+#define kfpu_initialize(tsk) do {} while (0)
427+#define kfpu_begin() do {} while (0)
428+#define kfpu_end() do {} while (0)
429 #endif /* defined(_KERNEL) */
430
431 /*
08743f90 432@@ -300,7 +402,7 @@ __simd_state_enabled(const uint64_t state)
f43dbfa7
FG
433 uint64_t xcr0;
434
435 #if defined(_KERNEL)
436-#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU)
437+#if defined(X86_FEATURE_OSXSAVE)
438 has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE);
439 #else
440 has_osxsave = B_FALSE;
08743f90 441@@ -330,11 +432,7 @@ static inline boolean_t
f43dbfa7
FG
442 zfs_sse_available(void)
443 {
444 #if defined(_KERNEL)
445-#if defined(KERNEL_EXPORTS_X86_FPU)
446 return (!!boot_cpu_has(X86_FEATURE_XMM));
447-#else
448- return (B_FALSE);
449-#endif
450 #elif !defined(_KERNEL)
451 return (__cpuid_has_sse());
452 #endif
08743f90 453@@ -347,11 +445,7 @@ static inline boolean_t
f43dbfa7
FG
454 zfs_sse2_available(void)
455 {
456 #if defined(_KERNEL)
457-#if defined(KERNEL_EXPORTS_X86_FPU)
458 return (!!boot_cpu_has(X86_FEATURE_XMM2));
459-#else
460- return (B_FALSE);
461-#endif
462 #elif !defined(_KERNEL)
463 return (__cpuid_has_sse2());
464 #endif
08743f90 465@@ -364,11 +458,7 @@ static inline boolean_t
f43dbfa7
FG
466 zfs_sse3_available(void)
467 {
468 #if defined(_KERNEL)
469-#if defined(KERNEL_EXPORTS_X86_FPU)
470 return (!!boot_cpu_has(X86_FEATURE_XMM3));
471-#else
472- return (B_FALSE);
473-#endif
474 #elif !defined(_KERNEL)
475 return (__cpuid_has_sse3());
476 #endif
08743f90 477@@ -381,11 +471,7 @@ static inline boolean_t
f43dbfa7
FG
478 zfs_ssse3_available(void)
479 {
480 #if defined(_KERNEL)
481-#if defined(KERNEL_EXPORTS_X86_FPU)
482 return (!!boot_cpu_has(X86_FEATURE_SSSE3));
483-#else
484- return (B_FALSE);
485-#endif
486 #elif !defined(_KERNEL)
487 return (__cpuid_has_ssse3());
488 #endif
08743f90 489@@ -398,11 +484,7 @@ static inline boolean_t
f43dbfa7
FG
490 zfs_sse4_1_available(void)
491 {
492 #if defined(_KERNEL)
493-#if defined(KERNEL_EXPORTS_X86_FPU)
494 return (!!boot_cpu_has(X86_FEATURE_XMM4_1));
495-#else
496- return (B_FALSE);
497-#endif
498 #elif !defined(_KERNEL)
499 return (__cpuid_has_sse4_1());
500 #endif
08743f90 501@@ -415,11 +497,7 @@ static inline boolean_t
f43dbfa7
FG
502 zfs_sse4_2_available(void)
503 {
504 #if defined(_KERNEL)
505-#if defined(KERNEL_EXPORTS_X86_FPU)
506 return (!!boot_cpu_has(X86_FEATURE_XMM4_2));
507-#else
508- return (B_FALSE);
509-#endif
510 #elif !defined(_KERNEL)
511 return (__cpuid_has_sse4_2());
512 #endif
08743f90 513@@ -433,11 +511,7 @@ zfs_avx_available(void)
f43dbfa7
FG
514 {
515 boolean_t has_avx;
516 #if defined(_KERNEL)
517-#if defined(KERNEL_EXPORTS_X86_FPU)
518 has_avx = !!boot_cpu_has(X86_FEATURE_AVX);
519-#else
520- has_avx = B_FALSE;
521-#endif
522 #elif !defined(_KERNEL)
523 has_avx = __cpuid_has_avx();
524 #endif
08743f90 525@@ -453,11 +527,7 @@ zfs_avx2_available(void)
f43dbfa7
FG
526 {
527 boolean_t has_avx2;
528 #if defined(_KERNEL)
529-#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU)
530 has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2);
531-#else
532- has_avx2 = B_FALSE;
533-#endif
534 #elif !defined(_KERNEL)
535 has_avx2 = __cpuid_has_avx2();
536 #endif
08743f90 537@@ -472,7 +542,7 @@ static inline boolean_t
f43dbfa7
FG
538 zfs_bmi1_available(void)
539 {
540 #if defined(_KERNEL)
541-#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU)
542+#if defined(X86_FEATURE_BMI1)
543 return (!!boot_cpu_has(X86_FEATURE_BMI1));
544 #else
545 return (B_FALSE);
08743f90 546@@ -489,7 +559,7 @@ static inline boolean_t
f43dbfa7
FG
547 zfs_bmi2_available(void)
548 {
549 #if defined(_KERNEL)
550-#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU)
551+#if defined(X86_FEATURE_BMI2)
552 return (!!boot_cpu_has(X86_FEATURE_BMI2));
553 #else
554 return (B_FALSE);
08743f90 555@@ -506,7 +576,7 @@ static inline boolean_t
f43dbfa7
FG
556 zfs_aes_available(void)
557 {
558 #if defined(_KERNEL)
559-#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU)
560+#if defined(X86_FEATURE_AES)
561 return (!!boot_cpu_has(X86_FEATURE_AES));
562 #else
563 return (B_FALSE);
08743f90 564@@ -523,7 +593,7 @@ static inline boolean_t
f43dbfa7
FG
565 zfs_pclmulqdq_available(void)
566 {
567 #if defined(_KERNEL)
568-#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU)
569+#if defined(X86_FEATURE_PCLMULQDQ)
570 return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ));
571 #else
572 return (B_FALSE);
08743f90 573@@ -557,7 +627,7 @@ zfs_avx512f_available(void)
f43dbfa7
FG
574 boolean_t has_avx512 = B_FALSE;
575
576 #if defined(_KERNEL)
577-#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU)
578+#if defined(X86_FEATURE_AVX512F)
579 has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F);
580 #else
581 has_avx512 = B_FALSE;
08743f90 582@@ -576,7 +646,7 @@ zfs_avx512cd_available(void)
f43dbfa7
FG
583 boolean_t has_avx512 = B_FALSE;
584
585 #if defined(_KERNEL)
586-#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU)
587+#if defined(X86_FEATURE_AVX512CD)
588 has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
589 boot_cpu_has(X86_FEATURE_AVX512CD);
590 #else
08743f90 591@@ -596,7 +666,7 @@ zfs_avx512er_available(void)
f43dbfa7
FG
592 boolean_t has_avx512 = B_FALSE;
593
594 #if defined(_KERNEL)
595-#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU)
596+#if defined(X86_FEATURE_AVX512ER)
597 has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
598 boot_cpu_has(X86_FEATURE_AVX512ER);
599 #else
08743f90 600@@ -616,7 +686,7 @@ zfs_avx512pf_available(void)
f43dbfa7
FG
601 boolean_t has_avx512 = B_FALSE;
602
603 #if defined(_KERNEL)
604-#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU)
605+#if defined(X86_FEATURE_AVX512PF)
606 has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
607 boot_cpu_has(X86_FEATURE_AVX512PF);
608 #else
08743f90 609@@ -636,7 +706,7 @@ zfs_avx512bw_available(void)
f43dbfa7
FG
610 boolean_t has_avx512 = B_FALSE;
611
612 #if defined(_KERNEL)
613-#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU)
614+#if defined(X86_FEATURE_AVX512BW)
615 has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
616 boot_cpu_has(X86_FEATURE_AVX512BW);
617 #else
08743f90 618@@ -656,7 +726,7 @@ zfs_avx512dq_available(void)
f43dbfa7
FG
619 boolean_t has_avx512 = B_FALSE;
620
621 #if defined(_KERNEL)
622-#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU)
623+#if defined(X86_FEATURE_AVX512DQ)
624 has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
625 boot_cpu_has(X86_FEATURE_AVX512DQ);
626 #else
08743f90 627@@ -676,7 +746,7 @@ zfs_avx512vl_available(void)
f43dbfa7
FG
628 boolean_t has_avx512 = B_FALSE;
629
630 #if defined(_KERNEL)
631-#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU)
632+#if defined(X86_FEATURE_AVX512VL)
633 has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
634 boot_cpu_has(X86_FEATURE_AVX512VL);
635 #else
08743f90 636@@ -696,7 +766,7 @@ zfs_avx512ifma_available(void)
f43dbfa7
FG
637 boolean_t has_avx512 = B_FALSE;
638
639 #if defined(_KERNEL)
640-#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU)
641+#if defined(X86_FEATURE_AVX512IFMA)
642 has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
643 boot_cpu_has(X86_FEATURE_AVX512IFMA);
644 #else
08743f90 645@@ -716,7 +786,7 @@ zfs_avx512vbmi_available(void)
f43dbfa7
FG
646 boolean_t has_avx512 = B_FALSE;
647
648 #if defined(_KERNEL)
649-#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU)
650+#if defined(X86_FEATURE_AVX512VBMI)
651 has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) &&
652 boot_cpu_has(X86_FEATURE_AVX512VBMI);
653 #else
654diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h
655index 2ce32469d..0ce2b5ea1 100644
656--- a/include/sys/vdev_raidz.h
657+++ b/include/sys/vdev_raidz.h
658@@ -51,7 +51,7 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int);
659 */
660 void vdev_raidz_math_init(void);
661 void vdev_raidz_math_fini(void);
662-struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
663+const struct raidz_impl_ops *vdev_raidz_math_get_ops(void);
664 int vdev_raidz_math_generate(struct raidz_map *);
665 int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *,
666 const int);
667diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h
668index 0799ed19d..4969d110b 100644
669--- a/include/sys/vdev_raidz_impl.h
670+++ b/include/sys/vdev_raidz_impl.h
671@@ -126,7 +126,7 @@ typedef struct raidz_map {
672 uintptr_t rm_reports; /* # of referencing checksum reports */
673 uint8_t rm_freed; /* map no longer has referencing ZIO */
674 uint8_t rm_ecksuminjected; /* checksum error was injected */
675- raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
676+ const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */
677 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
678 } raidz_map_t;
679
f43dbfa7 680diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c
08743f90 681index 36e0686a5..0f11f9999 100644
f43dbfa7
FG
682--- a/module/icp/algs/aes/aes_impl.c
683+++ b/module/icp/algs/aes/aes_impl.c
684@@ -27,6 +27,7 @@
685 #include <sys/crypto/spi.h>
686 #include <modes/modes.h>
687 #include <aes/aes_impl.h>
688+#include <linux/simd.h>
689
690 /*
691 * Initialize AES encryption and decryption key schedules.
692@@ -40,9 +41,9 @@
693 void
694 aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched)
695 {
696- aes_impl_ops_t *ops = aes_impl_get_ops();
697- aes_key_t *newbie = keysched;
698- uint_t keysize, i, j;
699+ const aes_impl_ops_t *ops = aes_impl_get_ops();
700+ aes_key_t *newbie = keysched;
701+ uint_t keysize, i, j;
702 union {
703 uint64_t ka64[4];
704 uint32_t ka32[8];
705@@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0;
706 static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)];
707
708 /*
709- * Selects the aes operations for encrypt/decrypt/key setup
710+ * Returns the AES operations for encrypt/decrypt/key setup. When a
711+ * SIMD implementation is not allowed in the current context, then
712+ * fallback to the fastest generic implementation.
713 */
714-aes_impl_ops_t *
715-aes_impl_get_ops()
716+const aes_impl_ops_t *
717+aes_impl_get_ops(void)
718 {
719- aes_impl_ops_t *ops = NULL;
720+ if (!kfpu_allowed())
721+ return (&aes_generic_impl);
722+
723+ const aes_impl_ops_t *ops = NULL;
724 const uint32_t impl = AES_IMPL_READ(icp_aes_impl);
725
726 switch (impl) {
727@@ -266,15 +272,13 @@ aes_impl_get_ops()
728 ops = &aes_fastest_impl;
729 break;
730 case IMPL_CYCLE:
731- {
732+ /* Cycle through supported implementations */
733 ASSERT(aes_impl_initialized);
734 ASSERT3U(aes_supp_impl_cnt, >, 0);
735- /* Cycle through supported implementations */
736 static size_t cycle_impl_idx = 0;
737 size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt;
738 ops = aes_supp_impl[idx];
739- }
740- break;
741+ break;
742 default:
743 ASSERT3U(impl, <, aes_supp_impl_cnt);
744 ASSERT3U(aes_supp_impl_cnt, >, 0);
745@@ -288,13 +292,17 @@ aes_impl_get_ops()
746 return (ops);
747 }
748
749+/*
750+ * Initialize all supported implementations.
751+ */
752+/* ARGSUSED */
753 void
754-aes_impl_init(void)
755+aes_impl_init(void *arg)
756 {
757 aes_impl_ops_t *curr_impl;
758 int i, c;
759
760- /* move supported impl into aes_supp_impls */
761+ /* Move supported implementations into aes_supp_impls */
762 for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) {
763 curr_impl = (aes_impl_ops_t *)aes_all_impl[i];
764
765diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c
766index 97f7c3a47..222c176aa 100644
767--- a/module/icp/algs/aes/aes_impl_aesni.c
768+++ b/module/icp/algs/aes/aes_impl_aesni.c
769@@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4],
770 static boolean_t
771 aes_aesni_will_work(void)
772 {
773- return (zfs_aes_available());
774+ return (kfpu_allowed() && zfs_aes_available());
775 }
776
777 const aes_impl_ops_t aes_aesni_impl = {
778diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c
08743f90 779index 0afd957f0..423b70e2c 100644
f43dbfa7
FG
780--- a/module/icp/algs/modes/gcm.c
781+++ b/module/icp/algs/modes/gcm.c
782@@ -29,6 +29,7 @@
783 #include <sys/crypto/impl.h>
784 #include <sys/byteorder.h>
785 #include <modes/gcm_impl.h>
786+#include <linux/simd.h>
787
788 #define GHASH(c, d, t, o) \
789 xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \
790@@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length,
791 void (*copy_block)(uint8_t *, uint8_t *),
792 void (*xor_block)(uint8_t *, uint8_t *))
793 {
794- gcm_impl_ops_t *gops;
795+ const gcm_impl_ops_t *gops;
796 size_t remainder = length;
797 size_t need = 0;
798 uint8_t *datap = (uint8_t *)data;
799@@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
800 void (*copy_block)(uint8_t *, uint8_t *),
801 void (*xor_block)(uint8_t *, uint8_t *))
802 {
803- gcm_impl_ops_t *gops;
804+ const gcm_impl_ops_t *gops;
805 uint64_t counter_mask = ntohll(0x00000000ffffffffULL);
806 uint8_t *ghash, *macp = NULL;
807 int i, rv;
808@@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size,
809 int (*encrypt_block)(const void *, const uint8_t *, uint8_t *),
810 void (*xor_block)(uint8_t *, uint8_t *))
811 {
812- gcm_impl_ops_t *gops;
813+ const gcm_impl_ops_t *gops;
814 size_t pt_len;
815 size_t remainder;
816 uint8_t *ghash;
817@@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len,
818 void (*copy_block)(uint8_t *, uint8_t *),
819 void (*xor_block)(uint8_t *, uint8_t *))
820 {
821- gcm_impl_ops_t *gops;
822+ const gcm_impl_ops_t *gops;
823 uint8_t *cb;
824 ulong_t remainder = iv_len;
825 ulong_t processed = 0;
826@@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len,
827 void (*copy_block)(uint8_t *, uint8_t *),
828 void (*xor_block)(uint8_t *, uint8_t *))
829 {
830- gcm_impl_ops_t *gops;
831+ const gcm_impl_ops_t *gops;
832 uint8_t *ghash, *datap, *authp;
833 size_t remainder, processed;
834
835@@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0;
836 static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)];
837
838 /*
839- * Selects the gcm operation
840+ * Returns the GCM operations for encrypt/decrypt/key setup. When a
841+ * SIMD implementation is not allowed in the current context, then
842+ * fallback to the fastest generic implementation.
843 */
844-gcm_impl_ops_t *
845+const gcm_impl_ops_t *
846 gcm_impl_get_ops()
847 {
848- gcm_impl_ops_t *ops = NULL;
849+ if (!kfpu_allowed())
850+ return (&gcm_generic_impl);
851+
852+ const gcm_impl_ops_t *ops = NULL;
853 const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl);
854
855 switch (impl) {
856@@ -674,15 +680,13 @@ gcm_impl_get_ops()
857 ops = &gcm_fastest_impl;
858 break;
859 case IMPL_CYCLE:
860- {
861+ /* Cycle through supported implementations */
862 ASSERT(gcm_impl_initialized);
863 ASSERT3U(gcm_supp_impl_cnt, >, 0);
864- /* Cycle through supported implementations */
865 static size_t cycle_impl_idx = 0;
866 size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt;
867 ops = gcm_supp_impl[idx];
868- }
869- break;
870+ break;
871 default:
872 ASSERT3U(impl, <, gcm_supp_impl_cnt);
873 ASSERT3U(gcm_supp_impl_cnt, >, 0);
874@@ -696,13 +700,17 @@ gcm_impl_get_ops()
875 return (ops);
876 }
877
878+/*
879+ * Initialize all supported implementations.
880+ */
881+/* ARGSUSED */
882 void
883-gcm_impl_init(void)
884+gcm_impl_init(void *arg)
885 {
886 gcm_impl_ops_t *curr_impl;
887 int i, c;
888
889- /* move supported impl into aes_supp_impls */
890+ /* Move supported implementations into gcm_supp_impls */
891 for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) {
892 curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i];
893
894@@ -711,7 +719,10 @@ gcm_impl_init(void)
895 }
896 gcm_supp_impl_cnt = c;
897
898- /* set fastest implementation. assume hardware accelerated is fastest */
899+ /*
900+ * Set the fastest implementation given the assumption that the
901+ * hardware accelerated version is the fastest.
902+ */
903 #if defined(__x86_64) && defined(HAVE_PCLMULQDQ)
08743f90 904 if (gcm_pclmulqdq_impl.is_supported()) {
f43dbfa7
FG
905 memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl,
906diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c
907index be00ba37b..8a43ba33a 100644
908--- a/module/icp/algs/modes/gcm_pclmulqdq.c
909+++ b/module/icp/algs/modes/gcm_pclmulqdq.c
910@@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res)
911 static boolean_t
912 gcm_pclmulqdq_will_work(void)
913 {
914- return (zfs_pclmulqdq_available());
915+ return (kfpu_allowed() && zfs_pclmulqdq_available());
916 }
917
918 const gcm_impl_ops_t gcm_pclmulqdq_impl = {
08743f90
TL
919diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h
920index 3a3de91cf..329e32a8e 100644
921--- a/module/icp/include/aes/aes_impl.h
922+++ b/module/icp/include/aes/aes_impl.h
923@@ -198,12 +198,12 @@ extern const aes_impl_ops_t aes_aesni_impl;
924 /*
925 * Initializes fastest implementation
926 */
927-void aes_impl_init(void);
928+void aes_impl_init(void *arg);
929
930 /*
931- * Get selected aes implementation
932+ * Returns optimal allowed AES implementation
933 */
934-struct aes_impl_ops *aes_impl_get_ops(void);
935+const struct aes_impl_ops *aes_impl_get_ops(void);
936
937 #ifdef __cplusplus
938 }
939diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h
940index b78cc8aab..dff372ef8 100644
941--- a/module/icp/include/modes/gcm_impl.h
942+++ b/module/icp/include/modes/gcm_impl.h
943@@ -61,12 +61,12 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl;
944 /*
945 * Initializes fastest implementation
946 */
947-void gcm_impl_init(void);
948+void gcm_impl_init(void *arg);
949
950 /*
951- * Get selected aes implementation
952+ * Returns optimal allowed GCM implementation
953 */
954-struct gcm_impl_ops *gcm_impl_get_ops(void);
955+const struct gcm_impl_ops *gcm_impl_get_ops(void);
956
957 #ifdef __cplusplus
958 }
f43dbfa7
FG
959diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c
960index 53b193693..51538bc60 100644
961--- a/module/icp/io/aes.c
962+++ b/module/icp/io/aes.c
963@@ -206,9 +206,35 @@ aes_mod_init(void)
964 {
965 int ret;
966
967- /* find fastest implementations and set any requested implementations */
968- aes_impl_init();
969- gcm_impl_init();
970+#if defined(_KERNEL)
971+ /*
972+ * Determine the fastest available implementation. The benchmarks
973+ * are run in dedicated kernel threads to allow Linux 5.0+ kernels
974+ * to use SIMD operations. If for some reason this isn't possible,
975+ * fallback to the generic implementations. See the comment in
976+ * include/linux/simd_x86.h for additional details. Additionally,
977+ * this has the benefit of allowing them to be run in parallel.
978+ */
979+ taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init,
980+ NULL, TQ_SLEEP);
981+ taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init,
982+ NULL, TQ_SLEEP);
983+
984+ if (aes_id != TASKQID_INVALID) {
985+ taskq_wait_id(system_taskq, aes_id);
986+ } else {
987+ aes_impl_init(NULL);
988+ }
989+
990+ if (gcm_id != TASKQID_INVALID) {
991+ taskq_wait_id(system_taskq, gcm_id);
992+ } else {
993+ gcm_impl_init(NULL);
994+ }
995+#else
996+ aes_impl_init(NULL);
997+ gcm_impl_init(NULL);
998+#endif
999
1000 if ((ret = mod_install(&modlinkage)) != 0)
1001 return (ret);
1002diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c
08743f90 1003index a39f94e4c..69d591ff7 100644
f43dbfa7
FG
1004--- a/module/spl/spl-taskq.c
1005+++ b/module/spl/spl-taskq.c
08743f90 1006@@ -28,6 +28,7 @@
f43dbfa7
FG
1007 #include <sys/taskq.h>
1008 #include <sys/kmem.h>
1009 #include <sys/tsd.h>
1010+#include <linux/simd.h>
1011
1012 int spl_taskq_thread_bind = 0;
1013 module_param(spl_taskq_thread_bind, int, 0644);
08743f90 1014@@ -853,6 +854,7 @@ taskq_thread(void *args)
f43dbfa7
FG
1015 sigfillset(&blocked);
1016 sigprocmask(SIG_BLOCK, &blocked, NULL);
1017 flush_signals(current);
1018+ kfpu_initialize();
1019
1020 tsd_set(taskq_tsd, tq);
1021 spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
1022diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c
08743f90 1023index 0352a31ea..07e3a1bff 100644
f43dbfa7
FG
1024--- a/module/spl/spl-thread.c
1025+++ b/module/spl/spl-thread.c
1026@@ -27,6 +27,7 @@
1027 #include <sys/thread.h>
1028 #include <sys/kmem.h>
1029 #include <sys/tsd.h>
1030+#include <linux/simd.h>
1031
1032 /*
1033 * Thread interfaces
1034@@ -54,6 +55,7 @@ thread_generic_wrapper(void *arg)
1035 args = tp->tp_args;
1036 set_current_state(tp->tp_state);
1037 set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri));
1038+ kfpu_initialize();
1039 kmem_free(tp->tp_name, tp->tp_name_size);
1040 kmem_free(tp, sizeof (thread_priv_t));
1041
1042diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c
08743f90 1043index f712ce40c..9187a7c1e 100644
f43dbfa7
FG
1044--- a/module/zcommon/zfs_fletcher.c
1045+++ b/module/zcommon/zfs_fletcher.c
1046@@ -140,6 +140,7 @@
1047 #include <sys/zio_checksum.h>
1048 #include <sys/zfs_context.h>
1049 #include <zfs_fletcher.h>
1050+#include <linux/simd.h>
1051
1052 #define FLETCHER_MIN_SIMD_SIZE 64
1053
1054@@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector {
1055 const char *fis_name;
1056 uint32_t fis_sel;
1057 } fletcher_4_impl_selectors[] = {
1058-#if !defined(_KERNEL)
1059 { "cycle", IMPL_CYCLE },
1060-#endif
1061 { "fastest", IMPL_FASTEST },
1062 { "scalar", IMPL_SCALAR }
1063 };
1064
1065 #if defined(_KERNEL)
1066 static kstat_t *fletcher_4_kstat;
1067-#endif
1068
1069 static struct fletcher_4_kstat {
1070 uint64_t native;
1071 uint64_t byteswap;
1072 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
1073+#endif
1074
1075 /* Indicate that benchmark has been completed */
1076 static boolean_t fletcher_4_initialized = B_FALSE;
1077@@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val)
1078 return (err);
1079 }
1080
1081+/*
1082+ * Returns the Fletcher 4 operations for checksums. When a SIMD
1083+ * implementation is not allowed in the current context, then fallback
1084+ * to the fastest generic implementation.
1085+ */
1086 static inline const fletcher_4_ops_t *
1087 fletcher_4_impl_get(void)
1088 {
1089- fletcher_4_ops_t *ops = NULL;
1090- const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
1091+ if (!kfpu_allowed())
1092+ return (&fletcher_4_superscalar4_ops);
1093+
1094+ const fletcher_4_ops_t *ops = NULL;
1095+ uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
1096
1097 switch (impl) {
1098 case IMPL_FASTEST:
1099 ASSERT(fletcher_4_initialized);
1100 ops = &fletcher_4_fastest_impl;
1101 break;
1102-#if !defined(_KERNEL)
1103- case IMPL_CYCLE: {
1104+ case IMPL_CYCLE:
1105+ /* Cycle through supported implementations */
1106 ASSERT(fletcher_4_initialized);
1107 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
1108-
1109 static uint32_t cycle_count = 0;
1110 uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
1111 ops = fletcher_4_supp_impls[idx];
1112- }
1113- break;
1114-#endif
1115+ break;
1116 default:
1117 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
1118 ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
1119-
1120 ops = fletcher_4_supp_impls[impl];
1121 break;
1122 }
08743f90 1123@@ -659,6 +662,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
f43dbfa7
FG
1124 typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *,
1125 zio_cksum_t *);
1126
1127+#if defined(_KERNEL)
1128 static void
1129 fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
1130 {
1131@@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
1132 /* restore original selection */
1133 atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
1134 }
1135+#endif /* _KERNEL */
1136
1137-void
1138-fletcher_4_init(void)
1139+/*
1140+ * Initialize and benchmark all supported implementations.
1141+ */
1142+static void
1143+fletcher_4_benchmark(void *arg)
1144 {
1145- static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
1146 fletcher_4_ops_t *curr_impl;
1147- char *databuf;
1148 int i, c;
1149
1150- /* move supported impl into fletcher_4_supp_impls */
1151+ /* Move supported implementations into fletcher_4_supp_impls */
1152 for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
1153 curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i];
1154
1155@@ -735,19 +741,10 @@ fletcher_4_init(void)
1156 membar_producer(); /* complete fletcher_4_supp_impls[] init */
1157 fletcher_4_supp_impls_cnt = c; /* number of supported impl */
1158
1159-#if !defined(_KERNEL)
1160- /* Skip benchmarking and use last implementation as fastest */
1161- memcpy(&fletcher_4_fastest_impl,
1162- fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
1163- sizeof (fletcher_4_fastest_impl));
1164- fletcher_4_fastest_impl.name = "fastest";
1165- membar_producer();
1166+#if defined(_KERNEL)
1167+ static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
1168+ char *databuf = vmem_alloc(data_size, KM_SLEEP);
1169
1170- fletcher_4_initialized = B_TRUE;
1171- return;
1172-#endif
1173- /* Benchmark all supported implementations */
1174- databuf = vmem_alloc(data_size, KM_SLEEP);
1175 for (i = 0; i < data_size / sizeof (uint64_t); i++)
1176 ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
1177
1178@@ -755,9 +752,38 @@ fletcher_4_init(void)
1179 fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
1180
1181 vmem_free(databuf, data_size);
1182+#else
1183+ /*
1184+ * Skip the benchmark in user space to avoid impacting libzpool
1185+ * consumers (zdb, zhack, zinject, ztest). The last implementation
1186+ * is assumed to be the fastest and used by default.
1187+ */
1188+ memcpy(&fletcher_4_fastest_impl,
1189+ fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1],
1190+ sizeof (fletcher_4_fastest_impl));
1191+ fletcher_4_fastest_impl.name = "fastest";
1192+ membar_producer();
1193+#endif /* _KERNEL */
1194+}
1195
1196+void
1197+fletcher_4_init(void)
1198+{
1199 #if defined(_KERNEL)
1200- /* install kstats for all implementations */
1201+ /*
1202+ * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
1203+ * run in a kernel threads. This is needed to take advantage of the
1204+ * SIMD functionality, see include/linux/simd_x86.h for details.
1205+ */
1206+ taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark,
1207+ NULL, TQ_SLEEP);
1208+ if (id != TASKQID_INVALID) {
1209+ taskq_wait_id(system_taskq, id);
1210+ } else {
1211+ fletcher_4_benchmark(NULL);
1212+ }
1213+
1214+ /* Install kstats for all implementations */
1215 fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
1216 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
1217 if (fletcher_4_kstat != NULL) {
1218@@ -769,6 +795,8 @@ fletcher_4_init(void)
1219 fletcher_4_kstat_addr);
1220 kstat_install(fletcher_4_kstat);
1221 }
1222+#else
1223+ fletcher_4_benchmark(NULL);
1224 #endif
1225
1226 /* Finish initialization */
1227diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c
1228index bd2db2b20..3b3c1b52b 100644
1229--- a/module/zcommon/zfs_fletcher_aarch64_neon.c
1230+++ b/module/zcommon/zfs_fletcher_aarch64_neon.c
1231@@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16)));
1232
1233 static boolean_t fletcher_4_aarch64_neon_valid(void)
1234 {
1235- return (B_TRUE);
1236+ return (kfpu_allowed());
1237 }
1238
1239 const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = {
1240diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c
1241index 7260a9864..0d4cff21a 100644
1242--- a/module/zcommon/zfs_fletcher_avx512.c
1243+++ b/module/zcommon/zfs_fletcher_avx512.c
1244@@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap);
1245 static boolean_t
1246 fletcher_4_avx512f_valid(void)
1247 {
1248- return (zfs_avx512f_available());
1249+ return (kfpu_allowed() && zfs_avx512f_available());
1250 }
1251
1252 const fletcher_4_ops_t fletcher_4_avx512f_ops = {
1253diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c
1254index 6dac047da..7f12efe6d 100644
1255--- a/module/zcommon/zfs_fletcher_intel.c
1256+++ b/module/zcommon/zfs_fletcher_intel.c
1257@@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
1258
1259 static boolean_t fletcher_4_avx2_valid(void)
1260 {
1261- return (zfs_avx_available() && zfs_avx2_available());
1262+ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
1263 }
1264
1265 const fletcher_4_ops_t fletcher_4_avx2_ops = {
1266diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c
1267index a0b42e5f5..e6389d6e5 100644
1268--- a/module/zcommon/zfs_fletcher_sse.c
1269+++ b/module/zcommon/zfs_fletcher_sse.c
1270@@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
1271
1272 static boolean_t fletcher_4_sse2_valid(void)
1273 {
1274- return (zfs_sse2_available());
1275+ return (kfpu_allowed() && zfs_sse2_available());
1276 }
1277
1278 const fletcher_4_ops_t fletcher_4_sse2_ops = {
1279@@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size)
1280
1281 static boolean_t fletcher_4_ssse3_valid(void)
1282 {
1283- return (zfs_sse2_available() && zfs_ssse3_available());
1284+ return (kfpu_allowed() && zfs_sse2_available() &&
1285+ zfs_ssse3_available());
1286 }
1287
1288 const fletcher_4_ops_t fletcher_4_ssse3_ops = {
1289diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c
08743f90 1290index 3ef67768f..ef514e9e1 100644
f43dbfa7
FG
1291--- a/module/zfs/vdev_raidz_math.c
1292+++ b/module/zfs/vdev_raidz_math.c
1293@@ -27,9 +27,9 @@
1294 #include <sys/zio.h>
1295 #include <sys/debug.h>
1296 #include <sys/zfs_debug.h>
1297-
1298 #include <sys/vdev_raidz.h>
1299 #include <sys/vdev_raidz_impl.h>
1300+#include <linux/simd.h>
1301
1302 extern boolean_t raidz_will_scalar_work(void);
1303
1304@@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST;
1305 static size_t raidz_supp_impl_cnt = 0;
1306 static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)];
1307
1308+#if defined(_KERNEL)
1309 /*
1310 * kstats values for supported implementations
1311 * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s]
1312@@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1];
1313
1314 /* kstat for benchmarked implementations */
1315 static kstat_t *raidz_math_kstat = NULL;
1316+#endif
1317
1318 /*
1319- * Selects the raidz operation for raidz_map
1320- * If rm_ops is set to NULL original raidz implementation will be used
1321+ * Returns the RAIDZ operations for raidz_map() parity calculations. When
1322+ * a SIMD implementation is not allowed in the current context, then fallback
1323+ * to the fastest generic implementation.
1324 */
1325-raidz_impl_ops_t *
1326-vdev_raidz_math_get_ops()
1327+const raidz_impl_ops_t *
1328+vdev_raidz_math_get_ops(void)
1329 {
1330+ if (!kfpu_allowed())
1331+ return (&vdev_raidz_scalar_impl);
1332+
1333 raidz_impl_ops_t *ops = NULL;
1334 const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl);
1335
1336@@ -111,18 +117,14 @@ vdev_raidz_math_get_ops()
1337 ASSERT(raidz_math_initialized);
1338 ops = &vdev_raidz_fastest_impl;
1339 break;
1340-#if !defined(_KERNEL)
1341 case IMPL_CYCLE:
1342- {
1343+ /* Cycle through all supported implementations */
1344 ASSERT(raidz_math_initialized);
1345 ASSERT3U(raidz_supp_impl_cnt, >, 0);
1346- /* Cycle through all supported implementations */
1347 static size_t cycle_impl_idx = 0;
1348 size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt;
1349 ops = raidz_supp_impl[idx];
1350- }
1351- break;
1352-#endif
1353+ break;
1354 case IMPL_ORIGINAL:
1355 ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl;
1356 break;
1357@@ -273,6 +275,8 @@ const char *raidz_rec_name[] = {
1358 "rec_pq", "rec_pr", "rec_qr", "rec_pqr"
1359 };
1360
1361+#if defined(_KERNEL)
1362+
1363 #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1)
1364
1365 static int
1366@@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn)
1367 }
1368 }
1369 }
1370+#endif
1371
1372-void
1373-vdev_raidz_math_init(void)
1374+/*
1375+ * Initialize and benchmark all supported implementations.
1376+ */
1377+static void
1378+benchmark_raidz(void *arg)
1379 {
1380 raidz_impl_ops_t *curr_impl;
1381- zio_t *bench_zio = NULL;
1382- raidz_map_t *bench_rm = NULL;
1383- uint64_t bench_parity;
1384- int i, c, fn;
1385+ int i, c;
1386
1387- /* move supported impl into raidz_supp_impl */
1388+ /* Move supported impl into raidz_supp_impl */
1389 for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
1390 curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i];
1391
1392- /* initialize impl */
1393 if (curr_impl->init)
1394 curr_impl->init();
1395
1396@@ -459,18 +463,10 @@ vdev_raidz_math_init(void)
1397 membar_producer(); /* complete raidz_supp_impl[] init */
1398 raidz_supp_impl_cnt = c; /* number of supported impl */
1399
1400-#if !defined(_KERNEL)
1401- /* Skip benchmarking and use last implementation as fastest */
1402- memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1],
1403- sizeof (vdev_raidz_fastest_impl));
1404- strcpy(vdev_raidz_fastest_impl.name, "fastest");
1405-
1406- raidz_math_initialized = B_TRUE;
1407-
1408- /* Use 'cycle' math selection method for userspace */
1409- VERIFY0(vdev_raidz_impl_set("cycle"));
1410- return;
1411-#endif
1412+#if defined(_KERNEL)
1413+ zio_t *bench_zio = NULL;
1414+ raidz_map_t *bench_rm = NULL;
1415+ uint64_t bench_parity;
1416
08743f90 1417 /* Fake a zio and run the benchmark on a warmed up buffer */
f43dbfa7
FG
1418 bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP);
1419@@ -480,7 +476,7 @@ vdev_raidz_math_init(void)
1420 memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE);
1421
1422 /* Benchmark parity generation methods */
1423- for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
1424+ for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) {
1425 bench_parity = fn + 1;
1426 /* New raidz_map is needed for each generate_p/q/r */
1427 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
1428@@ -495,7 +491,7 @@ vdev_raidz_math_init(void)
1429 bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT,
1430 BENCH_COLS, PARITY_PQR);
1431
1432- for (fn = 0; fn < RAIDZ_REC_NUM; fn++)
1433+ for (int fn = 0; fn < RAIDZ_REC_NUM; fn++)
1434 benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl);
1435
1436 vdev_raidz_map_free(bench_rm);
1437@@ -503,11 +499,39 @@ vdev_raidz_math_init(void)
1438 /* cleanup the bench zio */
1439 abd_free(bench_zio->io_abd);
1440 kmem_free(bench_zio, sizeof (zio_t));
1441+#else
1442+ /*
1443+ * Skip the benchmark in user space to avoid impacting libzpool
1444+ * consumers (zdb, zhack, zinject, ztest). The last implementation
1445+ * is assumed to be the fastest and used by default.
1446+ */
1447+ memcpy(&vdev_raidz_fastest_impl,
1448+ raidz_supp_impl[raidz_supp_impl_cnt - 1],
1449+ sizeof (vdev_raidz_fastest_impl));
1450+ strcpy(vdev_raidz_fastest_impl.name, "fastest");
1451+#endif /* _KERNEL */
1452+}
1453
1454- /* install kstats for all impl */
1455+void
1456+vdev_raidz_math_init(void)
1457+{
1458+#if defined(_KERNEL)
1459+ /*
1460+ * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are
1461+ * run in a kernel threads. This is needed to take advantage of the
1462+ * SIMD functionality, see include/linux/simd_x86.h for details.
1463+ */
1464+ taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz,
1465+ NULL, TQ_SLEEP);
1466+ if (id != TASKQID_INVALID) {
1467+ taskq_wait_id(system_taskq, id);
1468+ } else {
1469+ benchmark_raidz(NULL);
1470+ }
1471+
1472+ /* Install kstats for all implementations */
1473 raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc",
1474 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
1475-
1476 if (raidz_math_kstat != NULL) {
1477 raidz_math_kstat->ks_data = NULL;
1478 raidz_math_kstat->ks_ndata = UINT32_MAX;
1479@@ -517,6 +541,9 @@ vdev_raidz_math_init(void)
1480 raidz_math_kstat_addr);
1481 kstat_install(raidz_math_kstat);
1482 }
1483+#else
1484+ benchmark_raidz(NULL);
1485+#endif
1486
1487 /* Finish initialization */
1488 atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl);
1489@@ -527,15 +554,15 @@ void
1490 vdev_raidz_math_fini(void)
1491 {
1492 raidz_impl_ops_t const *curr_impl;
1493- int i;
1494
1495+#if defined(_KERNEL)
1496 if (raidz_math_kstat != NULL) {
1497 kstat_delete(raidz_math_kstat);
1498 raidz_math_kstat = NULL;
1499 }
1500+#endif
1501
1502- /* fini impl */
1503- for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
1504+ for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) {
1505 curr_impl = raidz_all_maths[i];
1506 if (curr_impl->fini)
1507 curr_impl->fini();
1508@@ -546,9 +573,7 @@ static const struct {
1509 char *name;
1510 uint32_t sel;
1511 } math_impl_opts[] = {
1512-#if !defined(_KERNEL)
1513 { "cycle", IMPL_CYCLE },
1514-#endif
1515 { "fastest", IMPL_FASTEST },
1516 { "original", IMPL_ORIGINAL },
1517 { "scalar", IMPL_SCALAR }
1518diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c
1519index e3ad06776..0a67ceb84 100644
1520--- a/module/zfs/vdev_raidz_math_aarch64_neon.c
1521+++ b/module/zfs/vdev_raidz_math_aarch64_neon.c
1522@@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon);
1523 static boolean_t
1524 raidz_will_aarch64_neon_work(void)
1525 {
1526- return (B_TRUE); // __arch64__ requires NEON
1527+ return (kfpu_allowed());
1528 }
1529
1530 const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = {
1531diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
1532index f8688a06a..e072f51cd 100644
1533--- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c
1534+++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c
1535@@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2);
1536 static boolean_t
1537 raidz_will_aarch64_neonx2_work(void)
1538 {
1539- return (B_TRUE); // __arch64__ requires NEON
1540+ return (kfpu_allowed());
1541 }
1542
1543 const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = {
1544diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c
1545index 063d29bcd..a12eb6720 100644
1546--- a/module/zfs/vdev_raidz_math_avx2.c
1547+++ b/module/zfs/vdev_raidz_math_avx2.c
1548@@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2);
1549 static boolean_t
1550 raidz_will_avx2_work(void)
1551 {
1552- return (zfs_avx_available() && zfs_avx2_available());
1553+ return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available());
1554 }
1555
1556 const raidz_impl_ops_t vdev_raidz_avx2_impl = {
1557diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c
1558index d605653db..2f545c9ec 100644
1559--- a/module/zfs/vdev_raidz_math_avx512bw.c
1560+++ b/module/zfs/vdev_raidz_math_avx512bw.c
1561@@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw);
1562 static boolean_t
1563 raidz_will_avx512bw_work(void)
1564 {
1565- return (zfs_avx_available() &&
1566- zfs_avx512f_available() &&
1567- zfs_avx512bw_available());
1568+ return (kfpu_allowed() && zfs_avx_available() &&
1569+ zfs_avx512f_available() && zfs_avx512bw_available());
1570 }
1571
1572 const raidz_impl_ops_t vdev_raidz_avx512bw_impl = {
1573diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c
1574index f4e4560ce..75af7a8ee 100644
1575--- a/module/zfs/vdev_raidz_math_avx512f.c
1576+++ b/module/zfs/vdev_raidz_math_avx512f.c
1577@@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f);
1578 static boolean_t
1579 raidz_will_avx512f_work(void)
1580 {
1581- return (zfs_avx_available() &&
1582- zfs_avx2_available() &&
1583- zfs_avx512f_available());
1584+ return (kfpu_allowed() && zfs_avx_available() &&
1585+ zfs_avx2_available() && zfs_avx512f_available());
1586 }
1587
1588 const raidz_impl_ops_t vdev_raidz_avx512f_impl = {
1589diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c
1590index 9985da273..5b3a9385c 100644
1591--- a/module/zfs/vdev_raidz_math_sse2.c
1592+++ b/module/zfs/vdev_raidz_math_sse2.c
1593@@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2);
1594 static boolean_t
1595 raidz_will_sse2_work(void)
1596 {
1597- return (zfs_sse_available() && zfs_sse2_available());
1598+ return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available());
1599 }
1600
1601 const raidz_impl_ops_t vdev_raidz_sse2_impl = {
1602diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c
1603index 047a48d54..62247cf8e 100644
1604--- a/module/zfs/vdev_raidz_math_ssse3.c
1605+++ b/module/zfs/vdev_raidz_math_ssse3.c
1606@@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3);
1607 static boolean_t
1608 raidz_will_ssse3_work(void)
1609 {
1610- return (zfs_sse_available() && zfs_sse2_available() &&
1611- zfs_ssse3_available());
1612+ return (kfpu_allowed() && zfs_sse_available() &&
1613+ zfs_sse2_available() && zfs_ssse3_available());
1614 }
1615
1616 const raidz_impl_ops_t vdev_raidz_ssse3_impl = {