]>
Commit | Line | Data |
---|---|---|
f43dbfa7 FG |
1 | From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001 |
2 | From: Brian Behlendorf <behlendorf1@llnl.gov> | |
3 | Date: Fri, 12 Jul 2019 09:31:20 -0700 | |
4 | Subject: [PATCH] Linux 5.0 compat: SIMD compatibility | |
5 | MIME-Version: 1.0 | |
6 | Content-Type: text/plain; charset=UTF-8 | |
7 | Content-Transfer-Encoding: 8bit | |
8 | ||
9 | Restore the SIMD optimization for 4.19.38 LTS, 4.14.120 LTS, | |
10 | and 5.0 and newer kernels. This is accomplished by leveraging | |
11 | the fact that by definition dedicated kernel threads never need | |
12 | to concern themselves with saving and restoring the user FPU state. | |
13 | Therefore, they may use the FPU as long as we can guarantee user | |
14 | tasks always restore their FPU state before context switching back | |
15 | to user space. | |
16 | ||
17 | For the 5.0 and 5.1 kernels disabling preemption and local | |
18 | interrupts is sufficient to allow the FPU to be used. All non-kernel | |
19 | threads will restore the preserved user FPU state. | |
20 | ||
21 | For 5.2 and latter kernels the user FPU state restoration will be | |
22 | skipped if the kernel determines the registers have not changed. | |
23 | Therefore, for these kernels we need to perform the additional | |
24 | step of saving and restoring the FPU registers. Invalidating the | |
25 | per-cpu global tracking the FPU state would force a restore but | |
26 | that functionality is private to the core x86 FPU implementation | |
27 | and unavailable. | |
28 | ||
29 | In practice, restricting SIMD to kernel threads is not a major | |
30 | restriction for ZFS. The vast majority of SIMD operations are | |
31 | already performed by the IO pipeline. The remaining cases are | |
32 | relatively infrequent and can be handled by the generic code | |
33 | without significant impact. The two most noteworthy cases are: | |
34 | ||
35 | 1) Decrypting the wrapping key for an encrypted dataset, | |
36 | i.e. `zfs load-key`. All other encryption and decryption | |
37 | operations will use the SIMD optimized implementations. | |
38 | ||
39 | 2) Generating the payload checksums for a `zfs send` stream. | |
40 | ||
41 | In order to avoid making any changes to the higher layers of ZFS | |
42 | all of the `*_get_ops()` functions were updated to take in to | |
43 | consideration the calling context. This allows for the fastest | |
44 | implementation to be used as appropriate (see kfpu_allowed()). | |
45 | ||
46 | The only other notable instance of SIMD operations being used | |
47 | outside a kernel thread was at module load time. This code | |
48 | was moved in to a taskq in order to accommodate the new kernel | |
49 | thread restriction. | |
50 | ||
51 | Finally, a few other modifications were made in order to further | |
52 | harden this code and facilitate testing. They include updating | |
53 | each implementations operations structure to be declared as a | |
54 | constant. And allowing "cycle" to be set when selecting the | |
55 | preferred ops in the kernel as well as user space. | |
56 | ||
57 | Reviewed-by: Tony Hutter <hutter2@llnl.gov> | |
58 | Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov> | |
59 | Closes #8754 | |
60 | Closes #8793 | |
61 | Closes #8965 | |
62 | (cherry picked from commit e5db31349484e5e859c7a942eb15b98d68ce5b4d) | |
63 | Signed-off-by: Fabian Grünbichler <f.gruenbichler@proxmox.com> | |
08743f90 | 64 | Signed-off-by: Thomas Lamprecht <t.lamprecht@proxmox.com> |
f43dbfa7 | 65 | --- |
08743f90 TL |
66 | cmd/ztest/ztest.c | 3 + |
67 | config/kernel-fpu.m4 | 46 ++++- | |
f43dbfa7 FG |
68 | include/linux/Makefile.am | 1 + |
69 | include/linux/simd.h | 41 +++++ | |
70 | include/linux/simd_aarch64.h | 18 +- | |
71 | include/linux/simd_x86.h | 192 +++++++++++++------- | |
72 | include/sys/vdev_raidz.h | 2 +- | |
73 | include/sys/vdev_raidz_impl.h | 2 +- | |
f43dbfa7 FG |
74 | module/icp/algs/aes/aes_impl.c | 34 ++-- |
75 | module/icp/algs/aes/aes_impl_aesni.c | 2 +- | |
76 | module/icp/algs/modes/gcm.c | 41 +++-- | |
77 | module/icp/algs/modes/gcm_pclmulqdq.c | 2 +- | |
08743f90 TL |
78 | module/icp/include/aes/aes_impl.h | 6 +- |
79 | module/icp/include/modes/gcm_impl.h | 6 +- | |
f43dbfa7 FG |
80 | module/icp/io/aes.c | 32 +++- |
81 | module/spl/spl-taskq.c | 2 + | |
82 | module/spl/spl-thread.c | 2 + | |
83 | module/zcommon/zfs_fletcher.c | 88 ++++++--- | |
84 | module/zcommon/zfs_fletcher_aarch64_neon.c | 2 +- | |
85 | module/zcommon/zfs_fletcher_avx512.c | 2 +- | |
86 | module/zcommon/zfs_fletcher_intel.c | 2 +- | |
87 | module/zcommon/zfs_fletcher_sse.c | 5 +- | |
88 | module/zfs/vdev_raidz_math.c | 105 +++++++---- | |
89 | module/zfs/vdev_raidz_math_aarch64_neon.c | 2 +- | |
90 | module/zfs/vdev_raidz_math_aarch64_neonx2.c | 2 +- | |
91 | module/zfs/vdev_raidz_math_avx2.c | 2 +- | |
92 | module/zfs/vdev_raidz_math_avx512bw.c | 5 +- | |
93 | module/zfs/vdev_raidz_math_avx512f.c | 5 +- | |
94 | module/zfs/vdev_raidz_math_sse2.c | 2 +- | |
95 | module/zfs/vdev_raidz_math_ssse3.c | 4 +- | |
f43dbfa7 FG |
96 | 30 files changed, 454 insertions(+), 204 deletions(-) |
97 | create mode 100644 include/linux/simd.h | |
98 | ||
08743f90 TL |
99 | diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c |
100 | index 3bf840d88..dc22faef7 100644 | |
101 | --- a/cmd/ztest/ztest.c | |
102 | +++ b/cmd/ztest/ztest.c | |
103 | @@ -107,6 +107,7 @@ | |
104 | #include <sys/vdev_impl.h> | |
105 | #include <sys/vdev_file.h> | |
106 | #include <sys/vdev_initialize.h> | |
107 | +#include <sys/vdev_raidz.h> | |
108 | #include <sys/vdev_trim.h> | |
109 | #include <sys/spa_impl.h> | |
110 | #include <sys/metaslab_impl.h> | |
111 | @@ -7110,6 +7111,8 @@ ztest_run(ztest_shared_t *zs) | |
112 | metaslab_preload_limit = ztest_random(20) + 1; | |
113 | ztest_spa = spa; | |
114 | ||
115 | + VERIFY0(vdev_raidz_impl_set("cycle")); | |
116 | + | |
117 | dmu_objset_stats_t dds; | |
118 | VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, | |
119 | DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); | |
120 | diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4 | |
121 | index ebb02fb09..0e622e859 100644 | |
122 | --- a/config/kernel-fpu.m4 | |
123 | +++ b/config/kernel-fpu.m4 | |
124 | @@ -2,8 +2,15 @@ dnl # | |
125 | dnl # Handle differences in kernel FPU code. | |
126 | dnl # | |
127 | dnl # Kernel | |
128 | -dnl # 5.0: All kernel fpu functions are GPL only, so we can't use them. | |
129 | -dnl # (nothing defined) | |
130 | +dnl # 5.2: The fpu->initialized flag was replaced by TIF_NEED_FPU_LOAD. | |
131 | +dnl # HAVE_KERNEL_TIF_NEED_FPU_LOAD | |
132 | +dnl # | |
133 | +dnl # 5.0: As an optimization SIMD operations performed by kernel | |
134 | +dnl # threads can skip saving and restoring their FPU context. | |
135 | +dnl # Wrappers have been introduced to determine the running | |
136 | +dnl # context and use either the SIMD or generic implementation. | |
137 | +dnl # This change was made to the 4.19.38 and 4.14.120 LTS kernels. | |
138 | +dnl # HAVE_KERNEL_FPU_INITIALIZED | |
139 | dnl # | |
140 | dnl # 4.2: Use __kernel_fpu_{begin,end}() | |
141 | dnl # HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU | |
142 | @@ -59,10 +66,39 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ | |
143 | __kernel_fpu_end(); | |
144 | ], [__kernel_fpu_begin], [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [ | |
145 | AC_MSG_RESULT(__kernel_fpu_*) | |
146 | - AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, [kernel has __kernel_fpu_* functions]) | |
147 | - AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions]) | |
148 | + AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, | |
149 | + [kernel has __kernel_fpu_* functions]) | |
150 | + AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, | |
151 | + [kernel exports FPU functions]) | |
152 | ],[ | |
153 | - AC_MSG_RESULT(not exported) | |
154 | + ZFS_LINUX_TRY_COMPILE([ | |
155 | + #include <linux/module.h> | |
156 | + #include <linux/sched.h> | |
157 | + ],[ | |
158 | + struct fpu *fpu = ¤t->thread.fpu; | |
159 | + if (fpu->initialized) { return (0); }; | |
160 | + ],[ | |
161 | + AC_MSG_RESULT(fpu.initialized) | |
162 | + AC_DEFINE(HAVE_KERNEL_FPU_INITIALIZED, 1, | |
163 | + [kernel fpu.initialized exists]) | |
164 | + ],[ | |
165 | + ZFS_LINUX_TRY_COMPILE([ | |
166 | + #include <linux/module.h> | |
167 | + #include <asm/thread_info.h> | |
168 | + | |
169 | + #if !defined(TIF_NEED_FPU_LOAD) | |
170 | + #error "TIF_NEED_FPU_LOAD undefined" | |
171 | + #endif | |
172 | + ],[ | |
173 | + ],[ | |
174 | + AC_MSG_RESULT(TIF_NEED_FPU_LOAD) | |
175 | + AC_DEFINE( | |
176 | + HAVE_KERNEL_TIF_NEED_FPU_LOAD, 1, | |
177 | + [kernel TIF_NEED_FPU_LOAD exists]) | |
178 | + ],[ | |
179 | + AC_MSG_RESULT(unavailable) | |
180 | + ]) | |
181 | + ]) | |
182 | ]) | |
183 | ]) | |
184 | ]) | |
f43dbfa7 FG |
185 | diff --git a/include/linux/Makefile.am b/include/linux/Makefile.am |
186 | index efb49520e..2455759e8 100644 | |
187 | --- a/include/linux/Makefile.am | |
188 | +++ b/include/linux/Makefile.am | |
189 | @@ -7,6 +7,7 @@ KERNEL_H = \ | |
190 | $(top_srcdir)/include/linux/blkdev_compat.h \ | |
191 | $(top_srcdir)/include/linux/utsname_compat.h \ | |
192 | $(top_srcdir)/include/linux/kmap_compat.h \ | |
193 | + $(top_srcdir)/include/linux/simd.h \ | |
194 | $(top_srcdir)/include/linux/simd_x86.h \ | |
195 | $(top_srcdir)/include/linux/simd_aarch64.h \ | |
196 | $(top_srcdir)/include/linux/mod_compat.h \ | |
197 | diff --git a/include/linux/simd.h b/include/linux/simd.h | |
198 | new file mode 100644 | |
199 | index 000000000..d2b60996a | |
200 | --- /dev/null | |
201 | +++ b/include/linux/simd.h | |
202 | @@ -0,0 +1,41 @@ | |
203 | +/* | |
204 | + * CDDL HEADER START | |
205 | + * | |
206 | + * The contents of this file are subject to the terms of the | |
207 | + * Common Development and Distribution License (the "License"). | |
208 | + * You may not use this file except in compliance with the License. | |
209 | + * | |
210 | + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
211 | + * or http://www.opensolaris.org/os/licensing. | |
212 | + * See the License for the specific language governing permissions | |
213 | + * and limitations under the License. | |
214 | + * | |
215 | + * When distributing Covered Code, include this CDDL HEADER in each | |
216 | + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
217 | + * If applicable, add the following below this CDDL HEADER, with the | |
218 | + * fields enclosed by brackets "[]" replaced with your own identifying | |
219 | + * information: Portions Copyright [yyyy] [name of copyright owner] | |
220 | + * | |
221 | + * CDDL HEADER END | |
222 | + */ | |
223 | +/* | |
224 | + * Copyright (C) 2019 Lawrence Livermore National Security, LLC. | |
225 | + */ | |
226 | + | |
227 | +#ifndef _SIMD_H | |
228 | +#define _SIMD_H | |
229 | + | |
230 | +#if defined(__x86) | |
231 | +#include <linux/simd_x86.h> | |
232 | + | |
233 | +#elif defined(__aarch64__) | |
234 | +#include <linux/simd_aarch64.h> | |
235 | +#else | |
236 | + | |
237 | +#define kfpu_allowed() 1 | |
238 | +#define kfpu_initialize(tsk) do {} while (0) | |
239 | +#define kfpu_begin() do {} while (0) | |
240 | +#define kfpu_end() do {} while (0) | |
241 | + | |
242 | +#endif | |
243 | +#endif /* _SIMD_H */ | |
244 | diff --git a/include/linux/simd_aarch64.h b/include/linux/simd_aarch64.h | |
08743f90 | 245 | index 56153a160..b45d31c48 100644 |
f43dbfa7 FG |
246 | --- a/include/linux/simd_aarch64.h |
247 | +++ b/include/linux/simd_aarch64.h | |
08743f90 | 248 | @@ -43,20 +43,18 @@ |
f43dbfa7 FG |
249 | |
250 | #if defined(_KERNEL) | |
251 | #include <asm/neon.h> | |
252 | -#define kfpu_begin() \ | |
253 | -{ \ | |
254 | - kernel_neon_begin(); \ | |
255 | -} | |
256 | -#define kfpu_end() \ | |
257 | -{ \ | |
258 | - kernel_neon_end(); \ | |
259 | -} | |
260 | +#define kfpu_allowed() 1 | |
261 | +#define kfpu_initialize(tsk) do {} while (0) | |
262 | +#define kfpu_begin() kernel_neon_begin() | |
263 | +#define kfpu_end() kernel_neon_end() | |
264 | #else | |
265 | /* | |
266 | * fpu dummy methods for userspace | |
267 | */ | |
268 | -#define kfpu_begin() do {} while (0) | |
269 | -#define kfpu_end() do {} while (0) | |
270 | +#define kfpu_allowed() 1 | |
271 | +#define kfpu_initialize(tsk) do {} while (0) | |
272 | +#define kfpu_begin() do {} while (0) | |
273 | +#define kfpu_end() do {} while (0) | |
274 | #endif /* defined(_KERNEL) */ | |
275 | ||
276 | #endif /* __aarch64__ */ | |
277 | diff --git a/include/linux/simd_x86.h b/include/linux/simd_x86.h | |
08743f90 | 278 | index 0489bfaa3..641f43955 100644 |
f43dbfa7 FG |
279 | --- a/include/linux/simd_x86.h |
280 | +++ b/include/linux/simd_x86.h | |
08743f90 | 281 | @@ -92,33 +92,135 @@ |
f43dbfa7 FG |
282 | #include <asm/xcr.h> |
283 | #endif | |
284 | ||
285 | +/* | |
286 | + * The following cases are for kernels which export either the | |
287 | + * kernel_fpu_* or __kernel_fpu_* functions. | |
288 | + */ | |
289 | +#if defined(KERNEL_EXPORTS_X86_FPU) | |
290 | + | |
291 | +#define kfpu_allowed() 1 | |
292 | +#define kfpu_initialize(tsk) do {} while (0) | |
293 | + | |
294 | #if defined(HAVE_UNDERSCORE_KERNEL_FPU) | |
295 | #define kfpu_begin() \ | |
296 | -{ \ | |
297 | - preempt_disable(); \ | |
298 | +{ \ | |
299 | + preempt_disable(); \ | |
300 | __kernel_fpu_begin(); \ | |
301 | } | |
302 | -#define kfpu_end() \ | |
303 | -{ \ | |
304 | - __kernel_fpu_end(); \ | |
305 | - preempt_enable(); \ | |
306 | +#define kfpu_end() \ | |
307 | +{ \ | |
308 | + __kernel_fpu_end(); \ | |
309 | + preempt_enable(); \ | |
310 | } | |
311 | + | |
312 | #elif defined(HAVE_KERNEL_FPU) | |
313 | -#define kfpu_begin() kernel_fpu_begin() | |
314 | +#define kfpu_begin() kernel_fpu_begin() | |
315 | #define kfpu_end() kernel_fpu_end() | |
316 | + | |
317 | #else | |
318 | -/* Kernel doesn't export any kernel_fpu_* functions */ | |
319 | -#include <asm/fpu/internal.h> /* For kernel xgetbv() */ | |
320 | -#define kfpu_begin() panic("This code should never run") | |
321 | -#define kfpu_end() panic("This code should never run") | |
322 | -#endif /* defined(HAVE_KERNEL_FPU) */ | |
323 | +/* | |
324 | + * This case is unreachable. When KERNEL_EXPORTS_X86_FPU is defined then | |
325 | + * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined. | |
326 | + */ | |
327 | +#error "Unreachable kernel configuration" | |
328 | +#endif | |
329 | + | |
330 | +#else /* defined(KERNEL_EXPORTS_X86_FPU) */ | |
331 | +/* | |
332 | + * When the kernel_fpu_* symbols are unavailable then provide our own | |
333 | + * versions which allow the FPU to be safely used in kernel threads. | |
334 | + * In practice, this is not a significant restriction for ZFS since the | |
335 | + * vast majority of SIMD operations are performed by the IO pipeline. | |
336 | + */ | |
337 | ||
338 | +/* | |
339 | + * Returns non-zero if FPU operations are allowed in the current context. | |
340 | + */ | |
341 | +#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD) | |
342 | +#define kfpu_allowed() ((current->flags & PF_KTHREAD) && \ | |
343 | + test_thread_flag(TIF_NEED_FPU_LOAD)) | |
344 | +#elif defined(HAVE_KERNEL_FPU_INITIALIZED) | |
345 | +#define kfpu_allowed() ((current->flags & PF_KTHREAD) && \ | |
346 | + current->thread.fpu.initialized) | |
347 | #else | |
348 | +#define kfpu_allowed() 0 | |
349 | +#endif | |
350 | + | |
351 | +static inline void | |
352 | +kfpu_initialize(void) | |
353 | +{ | |
354 | + WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); | |
355 | + | |
356 | +#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD) | |
357 | + __fpu_invalidate_fpregs_state(¤t->thread.fpu); | |
358 | + set_thread_flag(TIF_NEED_FPU_LOAD); | |
359 | +#elif defined(HAVE_KERNEL_FPU_INITIALIZED) | |
360 | + __fpu_invalidate_fpregs_state(¤t->thread.fpu); | |
361 | + current->thread.fpu.initialized = 1; | |
362 | +#endif | |
363 | +} | |
364 | + | |
365 | +static inline void | |
366 | +kfpu_begin(void) | |
367 | +{ | |
368 | + WARN_ON_ONCE(!kfpu_allowed()); | |
369 | + | |
370 | + /* | |
371 | + * Preemption and interrupts must be disabled for the critical | |
372 | + * region where the FPU state is being modified. | |
373 | + */ | |
374 | + preempt_disable(); | |
375 | + local_irq_disable(); | |
376 | + | |
377 | +#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD) | |
378 | + /* | |
379 | + * The current FPU registers need to be preserved by kfpu_begin() | |
380 | + * and restored by kfpu_end(). This is required because we can | |
381 | + * not call __cpu_invalidate_fpregs_state() to invalidate the | |
382 | + * per-cpu FPU state and force them to be restored during a | |
383 | + * context switch. | |
384 | + */ | |
385 | + copy_fpregs_to_fpstate(¤t->thread.fpu); | |
386 | +#elif defined(HAVE_KERNEL_FPU_INITIALIZED) | |
387 | + /* | |
388 | + * There is no need to preserve and restore the FPU registers. | |
389 | + * They will always be restored from the task's stored FPU state | |
390 | + * when switching contexts. | |
391 | + */ | |
392 | + WARN_ON_ONCE(current->thread.fpu.initialized == 0); | |
393 | +#endif | |
394 | +} | |
395 | + | |
396 | +static inline void | |
397 | +kfpu_end(void) | |
398 | +{ | |
399 | +#if defined(HAVE_KERNEL_TIF_NEED_FPU_LOAD) | |
400 | + union fpregs_state *state = ¤t->thread.fpu.state; | |
401 | + int error; | |
402 | + | |
403 | + if (use_xsave()) { | |
404 | + error = copy_kernel_to_xregs_err(&state->xsave, -1); | |
405 | + } else if (use_fxsr()) { | |
406 | + error = copy_kernel_to_fxregs_err(&state->fxsave); | |
407 | + } else { | |
408 | + error = copy_kernel_to_fregs_err(&state->fsave); | |
409 | + } | |
410 | + WARN_ON_ONCE(error); | |
411 | +#endif | |
412 | + | |
413 | + local_irq_enable(); | |
414 | + preempt_enable(); | |
415 | +} | |
416 | +#endif /* defined(HAVE_KERNEL_FPU) */ | |
417 | + | |
418 | +#else /* defined(_KERNEL) */ | |
419 | /* | |
420 | - * fpu dummy methods for userspace | |
421 | + * FPU dummy methods for user space. | |
422 | */ | |
423 | -#define kfpu_begin() do {} while (0) | |
424 | -#define kfpu_end() do {} while (0) | |
425 | +#define kfpu_allowed() 1 | |
426 | +#define kfpu_initialize(tsk) do {} while (0) | |
427 | +#define kfpu_begin() do {} while (0) | |
428 | +#define kfpu_end() do {} while (0) | |
429 | #endif /* defined(_KERNEL) */ | |
430 | ||
431 | /* | |
08743f90 | 432 | @@ -300,7 +402,7 @@ __simd_state_enabled(const uint64_t state) |
f43dbfa7 FG |
433 | uint64_t xcr0; |
434 | ||
435 | #if defined(_KERNEL) | |
436 | -#if defined(X86_FEATURE_OSXSAVE) && defined(KERNEL_EXPORTS_X86_FPU) | |
437 | +#if defined(X86_FEATURE_OSXSAVE) | |
438 | has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE); | |
439 | #else | |
440 | has_osxsave = B_FALSE; | |
08743f90 | 441 | @@ -330,11 +432,7 @@ static inline boolean_t |
f43dbfa7 FG |
442 | zfs_sse_available(void) |
443 | { | |
444 | #if defined(_KERNEL) | |
445 | -#if defined(KERNEL_EXPORTS_X86_FPU) | |
446 | return (!!boot_cpu_has(X86_FEATURE_XMM)); | |
447 | -#else | |
448 | - return (B_FALSE); | |
449 | -#endif | |
450 | #elif !defined(_KERNEL) | |
451 | return (__cpuid_has_sse()); | |
452 | #endif | |
08743f90 | 453 | @@ -347,11 +445,7 @@ static inline boolean_t |
f43dbfa7 FG |
454 | zfs_sse2_available(void) |
455 | { | |
456 | #if defined(_KERNEL) | |
457 | -#if defined(KERNEL_EXPORTS_X86_FPU) | |
458 | return (!!boot_cpu_has(X86_FEATURE_XMM2)); | |
459 | -#else | |
460 | - return (B_FALSE); | |
461 | -#endif | |
462 | #elif !defined(_KERNEL) | |
463 | return (__cpuid_has_sse2()); | |
464 | #endif | |
08743f90 | 465 | @@ -364,11 +458,7 @@ static inline boolean_t |
f43dbfa7 FG |
466 | zfs_sse3_available(void) |
467 | { | |
468 | #if defined(_KERNEL) | |
469 | -#if defined(KERNEL_EXPORTS_X86_FPU) | |
470 | return (!!boot_cpu_has(X86_FEATURE_XMM3)); | |
471 | -#else | |
472 | - return (B_FALSE); | |
473 | -#endif | |
474 | #elif !defined(_KERNEL) | |
475 | return (__cpuid_has_sse3()); | |
476 | #endif | |
08743f90 | 477 | @@ -381,11 +471,7 @@ static inline boolean_t |
f43dbfa7 FG |
478 | zfs_ssse3_available(void) |
479 | { | |
480 | #if defined(_KERNEL) | |
481 | -#if defined(KERNEL_EXPORTS_X86_FPU) | |
482 | return (!!boot_cpu_has(X86_FEATURE_SSSE3)); | |
483 | -#else | |
484 | - return (B_FALSE); | |
485 | -#endif | |
486 | #elif !defined(_KERNEL) | |
487 | return (__cpuid_has_ssse3()); | |
488 | #endif | |
08743f90 | 489 | @@ -398,11 +484,7 @@ static inline boolean_t |
f43dbfa7 FG |
490 | zfs_sse4_1_available(void) |
491 | { | |
492 | #if defined(_KERNEL) | |
493 | -#if defined(KERNEL_EXPORTS_X86_FPU) | |
494 | return (!!boot_cpu_has(X86_FEATURE_XMM4_1)); | |
495 | -#else | |
496 | - return (B_FALSE); | |
497 | -#endif | |
498 | #elif !defined(_KERNEL) | |
499 | return (__cpuid_has_sse4_1()); | |
500 | #endif | |
08743f90 | 501 | @@ -415,11 +497,7 @@ static inline boolean_t |
f43dbfa7 FG |
502 | zfs_sse4_2_available(void) |
503 | { | |
504 | #if defined(_KERNEL) | |
505 | -#if defined(KERNEL_EXPORTS_X86_FPU) | |
506 | return (!!boot_cpu_has(X86_FEATURE_XMM4_2)); | |
507 | -#else | |
508 | - return (B_FALSE); | |
509 | -#endif | |
510 | #elif !defined(_KERNEL) | |
511 | return (__cpuid_has_sse4_2()); | |
512 | #endif | |
08743f90 | 513 | @@ -433,11 +511,7 @@ zfs_avx_available(void) |
f43dbfa7 FG |
514 | { |
515 | boolean_t has_avx; | |
516 | #if defined(_KERNEL) | |
517 | -#if defined(KERNEL_EXPORTS_X86_FPU) | |
518 | has_avx = !!boot_cpu_has(X86_FEATURE_AVX); | |
519 | -#else | |
520 | - has_avx = B_FALSE; | |
521 | -#endif | |
522 | #elif !defined(_KERNEL) | |
523 | has_avx = __cpuid_has_avx(); | |
524 | #endif | |
08743f90 | 525 | @@ -453,11 +527,7 @@ zfs_avx2_available(void) |
f43dbfa7 FG |
526 | { |
527 | boolean_t has_avx2; | |
528 | #if defined(_KERNEL) | |
529 | -#if defined(X86_FEATURE_AVX2) && defined(KERNEL_EXPORTS_X86_FPU) | |
530 | has_avx2 = !!boot_cpu_has(X86_FEATURE_AVX2); | |
531 | -#else | |
532 | - has_avx2 = B_FALSE; | |
533 | -#endif | |
534 | #elif !defined(_KERNEL) | |
535 | has_avx2 = __cpuid_has_avx2(); | |
536 | #endif | |
08743f90 | 537 | @@ -472,7 +542,7 @@ static inline boolean_t |
f43dbfa7 FG |
538 | zfs_bmi1_available(void) |
539 | { | |
540 | #if defined(_KERNEL) | |
541 | -#if defined(X86_FEATURE_BMI1) && defined(KERNEL_EXPORTS_X86_FPU) | |
542 | +#if defined(X86_FEATURE_BMI1) | |
543 | return (!!boot_cpu_has(X86_FEATURE_BMI1)); | |
544 | #else | |
545 | return (B_FALSE); | |
08743f90 | 546 | @@ -489,7 +559,7 @@ static inline boolean_t |
f43dbfa7 FG |
547 | zfs_bmi2_available(void) |
548 | { | |
549 | #if defined(_KERNEL) | |
550 | -#if defined(X86_FEATURE_BMI2) && defined(KERNEL_EXPORTS_X86_FPU) | |
551 | +#if defined(X86_FEATURE_BMI2) | |
552 | return (!!boot_cpu_has(X86_FEATURE_BMI2)); | |
553 | #else | |
554 | return (B_FALSE); | |
08743f90 | 555 | @@ -506,7 +576,7 @@ static inline boolean_t |
f43dbfa7 FG |
556 | zfs_aes_available(void) |
557 | { | |
558 | #if defined(_KERNEL) | |
559 | -#if defined(X86_FEATURE_AES) && defined(KERNEL_EXPORTS_X86_FPU) | |
560 | +#if defined(X86_FEATURE_AES) | |
561 | return (!!boot_cpu_has(X86_FEATURE_AES)); | |
562 | #else | |
563 | return (B_FALSE); | |
08743f90 | 564 | @@ -523,7 +593,7 @@ static inline boolean_t |
f43dbfa7 FG |
565 | zfs_pclmulqdq_available(void) |
566 | { | |
567 | #if defined(_KERNEL) | |
568 | -#if defined(X86_FEATURE_PCLMULQDQ) && defined(KERNEL_EXPORTS_X86_FPU) | |
569 | +#if defined(X86_FEATURE_PCLMULQDQ) | |
570 | return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ)); | |
571 | #else | |
572 | return (B_FALSE); | |
08743f90 | 573 | @@ -557,7 +627,7 @@ zfs_avx512f_available(void) |
f43dbfa7 FG |
574 | boolean_t has_avx512 = B_FALSE; |
575 | ||
576 | #if defined(_KERNEL) | |
577 | -#if defined(X86_FEATURE_AVX512F) && defined(KERNEL_EXPORTS_X86_FPU) | |
578 | +#if defined(X86_FEATURE_AVX512F) | |
579 | has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F); | |
580 | #else | |
581 | has_avx512 = B_FALSE; | |
08743f90 | 582 | @@ -576,7 +646,7 @@ zfs_avx512cd_available(void) |
f43dbfa7 FG |
583 | boolean_t has_avx512 = B_FALSE; |
584 | ||
585 | #if defined(_KERNEL) | |
586 | -#if defined(X86_FEATURE_AVX512CD) && defined(KERNEL_EXPORTS_X86_FPU) | |
587 | +#if defined(X86_FEATURE_AVX512CD) | |
588 | has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && | |
589 | boot_cpu_has(X86_FEATURE_AVX512CD); | |
590 | #else | |
08743f90 | 591 | @@ -596,7 +666,7 @@ zfs_avx512er_available(void) |
f43dbfa7 FG |
592 | boolean_t has_avx512 = B_FALSE; |
593 | ||
594 | #if defined(_KERNEL) | |
595 | -#if defined(X86_FEATURE_AVX512ER) && defined(KERNEL_EXPORTS_X86_FPU) | |
596 | +#if defined(X86_FEATURE_AVX512ER) | |
597 | has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && | |
598 | boot_cpu_has(X86_FEATURE_AVX512ER); | |
599 | #else | |
08743f90 | 600 | @@ -616,7 +686,7 @@ zfs_avx512pf_available(void) |
f43dbfa7 FG |
601 | boolean_t has_avx512 = B_FALSE; |
602 | ||
603 | #if defined(_KERNEL) | |
604 | -#if defined(X86_FEATURE_AVX512PF) && defined(KERNEL_EXPORTS_X86_FPU) | |
605 | +#if defined(X86_FEATURE_AVX512PF) | |
606 | has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && | |
607 | boot_cpu_has(X86_FEATURE_AVX512PF); | |
608 | #else | |
08743f90 | 609 | @@ -636,7 +706,7 @@ zfs_avx512bw_available(void) |
f43dbfa7 FG |
610 | boolean_t has_avx512 = B_FALSE; |
611 | ||
612 | #if defined(_KERNEL) | |
613 | -#if defined(X86_FEATURE_AVX512BW) && defined(KERNEL_EXPORTS_X86_FPU) | |
614 | +#if defined(X86_FEATURE_AVX512BW) | |
615 | has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && | |
616 | boot_cpu_has(X86_FEATURE_AVX512BW); | |
617 | #else | |
08743f90 | 618 | @@ -656,7 +726,7 @@ zfs_avx512dq_available(void) |
f43dbfa7 FG |
619 | boolean_t has_avx512 = B_FALSE; |
620 | ||
621 | #if defined(_KERNEL) | |
622 | -#if defined(X86_FEATURE_AVX512DQ) && defined(KERNEL_EXPORTS_X86_FPU) | |
623 | +#if defined(X86_FEATURE_AVX512DQ) | |
624 | has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && | |
625 | boot_cpu_has(X86_FEATURE_AVX512DQ); | |
626 | #else | |
08743f90 | 627 | @@ -676,7 +746,7 @@ zfs_avx512vl_available(void) |
f43dbfa7 FG |
628 | boolean_t has_avx512 = B_FALSE; |
629 | ||
630 | #if defined(_KERNEL) | |
631 | -#if defined(X86_FEATURE_AVX512VL) && defined(KERNEL_EXPORTS_X86_FPU) | |
632 | +#if defined(X86_FEATURE_AVX512VL) | |
633 | has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && | |
634 | boot_cpu_has(X86_FEATURE_AVX512VL); | |
635 | #else | |
08743f90 | 636 | @@ -696,7 +766,7 @@ zfs_avx512ifma_available(void) |
f43dbfa7 FG |
637 | boolean_t has_avx512 = B_FALSE; |
638 | ||
639 | #if defined(_KERNEL) | |
640 | -#if defined(X86_FEATURE_AVX512IFMA) && defined(KERNEL_EXPORTS_X86_FPU) | |
641 | +#if defined(X86_FEATURE_AVX512IFMA) | |
642 | has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && | |
643 | boot_cpu_has(X86_FEATURE_AVX512IFMA); | |
644 | #else | |
08743f90 | 645 | @@ -716,7 +786,7 @@ zfs_avx512vbmi_available(void) |
f43dbfa7 FG |
646 | boolean_t has_avx512 = B_FALSE; |
647 | ||
648 | #if defined(_KERNEL) | |
649 | -#if defined(X86_FEATURE_AVX512VBMI) && defined(KERNEL_EXPORTS_X86_FPU) | |
650 | +#if defined(X86_FEATURE_AVX512VBMI) | |
651 | has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && | |
652 | boot_cpu_has(X86_FEATURE_AVX512VBMI); | |
653 | #else | |
654 | diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h | |
655 | index 2ce32469d..0ce2b5ea1 100644 | |
656 | --- a/include/sys/vdev_raidz.h | |
657 | +++ b/include/sys/vdev_raidz.h | |
658 | @@ -51,7 +51,7 @@ int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); | |
659 | */ | |
660 | void vdev_raidz_math_init(void); | |
661 | void vdev_raidz_math_fini(void); | |
662 | -struct raidz_impl_ops *vdev_raidz_math_get_ops(void); | |
663 | +const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); | |
664 | int vdev_raidz_math_generate(struct raidz_map *); | |
665 | int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *, | |
666 | const int); | |
667 | diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h | |
668 | index 0799ed19d..4969d110b 100644 | |
669 | --- a/include/sys/vdev_raidz_impl.h | |
670 | +++ b/include/sys/vdev_raidz_impl.h | |
671 | @@ -126,7 +126,7 @@ typedef struct raidz_map { | |
672 | uintptr_t rm_reports; /* # of referencing checksum reports */ | |
673 | uint8_t rm_freed; /* map no longer has referencing ZIO */ | |
674 | uint8_t rm_ecksuminjected; /* checksum error was injected */ | |
675 | - raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ | |
676 | + const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ | |
677 | raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ | |
678 | } raidz_map_t; | |
679 | ||
f43dbfa7 | 680 | diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c |
08743f90 | 681 | index 36e0686a5..0f11f9999 100644 |
f43dbfa7 FG |
682 | --- a/module/icp/algs/aes/aes_impl.c |
683 | +++ b/module/icp/algs/aes/aes_impl.c | |
684 | @@ -27,6 +27,7 @@ | |
685 | #include <sys/crypto/spi.h> | |
686 | #include <modes/modes.h> | |
687 | #include <aes/aes_impl.h> | |
688 | +#include <linux/simd.h> | |
689 | ||
690 | /* | |
691 | * Initialize AES encryption and decryption key schedules. | |
692 | @@ -40,9 +41,9 @@ | |
693 | void | |
694 | aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched) | |
695 | { | |
696 | - aes_impl_ops_t *ops = aes_impl_get_ops(); | |
697 | - aes_key_t *newbie = keysched; | |
698 | - uint_t keysize, i, j; | |
699 | + const aes_impl_ops_t *ops = aes_impl_get_ops(); | |
700 | + aes_key_t *newbie = keysched; | |
701 | + uint_t keysize, i, j; | |
702 | union { | |
703 | uint64_t ka64[4]; | |
704 | uint32_t ka32[8]; | |
705 | @@ -252,12 +253,17 @@ static size_t aes_supp_impl_cnt = 0; | |
706 | static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)]; | |
707 | ||
708 | /* | |
709 | - * Selects the aes operations for encrypt/decrypt/key setup | |
710 | + * Returns the AES operations for encrypt/decrypt/key setup. When a | |
711 | + * SIMD implementation is not allowed in the current context, then | |
712 | + * fallback to the fastest generic implementation. | |
713 | */ | |
714 | -aes_impl_ops_t * | |
715 | -aes_impl_get_ops() | |
716 | +const aes_impl_ops_t * | |
717 | +aes_impl_get_ops(void) | |
718 | { | |
719 | - aes_impl_ops_t *ops = NULL; | |
720 | + if (!kfpu_allowed()) | |
721 | + return (&aes_generic_impl); | |
722 | + | |
723 | + const aes_impl_ops_t *ops = NULL; | |
724 | const uint32_t impl = AES_IMPL_READ(icp_aes_impl); | |
725 | ||
726 | switch (impl) { | |
727 | @@ -266,15 +272,13 @@ aes_impl_get_ops() | |
728 | ops = &aes_fastest_impl; | |
729 | break; | |
730 | case IMPL_CYCLE: | |
731 | - { | |
732 | + /* Cycle through supported implementations */ | |
733 | ASSERT(aes_impl_initialized); | |
734 | ASSERT3U(aes_supp_impl_cnt, >, 0); | |
735 | - /* Cycle through supported implementations */ | |
736 | static size_t cycle_impl_idx = 0; | |
737 | size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt; | |
738 | ops = aes_supp_impl[idx]; | |
739 | - } | |
740 | - break; | |
741 | + break; | |
742 | default: | |
743 | ASSERT3U(impl, <, aes_supp_impl_cnt); | |
744 | ASSERT3U(aes_supp_impl_cnt, >, 0); | |
745 | @@ -288,13 +292,17 @@ aes_impl_get_ops() | |
746 | return (ops); | |
747 | } | |
748 | ||
749 | +/* | |
750 | + * Initialize all supported implementations. | |
751 | + */ | |
752 | +/* ARGSUSED */ | |
753 | void | |
754 | -aes_impl_init(void) | |
755 | +aes_impl_init(void *arg) | |
756 | { | |
757 | aes_impl_ops_t *curr_impl; | |
758 | int i, c; | |
759 | ||
760 | - /* move supported impl into aes_supp_impls */ | |
761 | + /* Move supported implementations into aes_supp_impls */ | |
762 | for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) { | |
763 | curr_impl = (aes_impl_ops_t *)aes_all_impl[i]; | |
764 | ||
765 | diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c | |
766 | index 97f7c3a47..222c176aa 100644 | |
767 | --- a/module/icp/algs/aes/aes_impl_aesni.c | |
768 | +++ b/module/icp/algs/aes/aes_impl_aesni.c | |
769 | @@ -108,7 +108,7 @@ aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], | |
770 | static boolean_t | |
771 | aes_aesni_will_work(void) | |
772 | { | |
773 | - return (zfs_aes_available()); | |
774 | + return (kfpu_allowed() && zfs_aes_available()); | |
775 | } | |
776 | ||
777 | const aes_impl_ops_t aes_aesni_impl = { | |
778 | diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c | |
08743f90 | 779 | index 0afd957f0..423b70e2c 100644 |
f43dbfa7 FG |
780 | --- a/module/icp/algs/modes/gcm.c |
781 | +++ b/module/icp/algs/modes/gcm.c | |
782 | @@ -29,6 +29,7 @@ | |
783 | #include <sys/crypto/impl.h> | |
784 | #include <sys/byteorder.h> | |
785 | #include <modes/gcm_impl.h> | |
786 | +#include <linux/simd.h> | |
787 | ||
788 | #define GHASH(c, d, t, o) \ | |
789 | xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ | |
790 | @@ -46,7 +47,7 @@ gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, | |
791 | void (*copy_block)(uint8_t *, uint8_t *), | |
792 | void (*xor_block)(uint8_t *, uint8_t *)) | |
793 | { | |
794 | - gcm_impl_ops_t *gops; | |
795 | + const gcm_impl_ops_t *gops; | |
796 | size_t remainder = length; | |
797 | size_t need = 0; | |
798 | uint8_t *datap = (uint8_t *)data; | |
799 | @@ -168,7 +169,7 @@ gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, | |
800 | void (*copy_block)(uint8_t *, uint8_t *), | |
801 | void (*xor_block)(uint8_t *, uint8_t *)) | |
802 | { | |
803 | - gcm_impl_ops_t *gops; | |
804 | + const gcm_impl_ops_t *gops; | |
805 | uint64_t counter_mask = ntohll(0x00000000ffffffffULL); | |
806 | uint8_t *ghash, *macp = NULL; | |
807 | int i, rv; | |
808 | @@ -320,7 +321,7 @@ gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, | |
809 | int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), | |
810 | void (*xor_block)(uint8_t *, uint8_t *)) | |
811 | { | |
812 | - gcm_impl_ops_t *gops; | |
813 | + const gcm_impl_ops_t *gops; | |
814 | size_t pt_len; | |
815 | size_t remainder; | |
816 | uint8_t *ghash; | |
817 | @@ -427,7 +428,7 @@ gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, | |
818 | void (*copy_block)(uint8_t *, uint8_t *), | |
819 | void (*xor_block)(uint8_t *, uint8_t *)) | |
820 | { | |
821 | - gcm_impl_ops_t *gops; | |
822 | + const gcm_impl_ops_t *gops; | |
823 | uint8_t *cb; | |
824 | ulong_t remainder = iv_len; | |
825 | ulong_t processed = 0; | |
826 | @@ -481,7 +482,7 @@ gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, | |
827 | void (*copy_block)(uint8_t *, uint8_t *), | |
828 | void (*xor_block)(uint8_t *, uint8_t *)) | |
829 | { | |
830 | - gcm_impl_ops_t *gops; | |
831 | + const gcm_impl_ops_t *gops; | |
832 | uint8_t *ghash, *datap, *authp; | |
833 | size_t remainder, processed; | |
834 | ||
835 | @@ -660,12 +661,17 @@ static size_t gcm_supp_impl_cnt = 0; | |
836 | static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; | |
837 | ||
838 | /* | |
839 | - * Selects the gcm operation | |
840 | + * Returns the GCM operations for encrypt/decrypt/key setup. When a | |
841 | + * SIMD implementation is not allowed in the current context, then | |
842 | + * fallback to the fastest generic implementation. | |
843 | */ | |
844 | -gcm_impl_ops_t * | |
845 | +const gcm_impl_ops_t * | |
846 | gcm_impl_get_ops() | |
847 | { | |
848 | - gcm_impl_ops_t *ops = NULL; | |
849 | + if (!kfpu_allowed()) | |
850 | + return (&gcm_generic_impl); | |
851 | + | |
852 | + const gcm_impl_ops_t *ops = NULL; | |
853 | const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); | |
854 | ||
855 | switch (impl) { | |
856 | @@ -674,15 +680,13 @@ gcm_impl_get_ops() | |
857 | ops = &gcm_fastest_impl; | |
858 | break; | |
859 | case IMPL_CYCLE: | |
860 | - { | |
861 | + /* Cycle through supported implementations */ | |
862 | ASSERT(gcm_impl_initialized); | |
863 | ASSERT3U(gcm_supp_impl_cnt, >, 0); | |
864 | - /* Cycle through supported implementations */ | |
865 | static size_t cycle_impl_idx = 0; | |
866 | size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; | |
867 | ops = gcm_supp_impl[idx]; | |
868 | - } | |
869 | - break; | |
870 | + break; | |
871 | default: | |
872 | ASSERT3U(impl, <, gcm_supp_impl_cnt); | |
873 | ASSERT3U(gcm_supp_impl_cnt, >, 0); | |
874 | @@ -696,13 +700,17 @@ gcm_impl_get_ops() | |
875 | return (ops); | |
876 | } | |
877 | ||
878 | +/* | |
879 | + * Initialize all supported implementations. | |
880 | + */ | |
881 | +/* ARGSUSED */ | |
882 | void | |
883 | -gcm_impl_init(void) | |
884 | +gcm_impl_init(void *arg) | |
885 | { | |
886 | gcm_impl_ops_t *curr_impl; | |
887 | int i, c; | |
888 | ||
889 | - /* move supported impl into aes_supp_impls */ | |
890 | + /* Move supported implementations into gcm_supp_impls */ | |
891 | for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { | |
892 | curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; | |
893 | ||
894 | @@ -711,7 +719,10 @@ gcm_impl_init(void) | |
895 | } | |
896 | gcm_supp_impl_cnt = c; | |
897 | ||
898 | - /* set fastest implementation. assume hardware accelerated is fastest */ | |
899 | + /* | |
900 | + * Set the fastest implementation given the assumption that the | |
901 | + * hardware accelerated version is the fastest. | |
902 | + */ | |
903 | #if defined(__x86_64) && defined(HAVE_PCLMULQDQ) | |
08743f90 | 904 | if (gcm_pclmulqdq_impl.is_supported()) { |
f43dbfa7 FG |
905 | memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, |
906 | diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c | |
907 | index be00ba37b..8a43ba33a 100644 | |
908 | --- a/module/icp/algs/modes/gcm_pclmulqdq.c | |
909 | +++ b/module/icp/algs/modes/gcm_pclmulqdq.c | |
910 | @@ -52,7 +52,7 @@ gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res) | |
911 | static boolean_t | |
912 | gcm_pclmulqdq_will_work(void) | |
913 | { | |
914 | - return (zfs_pclmulqdq_available()); | |
915 | + return (kfpu_allowed() && zfs_pclmulqdq_available()); | |
916 | } | |
917 | ||
918 | const gcm_impl_ops_t gcm_pclmulqdq_impl = { | |
08743f90 TL |
919 | diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h |
920 | index 3a3de91cf..329e32a8e 100644 | |
921 | --- a/module/icp/include/aes/aes_impl.h | |
922 | +++ b/module/icp/include/aes/aes_impl.h | |
923 | @@ -198,12 +198,12 @@ extern const aes_impl_ops_t aes_aesni_impl; | |
924 | /* | |
925 | * Initializes fastest implementation | |
926 | */ | |
927 | -void aes_impl_init(void); | |
928 | +void aes_impl_init(void *arg); | |
929 | ||
930 | /* | |
931 | - * Get selected aes implementation | |
932 | + * Returns optimal allowed AES implementation | |
933 | */ | |
934 | -struct aes_impl_ops *aes_impl_get_ops(void); | |
935 | +const struct aes_impl_ops *aes_impl_get_ops(void); | |
936 | ||
937 | #ifdef __cplusplus | |
938 | } | |
939 | diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h | |
940 | index b78cc8aab..dff372ef8 100644 | |
941 | --- a/module/icp/include/modes/gcm_impl.h | |
942 | +++ b/module/icp/include/modes/gcm_impl.h | |
943 | @@ -61,12 +61,12 @@ extern const gcm_impl_ops_t gcm_pclmulqdq_impl; | |
944 | /* | |
945 | * Initializes fastest implementation | |
946 | */ | |
947 | -void gcm_impl_init(void); | |
948 | +void gcm_impl_init(void *arg); | |
949 | ||
950 | /* | |
951 | - * Get selected aes implementation | |
952 | + * Returns optimal allowed GCM implementation | |
953 | */ | |
954 | -struct gcm_impl_ops *gcm_impl_get_ops(void); | |
955 | +const struct gcm_impl_ops *gcm_impl_get_ops(void); | |
956 | ||
957 | #ifdef __cplusplus | |
958 | } | |
f43dbfa7 FG |
959 | diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c |
960 | index 53b193693..51538bc60 100644 | |
961 | --- a/module/icp/io/aes.c | |
962 | +++ b/module/icp/io/aes.c | |
963 | @@ -206,9 +206,35 @@ aes_mod_init(void) | |
964 | { | |
965 | int ret; | |
966 | ||
967 | - /* find fastest implementations and set any requested implementations */ | |
968 | - aes_impl_init(); | |
969 | - gcm_impl_init(); | |
970 | +#if defined(_KERNEL) | |
971 | + /* | |
972 | + * Determine the fastest available implementation. The benchmarks | |
973 | + * are run in dedicated kernel threads to allow Linux 5.0+ kernels | |
974 | + * to use SIMD operations. If for some reason this isn't possible, | |
975 | + * fallback to the generic implementations. See the comment in | |
976 | + * include/linux/simd_x86.h for additional details. Additionally, | |
977 | + * this has the benefit of allowing them to be run in parallel. | |
978 | + */ | |
979 | + taskqid_t aes_id = taskq_dispatch(system_taskq, aes_impl_init, | |
980 | + NULL, TQ_SLEEP); | |
981 | + taskqid_t gcm_id = taskq_dispatch(system_taskq, gcm_impl_init, | |
982 | + NULL, TQ_SLEEP); | |
983 | + | |
984 | + if (aes_id != TASKQID_INVALID) { | |
985 | + taskq_wait_id(system_taskq, aes_id); | |
986 | + } else { | |
987 | + aes_impl_init(NULL); | |
988 | + } | |
989 | + | |
990 | + if (gcm_id != TASKQID_INVALID) { | |
991 | + taskq_wait_id(system_taskq, gcm_id); | |
992 | + } else { | |
993 | + gcm_impl_init(NULL); | |
994 | + } | |
995 | +#else | |
996 | + aes_impl_init(NULL); | |
997 | + gcm_impl_init(NULL); | |
998 | +#endif | |
999 | ||
1000 | if ((ret = mod_install(&modlinkage)) != 0) | |
1001 | return (ret); | |
1002 | diff --git a/module/spl/spl-taskq.c b/module/spl/spl-taskq.c | |
08743f90 | 1003 | index a39f94e4c..69d591ff7 100644 |
f43dbfa7 FG |
1004 | --- a/module/spl/spl-taskq.c |
1005 | +++ b/module/spl/spl-taskq.c | |
08743f90 | 1006 | @@ -28,6 +28,7 @@ |
f43dbfa7 FG |
1007 | #include <sys/taskq.h> |
1008 | #include <sys/kmem.h> | |
1009 | #include <sys/tsd.h> | |
1010 | +#include <linux/simd.h> | |
1011 | ||
1012 | int spl_taskq_thread_bind = 0; | |
1013 | module_param(spl_taskq_thread_bind, int, 0644); | |
08743f90 | 1014 | @@ -853,6 +854,7 @@ taskq_thread(void *args) |
f43dbfa7 FG |
1015 | sigfillset(&blocked); |
1016 | sigprocmask(SIG_BLOCK, &blocked, NULL); | |
1017 | flush_signals(current); | |
1018 | + kfpu_initialize(); | |
1019 | ||
1020 | tsd_set(taskq_tsd, tq); | |
1021 | spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); | |
1022 | diff --git a/module/spl/spl-thread.c b/module/spl/spl-thread.c | |
08743f90 | 1023 | index 0352a31ea..07e3a1bff 100644 |
f43dbfa7 FG |
1024 | --- a/module/spl/spl-thread.c |
1025 | +++ b/module/spl/spl-thread.c | |
1026 | @@ -27,6 +27,7 @@ | |
1027 | #include <sys/thread.h> | |
1028 | #include <sys/kmem.h> | |
1029 | #include <sys/tsd.h> | |
1030 | +#include <linux/simd.h> | |
1031 | ||
1032 | /* | |
1033 | * Thread interfaces | |
1034 | @@ -54,6 +55,7 @@ thread_generic_wrapper(void *arg) | |
1035 | args = tp->tp_args; | |
1036 | set_current_state(tp->tp_state); | |
1037 | set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri)); | |
1038 | + kfpu_initialize(); | |
1039 | kmem_free(tp->tp_name, tp->tp_name_size); | |
1040 | kmem_free(tp, sizeof (thread_priv_t)); | |
1041 | ||
1042 | diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c | |
08743f90 | 1043 | index f712ce40c..9187a7c1e 100644 |
f43dbfa7 FG |
1044 | --- a/module/zcommon/zfs_fletcher.c |
1045 | +++ b/module/zcommon/zfs_fletcher.c | |
1046 | @@ -140,6 +140,7 @@ | |
1047 | #include <sys/zio_checksum.h> | |
1048 | #include <sys/zfs_context.h> | |
1049 | #include <zfs_fletcher.h> | |
1050 | +#include <linux/simd.h> | |
1051 | ||
1052 | #define FLETCHER_MIN_SIMD_SIZE 64 | |
1053 | ||
1054 | @@ -205,21 +206,19 @@ static struct fletcher_4_impl_selector { | |
1055 | const char *fis_name; | |
1056 | uint32_t fis_sel; | |
1057 | } fletcher_4_impl_selectors[] = { | |
1058 | -#if !defined(_KERNEL) | |
1059 | { "cycle", IMPL_CYCLE }, | |
1060 | -#endif | |
1061 | { "fastest", IMPL_FASTEST }, | |
1062 | { "scalar", IMPL_SCALAR } | |
1063 | }; | |
1064 | ||
1065 | #if defined(_KERNEL) | |
1066 | static kstat_t *fletcher_4_kstat; | |
1067 | -#endif | |
1068 | ||
1069 | static struct fletcher_4_kstat { | |
1070 | uint64_t native; | |
1071 | uint64_t byteswap; | |
1072 | } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; | |
1073 | +#endif | |
1074 | ||
1075 | /* Indicate that benchmark has been completed */ | |
1076 | static boolean_t fletcher_4_initialized = B_FALSE; | |
1077 | @@ -408,32 +407,36 @@ fletcher_4_impl_set(const char *val) | |
1078 | return (err); | |
1079 | } | |
1080 | ||
1081 | +/* | |
1082 | + * Returns the Fletcher 4 operations for checksums. When a SIMD | |
1083 | + * implementation is not allowed in the current context, then fallback | |
1084 | + * to the fastest generic implementation. | |
1085 | + */ | |
1086 | static inline const fletcher_4_ops_t * | |
1087 | fletcher_4_impl_get(void) | |
1088 | { | |
1089 | - fletcher_4_ops_t *ops = NULL; | |
1090 | - const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); | |
1091 | + if (!kfpu_allowed()) | |
1092 | + return (&fletcher_4_superscalar4_ops); | |
1093 | + | |
1094 | + const fletcher_4_ops_t *ops = NULL; | |
1095 | + uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); | |
1096 | ||
1097 | switch (impl) { | |
1098 | case IMPL_FASTEST: | |
1099 | ASSERT(fletcher_4_initialized); | |
1100 | ops = &fletcher_4_fastest_impl; | |
1101 | break; | |
1102 | -#if !defined(_KERNEL) | |
1103 | - case IMPL_CYCLE: { | |
1104 | + case IMPL_CYCLE: | |
1105 | + /* Cycle through supported implementations */ | |
1106 | ASSERT(fletcher_4_initialized); | |
1107 | ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); | |
1108 | - | |
1109 | static uint32_t cycle_count = 0; | |
1110 | uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; | |
1111 | ops = fletcher_4_supp_impls[idx]; | |
1112 | - } | |
1113 | - break; | |
1114 | -#endif | |
1115 | + break; | |
1116 | default: | |
1117 | ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); | |
1118 | ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); | |
1119 | - | |
1120 | ops = fletcher_4_supp_impls[impl]; | |
1121 | break; | |
1122 | } | |
08743f90 | 1123 | @@ -659,6 +662,7 @@ fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) |
f43dbfa7 FG |
1124 | typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, |
1125 | zio_cksum_t *); | |
1126 | ||
1127 | +#if defined(_KERNEL) | |
1128 | static void | |
1129 | fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) | |
1130 | { | |
1131 | @@ -716,16 +720,18 @@ fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) | |
1132 | /* restore original selection */ | |
1133 | atomic_swap_32(&fletcher_4_impl_chosen, sel_save); | |
1134 | } | |
1135 | +#endif /* _KERNEL */ | |
1136 | ||
1137 | -void | |
1138 | -fletcher_4_init(void) | |
1139 | +/* | |
1140 | + * Initialize and benchmark all supported implementations. | |
1141 | + */ | |
1142 | +static void | |
1143 | +fletcher_4_benchmark(void *arg) | |
1144 | { | |
1145 | - static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ | |
1146 | fletcher_4_ops_t *curr_impl; | |
1147 | - char *databuf; | |
1148 | int i, c; | |
1149 | ||
1150 | - /* move supported impl into fletcher_4_supp_impls */ | |
1151 | + /* Move supported implementations into fletcher_4_supp_impls */ | |
1152 | for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { | |
1153 | curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; | |
1154 | ||
1155 | @@ -735,19 +741,10 @@ fletcher_4_init(void) | |
1156 | membar_producer(); /* complete fletcher_4_supp_impls[] init */ | |
1157 | fletcher_4_supp_impls_cnt = c; /* number of supported impl */ | |
1158 | ||
1159 | -#if !defined(_KERNEL) | |
1160 | - /* Skip benchmarking and use last implementation as fastest */ | |
1161 | - memcpy(&fletcher_4_fastest_impl, | |
1162 | - fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1], | |
1163 | - sizeof (fletcher_4_fastest_impl)); | |
1164 | - fletcher_4_fastest_impl.name = "fastest"; | |
1165 | - membar_producer(); | |
1166 | +#if defined(_KERNEL) | |
1167 | + static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ | |
1168 | + char *databuf = vmem_alloc(data_size, KM_SLEEP); | |
1169 | ||
1170 | - fletcher_4_initialized = B_TRUE; | |
1171 | - return; | |
1172 | -#endif | |
1173 | - /* Benchmark all supported implementations */ | |
1174 | - databuf = vmem_alloc(data_size, KM_SLEEP); | |
1175 | for (i = 0; i < data_size / sizeof (uint64_t); i++) | |
1176 | ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ | |
1177 | ||
1178 | @@ -755,9 +752,38 @@ fletcher_4_init(void) | |
1179 | fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); | |
1180 | ||
1181 | vmem_free(databuf, data_size); | |
1182 | +#else | |
1183 | + /* | |
1184 | + * Skip the benchmark in user space to avoid impacting libzpool | |
1185 | + * consumers (zdb, zhack, zinject, ztest). The last implementation | |
1186 | + * is assumed to be the fastest and used by default. | |
1187 | + */ | |
1188 | + memcpy(&fletcher_4_fastest_impl, | |
1189 | + fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], | |
1190 | + sizeof (fletcher_4_fastest_impl)); | |
1191 | + fletcher_4_fastest_impl.name = "fastest"; | |
1192 | + membar_producer(); | |
1193 | +#endif /* _KERNEL */ | |
1194 | +} | |
1195 | ||
1196 | +void | |
1197 | +fletcher_4_init(void) | |
1198 | +{ | |
1199 | #if defined(_KERNEL) | |
1200 | - /* install kstats for all implementations */ | |
1201 | + /* | |
1202 | + * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are | |
1203 | + * run in a kernel threads. This is needed to take advantage of the | |
1204 | + * SIMD functionality, see include/linux/simd_x86.h for details. | |
1205 | + */ | |
1206 | + taskqid_t id = taskq_dispatch(system_taskq, fletcher_4_benchmark, | |
1207 | + NULL, TQ_SLEEP); | |
1208 | + if (id != TASKQID_INVALID) { | |
1209 | + taskq_wait_id(system_taskq, id); | |
1210 | + } else { | |
1211 | + fletcher_4_benchmark(NULL); | |
1212 | + } | |
1213 | + | |
1214 | + /* Install kstats for all implementations */ | |
1215 | fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", | |
1216 | KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); | |
1217 | if (fletcher_4_kstat != NULL) { | |
1218 | @@ -769,6 +795,8 @@ fletcher_4_init(void) | |
1219 | fletcher_4_kstat_addr); | |
1220 | kstat_install(fletcher_4_kstat); | |
1221 | } | |
1222 | +#else | |
1223 | + fletcher_4_benchmark(NULL); | |
1224 | #endif | |
1225 | ||
1226 | /* Finish initialization */ | |
1227 | diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c | |
1228 | index bd2db2b20..3b3c1b52b 100644 | |
1229 | --- a/module/zcommon/zfs_fletcher_aarch64_neon.c | |
1230 | +++ b/module/zcommon/zfs_fletcher_aarch64_neon.c | |
1231 | @@ -198,7 +198,7 @@ unsigned char SRC __attribute__((vector_size(16))); | |
1232 | ||
1233 | static boolean_t fletcher_4_aarch64_neon_valid(void) | |
1234 | { | |
1235 | - return (B_TRUE); | |
1236 | + return (kfpu_allowed()); | |
1237 | } | |
1238 | ||
1239 | const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = { | |
1240 | diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c | |
1241 | index 7260a9864..0d4cff21a 100644 | |
1242 | --- a/module/zcommon/zfs_fletcher_avx512.c | |
1243 | +++ b/module/zcommon/zfs_fletcher_avx512.c | |
1244 | @@ -157,7 +157,7 @@ STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap); | |
1245 | static boolean_t | |
1246 | fletcher_4_avx512f_valid(void) | |
1247 | { | |
1248 | - return (zfs_avx512f_available()); | |
1249 | + return (kfpu_allowed() && zfs_avx512f_available()); | |
1250 | } | |
1251 | ||
1252 | const fletcher_4_ops_t fletcher_4_avx512f_ops = { | |
1253 | diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c | |
1254 | index 6dac047da..7f12efe6d 100644 | |
1255 | --- a/module/zcommon/zfs_fletcher_intel.c | |
1256 | +++ b/module/zcommon/zfs_fletcher_intel.c | |
1257 | @@ -156,7 +156,7 @@ fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) | |
1258 | ||
1259 | static boolean_t fletcher_4_avx2_valid(void) | |
1260 | { | |
1261 | - return (zfs_avx_available() && zfs_avx2_available()); | |
1262 | + return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); | |
1263 | } | |
1264 | ||
1265 | const fletcher_4_ops_t fletcher_4_avx2_ops = { | |
1266 | diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c | |
1267 | index a0b42e5f5..e6389d6e5 100644 | |
1268 | --- a/module/zcommon/zfs_fletcher_sse.c | |
1269 | +++ b/module/zcommon/zfs_fletcher_sse.c | |
1270 | @@ -157,7 +157,7 @@ fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) | |
1271 | ||
1272 | static boolean_t fletcher_4_sse2_valid(void) | |
1273 | { | |
1274 | - return (zfs_sse2_available()); | |
1275 | + return (kfpu_allowed() && zfs_sse2_available()); | |
1276 | } | |
1277 | ||
1278 | const fletcher_4_ops_t fletcher_4_sse2_ops = { | |
1279 | @@ -214,7 +214,8 @@ fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) | |
1280 | ||
1281 | static boolean_t fletcher_4_ssse3_valid(void) | |
1282 | { | |
1283 | - return (zfs_sse2_available() && zfs_ssse3_available()); | |
1284 | + return (kfpu_allowed() && zfs_sse2_available() && | |
1285 | + zfs_ssse3_available()); | |
1286 | } | |
1287 | ||
1288 | const fletcher_4_ops_t fletcher_4_ssse3_ops = { | |
1289 | diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c | |
08743f90 | 1290 | index 3ef67768f..ef514e9e1 100644 |
f43dbfa7 FG |
1291 | --- a/module/zfs/vdev_raidz_math.c |
1292 | +++ b/module/zfs/vdev_raidz_math.c | |
1293 | @@ -27,9 +27,9 @@ | |
1294 | #include <sys/zio.h> | |
1295 | #include <sys/debug.h> | |
1296 | #include <sys/zfs_debug.h> | |
1297 | - | |
1298 | #include <sys/vdev_raidz.h> | |
1299 | #include <sys/vdev_raidz_impl.h> | |
1300 | +#include <linux/simd.h> | |
1301 | ||
1302 | extern boolean_t raidz_will_scalar_work(void); | |
1303 | ||
1304 | @@ -87,6 +87,7 @@ static uint32_t user_sel_impl = IMPL_FASTEST; | |
1305 | static size_t raidz_supp_impl_cnt = 0; | |
1306 | static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; | |
1307 | ||
1308 | +#if defined(_KERNEL) | |
1309 | /* | |
1310 | * kstats values for supported implementations | |
1311 | * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] | |
1312 | @@ -95,14 +96,19 @@ static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; | |
1313 | ||
1314 | /* kstat for benchmarked implementations */ | |
1315 | static kstat_t *raidz_math_kstat = NULL; | |
1316 | +#endif | |
1317 | ||
1318 | /* | |
1319 | - * Selects the raidz operation for raidz_map | |
1320 | - * If rm_ops is set to NULL original raidz implementation will be used | |
1321 | + * Returns the RAIDZ operations for raidz_map() parity calculations. When | |
1322 | + * a SIMD implementation is not allowed in the current context, then fallback | |
1323 | + * to the fastest generic implementation. | |
1324 | */ | |
1325 | -raidz_impl_ops_t * | |
1326 | -vdev_raidz_math_get_ops() | |
1327 | +const raidz_impl_ops_t * | |
1328 | +vdev_raidz_math_get_ops(void) | |
1329 | { | |
1330 | + if (!kfpu_allowed()) | |
1331 | + return (&vdev_raidz_scalar_impl); | |
1332 | + | |
1333 | raidz_impl_ops_t *ops = NULL; | |
1334 | const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); | |
1335 | ||
1336 | @@ -111,18 +117,14 @@ vdev_raidz_math_get_ops() | |
1337 | ASSERT(raidz_math_initialized); | |
1338 | ops = &vdev_raidz_fastest_impl; | |
1339 | break; | |
1340 | -#if !defined(_KERNEL) | |
1341 | case IMPL_CYCLE: | |
1342 | - { | |
1343 | + /* Cycle through all supported implementations */ | |
1344 | ASSERT(raidz_math_initialized); | |
1345 | ASSERT3U(raidz_supp_impl_cnt, >, 0); | |
1346 | - /* Cycle through all supported implementations */ | |
1347 | static size_t cycle_impl_idx = 0; | |
1348 | size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; | |
1349 | ops = raidz_supp_impl[idx]; | |
1350 | - } | |
1351 | - break; | |
1352 | -#endif | |
1353 | + break; | |
1354 | case IMPL_ORIGINAL: | |
1355 | ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; | |
1356 | break; | |
1357 | @@ -273,6 +275,8 @@ const char *raidz_rec_name[] = { | |
1358 | "rec_pq", "rec_pr", "rec_qr", "rec_pqr" | |
1359 | }; | |
1360 | ||
1361 | +#if defined(_KERNEL) | |
1362 | + | |
1363 | #define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1) | |
1364 | ||
1365 | static int | |
1366 | @@ -435,21 +439,21 @@ benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) | |
1367 | } | |
1368 | } | |
1369 | } | |
1370 | +#endif | |
1371 | ||
1372 | -void | |
1373 | -vdev_raidz_math_init(void) | |
1374 | +/* | |
1375 | + * Initialize and benchmark all supported implementations. | |
1376 | + */ | |
1377 | +static void | |
1378 | +benchmark_raidz(void *arg) | |
1379 | { | |
1380 | raidz_impl_ops_t *curr_impl; | |
1381 | - zio_t *bench_zio = NULL; | |
1382 | - raidz_map_t *bench_rm = NULL; | |
1383 | - uint64_t bench_parity; | |
1384 | - int i, c, fn; | |
1385 | + int i, c; | |
1386 | ||
1387 | - /* move supported impl into raidz_supp_impl */ | |
1388 | + /* Move supported impl into raidz_supp_impl */ | |
1389 | for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { | |
1390 | curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; | |
1391 | ||
1392 | - /* initialize impl */ | |
1393 | if (curr_impl->init) | |
1394 | curr_impl->init(); | |
1395 | ||
1396 | @@ -459,18 +463,10 @@ vdev_raidz_math_init(void) | |
1397 | membar_producer(); /* complete raidz_supp_impl[] init */ | |
1398 | raidz_supp_impl_cnt = c; /* number of supported impl */ | |
1399 | ||
1400 | -#if !defined(_KERNEL) | |
1401 | - /* Skip benchmarking and use last implementation as fastest */ | |
1402 | - memcpy(&vdev_raidz_fastest_impl, raidz_supp_impl[raidz_supp_impl_cnt-1], | |
1403 | - sizeof (vdev_raidz_fastest_impl)); | |
1404 | - strcpy(vdev_raidz_fastest_impl.name, "fastest"); | |
1405 | - | |
1406 | - raidz_math_initialized = B_TRUE; | |
1407 | - | |
1408 | - /* Use 'cycle' math selection method for userspace */ | |
1409 | - VERIFY0(vdev_raidz_impl_set("cycle")); | |
1410 | - return; | |
1411 | -#endif | |
1412 | +#if defined(_KERNEL) | |
1413 | + zio_t *bench_zio = NULL; | |
1414 | + raidz_map_t *bench_rm = NULL; | |
1415 | + uint64_t bench_parity; | |
1416 | ||
08743f90 | 1417 | /* Fake a zio and run the benchmark on a warmed up buffer */ |
f43dbfa7 FG |
1418 | bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); |
1419 | @@ -480,7 +476,7 @@ vdev_raidz_math_init(void) | |
1420 | memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); | |
1421 | ||
1422 | /* Benchmark parity generation methods */ | |
1423 | - for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { | |
1424 | + for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { | |
1425 | bench_parity = fn + 1; | |
1426 | /* New raidz_map is needed for each generate_p/q/r */ | |
1427 | bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, | |
1428 | @@ -495,7 +491,7 @@ vdev_raidz_math_init(void) | |
1429 | bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, | |
1430 | BENCH_COLS, PARITY_PQR); | |
1431 | ||
1432 | - for (fn = 0; fn < RAIDZ_REC_NUM; fn++) | |
1433 | + for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) | |
1434 | benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); | |
1435 | ||
1436 | vdev_raidz_map_free(bench_rm); | |
1437 | @@ -503,11 +499,39 @@ vdev_raidz_math_init(void) | |
1438 | /* cleanup the bench zio */ | |
1439 | abd_free(bench_zio->io_abd); | |
1440 | kmem_free(bench_zio, sizeof (zio_t)); | |
1441 | +#else | |
1442 | + /* | |
1443 | + * Skip the benchmark in user space to avoid impacting libzpool | |
1444 | + * consumers (zdb, zhack, zinject, ztest). The last implementation | |
1445 | + * is assumed to be the fastest and used by default. | |
1446 | + */ | |
1447 | + memcpy(&vdev_raidz_fastest_impl, | |
1448 | + raidz_supp_impl[raidz_supp_impl_cnt - 1], | |
1449 | + sizeof (vdev_raidz_fastest_impl)); | |
1450 | + strcpy(vdev_raidz_fastest_impl.name, "fastest"); | |
1451 | +#endif /* _KERNEL */ | |
1452 | +} | |
1453 | ||
1454 | - /* install kstats for all impl */ | |
1455 | +void | |
1456 | +vdev_raidz_math_init(void) | |
1457 | +{ | |
1458 | +#if defined(_KERNEL) | |
1459 | + /* | |
1460 | + * For 5.0 and latter Linux kernels the fletcher 4 benchmarks are | |
1461 | + * run in a kernel threads. This is needed to take advantage of the | |
1462 | + * SIMD functionality, see include/linux/simd_x86.h for details. | |
1463 | + */ | |
1464 | + taskqid_t id = taskq_dispatch(system_taskq, benchmark_raidz, | |
1465 | + NULL, TQ_SLEEP); | |
1466 | + if (id != TASKQID_INVALID) { | |
1467 | + taskq_wait_id(system_taskq, id); | |
1468 | + } else { | |
1469 | + benchmark_raidz(NULL); | |
1470 | + } | |
1471 | + | |
1472 | + /* Install kstats for all implementations */ | |
1473 | raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc", | |
1474 | KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); | |
1475 | - | |
1476 | if (raidz_math_kstat != NULL) { | |
1477 | raidz_math_kstat->ks_data = NULL; | |
1478 | raidz_math_kstat->ks_ndata = UINT32_MAX; | |
1479 | @@ -517,6 +541,9 @@ vdev_raidz_math_init(void) | |
1480 | raidz_math_kstat_addr); | |
1481 | kstat_install(raidz_math_kstat); | |
1482 | } | |
1483 | +#else | |
1484 | + benchmark_raidz(NULL); | |
1485 | +#endif | |
1486 | ||
1487 | /* Finish initialization */ | |
1488 | atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); | |
1489 | @@ -527,15 +554,15 @@ void | |
1490 | vdev_raidz_math_fini(void) | |
1491 | { | |
1492 | raidz_impl_ops_t const *curr_impl; | |
1493 | - int i; | |
1494 | ||
1495 | +#if defined(_KERNEL) | |
1496 | if (raidz_math_kstat != NULL) { | |
1497 | kstat_delete(raidz_math_kstat); | |
1498 | raidz_math_kstat = NULL; | |
1499 | } | |
1500 | +#endif | |
1501 | ||
1502 | - /* fini impl */ | |
1503 | - for (i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { | |
1504 | + for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { | |
1505 | curr_impl = raidz_all_maths[i]; | |
1506 | if (curr_impl->fini) | |
1507 | curr_impl->fini(); | |
1508 | @@ -546,9 +573,7 @@ static const struct { | |
1509 | char *name; | |
1510 | uint32_t sel; | |
1511 | } math_impl_opts[] = { | |
1512 | -#if !defined(_KERNEL) | |
1513 | { "cycle", IMPL_CYCLE }, | |
1514 | -#endif | |
1515 | { "fastest", IMPL_FASTEST }, | |
1516 | { "original", IMPL_ORIGINAL }, | |
1517 | { "scalar", IMPL_SCALAR } | |
1518 | diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c | |
1519 | index e3ad06776..0a67ceb84 100644 | |
1520 | --- a/module/zfs/vdev_raidz_math_aarch64_neon.c | |
1521 | +++ b/module/zfs/vdev_raidz_math_aarch64_neon.c | |
1522 | @@ -207,7 +207,7 @@ DEFINE_REC_METHODS(aarch64_neon); | |
1523 | static boolean_t | |
1524 | raidz_will_aarch64_neon_work(void) | |
1525 | { | |
1526 | - return (B_TRUE); // __arch64__ requires NEON | |
1527 | + return (kfpu_allowed()); | |
1528 | } | |
1529 | ||
1530 | const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = { | |
1531 | diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c | |
1532 | index f8688a06a..e072f51cd 100644 | |
1533 | --- a/module/zfs/vdev_raidz_math_aarch64_neonx2.c | |
1534 | +++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c | |
1535 | @@ -217,7 +217,7 @@ DEFINE_REC_METHODS(aarch64_neonx2); | |
1536 | static boolean_t | |
1537 | raidz_will_aarch64_neonx2_work(void) | |
1538 | { | |
1539 | - return (B_TRUE); // __arch64__ requires NEON | |
1540 | + return (kfpu_allowed()); | |
1541 | } | |
1542 | ||
1543 | const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = { | |
1544 | diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c | |
1545 | index 063d29bcd..a12eb6720 100644 | |
1546 | --- a/module/zfs/vdev_raidz_math_avx2.c | |
1547 | +++ b/module/zfs/vdev_raidz_math_avx2.c | |
1548 | @@ -396,7 +396,7 @@ DEFINE_REC_METHODS(avx2); | |
1549 | static boolean_t | |
1550 | raidz_will_avx2_work(void) | |
1551 | { | |
1552 | - return (zfs_avx_available() && zfs_avx2_available()); | |
1553 | + return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); | |
1554 | } | |
1555 | ||
1556 | const raidz_impl_ops_t vdev_raidz_avx2_impl = { | |
1557 | diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c | |
1558 | index d605653db..2f545c9ec 100644 | |
1559 | --- a/module/zfs/vdev_raidz_math_avx512bw.c | |
1560 | +++ b/module/zfs/vdev_raidz_math_avx512bw.c | |
1561 | @@ -393,9 +393,8 @@ DEFINE_REC_METHODS(avx512bw); | |
1562 | static boolean_t | |
1563 | raidz_will_avx512bw_work(void) | |
1564 | { | |
1565 | - return (zfs_avx_available() && | |
1566 | - zfs_avx512f_available() && | |
1567 | - zfs_avx512bw_available()); | |
1568 | + return (kfpu_allowed() && zfs_avx_available() && | |
1569 | + zfs_avx512f_available() && zfs_avx512bw_available()); | |
1570 | } | |
1571 | ||
1572 | const raidz_impl_ops_t vdev_raidz_avx512bw_impl = { | |
1573 | diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c | |
1574 | index f4e4560ce..75af7a8ee 100644 | |
1575 | --- a/module/zfs/vdev_raidz_math_avx512f.c | |
1576 | +++ b/module/zfs/vdev_raidz_math_avx512f.c | |
1577 | @@ -470,9 +470,8 @@ DEFINE_REC_METHODS(avx512f); | |
1578 | static boolean_t | |
1579 | raidz_will_avx512f_work(void) | |
1580 | { | |
1581 | - return (zfs_avx_available() && | |
1582 | - zfs_avx2_available() && | |
1583 | - zfs_avx512f_available()); | |
1584 | + return (kfpu_allowed() && zfs_avx_available() && | |
1585 | + zfs_avx2_available() && zfs_avx512f_available()); | |
1586 | } | |
1587 | ||
1588 | const raidz_impl_ops_t vdev_raidz_avx512f_impl = { | |
1589 | diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c | |
1590 | index 9985da273..5b3a9385c 100644 | |
1591 | --- a/module/zfs/vdev_raidz_math_sse2.c | |
1592 | +++ b/module/zfs/vdev_raidz_math_sse2.c | |
1593 | @@ -607,7 +607,7 @@ DEFINE_REC_METHODS(sse2); | |
1594 | static boolean_t | |
1595 | raidz_will_sse2_work(void) | |
1596 | { | |
1597 | - return (zfs_sse_available() && zfs_sse2_available()); | |
1598 | + return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available()); | |
1599 | } | |
1600 | ||
1601 | const raidz_impl_ops_t vdev_raidz_sse2_impl = { | |
1602 | diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c | |
1603 | index 047a48d54..62247cf8e 100644 | |
1604 | --- a/module/zfs/vdev_raidz_math_ssse3.c | |
1605 | +++ b/module/zfs/vdev_raidz_math_ssse3.c | |
1606 | @@ -399,8 +399,8 @@ DEFINE_REC_METHODS(ssse3); | |
1607 | static boolean_t | |
1608 | raidz_will_ssse3_work(void) | |
1609 | { | |
1610 | - return (zfs_sse_available() && zfs_sse2_available() && | |
1611 | - zfs_ssse3_available()); | |
1612 | + return (kfpu_allowed() && zfs_sse_available() && | |
1613 | + zfs_sse2_available() && zfs_ssse3_available()); | |
1614 | } | |
1615 | ||
1616 | const raidz_impl_ops_t vdev_raidz_ssse3_impl = { |