2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
7 * 1. Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
14 * 3. Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from this
16 * software without specific prior written permission.
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
32 * Copyright (c) 2016-2018, Klara Inc.
33 * Copyright (c) 2016-2018, Allan Jude
34 * Copyright (c) 2018-2020, Sebastian Gottschall
35 * Copyright (c) 2019-2020, Michael Niewöhner
36 * Copyright (c) 2020, The FreeBSD Foundation [1]
38 * [1] Portions of this software were developed by Allan Jude
39 * under sponsorship from the FreeBSD Foundation.
42 #include <sys/param.h>
43 #include <sys/sysmacros.h>
44 #include <sys/zfs_context.h>
45 #include <sys/zio_compress.h>
47 #include <sys/zstd/zstd.h>
49 #define ZSTD_STATIC_LINKING_ONLY
51 #include "lib/common/zstd_errors.h"
53 static uint_t zstd_earlyabort_pass
= 1;
54 static int zstd_cutoff_level
= ZIO_ZSTD_LEVEL_3
;
55 static unsigned int zstd_abort_size
= (128 * 1024);
57 static kstat_t
*zstd_ksp
= NULL
;
59 typedef struct zstd_stats
{
60 kstat_named_t zstd_stat_alloc_fail
;
61 kstat_named_t zstd_stat_alloc_fallback
;
62 kstat_named_t zstd_stat_com_alloc_fail
;
63 kstat_named_t zstd_stat_dec_alloc_fail
;
64 kstat_named_t zstd_stat_com_inval
;
65 kstat_named_t zstd_stat_dec_inval
;
66 kstat_named_t zstd_stat_dec_header_inval
;
67 kstat_named_t zstd_stat_com_fail
;
68 kstat_named_t zstd_stat_dec_fail
;
70 * LZ4 first-pass early abort verdict
72 kstat_named_t zstd_stat_lz4pass_allowed
;
73 kstat_named_t zstd_stat_lz4pass_rejected
;
75 * zstd-1 second-pass early abort verdict
77 kstat_named_t zstd_stat_zstdpass_allowed
;
78 kstat_named_t zstd_stat_zstdpass_rejected
;
80 * We excluded this from early abort for some reason
82 kstat_named_t zstd_stat_passignored
;
83 kstat_named_t zstd_stat_passignored_size
;
84 kstat_named_t zstd_stat_buffers
;
85 kstat_named_t zstd_stat_size
;
88 static zstd_stats_t zstd_stats
= {
89 { "alloc_fail", KSTAT_DATA_UINT64
},
90 { "alloc_fallback", KSTAT_DATA_UINT64
},
91 { "compress_alloc_fail", KSTAT_DATA_UINT64
},
92 { "decompress_alloc_fail", KSTAT_DATA_UINT64
},
93 { "compress_level_invalid", KSTAT_DATA_UINT64
},
94 { "decompress_level_invalid", KSTAT_DATA_UINT64
},
95 { "decompress_header_invalid", KSTAT_DATA_UINT64
},
96 { "compress_failed", KSTAT_DATA_UINT64
},
97 { "decompress_failed", KSTAT_DATA_UINT64
},
98 { "lz4pass_allowed", KSTAT_DATA_UINT64
},
99 { "lz4pass_rejected", KSTAT_DATA_UINT64
},
100 { "zstdpass_allowed", KSTAT_DATA_UINT64
},
101 { "zstdpass_rejected", KSTAT_DATA_UINT64
},
102 { "passignored", KSTAT_DATA_UINT64
},
103 { "passignored_size", KSTAT_DATA_UINT64
},
104 { "buffers", KSTAT_DATA_UINT64
},
105 { "size", KSTAT_DATA_UINT64
},
110 kstat_zstd_update(kstat_t
*ksp
, int rw
)
114 if (rw
== KSTAT_WRITE
&& ksp
== zstd_ksp
) {
115 ZSTDSTAT_ZERO(zstd_stat_alloc_fail
);
116 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback
);
117 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail
);
118 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail
);
119 ZSTDSTAT_ZERO(zstd_stat_com_inval
);
120 ZSTDSTAT_ZERO(zstd_stat_dec_inval
);
121 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval
);
122 ZSTDSTAT_ZERO(zstd_stat_com_fail
);
123 ZSTDSTAT_ZERO(zstd_stat_dec_fail
);
124 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed
);
125 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected
);
126 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed
);
127 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected
);
128 ZSTDSTAT_ZERO(zstd_stat_passignored
);
129 ZSTDSTAT_ZERO(zstd_stat_passignored_size
);
136 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
137 enum zstd_kmem_type
{
138 ZSTD_KMEM_UNKNOWN
= 0,
139 /* Allocation type using kmem_vmalloc */
141 /* Pool based allocation using mempool_alloc */
143 /* Reserved fallback memory for decompression only */
148 /* Structure for pooled memory objects */
156 /* Global structure for handling memory allocations */
158 enum zstd_kmem_type kmem_type
;
160 struct zstd_pool
*pool
;
163 /* Fallback memory structure used for decompression only if memory runs out */
164 struct zstd_fallback_mem
{
170 struct zstd_levelmap
{
172 enum zio_zstd_levels level
;
176 * ZSTD memory handlers
178 * For decompression we use a different handler which also provides fallback
179 * memory allocation in case memory runs out.
181 * The ZSTD handlers were split up for the most simplified implementation.
183 static void *zstd_alloc(void *opaque
, size_t size
);
184 static void *zstd_dctx_alloc(void *opaque
, size_t size
);
185 static void zstd_free(void *opaque
, void *ptr
);
187 /* Compression memory handler */
188 static const ZSTD_customMem zstd_malloc
= {
194 /* Decompression memory handler */
195 static const ZSTD_customMem zstd_dctx_malloc
= {
201 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
202 static struct zstd_levelmap zstd_levels
[] = {
203 {ZIO_ZSTD_LEVEL_1
, ZIO_ZSTD_LEVEL_1
},
204 {ZIO_ZSTD_LEVEL_2
, ZIO_ZSTD_LEVEL_2
},
205 {ZIO_ZSTD_LEVEL_3
, ZIO_ZSTD_LEVEL_3
},
206 {ZIO_ZSTD_LEVEL_4
, ZIO_ZSTD_LEVEL_4
},
207 {ZIO_ZSTD_LEVEL_5
, ZIO_ZSTD_LEVEL_5
},
208 {ZIO_ZSTD_LEVEL_6
, ZIO_ZSTD_LEVEL_6
},
209 {ZIO_ZSTD_LEVEL_7
, ZIO_ZSTD_LEVEL_7
},
210 {ZIO_ZSTD_LEVEL_8
, ZIO_ZSTD_LEVEL_8
},
211 {ZIO_ZSTD_LEVEL_9
, ZIO_ZSTD_LEVEL_9
},
212 {ZIO_ZSTD_LEVEL_10
, ZIO_ZSTD_LEVEL_10
},
213 {ZIO_ZSTD_LEVEL_11
, ZIO_ZSTD_LEVEL_11
},
214 {ZIO_ZSTD_LEVEL_12
, ZIO_ZSTD_LEVEL_12
},
215 {ZIO_ZSTD_LEVEL_13
, ZIO_ZSTD_LEVEL_13
},
216 {ZIO_ZSTD_LEVEL_14
, ZIO_ZSTD_LEVEL_14
},
217 {ZIO_ZSTD_LEVEL_15
, ZIO_ZSTD_LEVEL_15
},
218 {ZIO_ZSTD_LEVEL_16
, ZIO_ZSTD_LEVEL_16
},
219 {ZIO_ZSTD_LEVEL_17
, ZIO_ZSTD_LEVEL_17
},
220 {ZIO_ZSTD_LEVEL_18
, ZIO_ZSTD_LEVEL_18
},
221 {ZIO_ZSTD_LEVEL_19
, ZIO_ZSTD_LEVEL_19
},
222 {-1, ZIO_ZSTD_LEVEL_FAST_1
},
223 {-2, ZIO_ZSTD_LEVEL_FAST_2
},
224 {-3, ZIO_ZSTD_LEVEL_FAST_3
},
225 {-4, ZIO_ZSTD_LEVEL_FAST_4
},
226 {-5, ZIO_ZSTD_LEVEL_FAST_5
},
227 {-6, ZIO_ZSTD_LEVEL_FAST_6
},
228 {-7, ZIO_ZSTD_LEVEL_FAST_7
},
229 {-8, ZIO_ZSTD_LEVEL_FAST_8
},
230 {-9, ZIO_ZSTD_LEVEL_FAST_9
},
231 {-10, ZIO_ZSTD_LEVEL_FAST_10
},
232 {-20, ZIO_ZSTD_LEVEL_FAST_20
},
233 {-30, ZIO_ZSTD_LEVEL_FAST_30
},
234 {-40, ZIO_ZSTD_LEVEL_FAST_40
},
235 {-50, ZIO_ZSTD_LEVEL_FAST_50
},
236 {-60, ZIO_ZSTD_LEVEL_FAST_60
},
237 {-70, ZIO_ZSTD_LEVEL_FAST_70
},
238 {-80, ZIO_ZSTD_LEVEL_FAST_80
},
239 {-90, ZIO_ZSTD_LEVEL_FAST_90
},
240 {-100, ZIO_ZSTD_LEVEL_FAST_100
},
241 {-500, ZIO_ZSTD_LEVEL_FAST_500
},
242 {-1000, ZIO_ZSTD_LEVEL_FAST_1000
},
246 * This variable represents the maximum count of the pool based on the number
247 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
249 static int pool_count
= 16;
251 #define ZSTD_POOL_MAX pool_count
252 #define ZSTD_POOL_TIMEOUT 60 * 2
254 static struct zstd_fallback_mem zstd_dctx_fallback
;
255 static struct zstd_pool
*zstd_mempool_cctx
;
256 static struct zstd_pool
*zstd_mempool_dctx
;
259 * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
260 * and while ASAN does this, KASAN defines that and does not. So to avoid
261 * changing the external code, we do this.
263 #if defined(ZFS_ASAN_ENABLED)
264 #define ADDRESS_SANITIZER 1
266 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
267 void __asan_unpoison_memory_region(void const volatile *addr
, size_t size
);
268 void __asan_poison_memory_region(void const volatile *addr
, size_t size
);
269 void __asan_unpoison_memory_region(void const volatile *addr
, size_t size
) {};
270 void __asan_poison_memory_region(void const volatile *addr
, size_t size
) {};
275 zstd_mempool_reap(struct zstd_pool
*zstd_mempool
)
277 struct zstd_pool
*pool
;
279 if (!zstd_mempool
|| !ZSTDSTAT(zstd_stat_buffers
)) {
283 /* free obsolete slots */
284 for (int i
= 0; i
< ZSTD_POOL_MAX
; i
++) {
285 pool
= &zstd_mempool
[i
];
286 if (pool
->mem
&& mutex_tryenter(&pool
->barrier
)) {
287 /* Free memory if unused object older than 2 minutes */
288 if (pool
->mem
&& gethrestime_sec() > pool
->timeout
) {
289 vmem_free(pool
->mem
, pool
->size
);
290 ZSTDSTAT_SUB(zstd_stat_buffers
, 1);
291 ZSTDSTAT_SUB(zstd_stat_size
, pool
->size
);
296 mutex_exit(&pool
->barrier
);
302 * Try to get a cached allocated buffer from memory pool or allocate a new one
303 * if necessary. If a object is older than 2 minutes and does not fit the
304 * requested size, it will be released and a new cached entry will be allocated.
305 * If other pooled objects are detected without being used for 2 minutes, they
306 * will be released, too.
308 * The concept is that high frequency memory allocations of bigger objects are
309 * expensive. So if a lot of work is going on, allocations will be kept for a
310 * while and can be reused in that time frame.
312 * The scheduled release will be updated every time a object is reused.
316 zstd_mempool_alloc(struct zstd_pool
*zstd_mempool
, size_t size
)
318 struct zstd_pool
*pool
;
319 struct zstd_kmem
*mem
= NULL
;
325 /* Seek for preallocated memory slot and free obsolete slots */
326 for (int i
= 0; i
< ZSTD_POOL_MAX
; i
++) {
327 pool
= &zstd_mempool
[i
];
329 * This lock is simply a marker for a pool object being in use.
330 * If it's already hold, it will be skipped.
332 * We need to create it before checking it to avoid race
333 * conditions caused by running in a threaded context.
335 * The lock is later released by zstd_mempool_free.
337 if (mutex_tryenter(&pool
->barrier
)) {
339 * Check if objects fits the size, if so we take it and
340 * update the timestamp.
342 if (pool
->mem
&& size
<= pool
->size
) {
343 pool
->timeout
= gethrestime_sec() +
348 mutex_exit(&pool
->barrier
);
353 * If no preallocated slot was found, try to fill in a new one.
355 * We run a similar algorithm twice here to avoid pool fragmentation.
356 * The first one may generate holes in the list if objects get released.
357 * We always make sure that these holes get filled instead of adding new
358 * allocations constantly at the end.
360 for (int i
= 0; i
< ZSTD_POOL_MAX
; i
++) {
361 pool
= &zstd_mempool
[i
];
362 if (mutex_tryenter(&pool
->barrier
)) {
363 /* Object is free, try to allocate new one */
365 mem
= vmem_alloc(size
, KM_SLEEP
);
367 ZSTDSTAT_ADD(zstd_stat_buffers
, 1);
368 ZSTDSTAT_ADD(zstd_stat_size
, size
);
371 /* Keep track for later release */
373 mem
->kmem_type
= ZSTD_KMEM_POOL
;
374 mem
->kmem_size
= size
;
378 if (size
<= pool
->size
) {
379 /* Update timestamp */
380 pool
->timeout
= gethrestime_sec() +
386 mutex_exit(&pool
->barrier
);
391 * If the pool is full or the allocation failed, try lazy allocation
395 mem
= vmem_alloc(size
, KM_NOSLEEP
);
398 mem
->kmem_type
= ZSTD_KMEM_DEFAULT
;
399 mem
->kmem_size
= size
;
406 /* Mark object as released by releasing the barrier mutex */
408 zstd_mempool_free(struct zstd_kmem
*z
)
410 mutex_exit(&z
->pool
->barrier
);
413 /* Convert ZFS internal enum to ZSTD level */
415 zstd_enum_to_level(enum zio_zstd_levels level
, int16_t *zstd_level
)
417 if (level
> 0 && level
<= ZIO_ZSTD_LEVEL_19
) {
418 *zstd_level
= zstd_levels
[level
- 1].zstd_level
;
421 if (level
>= ZIO_ZSTD_LEVEL_FAST_1
&&
422 level
<= ZIO_ZSTD_LEVEL_FAST_1000
) {
423 *zstd_level
= zstd_levels
[level
- ZIO_ZSTD_LEVEL_FAST_1
424 + ZIO_ZSTD_LEVEL_19
].zstd_level
;
428 /* Invalid/unknown zfs compression enum - this should never happen. */
434 zfs_zstd_compress_wrap(void *s_start
, void *d_start
, size_t s_len
, size_t d_len
,
438 if (zstd_enum_to_level(level
, &zstd_level
)) {
439 ZSTDSTAT_BUMP(zstd_stat_com_inval
);
443 * A zstd early abort heuristic.
445 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
446 * 128k), don't try any of this, just go.
447 * (because experimentally that was a reasonable cutoff for a perf win
448 * with tiny ratio change)
449 * - First, we try LZ4 compression, and if it doesn't early abort, we
450 * jump directly to whatever compression level we intended to try.
451 * - Second, we try zstd-1 - if that errors out (usually, but not
452 * exclusively, if it would overflow), we give up early.
454 * If it works, instead we go on and compress anyway.
456 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
457 * compressible data, it was losing up to 8.5% of the compressed
458 * savings versus no early abort, and all the zstd-fast levels are
459 * worse indications on their own than LZ4, and don't improve the LZ4
460 * pass noticably if stacked like this.
462 size_t actual_abort_size
= zstd_abort_size
;
463 if (zstd_earlyabort_pass
> 0 && zstd_level
>= zstd_cutoff_level
&&
464 s_len
>= actual_abort_size
) {
466 pass_len
= lz4_compress_zfs(s_start
, d_start
, s_len
, d_len
, 0);
467 if (pass_len
< d_len
) {
468 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed
);
471 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected
);
473 pass_len
= zfs_zstd_compress(s_start
, d_start
, s_len
, d_len
,
475 if (pass_len
== s_len
|| pass_len
<= 0 || pass_len
> d_len
) {
476 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected
);
479 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed
);
481 ZSTDSTAT_BUMP(zstd_stat_passignored
);
482 if (s_len
< actual_abort_size
) {
483 ZSTDSTAT_BUMP(zstd_stat_passignored_size
);
487 return (zfs_zstd_compress(s_start
, d_start
, s_len
, d_len
, level
));
491 /* Compress block using zstd */
493 zfs_zstd_compress(void *s_start
, void *d_start
, size_t s_len
, size_t d_len
,
501 hdr
= (zfs_zstdhdr_t
*)d_start
;
503 /* Skip compression if the specified level is invalid */
504 if (zstd_enum_to_level(level
, &zstd_level
)) {
505 ZSTDSTAT_BUMP(zstd_stat_com_inval
);
509 ASSERT3U(d_len
, >=, sizeof (*hdr
));
510 ASSERT3U(d_len
, <=, s_len
);
511 ASSERT3U(zstd_level
, !=, 0);
513 cctx
= ZSTD_createCCtx_advanced(zstd_malloc
);
516 * Out of kernel memory, gently fall through - this will disable
517 * compression in zio_compress_data
520 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail
);
524 /* Set the compression level */
525 ZSTD_CCtx_setParameter(cctx
, ZSTD_c_compressionLevel
, zstd_level
);
527 /* Use the "magicless" zstd header which saves us 4 header bytes */
528 ZSTD_CCtx_setParameter(cctx
, ZSTD_c_format
, ZSTD_f_zstd1_magicless
);
531 * Disable redundant checksum calculation and content size storage since
532 * this is already done by ZFS itself.
534 ZSTD_CCtx_setParameter(cctx
, ZSTD_c_checksumFlag
, 0);
535 ZSTD_CCtx_setParameter(cctx
, ZSTD_c_contentSizeFlag
, 0);
537 c_len
= ZSTD_compress2(cctx
,
539 d_len
- sizeof (*hdr
),
544 /* Error in the compression routine, disable compression. */
545 if (ZSTD_isError(c_len
)) {
547 * If we are aborting the compression because the saves are
548 * too small, that is not a failure. Everything else is a
549 * failure, so increment the compression failure counter.
551 int err
= ZSTD_getErrorCode(c_len
);
552 if (err
!= ZSTD_error_dstSize_tooSmall
) {
553 ZSTDSTAT_BUMP(zstd_stat_com_fail
);
554 dprintf("Error: %s", ZSTD_getErrorString(err
));
560 * Encode the compressed buffer size at the start. We'll need this in
561 * decompression to counter the effects of padding which might be added
562 * to the compressed buffer and which, if unhandled, would confuse the
563 * hell out of our decompression function.
565 hdr
->c_len
= BE_32(c_len
);
568 * Check version for overflow.
569 * The limit of 24 bits must not be exceeded. This allows a maximum
570 * version 1677.72.15 which we don't expect to be ever reached.
572 ASSERT3U(ZSTD_VERSION_NUMBER
, <=, 0xFFFFFF);
575 * Encode the compression level as well. We may need to know the
576 * original compression level if compressed_arc is disabled, to match
577 * the compression settings to write this block to the L2ARC.
579 * Encode the actual level, so if the enum changes in the future, we
580 * will be compatible.
582 * The upper 24 bits store the ZSTD version to be able to provide
583 * future compatibility, since new versions might enhance the
584 * compression algorithm in a way, where the compressed data will
587 * As soon as such incompatibility occurs, handling code needs to be
588 * added, differentiating between the versions.
590 zfs_set_hdrversion(hdr
, ZSTD_VERSION_NUMBER
);
591 zfs_set_hdrlevel(hdr
, level
);
592 hdr
->raw_version_level
= BE_32(hdr
->raw_version_level
);
594 return (c_len
+ sizeof (*hdr
));
597 /* Decompress block using zstd and return its stored level */
599 zfs_zstd_decompress_level(void *s_start
, void *d_start
, size_t s_len
,
600 size_t d_len
, uint8_t *level
)
606 const zfs_zstdhdr_t
*hdr
;
607 zfs_zstdhdr_t hdr_copy
;
609 hdr
= (const zfs_zstdhdr_t
*)s_start
;
610 c_len
= BE_32(hdr
->c_len
);
613 * Make a copy instead of directly converting the header, since we must
614 * not modify the original data that may be used again later.
616 hdr_copy
.raw_version_level
= BE_32(hdr
->raw_version_level
);
617 uint8_t curlevel
= zfs_get_hdrlevel(&hdr_copy
);
620 * NOTE: We ignore the ZSTD version for now. As soon as any
621 * incompatibility occurs, it has to be handled accordingly.
622 * The version can be accessed via `hdr_copy.version`.
626 * Convert and check the level
627 * An invalid level is a strong indicator for data corruption! In such
628 * case return an error so the upper layers can try to fix it.
630 if (zstd_enum_to_level(curlevel
, &zstd_level
)) {
631 ZSTDSTAT_BUMP(zstd_stat_dec_inval
);
635 ASSERT3U(d_len
, >=, s_len
);
636 ASSERT3U(curlevel
, !=, ZIO_COMPLEVEL_INHERIT
);
638 /* Invalid compressed buffer size encoded at start */
639 if (c_len
+ sizeof (*hdr
) > s_len
) {
640 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval
);
644 dctx
= ZSTD_createDCtx_advanced(zstd_dctx_malloc
);
646 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail
);
650 /* Set header type to "magicless" */
651 ZSTD_DCtx_setParameter(dctx
, ZSTD_d_format
, ZSTD_f_zstd1_magicless
);
653 /* Decompress the data and release the context */
654 result
= ZSTD_decompressDCtx(dctx
, d_start
, d_len
, hdr
->data
, c_len
);
658 * Returns 0 on success (decompression function returned non-negative)
659 * and non-zero on failure (decompression function returned negative.
661 if (ZSTD_isError(result
)) {
662 ZSTDSTAT_BUMP(zstd_stat_dec_fail
);
673 /* Decompress datablock using zstd */
675 zfs_zstd_decompress(void *s_start
, void *d_start
, size_t s_len
, size_t d_len
,
676 int level __maybe_unused
)
679 return (zfs_zstd_decompress_level(s_start
, d_start
, s_len
, d_len
,
683 /* Allocator for zstd compression context using mempool_allocator */
685 zstd_alloc(void *opaque __maybe_unused
, size_t size
)
687 size_t nbytes
= sizeof (struct zstd_kmem
) + size
;
688 struct zstd_kmem
*z
= NULL
;
690 z
= (struct zstd_kmem
*)zstd_mempool_alloc(zstd_mempool_cctx
, nbytes
);
693 ZSTDSTAT_BUMP(zstd_stat_alloc_fail
);
697 return ((void*)z
+ (sizeof (struct zstd_kmem
)));
701 * Allocator for zstd decompression context using mempool_allocator with
702 * fallback to reserved memory if allocation fails
705 zstd_dctx_alloc(void *opaque __maybe_unused
, size_t size
)
707 size_t nbytes
= sizeof (struct zstd_kmem
) + size
;
708 struct zstd_kmem
*z
= NULL
;
709 enum zstd_kmem_type type
= ZSTD_KMEM_DEFAULT
;
711 z
= (struct zstd_kmem
*)zstd_mempool_alloc(zstd_mempool_dctx
, nbytes
);
713 /* Try harder, decompression shall not fail */
714 z
= vmem_alloc(nbytes
, KM_SLEEP
);
718 ZSTDSTAT_BUMP(zstd_stat_alloc_fail
);
720 return ((void*)z
+ (sizeof (struct zstd_kmem
)));
723 /* Fallback if everything fails */
726 * Barrier since we only can handle it in a single thread. All
727 * other following threads need to wait here until decompression
728 * is completed. zstd_free will release this barrier later.
730 mutex_enter(&zstd_dctx_fallback
.barrier
);
732 z
= zstd_dctx_fallback
.mem
;
733 type
= ZSTD_KMEM_DCTX
;
734 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback
);
737 /* Allocation should always be successful */
743 z
->kmem_size
= nbytes
;
745 return ((void*)z
+ (sizeof (struct zstd_kmem
)));
748 /* Free allocated memory by its specific type */
750 zstd_free(void *opaque __maybe_unused
, void *ptr
)
752 struct zstd_kmem
*z
= (ptr
- sizeof (struct zstd_kmem
));
753 enum zstd_kmem_type type
;
755 ASSERT3U(z
->kmem_type
, <, ZSTD_KMEM_COUNT
);
756 ASSERT3U(z
->kmem_type
, >, ZSTD_KMEM_UNKNOWN
);
760 case ZSTD_KMEM_DEFAULT
:
761 vmem_free(z
, z
->kmem_size
);
764 zstd_mempool_free(z
);
767 mutex_exit(&zstd_dctx_fallback
.barrier
);
774 /* Allocate fallback memory to ensure safe decompression */
776 create_fallback_mem(struct zstd_fallback_mem
*mem
, size_t size
)
778 mem
->mem_size
= size
;
779 mem
->mem
= vmem_zalloc(mem
->mem_size
, KM_SLEEP
);
780 mutex_init(&mem
->barrier
, NULL
, MUTEX_DEFAULT
, NULL
);
783 /* Initialize memory pool barrier mutexes */
785 zstd_mempool_init(void)
788 kmem_zalloc(ZSTD_POOL_MAX
* sizeof (struct zstd_pool
), KM_SLEEP
);
790 kmem_zalloc(ZSTD_POOL_MAX
* sizeof (struct zstd_pool
), KM_SLEEP
);
792 for (int i
= 0; i
< ZSTD_POOL_MAX
; i
++) {
793 mutex_init(&zstd_mempool_cctx
[i
].barrier
, NULL
,
794 MUTEX_DEFAULT
, NULL
);
795 mutex_init(&zstd_mempool_dctx
[i
].barrier
, NULL
,
796 MUTEX_DEFAULT
, NULL
);
800 /* Initialize zstd-related memory handling */
807 * Estimate the size of the fallback decompression context.
808 * The expected size on x64 with current ZSTD should be about 160 KB.
810 create_fallback_mem(&zstd_dctx_fallback
,
811 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem
),
817 /* Release object from pool and free memory */
819 release_pool(struct zstd_pool
*pool
)
821 mutex_destroy(&pool
->barrier
);
822 vmem_free(pool
->mem
, pool
->size
);
827 /* Release memory pool objects */
829 zstd_mempool_deinit(void)
831 for (int i
= 0; i
< ZSTD_POOL_MAX
; i
++) {
832 release_pool(&zstd_mempool_cctx
[i
]);
833 release_pool(&zstd_mempool_dctx
[i
]);
836 kmem_free(zstd_mempool_dctx
, ZSTD_POOL_MAX
* sizeof (struct zstd_pool
));
837 kmem_free(zstd_mempool_cctx
, ZSTD_POOL_MAX
* sizeof (struct zstd_pool
));
838 zstd_mempool_dctx
= NULL
;
839 zstd_mempool_cctx
= NULL
;
842 /* release unused memory from pool */
845 zfs_zstd_cache_reap_now(void)
848 * calling alloc with zero size seeks
849 * and releases old unused objects
851 zstd_mempool_reap(zstd_mempool_cctx
);
852 zstd_mempool_reap(zstd_mempool_dctx
);
858 /* Set pool size by using maximum sane thread count * 4 */
859 pool_count
= (boot_ncpus
* 4);
862 /* Initialize kstat */
863 zstd_ksp
= kstat_create("zfs", 0, "zstd", "misc",
864 KSTAT_TYPE_NAMED
, sizeof (zstd_stats
) / sizeof (kstat_named_t
),
866 if (zstd_ksp
!= NULL
) {
867 zstd_ksp
->ks_data
= &zstd_stats
;
868 kstat_install(zstd_ksp
);
870 zstd_ksp
->ks_update
= kstat_zstd_update
;
880 /* Deinitialize kstat */
881 if (zstd_ksp
!= NULL
) {
882 kstat_delete(zstd_ksp
);
886 /* Release fallback memory */
887 vmem_free(zstd_dctx_fallback
.mem
, zstd_dctx_fallback
.mem_size
);
888 mutex_destroy(&zstd_dctx_fallback
.barrier
);
890 /* Deinit memory pool */
891 zstd_mempool_deinit();
896 module_init(zstd_init
);
897 module_exit(zstd_fini
);
900 ZFS_MODULE_PARAM(zfs
, zstd_
, earlyabort_pass
, UINT
, ZMOD_RW
,
901 "Enable early abort attempts when using zstd");
902 ZFS_MODULE_PARAM(zfs
, zstd_
, abort_size
, UINT
, ZMOD_RW
,
903 "Minimal size of block to attempt early abort");