]> git.proxmox.com Git - mirror_zfs.git/blame - module/zstd/zfs_zstd.c
BRT: Skip duplicate BRT prefetches
[mirror_zfs.git] / module / zstd / zfs_zstd.c
CommitLineData
10b3c7f5
MN
1/*
2 * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions are met:
6 *
7 * 1. Redistributions of source code must retain the above copyright notice,
8 * this list of conditions and the following disclaimer.
9 *
10 * 2. Redistributions in binary form must reproduce the above copyright notice,
11 * this list of conditions and the following disclaimer in the documentation
12 * and/or other materials provided with the distribution.
13 *
14 * 3. Neither the name of the copyright holder nor the names of its
15 * contributors may be used to endorse or promote products derived from this
16 * software without specific prior written permission.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31/*
32 * Copyright (c) 2016-2018, Klara Inc.
33 * Copyright (c) 2016-2018, Allan Jude
34 * Copyright (c) 2018-2020, Sebastian Gottschall
35 * Copyright (c) 2019-2020, Michael Niewöhner
36 * Copyright (c) 2020, The FreeBSD Foundation [1]
37 *
38 * [1] Portions of this software were developed by Allan Jude
39 * under sponsorship from the FreeBSD Foundation.
40 */
41
42#include <sys/param.h>
43#include <sys/sysmacros.h>
44#include <sys/zfs_context.h>
45#include <sys/zio_compress.h>
46#include <sys/spa.h>
47#include <sys/zstd/zstd.h>
48
49#define ZSTD_STATIC_LINKING_ONLY
50#include "lib/zstd.h"
234e9605 51#include "lib/common/zstd_errors.h"
10b3c7f5 52
fdc2d303 53static uint_t zstd_earlyabort_pass = 1;
f375b23c
RE
54static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
55static unsigned int zstd_abort_size = (128 * 1024);
56
7b875ee6 57static kstat_t *zstd_ksp = NULL;
10b3c7f5
MN
58
59typedef struct zstd_stats {
60 kstat_named_t zstd_stat_alloc_fail;
61 kstat_named_t zstd_stat_alloc_fallback;
62 kstat_named_t zstd_stat_com_alloc_fail;
63 kstat_named_t zstd_stat_dec_alloc_fail;
64 kstat_named_t zstd_stat_com_inval;
65 kstat_named_t zstd_stat_dec_inval;
66 kstat_named_t zstd_stat_dec_header_inval;
67 kstat_named_t zstd_stat_com_fail;
68 kstat_named_t zstd_stat_dec_fail;
f375b23c
RE
69 /*
70 * LZ4 first-pass early abort verdict
71 */
72 kstat_named_t zstd_stat_lz4pass_allowed;
73 kstat_named_t zstd_stat_lz4pass_rejected;
74 /*
75 * zstd-1 second-pass early abort verdict
76 */
77 kstat_named_t zstd_stat_zstdpass_allowed;
78 kstat_named_t zstd_stat_zstdpass_rejected;
79 /*
80 * We excluded this from early abort for some reason
81 */
82 kstat_named_t zstd_stat_passignored;
83 kstat_named_t zstd_stat_passignored_size;
c4ede65b
MG
84 kstat_named_t zstd_stat_buffers;
85 kstat_named_t zstd_stat_size;
10b3c7f5
MN
86} zstd_stats_t;
87
88static zstd_stats_t zstd_stats = {
89 { "alloc_fail", KSTAT_DATA_UINT64 },
90 { "alloc_fallback", KSTAT_DATA_UINT64 },
91 { "compress_alloc_fail", KSTAT_DATA_UINT64 },
92 { "decompress_alloc_fail", KSTAT_DATA_UINT64 },
93 { "compress_level_invalid", KSTAT_DATA_UINT64 },
94 { "decompress_level_invalid", KSTAT_DATA_UINT64 },
95 { "decompress_header_invalid", KSTAT_DATA_UINT64 },
96 { "compress_failed", KSTAT_DATA_UINT64 },
97 { "decompress_failed", KSTAT_DATA_UINT64 },
f375b23c
RE
98 { "lz4pass_allowed", KSTAT_DATA_UINT64 },
99 { "lz4pass_rejected", KSTAT_DATA_UINT64 },
100 { "zstdpass_allowed", KSTAT_DATA_UINT64 },
101 { "zstdpass_rejected", KSTAT_DATA_UINT64 },
102 { "passignored", KSTAT_DATA_UINT64 },
103 { "passignored_size", KSTAT_DATA_UINT64 },
c4ede65b
MG
104 { "buffers", KSTAT_DATA_UINT64 },
105 { "size", KSTAT_DATA_UINT64 },
10b3c7f5
MN
106};
107
f375b23c
RE
108#ifdef _KERNEL
109static int
110kstat_zstd_update(kstat_t *ksp, int rw)
111{
112 ASSERT(ksp != NULL);
113
114 if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
115 ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
116 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
117 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
118 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
119 ZSTDSTAT_ZERO(zstd_stat_com_inval);
120 ZSTDSTAT_ZERO(zstd_stat_dec_inval);
121 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
122 ZSTDSTAT_ZERO(zstd_stat_com_fail);
123 ZSTDSTAT_ZERO(zstd_stat_dec_fail);
124 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
125 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
126 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
127 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
128 ZSTDSTAT_ZERO(zstd_stat_passignored);
129 ZSTDSTAT_ZERO(zstd_stat_passignored_size);
130 }
131
132 return (0);
133}
134#endif
135
10b3c7f5
MN
136/* Enums describing the allocator type specified by kmem_type in zstd_kmem */
137enum zstd_kmem_type {
138 ZSTD_KMEM_UNKNOWN = 0,
139 /* Allocation type using kmem_vmalloc */
140 ZSTD_KMEM_DEFAULT,
141 /* Pool based allocation using mempool_alloc */
142 ZSTD_KMEM_POOL,
143 /* Reserved fallback memory for decompression only */
144 ZSTD_KMEM_DCTX,
145 ZSTD_KMEM_COUNT,
146};
147
148/* Structure for pooled memory objects */
149struct zstd_pool {
150 void *mem;
151 size_t size;
152 kmutex_t barrier;
153 hrtime_t timeout;
154};
155
156/* Global structure for handling memory allocations */
157struct zstd_kmem {
158 enum zstd_kmem_type kmem_type;
159 size_t kmem_size;
160 struct zstd_pool *pool;
161};
162
163/* Fallback memory structure used for decompression only if memory runs out */
164struct zstd_fallback_mem {
165 size_t mem_size;
166 void *mem;
167 kmutex_t barrier;
168};
169
170struct zstd_levelmap {
171 int16_t zstd_level;
172 enum zio_zstd_levels level;
173};
174
175/*
176 * ZSTD memory handlers
177 *
178 * For decompression we use a different handler which also provides fallback
179 * memory allocation in case memory runs out.
180 *
181 * The ZSTD handlers were split up for the most simplified implementation.
182 */
183static void *zstd_alloc(void *opaque, size_t size);
184static void *zstd_dctx_alloc(void *opaque, size_t size);
185static void zstd_free(void *opaque, void *ptr);
186
187/* Compression memory handler */
188static const ZSTD_customMem zstd_malloc = {
189 zstd_alloc,
190 zstd_free,
191 NULL,
192};
193
194/* Decompression memory handler */
195static const ZSTD_customMem zstd_dctx_malloc = {
196 zstd_dctx_alloc,
197 zstd_free,
198 NULL,
199};
200
201/* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
202static struct zstd_levelmap zstd_levels[] = {
203 {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
204 {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
205 {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
206 {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
207 {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
208 {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
209 {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
210 {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
211 {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
212 {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
213 {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
214 {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
215 {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
216 {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
217 {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
218 {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
219 {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
220 {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
221 {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
222 {-1, ZIO_ZSTD_LEVEL_FAST_1},
223 {-2, ZIO_ZSTD_LEVEL_FAST_2},
224 {-3, ZIO_ZSTD_LEVEL_FAST_3},
225 {-4, ZIO_ZSTD_LEVEL_FAST_4},
226 {-5, ZIO_ZSTD_LEVEL_FAST_5},
227 {-6, ZIO_ZSTD_LEVEL_FAST_6},
228 {-7, ZIO_ZSTD_LEVEL_FAST_7},
229 {-8, ZIO_ZSTD_LEVEL_FAST_8},
230 {-9, ZIO_ZSTD_LEVEL_FAST_9},
231 {-10, ZIO_ZSTD_LEVEL_FAST_10},
232 {-20, ZIO_ZSTD_LEVEL_FAST_20},
233 {-30, ZIO_ZSTD_LEVEL_FAST_30},
234 {-40, ZIO_ZSTD_LEVEL_FAST_40},
235 {-50, ZIO_ZSTD_LEVEL_FAST_50},
236 {-60, ZIO_ZSTD_LEVEL_FAST_60},
237 {-70, ZIO_ZSTD_LEVEL_FAST_70},
238 {-80, ZIO_ZSTD_LEVEL_FAST_80},
239 {-90, ZIO_ZSTD_LEVEL_FAST_90},
240 {-100, ZIO_ZSTD_LEVEL_FAST_100},
241 {-500, ZIO_ZSTD_LEVEL_FAST_500},
242 {-1000, ZIO_ZSTD_LEVEL_FAST_1000},
243};
244
245/*
246 * This variable represents the maximum count of the pool based on the number
247 * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
248 */
249static int pool_count = 16;
250
251#define ZSTD_POOL_MAX pool_count
252#define ZSTD_POOL_TIMEOUT 60 * 2
253
254static struct zstd_fallback_mem zstd_dctx_fallback;
255static struct zstd_pool *zstd_mempool_cctx;
256static struct zstd_pool *zstd_mempool_dctx;
257
2084e9f7
RE
258/*
259 * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
260 * and while ASAN does this, KASAN defines that and does not. So to avoid
261 * changing the external code, we do this.
262 */
63652e15 263#if defined(ZFS_ASAN_ENABLED)
2084e9f7
RE
264#define ADDRESS_SANITIZER 1
265#endif
266#if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
267void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
268void __asan_poison_memory_region(void const volatile *addr, size_t size);
269void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
270void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
271#endif
272
7eefaf0c
SG
273
274static void
275zstd_mempool_reap(struct zstd_pool *zstd_mempool)
276{
277 struct zstd_pool *pool;
278
279 if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
280 return;
281 }
282
283 /* free obsolete slots */
284 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
285 pool = &zstd_mempool[i];
286 if (pool->mem && mutex_tryenter(&pool->barrier)) {
287 /* Free memory if unused object older than 2 minutes */
288 if (pool->mem && gethrestime_sec() > pool->timeout) {
289 vmem_free(pool->mem, pool->size);
290 ZSTDSTAT_SUB(zstd_stat_buffers, 1);
291 ZSTDSTAT_SUB(zstd_stat_size, pool->size);
292 pool->mem = NULL;
293 pool->size = 0;
294 pool->timeout = 0;
295 }
296 mutex_exit(&pool->barrier);
297 }
298 }
299}
300
10b3c7f5
MN
301/*
302 * Try to get a cached allocated buffer from memory pool or allocate a new one
303 * if necessary. If a object is older than 2 minutes and does not fit the
304 * requested size, it will be released and a new cached entry will be allocated.
305 * If other pooled objects are detected without being used for 2 minutes, they
306 * will be released, too.
307 *
308 * The concept is that high frequency memory allocations of bigger objects are
309 * expensive. So if a lot of work is going on, allocations will be kept for a
310 * while and can be reused in that time frame.
311 *
312 * The scheduled release will be updated every time a object is reused.
313 */
7eefaf0c 314
10b3c7f5
MN
315static void *
316zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
317{
318 struct zstd_pool *pool;
319 struct zstd_kmem *mem = NULL;
320
321 if (!zstd_mempool) {
322 return (NULL);
323 }
324
325 /* Seek for preallocated memory slot and free obsolete slots */
326 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
327 pool = &zstd_mempool[i];
328 /*
bf169e9f 329 * This lock is simply a marker for a pool object being in use.
10b3c7f5
MN
330 * If it's already hold, it will be skipped.
331 *
332 * We need to create it before checking it to avoid race
333 * conditions caused by running in a threaded context.
334 *
335 * The lock is later released by zstd_mempool_free.
336 */
337 if (mutex_tryenter(&pool->barrier)) {
338 /*
339 * Check if objects fits the size, if so we take it and
340 * update the timestamp.
341 */
7eefaf0c 342 if (pool->mem && size <= pool->size) {
10b3c7f5
MN
343 pool->timeout = gethrestime_sec() +
344 ZSTD_POOL_TIMEOUT;
345 mem = pool->mem;
7eefaf0c 346 return (mem);
10b3c7f5 347 }
10b3c7f5
MN
348 mutex_exit(&pool->barrier);
349 }
350 }
351
10b3c7f5
MN
352 /*
353 * If no preallocated slot was found, try to fill in a new one.
354 *
355 * We run a similar algorithm twice here to avoid pool fragmentation.
356 * The first one may generate holes in the list if objects get released.
357 * We always make sure that these holes get filled instead of adding new
358 * allocations constantly at the end.
359 */
360 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
361 pool = &zstd_mempool[i];
362 if (mutex_tryenter(&pool->barrier)) {
363 /* Object is free, try to allocate new one */
364 if (!pool->mem) {
365 mem = vmem_alloc(size, KM_SLEEP);
c4ede65b
MG
366 if (mem) {
367 ZSTDSTAT_ADD(zstd_stat_buffers, 1);
368 ZSTDSTAT_ADD(zstd_stat_size, size);
369 pool->mem = mem;
370 pool->size = size;
10b3c7f5
MN
371 /* Keep track for later release */
372 mem->pool = pool;
10b3c7f5
MN
373 mem->kmem_type = ZSTD_KMEM_POOL;
374 mem->kmem_size = size;
375 }
376 }
377
378 if (size <= pool->size) {
379 /* Update timestamp */
380 pool->timeout = gethrestime_sec() +
381 ZSTD_POOL_TIMEOUT;
382
383 return (pool->mem);
384 }
385
386 mutex_exit(&pool->barrier);
387 }
388 }
389
390 /*
391 * If the pool is full or the allocation failed, try lazy allocation
392 * instead.
393 */
394 if (!mem) {
395 mem = vmem_alloc(size, KM_NOSLEEP);
396 if (mem) {
397 mem->pool = NULL;
398 mem->kmem_type = ZSTD_KMEM_DEFAULT;
399 mem->kmem_size = size;
400 }
401 }
402
403 return (mem);
404}
405
406/* Mark object as released by releasing the barrier mutex */
407static void
408zstd_mempool_free(struct zstd_kmem *z)
409{
410 mutex_exit(&z->pool->barrier);
411}
412
413/* Convert ZFS internal enum to ZSTD level */
414static int
415zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
416{
417 if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
418 *zstd_level = zstd_levels[level - 1].zstd_level;
419 return (0);
420 }
421 if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
422 level <= ZIO_ZSTD_LEVEL_FAST_1000) {
423 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
424 + ZIO_ZSTD_LEVEL_19].zstd_level;
425 return (0);
426 }
427
428 /* Invalid/unknown zfs compression enum - this should never happen. */
429 return (1);
430}
431
b1a1c643 432
f375b23c
RE
433size_t
434zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
435 int level)
436{
437 int16_t zstd_level;
438 if (zstd_enum_to_level(level, &zstd_level)) {
439 ZSTDSTAT_BUMP(zstd_stat_com_inval);
440 return (s_len);
441 }
442 /*
443 * A zstd early abort heuristic.
444 *
445 * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
446 * 128k), don't try any of this, just go.
447 * (because experimentally that was a reasonable cutoff for a perf win
448 * with tiny ratio change)
449 * - First, we try LZ4 compression, and if it doesn't early abort, we
450 * jump directly to whatever compression level we intended to try.
451 * - Second, we try zstd-1 - if that errors out (usually, but not
452 * exclusively, if it would overflow), we give up early.
453 *
454 * If it works, instead we go on and compress anyway.
455 *
456 * Why two passes? LZ4 alone gets you a lot of the way, but on highly
457 * compressible data, it was losing up to 8.5% of the compressed
458 * savings versus no early abort, and all the zstd-fast levels are
459 * worse indications on their own than LZ4, and don't improve the LZ4
460 * pass noticably if stacked like this.
461 */
462 size_t actual_abort_size = zstd_abort_size;
463 if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
464 s_len >= actual_abort_size) {
465 int pass_len = 1;
466 pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
467 if (pass_len < d_len) {
468 ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
469 goto keep_trying;
470 }
471 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
472
473 pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
474 ZIO_ZSTD_LEVEL_1);
475 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
476 ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
477 return (s_len);
478 }
479 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
480 } else {
481 ZSTDSTAT_BUMP(zstd_stat_passignored);
482 if (s_len < actual_abort_size) {
483 ZSTDSTAT_BUMP(zstd_stat_passignored_size);
484 }
485 }
486keep_trying:
487 return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
488
489}
490
10b3c7f5
MN
491/* Compress block using zstd */
492size_t
184df27e 493zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
10b3c7f5
MN
494 int level)
495{
496 size_t c_len;
497 int16_t zstd_level;
498 zfs_zstdhdr_t *hdr;
499 ZSTD_CCtx *cctx;
500
501 hdr = (zfs_zstdhdr_t *)d_start;
502
503 /* Skip compression if the specified level is invalid */
504 if (zstd_enum_to_level(level, &zstd_level)) {
505 ZSTDSTAT_BUMP(zstd_stat_com_inval);
506 return (s_len);
507 }
508
509 ASSERT3U(d_len, >=, sizeof (*hdr));
510 ASSERT3U(d_len, <=, s_len);
511 ASSERT3U(zstd_level, !=, 0);
512
513 cctx = ZSTD_createCCtx_advanced(zstd_malloc);
514
515 /*
516 * Out of kernel memory, gently fall through - this will disable
517 * compression in zio_compress_data
518 */
519 if (!cctx) {
520 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
521 return (s_len);
522 }
523
524 /* Set the compression level */
525 ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
526
527 /* Use the "magicless" zstd header which saves us 4 header bytes */
528 ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
529
530 /*
531 * Disable redundant checksum calculation and content size storage since
532 * this is already done by ZFS itself.
533 */
534 ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
535 ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
536
537 c_len = ZSTD_compress2(cctx,
538 hdr->data,
539 d_len - sizeof (*hdr),
540 s_start, s_len);
541
542 ZSTD_freeCCtx(cctx);
543
544 /* Error in the compression routine, disable compression. */
545 if (ZSTD_isError(c_len)) {
546 /*
547 * If we are aborting the compression because the saves are
548 * too small, that is not a failure. Everything else is a
549 * failure, so increment the compression failure counter.
550 */
f375b23c
RE
551 int err = ZSTD_getErrorCode(c_len);
552 if (err != ZSTD_error_dstSize_tooSmall) {
10b3c7f5 553 ZSTDSTAT_BUMP(zstd_stat_com_fail);
f375b23c 554 dprintf("Error: %s", ZSTD_getErrorString(err));
10b3c7f5
MN
555 }
556 return (s_len);
557 }
558
559 /*
560 * Encode the compressed buffer size at the start. We'll need this in
561 * decompression to counter the effects of padding which might be added
562 * to the compressed buffer and which, if unhandled, would confuse the
563 * hell out of our decompression function.
564 */
565 hdr->c_len = BE_32(c_len);
566
567 /*
568 * Check version for overflow.
569 * The limit of 24 bits must not be exceeded. This allows a maximum
570 * version 1677.72.15 which we don't expect to be ever reached.
571 */
572 ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
573
574 /*
575 * Encode the compression level as well. We may need to know the
576 * original compression level if compressed_arc is disabled, to match
577 * the compression settings to write this block to the L2ARC.
578 *
579 * Encode the actual level, so if the enum changes in the future, we
580 * will be compatible.
581 *
582 * The upper 24 bits store the ZSTD version to be able to provide
583 * future compatibility, since new versions might enhance the
584 * compression algorithm in a way, where the compressed data will
585 * change.
586 *
587 * As soon as such incompatibility occurs, handling code needs to be
588 * added, differentiating between the versions.
589 */
b1a1c643
RE
590 zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
591 zfs_set_hdrlevel(hdr, level);
10b3c7f5
MN
592 hdr->raw_version_level = BE_32(hdr->raw_version_level);
593
594 return (c_len + sizeof (*hdr));
595}
596
597/* Decompress block using zstd and return its stored level */
598int
184df27e
SG
599zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
600 size_t d_len, uint8_t *level)
10b3c7f5
MN
601{
602 ZSTD_DCtx *dctx;
603 size_t result;
604 int16_t zstd_level;
605 uint32_t c_len;
606 const zfs_zstdhdr_t *hdr;
607 zfs_zstdhdr_t hdr_copy;
608
609 hdr = (const zfs_zstdhdr_t *)s_start;
610 c_len = BE_32(hdr->c_len);
611
612 /*
613 * Make a copy instead of directly converting the header, since we must
614 * not modify the original data that may be used again later.
615 */
616 hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
b1a1c643 617 uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
10b3c7f5
MN
618
619 /*
620 * NOTE: We ignore the ZSTD version for now. As soon as any
bf169e9f 621 * incompatibility occurs, it has to be handled accordingly.
10b3c7f5
MN
622 * The version can be accessed via `hdr_copy.version`.
623 */
624
625 /*
626 * Convert and check the level
627 * An invalid level is a strong indicator for data corruption! In such
628 * case return an error so the upper layers can try to fix it.
629 */
b1a1c643 630 if (zstd_enum_to_level(curlevel, &zstd_level)) {
10b3c7f5
MN
631 ZSTDSTAT_BUMP(zstd_stat_dec_inval);
632 return (1);
633 }
634
635 ASSERT3U(d_len, >=, s_len);
b1a1c643 636 ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
10b3c7f5
MN
637
638 /* Invalid compressed buffer size encoded at start */
639 if (c_len + sizeof (*hdr) > s_len) {
640 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
641 return (1);
642 }
643
644 dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
645 if (!dctx) {
646 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
647 return (1);
648 }
649
650 /* Set header type to "magicless" */
651 ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
652
653 /* Decompress the data and release the context */
654 result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
655 ZSTD_freeDCtx(dctx);
656
657 /*
658 * Returns 0 on success (decompression function returned non-negative)
659 * and non-zero on failure (decompression function returned negative.
660 */
661 if (ZSTD_isError(result)) {
662 ZSTDSTAT_BUMP(zstd_stat_dec_fail);
663 return (1);
664 }
665
666 if (level) {
b1a1c643 667 *level = curlevel;
10b3c7f5
MN
668 }
669
670 return (0);
671}
672
673/* Decompress datablock using zstd */
674int
184df27e 675zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
10b3c7f5
MN
676 int level __maybe_unused)
677{
678
184df27e
SG
679 return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
680 NULL));
10b3c7f5
MN
681}
682
683/* Allocator for zstd compression context using mempool_allocator */
684static void *
685zstd_alloc(void *opaque __maybe_unused, size_t size)
686{
687 size_t nbytes = sizeof (struct zstd_kmem) + size;
688 struct zstd_kmem *z = NULL;
689
690 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
691
692 if (!z) {
693 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
694 return (NULL);
695 }
696
697 return ((void*)z + (sizeof (struct zstd_kmem)));
698}
699
700/*
701 * Allocator for zstd decompression context using mempool_allocator with
702 * fallback to reserved memory if allocation fails
703 */
704static void *
705zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
706{
707 size_t nbytes = sizeof (struct zstd_kmem) + size;
708 struct zstd_kmem *z = NULL;
709 enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
710
711 z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
712 if (!z) {
713 /* Try harder, decompression shall not fail */
714 z = vmem_alloc(nbytes, KM_SLEEP);
715 if (z) {
716 z->pool = NULL;
717 }
718 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
719 } else {
720 return ((void*)z + (sizeof (struct zstd_kmem)));
721 }
722
723 /* Fallback if everything fails */
724 if (!z) {
725 /*
726 * Barrier since we only can handle it in a single thread. All
727 * other following threads need to wait here until decompression
728 * is completed. zstd_free will release this barrier later.
729 */
730 mutex_enter(&zstd_dctx_fallback.barrier);
731
732 z = zstd_dctx_fallback.mem;
733 type = ZSTD_KMEM_DCTX;
734 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
735 }
736
737 /* Allocation should always be successful */
738 if (!z) {
739 return (NULL);
740 }
741
742 z->kmem_type = type;
743 z->kmem_size = nbytes;
744
745 return ((void*)z + (sizeof (struct zstd_kmem)));
746}
747
748/* Free allocated memory by its specific type */
749static void
750zstd_free(void *opaque __maybe_unused, void *ptr)
751{
752 struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
753 enum zstd_kmem_type type;
754
755 ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
756 ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
757
758 type = z->kmem_type;
759 switch (type) {
760 case ZSTD_KMEM_DEFAULT:
761 vmem_free(z, z->kmem_size);
762 break;
763 case ZSTD_KMEM_POOL:
764 zstd_mempool_free(z);
765 break;
766 case ZSTD_KMEM_DCTX:
767 mutex_exit(&zstd_dctx_fallback.barrier);
768 break;
769 default:
770 break;
771 }
772}
773
774/* Allocate fallback memory to ensure safe decompression */
775static void __init
776create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
777{
778 mem->mem_size = size;
779 mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
780 mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
781}
782
783/* Initialize memory pool barrier mutexes */
784static void __init
785zstd_mempool_init(void)
786{
7384ec65 787 zstd_mempool_cctx =
10b3c7f5 788 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
7384ec65 789 zstd_mempool_dctx =
10b3c7f5
MN
790 kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
791
792 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
793 mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
794 MUTEX_DEFAULT, NULL);
795 mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
796 MUTEX_DEFAULT, NULL);
797 }
798}
799
800/* Initialize zstd-related memory handling */
801static int __init
802zstd_meminit(void)
803{
804 zstd_mempool_init();
805
806 /*
807 * Estimate the size of the fallback decompression context.
808 * The expected size on x64 with current ZSTD should be about 160 KB.
809 */
810 create_fallback_mem(&zstd_dctx_fallback,
811 P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
812 PAGESIZE));
813
814 return (0);
815}
816
817/* Release object from pool and free memory */
ad9e7676 818static void
10b3c7f5
MN
819release_pool(struct zstd_pool *pool)
820{
821 mutex_destroy(&pool->barrier);
822 vmem_free(pool->mem, pool->size);
823 pool->mem = NULL;
824 pool->size = 0;
825}
826
827/* Release memory pool objects */
ad9e7676 828static void
10b3c7f5
MN
829zstd_mempool_deinit(void)
830{
831 for (int i = 0; i < ZSTD_POOL_MAX; i++) {
832 release_pool(&zstd_mempool_cctx[i]);
833 release_pool(&zstd_mempool_dctx[i]);
834 }
835
836 kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
837 kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
838 zstd_mempool_dctx = NULL;
839 zstd_mempool_cctx = NULL;
840}
841
8a171ccd
SG
842/* release unused memory from pool */
843
844void
845zfs_zstd_cache_reap_now(void)
846{
847 /*
848 * calling alloc with zero size seeks
849 * and releases old unused objects
850 */
7eefaf0c
SG
851 zstd_mempool_reap(zstd_mempool_cctx);
852 zstd_mempool_reap(zstd_mempool_dctx);
8a171ccd
SG
853}
854
10b3c7f5
MN
855extern int __init
856zstd_init(void)
857{
858 /* Set pool size by using maximum sane thread count * 4 */
859 pool_count = (boot_ncpus * 4);
860 zstd_meminit();
861
862 /* Initialize kstat */
863 zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
864 KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
865 KSTAT_FLAG_VIRTUAL);
866 if (zstd_ksp != NULL) {
867 zstd_ksp->ks_data = &zstd_stats;
868 kstat_install(zstd_ksp);
f375b23c
RE
869#ifdef _KERNEL
870 zstd_ksp->ks_update = kstat_zstd_update;
871#endif
10b3c7f5
MN
872 }
873
874 return (0);
875}
876
ad9e7676 877extern void
10b3c7f5
MN
878zstd_fini(void)
879{
880 /* Deinitialize kstat */
881 if (zstd_ksp != NULL) {
882 kstat_delete(zstd_ksp);
883 zstd_ksp = NULL;
884 }
885
886 /* Release fallback memory */
887 vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
888 mutex_destroy(&zstd_dctx_fallback.barrier);
889
890 /* Deinit memory pool */
891 zstd_mempool_deinit();
892}
893
894#if defined(_KERNEL)
ad9e7676 895#ifdef __FreeBSD__
10b3c7f5
MN
896module_init(zstd_init);
897module_exit(zstd_fini);
ad9e7676 898#endif
10b3c7f5 899
fdc2d303 900ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,
f375b23c
RE
901 "Enable early abort attempts when using zstd");
902ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
903 "Minimal size of block to attempt early abort");
10b3c7f5 904#endif