module/zstd/zfs_zstd.c

   1 /*
   2  * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html)
   3  *
   4  * Redistribution and use in source and binary forms, with or without
   5  * modification, are permitted provided that the following conditions are met:
   6  *
   7  * 1. Redistributions of source code must retain the above copyright notice,
   8  * this list of conditions and the following disclaimer.
   9  *
  10  * 2. Redistributions in binary form must reproduce the above copyright notice,
  11  * this list of conditions and the following disclaimer in the documentation
  12  * and/or other materials provided with the distribution.
  13  *
  14  * 3. Neither the name of the copyright holder nor the names of its
  15  * contributors may be used to endorse or promote products derived from this
  16  * software without specific prior written permission.
  17  *
  18  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
  19  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  21  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
  22  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
  24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
  25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
  26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  28  * POSSIBILITY OF SUCH DAMAGE.
  29  */
  30
  31 /*
  32  * Copyright (c) 2016-2018, Klara Inc.
  33  * Copyright (c) 2016-2018, Allan Jude
  34  * Copyright (c) 2018-2020, Sebastian Gottschall
  35  * Copyright (c) 2019-2020, Michael Niewöhner
  36  * Copyright (c) 2020, The FreeBSD Foundation [1]
  37  *
  38  * [1] Portions of this software were developed by Allan Jude
  39  *     under sponsorship from the FreeBSD Foundation.
  40  */
  41
  42 #include <sys/param.h>
  43 #include <sys/sysmacros.h>
  44 #include <sys/zfs_context.h>
  45 #include <sys/zio_compress.h>
  46 #include <sys/spa.h>
  47 #include <sys/zstd/zstd.h>
  48
  49 #define ZSTD_STATIC_LINKING_ONLY
  50 #include "lib/zstd.h"
  51 #include "lib/common/zstd_errors.h"
  52
  53 static uint_t zstd_earlyabort_pass = 1;
  54 static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
  55 static unsigned int zstd_abort_size = (128 * 1024);
  56
  57 static kstat_t *zstd_ksp = NULL;
  58
  59 typedef struct zstd_stats {
  60         kstat_named_t   zstd_stat_alloc_fail;
  61         kstat_named_t   zstd_stat_alloc_fallback;
  62         kstat_named_t   zstd_stat_com_alloc_fail;
  63         kstat_named_t   zstd_stat_dec_alloc_fail;
  64         kstat_named_t   zstd_stat_com_inval;
  65         kstat_named_t   zstd_stat_dec_inval;
  66         kstat_named_t   zstd_stat_dec_header_inval;
  67         kstat_named_t   zstd_stat_com_fail;
  68         kstat_named_t   zstd_stat_dec_fail;
  69         /*
  70          * LZ4 first-pass early abort verdict
  71          */
  72         kstat_named_t   zstd_stat_lz4pass_allowed;
  73         kstat_named_t   zstd_stat_lz4pass_rejected;
  74         /*
  75          * zstd-1 second-pass early abort verdict
  76          */
  77         kstat_named_t   zstd_stat_zstdpass_allowed;
  78         kstat_named_t   zstd_stat_zstdpass_rejected;
  79         /*
  80          * We excluded this from early abort for some reason
  81          */
  82         kstat_named_t   zstd_stat_passignored;
  83         kstat_named_t   zstd_stat_passignored_size;
  84         kstat_named_t   zstd_stat_buffers;
  85         kstat_named_t   zstd_stat_size;
  86 } zstd_stats_t;
  87
  88 static zstd_stats_t zstd_stats = {
  89         { "alloc_fail",                 KSTAT_DATA_UINT64 },
  90         { "alloc_fallback",             KSTAT_DATA_UINT64 },
  91         { "compress_alloc_fail",        KSTAT_DATA_UINT64 },
  92         { "decompress_alloc_fail",      KSTAT_DATA_UINT64 },
  93         { "compress_level_invalid",     KSTAT_DATA_UINT64 },
  94         { "decompress_level_invalid",   KSTAT_DATA_UINT64 },
  95         { "decompress_header_invalid",  KSTAT_DATA_UINT64 },
  96         { "compress_failed",            KSTAT_DATA_UINT64 },
  97         { "decompress_failed",          KSTAT_DATA_UINT64 },
  98         { "lz4pass_allowed",            KSTAT_DATA_UINT64 },
  99         { "lz4pass_rejected",           KSTAT_DATA_UINT64 },
 100         { "zstdpass_allowed",           KSTAT_DATA_UINT64 },
 101         { "zstdpass_rejected",          KSTAT_DATA_UINT64 },
 102         { "passignored",                KSTAT_DATA_UINT64 },
 103         { "passignored_size",           KSTAT_DATA_UINT64 },
 104         { "buffers",                    KSTAT_DATA_UINT64 },
 105         { "size",                       KSTAT_DATA_UINT64 },
 106 };
 107
 108 #ifdef _KERNEL
 109 static int
 110 kstat_zstd_update(kstat_t *ksp, int rw)
 111 {
 112         ASSERT(ksp != NULL);
 113
 114         if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
 115                 ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
 116                 ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
 117                 ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
 118                 ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
 119                 ZSTDSTAT_ZERO(zstd_stat_com_inval);
 120                 ZSTDSTAT_ZERO(zstd_stat_dec_inval);
 121                 ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
 122                 ZSTDSTAT_ZERO(zstd_stat_com_fail);
 123                 ZSTDSTAT_ZERO(zstd_stat_dec_fail);
 124                 ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
 125                 ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
 126                 ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
 127                 ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
 128                 ZSTDSTAT_ZERO(zstd_stat_passignored);
 129                 ZSTDSTAT_ZERO(zstd_stat_passignored_size);
 130         }
 131
 132         return (0);
 133 }
 134 #endif
 135
 136 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
 137 enum zstd_kmem_type {
 138         ZSTD_KMEM_UNKNOWN = 0,
 139         /* Allocation type using kmem_vmalloc */
 140         ZSTD_KMEM_DEFAULT,
 141         /* Pool based allocation using mempool_alloc */
 142         ZSTD_KMEM_POOL,
 143         /* Reserved fallback memory for decompression only */
 144         ZSTD_KMEM_DCTX,
 145         ZSTD_KMEM_COUNT,
 146 };
 147
 148 /* Structure for pooled memory objects */
 149 struct zstd_pool {
 150         void *mem;
 151         size_t size;
 152         kmutex_t barrier;
 153         hrtime_t timeout;
 154 };
 155
 156 /* Global structure for handling memory allocations */
 157 struct zstd_kmem {
 158         enum zstd_kmem_type kmem_type;
 159         size_t kmem_size;
 160         struct zstd_pool *pool;
 161 };
 162
 163 /* Fallback memory structure used for decompression only if memory runs out */
 164 struct zstd_fallback_mem {
 165         size_t mem_size;
 166         void *mem;
 167         kmutex_t barrier;
 168 };
 169
 170 struct zstd_levelmap {
 171         int16_t zstd_level;
 172         enum zio_zstd_levels level;
 173 };
 174
 175 /*
 176  * ZSTD memory handlers
 177  *
 178  * For decompression we use a different handler which also provides fallback
 179  * memory allocation in case memory runs out.
 180  *
 181  * The ZSTD handlers were split up for the most simplified implementation.
 182  */
 183 static void *zstd_alloc(void *opaque, size_t size);
 184 static void *zstd_dctx_alloc(void *opaque, size_t size);
 185 static void zstd_free(void *opaque, void *ptr);
 186
 187 /* Compression memory handler */
 188 static const ZSTD_customMem zstd_malloc = {
 189         zstd_alloc,
 190         zstd_free,
 191         NULL,
 192 };
 193
 194 /* Decompression memory handler */
 195 static const ZSTD_customMem zstd_dctx_malloc = {
 196         zstd_dctx_alloc,
 197         zstd_free,
 198         NULL,
 199 };
 200
 201 /* Level map for converting ZFS internal levels to ZSTD levels and vice versa */
 202 static struct zstd_levelmap zstd_levels[] = {
 203         {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1},
 204         {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2},
 205         {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3},
 206         {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4},
 207         {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5},
 208         {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6},
 209         {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7},
 210         {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8},
 211         {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9},
 212         {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10},
 213         {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11},
 214         {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12},
 215         {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13},
 216         {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14},
 217         {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15},
 218         {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16},
 219         {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17},
 220         {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18},
 221         {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19},
 222         {-1, ZIO_ZSTD_LEVEL_FAST_1},
 223         {-2, ZIO_ZSTD_LEVEL_FAST_2},
 224         {-3, ZIO_ZSTD_LEVEL_FAST_3},
 225         {-4, ZIO_ZSTD_LEVEL_FAST_4},
 226         {-5, ZIO_ZSTD_LEVEL_FAST_5},
 227         {-6, ZIO_ZSTD_LEVEL_FAST_6},
 228         {-7, ZIO_ZSTD_LEVEL_FAST_7},
 229         {-8, ZIO_ZSTD_LEVEL_FAST_8},
 230         {-9, ZIO_ZSTD_LEVEL_FAST_9},
 231         {-10, ZIO_ZSTD_LEVEL_FAST_10},
 232         {-20, ZIO_ZSTD_LEVEL_FAST_20},
 233         {-30, ZIO_ZSTD_LEVEL_FAST_30},
 234         {-40, ZIO_ZSTD_LEVEL_FAST_40},
 235         {-50, ZIO_ZSTD_LEVEL_FAST_50},
 236         {-60, ZIO_ZSTD_LEVEL_FAST_60},
 237         {-70, ZIO_ZSTD_LEVEL_FAST_70},
 238         {-80, ZIO_ZSTD_LEVEL_FAST_80},
 239         {-90, ZIO_ZSTD_LEVEL_FAST_90},
 240         {-100, ZIO_ZSTD_LEVEL_FAST_100},
 241         {-500, ZIO_ZSTD_LEVEL_FAST_500},
 242         {-1000, ZIO_ZSTD_LEVEL_FAST_1000},
 243 };
 244
 245 /*
 246  * This variable represents the maximum count of the pool based on the number
 247  * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd.
 248  */
 249 static int pool_count = 16;
 250
 251 #define ZSTD_POOL_MAX           pool_count
 252 #define ZSTD_POOL_TIMEOUT       60 * 2
 253
 254 static struct zstd_fallback_mem zstd_dctx_fallback;
 255 static struct zstd_pool *zstd_mempool_cctx;
 256 static struct zstd_pool *zstd_mempool_dctx;
 257
 258 /*
 259  * The library zstd code expects these if ADDRESS_SANITIZER gets defined,
 260  * and while ASAN does this, KASAN defines that and does not. So to avoid
 261  * changing the external code, we do this.
 262  */
 263 #if defined(ZFS_ASAN_ENABLED)
 264 #define ADDRESS_SANITIZER 1
 265 #endif
 266 #if defined(_KERNEL) && defined(ADDRESS_SANITIZER)
 267 void __asan_unpoison_memory_region(void const volatile *addr, size_t size);
 268 void __asan_poison_memory_region(void const volatile *addr, size_t size);
 269 void __asan_unpoison_memory_region(void const volatile *addr, size_t size) {};
 270 void __asan_poison_memory_region(void const volatile *addr, size_t size) {};
 271 #endif
 272
 273
 274 static void
 275 zstd_mempool_reap(struct zstd_pool *zstd_mempool)
 276 {
 277         struct zstd_pool *pool;
 278
 279         if (!zstd_mempool || !ZSTDSTAT(zstd_stat_buffers)) {
 280                 return;
 281         }
 282
 283         /* free obsolete slots */
 284         for (int i = 0; i < ZSTD_POOL_MAX; i++) {
 285                 pool = &zstd_mempool[i];
 286                 if (pool->mem && mutex_tryenter(&pool->barrier)) {
 287                         /* Free memory if unused object older than 2 minutes */
 288                         if (pool->mem && gethrestime_sec() > pool->timeout) {
 289                                 vmem_free(pool->mem, pool->size);
 290                                 ZSTDSTAT_SUB(zstd_stat_buffers, 1);
 291                                 ZSTDSTAT_SUB(zstd_stat_size, pool->size);
 292                                 pool->mem = NULL;
 293                                 pool->size = 0;
 294                                 pool->timeout = 0;
 295                         }
 296                         mutex_exit(&pool->barrier);
 297                 }
 298         }
 299 }
 300
 301 /*
 302  * Try to get a cached allocated buffer from memory pool or allocate a new one
 303  * if necessary. If a object is older than 2 minutes and does not fit the
 304  * requested size, it will be released and a new cached entry will be allocated.
 305  * If other pooled objects are detected without being used for 2 minutes, they
 306  * will be released, too.
 307  *
 308  * The concept is that high frequency memory allocations of bigger objects are
 309  * expensive. So if a lot of work is going on, allocations will be kept for a
 310  * while and can be reused in that time frame.
 311  *
 312  * The scheduled release will be updated every time a object is reused.
 313  */
 314
 315 static void *
 316 zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size)
 317 {
 318         struct zstd_pool *pool;
 319         struct zstd_kmem *mem = NULL;
 320
 321         if (!zstd_mempool) {
 322                 return (NULL);
 323         }
 324
 325         /* Seek for preallocated memory slot and free obsolete slots */
 326         for (int i = 0; i < ZSTD_POOL_MAX; i++) {
 327                 pool = &zstd_mempool[i];
 328                 /*
 329                  * This lock is simply a marker for a pool object being in use.
 330                  * If it's already hold, it will be skipped.
 331                  *
 332                  * We need to create it before checking it to avoid race
 333                  * conditions caused by running in a threaded context.
 334                  *
 335                  * The lock is later released by zstd_mempool_free.
 336                  */
 337                 if (mutex_tryenter(&pool->barrier)) {
 338                         /*
 339                          * Check if objects fits the size, if so we take it and
 340                          * update the timestamp.
 341                          */
 342                         if (pool->mem && size <= pool->size) {
 343                                 pool->timeout = gethrestime_sec() +
 344                                     ZSTD_POOL_TIMEOUT;
 345                                 mem = pool->mem;
 346                                 return (mem);
 347                         }
 348                         mutex_exit(&pool->barrier);
 349                 }
 350         }
 351
 352         /*
 353          * If no preallocated slot was found, try to fill in a new one.
 354          *
 355          * We run a similar algorithm twice here to avoid pool fragmentation.
 356          * The first one may generate holes in the list if objects get released.
 357          * We always make sure that these holes get filled instead of adding new
 358          * allocations constantly at the end.
 359          */
 360         for (int i = 0; i < ZSTD_POOL_MAX; i++) {
 361                 pool = &zstd_mempool[i];
 362                 if (mutex_tryenter(&pool->barrier)) {
 363                         /* Object is free, try to allocate new one */
 364                         if (!pool->mem) {
 365                                 mem = vmem_alloc(size, KM_SLEEP);
 366                                 if (mem) {
 367                                         ZSTDSTAT_ADD(zstd_stat_buffers, 1);
 368                                         ZSTDSTAT_ADD(zstd_stat_size, size);
 369                                         pool->mem = mem;
 370                                         pool->size = size;
 371                                         /* Keep track for later release */
 372                                         mem->pool = pool;
 373                                         mem->kmem_type = ZSTD_KMEM_POOL;
 374                                         mem->kmem_size = size;
 375                                 }
 376                         }
 377
 378                         if (size <= pool->size) {
 379                                 /* Update timestamp */
 380                                 pool->timeout = gethrestime_sec() +
 381                                     ZSTD_POOL_TIMEOUT;
 382
 383                                 return (pool->mem);
 384                         }
 385
 386                         mutex_exit(&pool->barrier);
 387                 }
 388         }
 389
 390         /*
 391          * If the pool is full or the allocation failed, try lazy allocation
 392          * instead.
 393          */
 394         if (!mem) {
 395                 mem = vmem_alloc(size, KM_NOSLEEP);
 396                 if (mem) {
 397                         mem->pool = NULL;
 398                         mem->kmem_type = ZSTD_KMEM_DEFAULT;
 399                         mem->kmem_size = size;
 400                 }
 401         }
 402
 403         return (mem);
 404 }
 405
 406 /* Mark object as released by releasing the barrier mutex */
 407 static void
 408 zstd_mempool_free(struct zstd_kmem *z)
 409 {
 410         mutex_exit(&z->pool->barrier);
 411 }
 412
 413 /* Convert ZFS internal enum to ZSTD level */
 414 static int
 415 zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
 416 {
 417         if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) {
 418                 *zstd_level = zstd_levels[level - 1].zstd_level;
 419                 return (0);
 420         }
 421         if (level >= ZIO_ZSTD_LEVEL_FAST_1 &&
 422             level <= ZIO_ZSTD_LEVEL_FAST_1000) {
 423                 *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1
 424                     + ZIO_ZSTD_LEVEL_19].zstd_level;
 425                 return (0);
 426         }
 427
 428         /* Invalid/unknown zfs compression enum - this should never happen. */
 429         return (1);
 430 }
 431
 432
 433 size_t
 434 zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
 435     int level)
 436 {
 437         int16_t zstd_level;
 438         if (zstd_enum_to_level(level, &zstd_level)) {
 439                 ZSTDSTAT_BUMP(zstd_stat_com_inval);
 440                 return (s_len);
 441         }
 442         /*
 443          * A zstd early abort heuristic.
 444          *
 445          * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
 446          *   128k), don't try any of this, just go.
 447          *   (because experimentally that was a reasonable cutoff for a perf win
 448          *   with tiny ratio change)
 449          * - First, we try LZ4 compression, and if it doesn't early abort, we
 450          *   jump directly to whatever compression level we intended to try.
 451          * - Second, we try zstd-1 - if that errors out (usually, but not
 452          *   exclusively, if it would overflow), we give up early.
 453          *
 454          *   If it works, instead we go on and compress anyway.
 455          *
 456          * Why two passes? LZ4 alone gets you a lot of the way, but on highly
 457          * compressible data, it was losing up to 8.5% of the compressed
 458          * savings versus no early abort, and all the zstd-fast levels are
 459          * worse indications on their own than LZ4, and don't improve the LZ4
 460          * pass noticably if stacked like this.
 461          */
 462         size_t actual_abort_size = zstd_abort_size;
 463         if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
 464             s_len >= actual_abort_size) {
 465                 int pass_len = 1;
 466                 pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
 467                 if (pass_len < d_len) {
 468                         ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
 469                         goto keep_trying;
 470                 }
 471                 ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
 472
 473                 pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
 474                     ZIO_ZSTD_LEVEL_1);
 475                 if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
 476                         ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
 477                         return (s_len);
 478                 }
 479                 ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
 480         } else {
 481                 ZSTDSTAT_BUMP(zstd_stat_passignored);
 482                 if (s_len < actual_abort_size) {
 483                         ZSTDSTAT_BUMP(zstd_stat_passignored_size);
 484                 }
 485         }
 486 keep_trying:
 487         return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
 488
 489 }
 490
 491 /* Compress block using zstd */
 492 size_t
 493 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
 494     int level)
 495 {
 496         size_t c_len;
 497         int16_t zstd_level;
 498         zfs_zstdhdr_t *hdr;
 499         ZSTD_CCtx *cctx;
 500
 501         hdr = (zfs_zstdhdr_t *)d_start;
 502
 503         /* Skip compression if the specified level is invalid */
 504         if (zstd_enum_to_level(level, &zstd_level)) {
 505                 ZSTDSTAT_BUMP(zstd_stat_com_inval);
 506                 return (s_len);
 507         }
 508
 509         ASSERT3U(d_len, >=, sizeof (*hdr));
 510         ASSERT3U(d_len, <=, s_len);
 511         ASSERT3U(zstd_level, !=, 0);
 512
 513         cctx = ZSTD_createCCtx_advanced(zstd_malloc);
 514
 515         /*
 516          * Out of kernel memory, gently fall through - this will disable
 517          * compression in zio_compress_data
 518          */
 519         if (!cctx) {
 520                 ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail);
 521                 return (s_len);
 522         }
 523
 524         /* Set the compression level */
 525         ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level);
 526
 527         /* Use the "magicless" zstd header which saves us 4 header bytes */
 528         ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless);
 529
 530         /*
 531          * Disable redundant checksum calculation and content size storage since
 532          * this is already done by ZFS itself.
 533          */
 534         ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0);
 535         ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0);
 536
 537         c_len = ZSTD_compress2(cctx,
 538             hdr->data,
 539             d_len - sizeof (*hdr),
 540             s_start, s_len);
 541
 542         ZSTD_freeCCtx(cctx);
 543
 544         /* Error in the compression routine, disable compression. */
 545         if (ZSTD_isError(c_len)) {
 546                 /*
 547                  * If we are aborting the compression because the saves are
 548                  * too small, that is not a failure. Everything else is a
 549                  * failure, so increment the compression failure counter.
 550                  */
 551                 int err = ZSTD_getErrorCode(c_len);
 552                 if (err != ZSTD_error_dstSize_tooSmall) {
 553                         ZSTDSTAT_BUMP(zstd_stat_com_fail);
 554                         dprintf("Error: %s", ZSTD_getErrorString(err));
 555                 }
 556                 return (s_len);
 557         }
 558
 559         /*
 560          * Encode the compressed buffer size at the start. We'll need this in
 561          * decompression to counter the effects of padding which might be added
 562          * to the compressed buffer and which, if unhandled, would confuse the
 563          * hell out of our decompression function.
 564          */
 565         hdr->c_len = BE_32(c_len);
 566
 567         /*
 568          * Check version for overflow.
 569          * The limit of 24 bits must not be exceeded. This allows a maximum
 570          * version 1677.72.15 which we don't expect to be ever reached.
 571          */
 572         ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF);
 573
 574         /*
 575          * Encode the compression level as well. We may need to know the
 576          * original compression level if compressed_arc is disabled, to match
 577          * the compression settings to write this block to the L2ARC.
 578          *
 579          * Encode the actual level, so if the enum changes in the future, we
 580          * will be compatible.
 581          *
 582          * The upper 24 bits store the ZSTD version to be able to provide
 583          * future compatibility, since new versions might enhance the
 584          * compression algorithm in a way, where the compressed data will
 585          * change.
 586          *
 587          * As soon as such incompatibility occurs, handling code needs to be
 588          * added, differentiating between the versions.
 589          */
 590         zfs_set_hdrversion(hdr, ZSTD_VERSION_NUMBER);
 591         zfs_set_hdrlevel(hdr, level);
 592         hdr->raw_version_level = BE_32(hdr->raw_version_level);
 593
 594         return (c_len + sizeof (*hdr));
 595 }
 596
 597 /* Decompress block using zstd and return its stored level */
 598 int
 599 zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
 600     size_t d_len, uint8_t *level)
 601 {
 602         ZSTD_DCtx *dctx;
 603         size_t result;
 604         int16_t zstd_level;
 605         uint32_t c_len;
 606         const zfs_zstdhdr_t *hdr;
 607         zfs_zstdhdr_t hdr_copy;
 608
 609         hdr = (const zfs_zstdhdr_t *)s_start;
 610         c_len = BE_32(hdr->c_len);
 611
 612         /*
 613          * Make a copy instead of directly converting the header, since we must
 614          * not modify the original data that may be used again later.
 615          */
 616         hdr_copy.raw_version_level = BE_32(hdr->raw_version_level);
 617         uint8_t curlevel = zfs_get_hdrlevel(&hdr_copy);
 618
 619         /*
 620          * NOTE: We ignore the ZSTD version for now. As soon as any
 621          * incompatibility occurs, it has to be handled accordingly.
 622          * The version can be accessed via `hdr_copy.version`.
 623          */
 624
 625         /*
 626          * Convert and check the level
 627          * An invalid level is a strong indicator for data corruption! In such
 628          * case return an error so the upper layers can try to fix it.
 629          */
 630         if (zstd_enum_to_level(curlevel, &zstd_level)) {
 631                 ZSTDSTAT_BUMP(zstd_stat_dec_inval);
 632                 return (1);
 633         }
 634
 635         ASSERT3U(d_len, >=, s_len);
 636         ASSERT3U(curlevel, !=, ZIO_COMPLEVEL_INHERIT);
 637
 638         /* Invalid compressed buffer size encoded at start */
 639         if (c_len + sizeof (*hdr) > s_len) {
 640                 ZSTDSTAT_BUMP(zstd_stat_dec_header_inval);
 641                 return (1);
 642         }
 643
 644         dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc);
 645         if (!dctx) {
 646                 ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail);
 647                 return (1);
 648         }
 649
 650         /* Set header type to "magicless" */
 651         ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless);
 652
 653         /* Decompress the data and release the context */
 654         result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len);
 655         ZSTD_freeDCtx(dctx);
 656
 657         /*
 658          * Returns 0 on success (decompression function returned non-negative)
 659          * and non-zero on failure (decompression function returned negative.
 660          */
 661         if (ZSTD_isError(result)) {
 662                 ZSTDSTAT_BUMP(zstd_stat_dec_fail);
 663                 return (1);
 664         }
 665
 666         if (level) {
 667                 *level = curlevel;
 668         }
 669
 670         return (0);
 671 }
 672
 673 /* Decompress datablock using zstd */
 674 int
 675 zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len,
 676     int level __maybe_unused)
 677 {
 678
 679         return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len,
 680             NULL));
 681 }
 682
 683 /* Allocator for zstd compression context using mempool_allocator */
 684 static void *
 685 zstd_alloc(void *opaque __maybe_unused, size_t size)
 686 {
 687         size_t nbytes = sizeof (struct zstd_kmem) + size;
 688         struct zstd_kmem *z = NULL;
 689
 690         z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes);
 691
 692         if (!z) {
 693                 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
 694                 return (NULL);
 695         }
 696
 697         return ((void*)z + (sizeof (struct zstd_kmem)));
 698 }
 699
 700 /*
 701  * Allocator for zstd decompression context using mempool_allocator with
 702  * fallback to reserved memory if allocation fails
 703  */
 704 static void *
 705 zstd_dctx_alloc(void *opaque __maybe_unused, size_t size)
 706 {
 707         size_t nbytes = sizeof (struct zstd_kmem) + size;
 708         struct zstd_kmem *z = NULL;
 709         enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT;
 710
 711         z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes);
 712         if (!z) {
 713                 /* Try harder, decompression shall not fail */
 714                 z = vmem_alloc(nbytes, KM_SLEEP);
 715                 if (z) {
 716                         z->pool = NULL;
 717                 }
 718                 ZSTDSTAT_BUMP(zstd_stat_alloc_fail);
 719         } else {
 720                 return ((void*)z + (sizeof (struct zstd_kmem)));
 721         }
 722
 723         /* Fallback if everything fails */
 724         if (!z) {
 725                 /*
 726                  * Barrier since we only can handle it in a single thread. All
 727                  * other following threads need to wait here until decompression
 728                  * is completed. zstd_free will release this barrier later.
 729                  */
 730                 mutex_enter(&zstd_dctx_fallback.barrier);
 731
 732                 z = zstd_dctx_fallback.mem;
 733                 type = ZSTD_KMEM_DCTX;
 734                 ZSTDSTAT_BUMP(zstd_stat_alloc_fallback);
 735         }
 736
 737         /* Allocation should always be successful */
 738         if (!z) {
 739                 return (NULL);
 740         }
 741
 742         z->kmem_type = type;
 743         z->kmem_size = nbytes;
 744
 745         return ((void*)z + (sizeof (struct zstd_kmem)));
 746 }
 747
 748 /* Free allocated memory by its specific type */
 749 static void
 750 zstd_free(void *opaque __maybe_unused, void *ptr)
 751 {
 752         struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem));
 753         enum zstd_kmem_type type;
 754
 755         ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT);
 756         ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN);
 757
 758         type = z->kmem_type;
 759         switch (type) {
 760         case ZSTD_KMEM_DEFAULT:
 761                 vmem_free(z, z->kmem_size);
 762                 break;
 763         case ZSTD_KMEM_POOL:
 764                 zstd_mempool_free(z);
 765                 break;
 766         case ZSTD_KMEM_DCTX:
 767                 mutex_exit(&zstd_dctx_fallback.barrier);
 768                 break;
 769         default:
 770                 break;
 771         }
 772 }
 773
 774 /* Allocate fallback memory to ensure safe decompression */
 775 static void __init
 776 create_fallback_mem(struct zstd_fallback_mem *mem, size_t size)
 777 {
 778         mem->mem_size = size;
 779         mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP);
 780         mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL);
 781 }
 782
 783 /* Initialize memory pool barrier mutexes */
 784 static void __init
 785 zstd_mempool_init(void)
 786 {
 787         zstd_mempool_cctx =
 788             kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
 789         zstd_mempool_dctx =
 790             kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP);
 791
 792         for (int i = 0; i < ZSTD_POOL_MAX; i++) {
 793                 mutex_init(&zstd_mempool_cctx[i].barrier, NULL,
 794                     MUTEX_DEFAULT, NULL);
 795                 mutex_init(&zstd_mempool_dctx[i].barrier, NULL,
 796                     MUTEX_DEFAULT, NULL);
 797         }
 798 }
 799
 800 /* Initialize zstd-related memory handling */
 801 static int __init
 802 zstd_meminit(void)
 803 {
 804         zstd_mempool_init();
 805
 806         /*
 807          * Estimate the size of the fallback decompression context.
 808          * The expected size on x64 with current ZSTD should be about 160 KB.
 809          */
 810         create_fallback_mem(&zstd_dctx_fallback,
 811             P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem),
 812             PAGESIZE));
 813
 814         return (0);
 815 }
 816
 817 /* Release object from pool and free memory */
 818 static void
 819 release_pool(struct zstd_pool *pool)
 820 {
 821         mutex_destroy(&pool->barrier);
 822         vmem_free(pool->mem, pool->size);
 823         pool->mem = NULL;
 824         pool->size = 0;
 825 }
 826
 827 /* Release memory pool objects */
 828 static void
 829 zstd_mempool_deinit(void)
 830 {
 831         for (int i = 0; i < ZSTD_POOL_MAX; i++) {
 832                 release_pool(&zstd_mempool_cctx[i]);
 833                 release_pool(&zstd_mempool_dctx[i]);
 834         }
 835
 836         kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
 837         kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool));
 838         zstd_mempool_dctx = NULL;
 839         zstd_mempool_cctx = NULL;
 840 }
 841
 842 /* release unused memory from pool */
 843
 844 void
 845 zfs_zstd_cache_reap_now(void)
 846 {
 847         /*
 848          * calling alloc with zero size seeks
 849          * and releases old unused objects
 850          */
 851         zstd_mempool_reap(zstd_mempool_cctx);
 852         zstd_mempool_reap(zstd_mempool_dctx);
 853 }
 854
 855 extern int __init
 856 zstd_init(void)
 857 {
 858         /* Set pool size by using maximum sane thread count * 4 */
 859         pool_count = (boot_ncpus * 4);
 860         zstd_meminit();
 861
 862         /* Initialize kstat */
 863         zstd_ksp = kstat_create("zfs", 0, "zstd", "misc",
 864             KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t),
 865             KSTAT_FLAG_VIRTUAL);
 866         if (zstd_ksp != NULL) {
 867                 zstd_ksp->ks_data = &zstd_stats;
 868                 kstat_install(zstd_ksp);
 869 #ifdef _KERNEL
 870                 zstd_ksp->ks_update = kstat_zstd_update;
 871 #endif
 872         }
 873
 874         return (0);
 875 }
 876
 877 extern void
 878 zstd_fini(void)
 879 {
 880         /* Deinitialize kstat */
 881         if (zstd_ksp != NULL) {
 882                 kstat_delete(zstd_ksp);
 883                 zstd_ksp = NULL;
 884         }
 885
 886         /* Release fallback memory */
 887         vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size);
 888         mutex_destroy(&zstd_dctx_fallback.barrier);
 889
 890         /* Deinit memory pool */
 891         zstd_mempool_deinit();
 892 }
 893
 894 #if defined(_KERNEL)
 895 #ifdef __FreeBSD__
 896 module_init(zstd_init);
 897 module_exit(zstd_fini);
 898 #endif
 899
 900 ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, UINT, ZMOD_RW,
 901         "Enable early abort attempts when using zstd");
 902 ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
 903         "Minimal size of block to attempt early abort");
 904 #endif