]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Tiered early abort, zstd edition
authorRich Ercolani <214141+rincebrain@users.noreply.github.com>
Tue, 24 May 2022 16:43:22 +0000 (12:43 -0400)
committerGitHub <noreply@github.com>
Tue, 24 May 2022 16:43:22 +0000 (09:43 -0700)
It turns out that "do LZ4 and zstd-1 both fail" is a great heuristic
for "don't even bother trying higher zstd tiers".

By way of illustration:
$ cat /incompress | mbuffer | zfs recv -o compression=zstd-12 evenfaster/lowcomp_1M_zstd12_normal
summary: 39.8 GiByte in  3min 40.2sec - average of  185 MiB/s
$ echo 3 | sudo tee /sys/module/zzstd/parameters/zstd_lz4_pass
3
$ cat /incompress | mbuffer -m 4G | zfs recv -o compression=zstd-12 evenfaster/lowcomp_1M_zstd12_patched
summary: 39.8 GiByte in 48.6sec - average of  839 MiB/s
$ sudo zfs list -p -o name,used,lused,ratio evenfaster/lowcomp_1M_zstd12_normal evenfaster/lowcomp_1M_zstd12_patched
NAME                                         USED        LUSED  RATIO
evenfaster/lowcomp_1M_zstd12_normal   39549931520  42721221632   1.08
evenfaster/lowcomp_1M_zstd12_patched  39626399744  42721217536   1.07
$ python3 -c "print(39626399744 - 39549931520)"
76468224
$

I'll take 76 MB out of 42 GB for > 4x speedup.

Reviewed-by: Allan Jude <allan@klarasystems.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: George Melikov <mail@gmelikov.ru>
Reviewed-by: Kjeld Schouten <kjeld@schouten-lebbing.nl>
Reviewed-by: Ahelenia ZiemiaƄska <nabijaczleweli@nabijaczleweli.xyz>
Signed-off-by: Rich Ercolani <rincebrain@gmail.com>
Closes #13244

include/sys/zstd/zstd.h
man/man4/zfs.4
module/zfs/zio_compress.c
module/zstd/zfs_zstd.c

index ca32a7464556e95b53d946ac1d119fad6d4b8e9d..ec2341b7693014ef4d6fbef6f7679122f830c3c2 100644 (file)
@@ -78,6 +78,8 @@ typedef struct zfs_zstd_meta {
  * kstat helper macros
  */
 #define        ZSTDSTAT(stat)          (zstd_stats.stat.value.ui64)
+#define        ZSTDSTAT_ZERO(stat)     \
+       (atomic_store_64(&zstd_stats.stat.value.ui64, 0))
 #define        ZSTDSTAT_ADD(stat, val) \
        atomic_add_64(&zstd_stats.stat.value.ui64, (val))
 #define        ZSTDSTAT_SUB(stat, val) \
@@ -90,6 +92,8 @@ void zstd_fini(void);
 
 size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len,
     size_t d_len, int level);
+size_t zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len,
+    size_t d_len, int level);
 int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level);
 int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len,
     size_t d_len, uint8_t *level);
index 5ef517c46faaf71d26cfbed10eb460832cddfe5d..c95fa98c5fd4a5a85330248d9d1b7bc8a88249fc 100644 (file)
@@ -2129,6 +2129,14 @@ However, if there are fewer than
 metaslabs in the vdev, this functionality is disabled.
 This ensures that we don't set aside an unreasonable amount of space for the ZIL.
 .
+.It Sy zfs_zstd_earlyabort_pass Ns = Ns Sy 1 Pq int
+Whether heuristic for detection of incompressible data with zstd levels >= 3
+using LZ4 and zstd-1 passes is enabled.
+.
+.It Sy zfs_zstd_abort_size Ns = Ns Sy 131072 Pq int
+Minimal uncompressed size (inclusive) of a record before the early abort
+heuristic will be attempted.
+.
 .It Sy zio_deadman_log_all Ns = Ns Sy 0 Ns | Ns 1 Pq int
 If non-zero, the zio deadman will produce debugging messages
 .Pq see Sy zfs_dbgmsg_enable
index cded11f4cbd58528082580faddf2fd0979ab789f..38020ce220b10b7f625e7cdd196d728cb66b722e 100644 (file)
@@ -66,7 +66,7 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
        {"gzip-9",      9,      gzip_compress,  gzip_decompress, NULL},
        {"zle",         64,     zle_compress,   zle_decompress, NULL},
        {"lz4",         0,      lz4_compress_zfs, lz4_decompress_zfs, NULL},
-       {"zstd",        ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress,
+       {"zstd",        ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress_wrap,
            zfs_zstd_decompress, zfs_zstd_decompress_level},
 };
 
index 04e52ae3cec670ea92b0a495e3a7cec7eccebe50..41351898981aab359b6c0638e933ada2a019bd25 100644 (file)
 #include "lib/zstd.h"
 #include "lib/common/zstd_errors.h"
 
+static int zstd_earlyabort_pass = 1;
+static int zstd_cutoff_level = ZIO_ZSTD_LEVEL_3;
+static unsigned int zstd_abort_size = (128 * 1024);
+
 static kstat_t *zstd_ksp = NULL;
 
 typedef struct zstd_stats {
@@ -62,6 +66,21 @@ typedef struct zstd_stats {
        kstat_named_t   zstd_stat_dec_header_inval;
        kstat_named_t   zstd_stat_com_fail;
        kstat_named_t   zstd_stat_dec_fail;
+       /*
+        * LZ4 first-pass early abort verdict
+        */
+       kstat_named_t   zstd_stat_lz4pass_allowed;
+       kstat_named_t   zstd_stat_lz4pass_rejected;
+       /*
+        * zstd-1 second-pass early abort verdict
+        */
+       kstat_named_t   zstd_stat_zstdpass_allowed;
+       kstat_named_t   zstd_stat_zstdpass_rejected;
+       /*
+        * We excluded this from early abort for some reason
+        */
+       kstat_named_t   zstd_stat_passignored;
+       kstat_named_t   zstd_stat_passignored_size;
        kstat_named_t   zstd_stat_buffers;
        kstat_named_t   zstd_stat_size;
 } zstd_stats_t;
@@ -76,10 +95,44 @@ static zstd_stats_t zstd_stats = {
        { "decompress_header_invalid",  KSTAT_DATA_UINT64 },
        { "compress_failed",            KSTAT_DATA_UINT64 },
        { "decompress_failed",          KSTAT_DATA_UINT64 },
+       { "lz4pass_allowed",            KSTAT_DATA_UINT64 },
+       { "lz4pass_rejected",           KSTAT_DATA_UINT64 },
+       { "zstdpass_allowed",           KSTAT_DATA_UINT64 },
+       { "zstdpass_rejected",          KSTAT_DATA_UINT64 },
+       { "passignored",                KSTAT_DATA_UINT64 },
+       { "passignored_size",           KSTAT_DATA_UINT64 },
        { "buffers",                    KSTAT_DATA_UINT64 },
        { "size",                       KSTAT_DATA_UINT64 },
 };
 
+#ifdef _KERNEL
+static int
+kstat_zstd_update(kstat_t *ksp, int rw)
+{
+       ASSERT(ksp != NULL);
+
+       if (rw == KSTAT_WRITE && ksp == zstd_ksp) {
+               ZSTDSTAT_ZERO(zstd_stat_alloc_fail);
+               ZSTDSTAT_ZERO(zstd_stat_alloc_fallback);
+               ZSTDSTAT_ZERO(zstd_stat_com_alloc_fail);
+               ZSTDSTAT_ZERO(zstd_stat_dec_alloc_fail);
+               ZSTDSTAT_ZERO(zstd_stat_com_inval);
+               ZSTDSTAT_ZERO(zstd_stat_dec_inval);
+               ZSTDSTAT_ZERO(zstd_stat_dec_header_inval);
+               ZSTDSTAT_ZERO(zstd_stat_com_fail);
+               ZSTDSTAT_ZERO(zstd_stat_dec_fail);
+               ZSTDSTAT_ZERO(zstd_stat_lz4pass_allowed);
+               ZSTDSTAT_ZERO(zstd_stat_lz4pass_rejected);
+               ZSTDSTAT_ZERO(zstd_stat_zstdpass_allowed);
+               ZSTDSTAT_ZERO(zstd_stat_zstdpass_rejected);
+               ZSTDSTAT_ZERO(zstd_stat_passignored);
+               ZSTDSTAT_ZERO(zstd_stat_passignored_size);
+       }
+
+       return (0);
+}
+#endif
+
 /* Enums describing the allocator type specified by kmem_type in zstd_kmem */
 enum zstd_kmem_type {
        ZSTD_KMEM_UNKNOWN = 0,
@@ -377,6 +430,64 @@ zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level)
 }
 
 
+size_t
+zfs_zstd_compress_wrap(void *s_start, void *d_start, size_t s_len, size_t d_len,
+    int level)
+{
+       int16_t zstd_level;
+       if (zstd_enum_to_level(level, &zstd_level)) {
+               ZSTDSTAT_BUMP(zstd_stat_com_inval);
+               return (s_len);
+       }
+       /*
+        * A zstd early abort heuristic.
+        *
+        * - Zeroth, if this is <= zstd-3, or < zstd_abort_size (currently
+        *   128k), don't try any of this, just go.
+        *   (because experimentally that was a reasonable cutoff for a perf win
+        *   with tiny ratio change)
+        * - First, we try LZ4 compression, and if it doesn't early abort, we
+        *   jump directly to whatever compression level we intended to try.
+        * - Second, we try zstd-1 - if that errors out (usually, but not
+        *   exclusively, if it would overflow), we give up early.
+        *
+        *   If it works, instead we go on and compress anyway.
+        *
+        * Why two passes? LZ4 alone gets you a lot of the way, but on highly
+        * compressible data, it was losing up to 8.5% of the compressed
+        * savings versus no early abort, and all the zstd-fast levels are
+        * worse indications on their own than LZ4, and don't improve the LZ4
+        * pass noticably if stacked like this.
+        */
+       size_t actual_abort_size = zstd_abort_size;
+       if (zstd_earlyabort_pass > 0 && zstd_level >= zstd_cutoff_level &&
+           s_len >= actual_abort_size) {
+               int pass_len = 1;
+               pass_len = lz4_compress_zfs(s_start, d_start, s_len, d_len, 0);
+               if (pass_len < d_len) {
+                       ZSTDSTAT_BUMP(zstd_stat_lz4pass_allowed);
+                       goto keep_trying;
+               }
+               ZSTDSTAT_BUMP(zstd_stat_lz4pass_rejected);
+
+               pass_len = zfs_zstd_compress(s_start, d_start, s_len, d_len,
+                   ZIO_ZSTD_LEVEL_1);
+               if (pass_len == s_len || pass_len <= 0 || pass_len > d_len) {
+                       ZSTDSTAT_BUMP(zstd_stat_zstdpass_rejected);
+                       return (s_len);
+               }
+               ZSTDSTAT_BUMP(zstd_stat_zstdpass_allowed);
+       } else {
+               ZSTDSTAT_BUMP(zstd_stat_passignored);
+               if (s_len < actual_abort_size) {
+                       ZSTDSTAT_BUMP(zstd_stat_passignored_size);
+               }
+       }
+keep_trying:
+       return (zfs_zstd_compress(s_start, d_start, s_len, d_len, level));
+
+}
+
 /* Compress block using zstd */
 size_t
 zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
@@ -437,8 +548,10 @@ zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len,
                 * too small, that is not a failure. Everything else is a
                 * failure, so increment the compression failure counter.
                 */
-               if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) {
+               int err = ZSTD_getErrorCode(c_len);
+               if (err != ZSTD_error_dstSize_tooSmall) {
                        ZSTDSTAT_BUMP(zstd_stat_com_fail);
+                       dprintf("Error: %s", ZSTD_getErrorString(err));
                }
                return (s_len);
        }
@@ -753,6 +866,9 @@ zstd_init(void)
        if (zstd_ksp != NULL) {
                zstd_ksp->ks_data = &zstd_stats;
                kstat_install(zstd_ksp);
+#ifdef _KERNEL
+               zstd_ksp->ks_update = kstat_zstd_update;
+#endif
        }
 
        return (0);
@@ -781,8 +897,8 @@ module_init(zstd_init);
 module_exit(zstd_fini);
 #endif
 
-EXPORT_SYMBOL(zfs_zstd_compress);
-EXPORT_SYMBOL(zfs_zstd_decompress_level);
-EXPORT_SYMBOL(zfs_zstd_decompress);
-EXPORT_SYMBOL(zfs_zstd_cache_reap_now);
+ZFS_MODULE_PARAM(zfs, zstd_, earlyabort_pass, INT, ZMOD_RW,
+       "Enable early abort attempts when using zstd");
+ZFS_MODULE_PARAM(zfs, zstd_, abort_size, UINT, ZMOD_RW,
+       "Minimal size of block to attempt early abort");
 #endif