Implementation of block cloning for ZFS

author Pawel Jakub Dawidek <pawel@dawidek.net>

Fri, 10 Mar 2023 19:59:53 +0000 (20:59 +0100)

committer GitHub <noreply@github.com>

Fri, 10 Mar 2023 19:59:53 +0000 (11:59 -0800)
author Pawel Jakub Dawidek <pawel@dawidek.net>
Fri, 10 Mar 2023 19:59:53 +0000 (20:59 +0100)
committer GitHub <noreply@github.com>
Fri, 10 Mar 2023 19:59:53 +0000 (11:59 -0800)
diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c

index 55df1f559f6e218378de3132e65b52b19175e5c9..970c45c9b3bbe979e4b1dd2ec50cc645a639b68e 100644 (file)
--- a/cmd/zdb/zdb_il.c
+++ b/cmd/zdb/zdb_il.c
@@ -307,6 +307,23 @@ zil_prt_rec_acl(zilog_t *zilog, int txtype, const void *arg)
             (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
  }
  
+static void
+zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg)
+{
+       (void) zilog, (void) txtype;
+       const lr_clone_range_t *lr = arg;
+
+       (void) printf("%sfoid %llu, offset %llx, length %llx, blksize %llx\n",
+           tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
+           (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blksz);
+
+       for (unsigned int i = 0; i < lr->lr_nbps; i++) {
+               (void) printf("%s[%u/%llu] ", tab_prefix, i + 1,
+                   (u_longlong_t)lr->lr_nbps);
+               print_log_bp(&lr->lr_bps[i], "");
+       }
+}
+
  typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *);
  typedef struct zil_rec_info {
         zil_prt_rec_func_t      zri_print;
@@ -340,6 +357,8 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
             .zri_name = "TX_SETSAXATTR      "},
         {.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME_EXCHANGE "},
         {.zri_print = zil_prt_rec_rename,   .zri_name = "TX_RENAME_WHITEOUT "},
+       {.zri_print = zil_prt_rec_clone_range,
+           .zri_name = "TX_CLONE_RANGE     "},
  };
  
  static int
diff --git a/cmd/ztest.c b/cmd/ztest.c

index fb9f83032e8f9a4700f9d1c4a127260366bd3d62..b6b99bfff6db7fa8b8adbaa8d728ca8e1664fe2f 100644 (file)
--- a/cmd/ztest.c
+++ b/cmd/ztest.c
@@ -1902,7 +1902,7 @@ ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
         if (zil_replaying(zd->zd_zilog, tx))
                 return;
  
-       if (lr->lr_length > zil_max_log_data(zd->zd_zilog))
+       if (lr->lr_length > zil_max_log_data(zd->zd_zilog, sizeof (lr_write_t)))
                 write_state = WR_INDIRECT;
  
         itx = zil_itx_create(TX_WRITE,
diff --git a/include/Makefile.am b/include/Makefile.am

index 6897e3c5e337b4038977d7b9658f262f460f0c39..569de6dfa781311adf8b74794122257b774996ac 100644 (file)
--- a/include/Makefile.am
+++ b/include/Makefile.am
@@ -23,6 +23,7 @@ COMMON_H = \
         sys/asm_linkage.h \
         sys/avl.h \
         sys/avl_impl.h \
+       sys/bitmap.h \
         sys/bitops.h \
         sys/blake3.h \
         sys/blkptr.h \
@@ -31,6 +32,7 @@ COMMON_H = \
         sys/bptree.h \
         sys/bqueue.h \
         sys/btree.h \
+       sys/brt.h \
         sys/dataset_kstats.h \
         sys/dbuf.h \
         sys/ddt.h \
diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h

index 8cde33dbcbbbf5c87a3eb1fc31856fd9a74d9e85..050fc3036f87b4995ba298456c4b2df72b1c2b10 100644 (file)
--- a/include/os/freebsd/zfs/sys/zfs_znode_impl.h
+++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h
@@ -119,7 +119,8 @@ typedef struct zfs_soft_state {
  #define        zn_has_cached_data(zp, start, end) \
      vn_has_cached_data(ZTOV(zp))
  #define        zn_flush_cached_data(zp, sync)  vn_flush_cached_data(ZTOV(zp), sync)
-#define        zn_rlimit_fsize(zp, uio) \
+#define        zn_rlimit_fsize(size)           zfs_rlimit_fsize(size)
+#define        zn_rlimit_fsize_uio(zp, uio) \
      vn_rlimit_fsize(ZTOV(zp), GET_UIO_STRUCT(uio), zfs_uio_td(uio))
  
  /* Called on entry to each ZFS vnode and vfs operation  */
@@ -179,6 +180,8 @@ extern zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE];
  
  extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp,
      char *buf);
+
+extern int zfs_rlimit_fsize(off_t fsize);
  #ifdef __cplusplus
  }
  #endif
diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h

index 67b4fc90653c8c62b2adf3801d8883011db0f582..09d109d191bfc349d63dd3e2368b67a172f338e2 100644 (file)
--- a/include/os/linux/kernel/linux/mod_compat.h
+++ b/include/os/linux/kernel/linux/mod_compat.h
@@ -47,6 +47,7 @@ typedef const struct kernel_param zfs_kernel_param_t;
  enum scope_prefix_types {
         zfs,
         zfs_arc,
+       zfs_brt,
         zfs_condense,
         zfs_dbuf,
         zfs_dbuf_cache,
diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h

index 81607ef2a25eac207ac02312d52fa4929cf38614..0be2c445ab761505ad7af6dc1b245335c17b2dc3 100644 (file)
--- a/include/os/linux/zfs/sys/zfs_znode_impl.h
+++ b/include/os/linux/zfs/sys/zfs_znode_impl.h
@@ -86,7 +86,8 @@ extern "C" {
  #endif
  
  #define        zn_flush_cached_data(zp, sync)  write_inode_now(ZTOI(zp), sync)
-#define        zn_rlimit_fsize(zp, uio)        (0)
+#define        zn_rlimit_fsize(size)           (0)
+#define        zn_rlimit_fsize_uio(zp, uio)    (0)
  
  /*
   * zhold() wraps igrab() on Linux, and igrab() may fail when the
diff --git a/include/sys/bitmap.h b/include/sys/bitmap.h

new file mode 100644 (file)

index 0000000..7b92507
--- /dev/null
+++ b/include/sys/bitmap.h
@@ -0,0 +1,93 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*     Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
+/*       All Rights Reserved   */
+
+
+#ifndef _SYS_BITMAP_H
+#define        _SYS_BITMAP_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Operations on bitmaps of arbitrary size
+ * A bitmap is a vector of 1 or more ulong_t's.
+ * The user of the package is responsible for range checks and keeping
+ * track of sizes.
+ */
+
+#ifdef _LP64
+#define        BT_ULSHIFT      6 /* log base 2 of BT_NBIPUL, to extract word index */
+#define        BT_ULSHIFT32    5 /* log base 2 of BT_NBIPUL, to extract word index */
+#else
+#define        BT_ULSHIFT      5 /* log base 2 of BT_NBIPUL, to extract word index */
+#endif
+
+#define        BT_NBIPUL       (1 << BT_ULSHIFT)       /* n bits per ulong_t */
+#define        BT_ULMASK       (BT_NBIPUL - 1)         /* to extract bit index */
+
+/*
+ * bitmap is a ulong_t *, bitindex an index_t
+ *
+ * The macros BT_WIM and BT_BIW internal; there is no need
+ * for users of this package to use them.
+ */
+
+/*
+ * word in map
+ */
+#define        BT_WIM(bitmap, bitindex) \
+       ((bitmap)[(bitindex) >> BT_ULSHIFT])
+/*
+ * bit in word
+ */
+#define        BT_BIW(bitindex) \
+       (1UL << ((bitindex) & BT_ULMASK))
+
+/*
+ * These are public macros
+ *
+ * BT_BITOUL == n bits to n ulong_t's
+ */
+#define        BT_BITOUL(nbits) \
+       (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL)
+#define        BT_SIZEOFMAP(nbits) \
+       (BT_BITOUL(nbits) * sizeof (ulong_t))
+#define        BT_TEST(bitmap, bitindex) \
+       ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0)
+#define        BT_SET(bitmap, bitindex) \
+       { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); }
+#define        BT_CLEAR(bitmap, bitindex) \
+       { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); }
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BITMAP_H */
diff --git a/include/sys/brt.h b/include/sys/brt.h

new file mode 100644 (file)

index 0000000..b1f7010
--- /dev/null
+++ b/include/sys/brt.h
@@ -0,0 +1,62 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
+ */
+
+#ifndef _SYS_BRT_H
+#define        _SYS_BRT_H
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern boolean_t brt_entry_decref(spa_t *spa, const blkptr_t *bp);
+
+extern uint64_t brt_get_dspace(spa_t *spa);
+extern uint64_t brt_get_used(spa_t *spa);
+extern uint64_t brt_get_saved(spa_t *spa);
+extern uint64_t brt_get_ratio(spa_t *spa);
+
+extern boolean_t brt_maybe_exists(spa_t *spa, const blkptr_t *bp);
+extern void brt_init(void);
+extern void brt_fini(void);
+
+extern void brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx);
+extern void brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx);
+extern void brt_pending_apply(spa_t *spa, uint64_t txg);
+
+extern void brt_create(spa_t *spa);
+extern int brt_load(spa_t *spa);
+extern void brt_unload(spa_t *spa);
+extern void brt_sync(spa_t *spa, uint64_t txg);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_BRT_H */
diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h

index a1ce76b1c763633fb5335ce2db9402ce0eef8874..a06316362e57394fa265a0671afd331984b35b7d 100644 (file)
--- a/include/sys/dbuf.h
+++ b/include/sys/dbuf.h
@@ -172,6 +172,7 @@ typedef struct dbuf_dirty_record {
                         override_states_t dr_override_state;
                         uint8_t dr_copies;
                         boolean_t dr_nopwrite;
+                       boolean_t dr_brtwrite;
                         boolean_t dr_has_raw_params;
  
                         /*
diff --git a/include/sys/ddt.h b/include/sys/ddt.h

index d72401dcf7a461a8e8586c32054f55bdb1e35c10..6378c042c705cfc62c8835fc8ca80117ba325d7f 100644 (file)
--- a/include/sys/ddt.h
+++ b/include/sys/ddt.h
@@ -248,6 +248,8 @@ extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
  extern int ddt_object_update(ddt_t *ddt, enum ddt_type type,
      enum ddt_class clazz, ddt_entry_t *dde, dmu_tx_t *tx);
  
+extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp);
+
  extern const ddt_ops_t ddt_zap_ops;
  
  #ifdef __cplusplus
diff --git a/include/sys/dmu.h b/include/sys/dmu.h

index 93de991ccd86318eec381ce2c102ed078f7ff245..1b82ff620f2739047f7921616eebbc7b0f83738c 100644 (file)
--- a/include/sys/dmu.h
+++ b/include/sys/dmu.h
@@ -782,6 +782,8 @@ dmu_tx_t *dmu_tx_create(objset_t *os);
  void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
  void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
      int len);
+void dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
+    int len);
  void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
      uint64_t len);
  void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
@@ -1059,6 +1061,12 @@ int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
  int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
      uint64_t *off);
  
+int dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t length, dmu_tx_t *tx, struct blkptr *bps, size_t *nbpsp);
+void dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset,
+    uint64_t length, dmu_tx_t *tx, const struct blkptr *bps, size_t nbps,
+    boolean_t replay);
+
  /*
   * Initial setup and final teardown.
   */
diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h

index 81e1ef6c147720ec6078c82f2390c898e275913b..ca8514e5d2d018df8199829678dfd986464de239 100644 (file)
--- a/include/sys/dmu_tx.h
+++ b/include/sys/dmu_tx.h
@@ -90,6 +90,7 @@ enum dmu_tx_hold_type {
         THT_ZAP,
         THT_SPACE,
         THT_SPILL,
+       THT_CLONE,
         THT_NUMTYPES
  };
  
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h

index e869685c5e2d3eaeaccc6f17a1f60dda48c857d7..25babd4ea8cffb08f73c62865260dfed8b9eec19 100644 (file)
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -253,6 +253,9 @@ typedef enum {
         ZPOOL_PROP_LOAD_GUID,
         ZPOOL_PROP_AUTOTRIM,
         ZPOOL_PROP_COMPATIBILITY,
+       ZPOOL_PROP_BCLONEUSED,
+       ZPOOL_PROP_BCLONESAVED,
+       ZPOOL_PROP_BCLONERATIO,
         ZPOOL_NUM_PROPS
  } zpool_prop_t;
  
diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h

index cde08ec9b0f463ab6bbf9abb998e6c0733c2cf1c..8ccd58b584ca0f7461aa9a0af9199ab2e130e919 100644 (file)
--- a/include/sys/spa_impl.h
+++ b/include/sys/spa_impl.h
@@ -380,6 +380,7 @@ struct spa {
         uint64_t        spa_dedup_dspace;       /* Cache get_dedup_dspace() */
         uint64_t        spa_dedup_checksum;     /* default dedup checksum */
         uint64_t        spa_dspace;             /* dspace in normal class */
+       struct brt      *spa_brt;               /* in-core BRT */
         kmutex_t        spa_vdev_top_lock;      /* dueling offline/remove */
         kmutex_t        spa_proc_lock;          /* protects spa_proc* */
         kcondvar_t      spa_proc_cv;            /* spa_proc_state transitions */
diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h

index 481209b241aa879a1ea264723a391119c8a0ae6b..a1dfef1d89ff78d5a2d90ed8edaf01c0ebf00ba7 100644 (file)
--- a/include/sys/zfs_debug.h
+++ b/include/sys/zfs_debug.h
@@ -57,6 +57,7 @@ extern int zfs_dbgmsg_enable;
  #define        ZFS_DEBUG_TRIM                  (1 << 11)
  #define        ZFS_DEBUG_LOG_SPACEMAP          (1 << 12)
  #define        ZFS_DEBUG_METASLAB_ALLOC        (1 << 13)
+#define        ZFS_DEBUG_BRT                   (1 << 14)
  
  extern void __set_error(const char *file, const char *func, int line, int err);
  extern void __zfs_dbgmsg(char *buf);
diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h

index edff8f681dd4623482a28dc3bf9952dfe143459d..5da103f17783ed13aa2e4924ba2045b6f3fb0f0e 100644 (file)
--- a/include/sys/zfs_vnops.h
+++ b/include/sys/zfs_vnops.h
@@ -31,6 +31,10 @@ extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *);
  extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *);
  extern int zfs_holey(znode_t *, ulong_t, loff_t *);
  extern int zfs_access(znode_t *, int, int, cred_t *);
+extern int zfs_clone_range(znode_t *, uint64_t *, znode_t *, uint64_t *,
+    uint64_t *, cred_t *);
+extern int zfs_clone_range_replay(znode_t *, uint64_t, uint64_t, uint64_t,
+    const blkptr_t *, size_t);
  
  extern int zfs_getsecattr(znode_t *, vsecattr_t *, int, cred_t *);
  extern int zfs_setsecattr(znode_t *, vsecattr_t *, int, cred_t *);
diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h

index fcee55b0199dba18cdeac57ba3fdb67adae0acaf..012e7403e2a6e2564ff732602fabc568baa7d3de 100644 (file)
--- a/include/sys/zfs_znode.h
+++ b/include/sys/zfs_znode.h
@@ -315,6 +315,9 @@ extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
      znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp);
  extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
      vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
+extern void zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype,
+    znode_t *zp, uint64_t offset, uint64_t length, uint64_t blksz,
+    const blkptr_t *bps, size_t nbps);
  extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx);
  extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
  extern void zfs_log_setsaxattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
diff --git a/include/sys/zil.h b/include/sys/zil.h

index 9ac421043377a26ebca77029668310a418562b0d..cff8ebcad8196822a0e5a19341fdc8f1945701ce 100644 (file)
--- a/include/sys/zil.h
+++ b/include/sys/zil.h
@@ -166,7 +166,8 @@ typedef enum zil_create {
  #define        TX_SETSAXATTR           21      /* Set sa xattrs on file */
  #define        TX_RENAME_EXCHANGE      22      /* Atomic swap via renameat2 */
  #define        TX_RENAME_WHITEOUT      23      /* Atomic whiteout via renameat2 */
-#define        TX_MAX_TYPE             24      /* Max transaction type */
+#define        TX_CLONE_RANGE          24      /* Clone a file range */
+#define        TX_MAX_TYPE             25      /* Max transaction type */
  
  /*
   * The transactions for mkdir, symlink, remove, rmdir, link, and rename
@@ -176,9 +177,9 @@ typedef enum zil_create {
  #define        TX_CI   ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
  
  /*
- * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
- * out of order.  For convenience in the code, all such records must have
- * lr_foid at the same offset.
+ * Transactions for operations below can be logged out of order.
+ * For convenience in the code, all such records must have lr_foid
+ * at the same offset.
   */
  #define        TX_OOO(txtype)                  \
         ((txtype) == TX_WRITE ||        \
@@ -187,7 +188,8 @@ typedef enum zil_create {
         (txtype) == TX_ACL_V0 ||        \
         (txtype) == TX_ACL ||           \
         (txtype) == TX_WRITE2 ||        \
-       (txtype) == TX_SETSAXATTR)
+       (txtype) == TX_SETSAXATTR ||    \
+       (txtype) == TX_CLONE_RANGE)
  
  /*
   * The number of dnode slots consumed by the object is stored in the 8
@@ -387,6 +389,17 @@ typedef struct {
         /* lr_acl_bytes number of variable sized ace's follows */
  } lr_acl_t;
  
+typedef struct {
+       lr_t            lr_common;      /* common portion of log record */
+       uint64_t        lr_foid;        /* file object to clone into */
+       uint64_t        lr_offset;      /* offset to clone to */
+       uint64_t        lr_length;      /* length of the blocks to clone */
+       uint64_t        lr_blksz;       /* file's block size */
+       uint64_t        lr_nbps;        /* number of block pointers */
+       blkptr_t        lr_bps[];
+       /* block pointers of the blocks to clone follows */
+} lr_clone_range_t;
+
  /*
   * ZIL structure definitions, interface function prototype and globals.
   */
@@ -574,7 +587,7 @@ extern void zil_set_sync(zilog_t *zilog, uint64_t syncval);
  extern void    zil_set_logbias(zilog_t *zilog, uint64_t slogval);
  
  extern uint64_t        zil_max_copied_data(zilog_t *zilog);
-extern uint64_t        zil_max_log_data(zilog_t *zilog);
+extern uint64_t        zil_max_log_data(zilog_t *zilog, size_t hdrsize);
  
  extern void zil_sums_init(zil_sums_t *zs);
  extern void zil_sums_fini(zil_sums_t *zs);
diff --git a/include/sys/zio.h b/include/sys/zio.h

index 28ed837d829ef5b9c1ca91c2551794038cf5a239..78603d0ebebac9c2177548f518ea4ecbf1427381 100644 (file)
--- a/include/sys/zio.h
+++ b/include/sys/zio.h
@@ -348,6 +348,7 @@ typedef struct zio_prop {
         boolean_t               zp_dedup;
         boolean_t               zp_dedup_verify;
         boolean_t               zp_nopwrite;
+       boolean_t               zp_brtwrite;
         boolean_t               zp_encrypt;
         boolean_t               zp_byteorder;
         uint8_t                 zp_salt[ZIO_DATA_SALT_LEN];
@@ -556,7 +557,7 @@ extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
      zio_priority_t priority, zio_flag_t flags, zbookmark_phys_t *zb);
  
  extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies,
-    boolean_t nopwrite);
+    boolean_t nopwrite, boolean_t brtwrite);
  
  extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
  
diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h

index 199cca291edfb96b593a9bf673bdee963ba42032..29a05986cd4f06731b369573438d5490d03735ef 100644 (file)
--- a/include/sys/zio_impl.h
+++ b/include/sys/zio_impl.h
@@ -77,6 +77,12 @@ extern "C" {
   * and zstd. Compression occurs as part of the write pipeline and is
   * performed in the ZIO_STAGE_WRITE_BP_INIT stage.
   *
+ * Block cloning:
+ * The block cloning functionality introduces ZIO_STAGE_BRT_FREE stage which
+ * is called during a free pipeline. If the block is referenced in the
+ * Block Cloning Table (BRT) we will just decrease its reference counter
+ * instead of actually freeing the block.
+ *
   * Dedup:
   * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and
   * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing
@@ -127,28 +133,30 @@ enum zio_stage {
  
         ZIO_STAGE_NOP_WRITE             = 1 << 8,       /* -W--- */
  
-       ZIO_STAGE_DDT_READ_START        = 1 << 9,       /* R---- */
-       ZIO_STAGE_DDT_READ_DONE         = 1 << 10,      /* R---- */
-       ZIO_STAGE_DDT_WRITE             = 1 << 11,      /* -W--- */
-       ZIO_STAGE_DDT_FREE              = 1 << 12,      /* --F-- */
+       ZIO_STAGE_BRT_FREE              = 1 << 9,       /* --F-- */
+
+       ZIO_STAGE_DDT_READ_START        = 1 << 10,      /* R---- */
+       ZIO_STAGE_DDT_READ_DONE         = 1 << 11,      /* R---- */
+       ZIO_STAGE_DDT_WRITE             = 1 << 12,      /* -W--- */
+       ZIO_STAGE_DDT_FREE              = 1 << 13,      /* --F-- */
  
-       ZIO_STAGE_GANG_ASSEMBLE         = 1 << 13,      /* RWFC- */
-       ZIO_STAGE_GANG_ISSUE            = 1 << 14,      /* RWFC- */
+       ZIO_STAGE_GANG_ASSEMBLE         = 1 << 14,      /* RWFC- */
+       ZIO_STAGE_GANG_ISSUE            = 1 << 15,      /* RWFC- */
  
-       ZIO_STAGE_DVA_THROTTLE          = 1 << 15,      /* -W--- */
-       ZIO_STAGE_DVA_ALLOCATE          = 1 << 16,      /* -W--- */
-       ZIO_STAGE_DVA_FREE              = 1 << 17,      /* --F-- */
-       ZIO_STAGE_DVA_CLAIM             = 1 << 18,      /* ---C- */
+       ZIO_STAGE_DVA_THROTTLE          = 1 << 16,      /* -W--- */
+       ZIO_STAGE_DVA_ALLOCATE          = 1 << 17,      /* -W--- */
+       ZIO_STAGE_DVA_FREE              = 1 << 18,      /* --F-- */
+       ZIO_STAGE_DVA_CLAIM             = 1 << 19,      /* ---C- */
  
-       ZIO_STAGE_READY                 = 1 << 19,      /* RWFCI */
+       ZIO_STAGE_READY                 = 1 << 20,      /* RWFCI */
  
-       ZIO_STAGE_VDEV_IO_START         = 1 << 20,      /* RW--I */
-       ZIO_STAGE_VDEV_IO_DONE          = 1 << 21,      /* RW--I */
-       ZIO_STAGE_VDEV_IO_ASSESS        = 1 << 22,      /* RW--I */
+       ZIO_STAGE_VDEV_IO_START         = 1 << 21,      /* RW--I */
+       ZIO_STAGE_VDEV_IO_DONE          = 1 << 22,      /* RW--I */
+       ZIO_STAGE_VDEV_IO_ASSESS        = 1 << 23,      /* RW--I */
  
-       ZIO_STAGE_CHECKSUM_VERIFY       = 1 << 23,      /* R---- */
+       ZIO_STAGE_CHECKSUM_VERIFY       = 1 << 24,      /* R---- */
  
-       ZIO_STAGE_DONE                  = 1 << 24       /* RWFCI */
+       ZIO_STAGE_DONE                  = 1 << 25       /* RWFCI */
  };
  
  #define        ZIO_INTERLOCK_STAGES                    \
@@ -233,6 +241,7 @@ enum zio_stage {
  #define        ZIO_FREE_PIPELINE                       \
         (ZIO_INTERLOCK_STAGES |                 \
         ZIO_STAGE_FREE_BP_INIT |                \
+       ZIO_STAGE_BRT_FREE |                    \
         ZIO_STAGE_DVA_FREE)
  
  #define        ZIO_DDT_FREE_PIPELINE                   \
diff --git a/include/zfeature_common.h b/include/zfeature_common.h

index 0930bc900f8218ff7a64a11b9fb3d59b5e6eed19..ef915a70952ec4da497cc8477093f17292d9cf60 100644 (file)
--- a/include/zfeature_common.h
+++ b/include/zfeature_common.h
@@ -78,6 +78,7 @@ typedef enum spa_feature {
         SPA_FEATURE_ZILSAXATTR,
         SPA_FEATURE_HEAD_ERRLOG,
         SPA_FEATURE_BLAKE3,
+       SPA_FEATURE_BLOCK_CLONING,
         SPA_FEATURES
  } spa_feature_t;
  
diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi

index 16fea63f895cb9c74279bc65ff8dd90df6c61bf4..79c0201678b0438c36b9dbbc7ef17383dc6a0e42 100644 (file)
--- a/lib/libzfs/libzfs.abi
+++ b/lib/libzfs/libzfs.abi
@@ -590,7 +590,7 @@
      <elf-symbol name='fletcher_4_superscalar_ops' size='64' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
      <elf-symbol name='libzfs_config_ops' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
      <elf-symbol name='sa_protocol_names' size='16' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
-    <elf-symbol name='spa_feature_table' size='2072' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='spa_feature_table' size='2128' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
      <elf-symbol name='zfeature_checks_disable' size='4' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
      <elf-symbol name='zfs_deleg_perm_tab' size='512' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
      <elf-symbol name='zfs_history_event_names' size='328' type='object-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
@@ -3174,7 +3174,10 @@
        <enumerator name='ZPOOL_PROP_LOAD_GUID' value='30'/>
        <enumerator name='ZPOOL_PROP_AUTOTRIM' value='31'/>
        <enumerator name='ZPOOL_PROP_COMPATIBILITY' value='32'/>
-      <enumerator name='ZPOOL_NUM_PROPS' value='33'/>
+      <enumerator name='ZPOOL_PROP_BCLONEUSED' value='33'/>
+      <enumerator name='ZPOOL_PROP_BCLONESAVED' value='34'/>
+      <enumerator name='ZPOOL_PROP_BCLONERATIO' value='35'/>
+      <enumerator name='ZPOOL_NUM_PROPS' value='36'/>
      </enum-decl>
      <typedef-decl name='zpool_prop_t' type-id='af1ba157' id='5d0c23fb'/>
      <enum-decl name='vdev_prop_t' naming-typedef-id='5aa5c90c' id='1573bec8'>
@@ -4850,8 +4853,8 @@
      </function-decl>
    </abi-instr>
    <abi-instr address-size='64' path='module/zcommon/zfeature_common.c' language='LANG_C99'>
-    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='16576' id='d95b2b0b'>
-      <subrange length='37' type-id='7359adad' id='aa6426fb'/>
+    <array-type-def dimensions='1' type-id='83f29ca2' size-in-bits='17024' id='d95b2b0b'>
+      <subrange length='38' type-id='7359adad' id='aa6426fb'/>
      </array-type-def>
      <enum-decl name='spa_feature' id='33ecb627'>
        <underlying-type type-id='9cac1fee'/>
@@ -4893,7 +4896,8 @@
        <enumerator name='SPA_FEATURE_ZILSAXATTR' value='34'/>
        <enumerator name='SPA_FEATURE_HEAD_ERRLOG' value='35'/>
        <enumerator name='SPA_FEATURE_BLAKE3' value='36'/>
-      <enumerator name='SPA_FEATURES' value='37'/>
+      <enumerator name='SPA_FEATURE_BLOCK_CLONING' value='37'/>
+      <enumerator name='SPA_FEATURES' value='38'/>
      </enum-decl>
      <typedef-decl name='spa_feature_t' type-id='33ecb627' id='d6618c78'/>
      <enum-decl name='zfeature_flags' id='6db816a4'>
diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c

index b3e12bd84a2dff06c9ff31c1aceafe40690ea849..82965f8b993ab070a1d4f08d9c9fc689dc6d73b4 100644 (file)
--- a/lib/libzfs/libzfs_pool.c
+++ b/lib/libzfs/libzfs_pool.c
@@ -339,6 +339,8 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
                 case ZPOOL_PROP_ASHIFT:
                 case ZPOOL_PROP_MAXBLOCKSIZE:
                 case ZPOOL_PROP_MAXDNODESIZE:
+               case ZPOOL_PROP_BCLONESAVED:
+               case ZPOOL_PROP_BCLONEUSED:
                         if (literal)
                                 (void) snprintf(buf, len, "%llu",
                                     (u_longlong_t)intval);
@@ -380,6 +382,7 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf,
                         }
                         break;
  
+               case ZPOOL_PROP_BCLONERATIO:
                 case ZPOOL_PROP_DEDUPRATIO:
                         if (literal)
                                 (void) snprintf(buf, len, "%llu.%02llu",
diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am

index 0748f1240db980eaeca71a135527adc4aa03de99..ceac2963e64780b8257ca836241b08b60083b9ee 100644 (file)
--- a/lib/libzpool/Makefile.am
+++ b/lib/libzpool/Makefile.am
@@ -74,6 +74,7 @@ nodist_libzpool_la_SOURCES = \
         module/zfs/bptree.c \
         module/zfs/bqueue.c \
         module/zfs/btree.c \
+       module/zfs/brt.c \
         module/zfs/dbuf.c \
         module/zfs/dbuf_stats.c \
         module/zfs/ddt.c \
diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7

index 3ff3d97ba70c5c9291779deefe3799a2df954b4a..a4d595cd3cd95ccca26ce7277c7be4cd75cb50b6 100644 (file)
--- a/man/man7/zpool-features.7
+++ b/man/man7/zpool-features.7
@@ -347,6 +347,20 @@ BLAKE3 is a secure hash algorithm focused on high performance.
  .Pp
  .checksum-spiel blake3
  .
+.feature com.fudosecurity block_cloning yes
+When this feature is enabled ZFS will use block cloning for operations like
+.Fn copy_file_range 2 .
+Block cloning allows to create multiple references to a single block.
+It is much faster than copying the data (as the actual data is neither read nor
+written) and takes no additional space.
+Blocks can be cloned across datasets under some conditions (like disabled
+encryption and equal
+.Nm recordsize ) .
+.Pp
+This feature becomes
+.Sy active
+when first block is cloned.
+When the last cloned block is freed, it goes back to the enabled state.
  .feature com.delphix bookmarks yes extensible_dataset
  This feature enables use of the
  .Nm zfs Cm bookmark
diff --git a/man/man7/zpoolprops.7 b/man/man7/zpoolprops.7

index 7be0a21d980ad345b2ec8560cf3a66b6ba8361f7..12b9b11903dfafa6441e928332d73c230e2d9d3e 100644 (file)
--- a/man/man7/zpoolprops.7
+++ b/man/man7/zpoolprops.7
@@ -42,13 +42,26 @@ change the behavior of the pool.
  .Pp
  The following are read-only properties:
  .Bl -tag -width "unsupported@guid"
-.It Cm allocated
+.It Sy allocated
  Amount of storage used within the pool.
  See
  .Sy fragmentation
  and
  .Sy free
  for more information.
+.It Sy bcloneratio
+The ratio of the total amount of storage that would be required to store all
+the cloned blocks without cloning to the actual storage used.
+The
+.Sy bcloneratio
+property is calculated as:
+.Pp
+.Sy ( ( bclonesaved + bcloneused ) * 100 ) / bcloneused
+.It Sy bclonesaved
+The amount of additional storage that would be required if block cloning
+was not used.
+.It Sy bcloneused
+The amount of storage used by cloned blocks.
  .It Sy capacity
  Percentage of pool space used.
  This property can also be referred to by its shortened column name,
@@ -103,16 +116,16 @@ Over time
  will decrease while
  .Sy free
  increases.
-.It Sy leaked
-Space not released while
-.Sy freeing
-due to corruption, now permanently leaked into the pool.
+.It Sy guid
+A unique identifier for the pool.
  .It Sy health
  The current health of the pool.
  Health can be one of
  .Sy ONLINE , DEGRADED , FAULTED , OFFLINE, REMOVED , UNAVAIL .
-.It Sy guid
-A unique identifier for the pool.
+.It Sy leaked
+Space not released while
+.Sy freeing
+due to corruption, now permanently leaked into the pool.
  .It Sy load_guid
  A unique identifier for the pool.
  Unlike the
diff --git a/module/Kbuild.in b/module/Kbuild.in

index 21606b8cae2779b70e2b5c771832473d3b0ada96..8d29f56c2fb8f50e93ad781d2caf82f64f1de5e6 100644 (file)
--- a/module/Kbuild.in
+++ b/module/Kbuild.in
@@ -305,6 +305,7 @@ ZFS_OBJS := \
         bpobj.o \
         bptree.o \
         bqueue.o \
+       brt.o \
         btree.o \
         dataset_kstats.o \
         dbuf.o \
diff --git a/module/Makefile.bsd b/module/Makefile.bsd

index 6676787967796f4cce15b61cac5c7d1a449c0166..8ec094d4ad1c854d13343a7d4810d031b8b5c305 100644 (file)
--- a/module/Makefile.bsd
+++ b/module/Makefile.bsd
@@ -33,11 +33,11 @@ KMOD=       openzfs
         ${SRCDIR}/zstd/lib/decompress
  
  CFLAGS+= -I${INCDIR}
+CFLAGS+= -I${SRCDIR}/icp/include
  CFLAGS+= -I${INCDIR}/os/freebsd
  CFLAGS+= -I${INCDIR}/os/freebsd/spl
  CFLAGS+= -I${INCDIR}/os/freebsd/zfs
  CFLAGS+= -I${SRCDIR}/zstd/include
-CFLAGS+= -I${SRCDIR}/icp/include
  CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h
  CFLAGS+= -I${.CURDIR}
  
@@ -243,6 +243,7 @@ SRCS+=      abd.c \
         blkptr.c \
         bplist.c \
         bpobj.c \
+       brt.c \
         btree.c \
         cityhash.c \
         dbuf.c \
diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c

index 35edea0a2427ec97f2b746ad40e7a71db6d7c023..eccb91deff4f619c67a752645b73435599c67a97 100644 (file)
--- a/module/os/freebsd/zfs/sysctl_os.c
+++ b/module/os/freebsd/zfs/sysctl_os.c
@@ -97,6 +97,8 @@ __FBSDID("$FreeBSD$");
  SYSCTL_DECL(_vfs_zfs);
  SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0,
         "ZFS adaptive replacement cache");
+SYSCTL_NODE(_vfs_zfs, OID_AUTO, brt, CTLFLAG_RW, 0,
+       "ZFS Block Reference Table");
  SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense");
  SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache");
  SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0,
diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c

index 9fb2873132bf9bae749deec61a2699f2a7987a86..30851f5273a2e0531cca926aeebab182a1642ad3 100644 (file)
--- a/module/os/freebsd/zfs/zfs_vfsops.c
+++ b/module/os/freebsd/zfs/zfs_vfsops.c
@@ -153,7 +153,12 @@ struct vfsops zfs_vfsops = {
         .vfs_quotactl =         zfs_quotactl,
  };
  
-VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
+#ifdef VFCF_CROSS_COPY_FILE_RANGE
+VFS_SET(zfs_vfsops, zfs,
+    VFCF_DELEGADMIN | VFCF_JAIL | VFCF_CROSS_COPY_FILE_RANGE);
+#else
+VFS_SET(zfs_vfsops, zfs, VFCF_DELEGADMIN | VFCF_JAIL);
+#endif
  
  /*
   * We need to keep a count of active fs's.
diff --git a/module/os/freebsd/zfs/zfs_vnops_os.c b/module/os/freebsd/zfs/zfs_vnops_os.c

index 148def20ce577c7c3db02b3503c5df778f3d1e4b..9169244b1a1381a09f102393f2399e4210910d80 100644 (file)
--- a/module/os/freebsd/zfs/zfs_vnops_os.c
+++ b/module/os/freebsd/zfs/zfs_vnops_os.c
@@ -30,7 +30,6 @@
  /* Portions Copyright 2010 Robert Milkowski */
  
  
-#include <sys/types.h>
  #include <sys/param.h>
  #include <sys/time.h>
  #include <sys/systm.h>
@@ -84,6 +83,12 @@
  #include <vm/vm_param.h>
  #include <sys/zil.h>
  #include <sys/zfs_vnops.h>
+#include <sys/module.h>
+#include <sys/sysent.h>
+#include <security/mac/mac_framework.h>
+#include <sys/dmu_impl.h>
+#include <sys/brt.h>
+#include <sys/zfeature.h>
  
  #include <vm/vm_object.h>
  
@@ -6209,6 +6214,93 @@ zfs_deallocate(struct vop_deallocate_args *ap)
  }
  #endif
  
+#ifndef _SYS_SYSPROTO_H_
+struct vop_copy_file_range_args {
+       struct vnode *a_invp;
+       off_t *a_inoffp;
+       struct vnode *a_outvp;
+       off_t *a_outoffp;
+       size_t *a_lenp;
+       unsigned int a_flags;
+       struct ucred *a_incred;
+       struct ucred *a_outcred;
+       struct thread *a_fsizetd;
+}
+#endif
+/*
+ * TODO: FreeBSD will only call file system-specific copy_file_range() if both
+ * files resides under the same mountpoint. In case of ZFS we want to be called
+ * even is files are in different datasets (but on the same pools, but we need
+ * to check that ourselves).
+ */
+static int
+zfs_freebsd_copy_file_range(struct vop_copy_file_range_args *ap)
+{
+       struct vnode *invp = ap->a_invp;
+       struct vnode *outvp = ap->a_outvp;
+       struct mount *mp;
+       struct uio io;
+       int error;
+
+       /*
+        * TODO: If offset/length is not aligned to recordsize, use
+        * vn_generic_copy_file_range() on this fragment.
+        * It would be better to do this after we lock the vnodes, but then we
+        * need something else than vn_generic_copy_file_range().
+        */
+
+       /* Lock both vnodes, avoiding risk of deadlock. */
+       do {
+               mp = NULL;
+               error = vn_start_write(outvp, &mp, V_WAIT);
+               if (error == 0) {
+                       error = vn_lock(outvp, LK_EXCLUSIVE);
+                       if (error == 0) {
+                               if (invp == outvp)
+                                       break;
+                               error = vn_lock(invp, LK_SHARED | LK_NOWAIT);
+                               if (error == 0)
+                                       break;
+                               VOP_UNLOCK(outvp);
+                               if (mp != NULL)
+                                       vn_finished_write(mp);
+                               mp = NULL;
+                               error = vn_lock(invp, LK_SHARED);
+                               if (error == 0)
+                                       VOP_UNLOCK(invp);
+                       }
+               }
+               if (mp != NULL)
+                       vn_finished_write(mp);
+       } while (error == 0);
+       if (error != 0)
+               return (error);
+#ifdef MAC
+       error = mac_vnode_check_write(curthread->td_ucred, ap->a_outcred,
+           outvp);
+       if (error != 0)
+               goto unlock;
+#endif
+
+       io.uio_offset = *ap->a_outoffp;
+       io.uio_resid = *ap->a_lenp;
+       error = vn_rlimit_fsize(outvp, &io, ap->a_fsizetd);
+       if (error != 0)
+               goto unlock;
+
+       error = zfs_clone_range(VTOZ(invp), ap->a_inoffp, VTOZ(outvp),
+           ap->a_outoffp, ap->a_lenp, ap->a_fsizetd->td_ucred);
+
+unlock:
+       if (invp != outvp)
+               VOP_UNLOCK(invp);
+       VOP_UNLOCK(outvp);
+       if (mp != NULL)
+               vn_finished_write(mp);
+
+       return (error);
+}
+
  struct vop_vector zfs_vnodeops;
  struct vop_vector zfs_fifoops;
  struct vop_vector zfs_shareops;
@@ -6272,6 +6364,7 @@ struct vop_vector zfs_vnodeops = {
  #if __FreeBSD_version >= 1400043
         .vop_add_writecount =   vop_stdadd_writecount_nomsync,
  #endif
+       .vop_copy_file_range =  zfs_freebsd_copy_file_range,
  };
  VFS_VOP_VECTOR_REGISTER(zfs_vnodeops);
  
diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c

index 304bc71f90dbd4afcffce4ed925e19fca953b80c..dc1d31e1bd1160a4e06480bec2dff819d609db65 100644 (file)
--- a/module/os/freebsd/zfs/zfs_znode.c
+++ b/module/os/freebsd/zfs/zfs_znode.c
@@ -34,6 +34,7 @@
  #include <sys/systm.h>
  #include <sys/sysmacros.h>
  #include <sys/resource.h>
+#include <sys/resourcevar.h>
  #include <sys/mntent.h>
  #include <sys/u8_textprep.h>
  #include <sys/dsl_dataset.h>
@@ -2113,3 +2114,28 @@ zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf)
         return (err);
  }
  #endif /* _KERNEL */
+
+#ifdef _KERNEL
+int
+zfs_rlimit_fsize(off_t fsize)
+{
+       struct thread *td = curthread;
+       off_t lim;
+
+       if (td == NULL)
+               return (0);
+
+       lim = lim_cur(td, RLIMIT_FSIZE);
+       if (__predict_true((uoff_t)fsize <= lim))
+               return (0);
+
+       /*
+        * The limit is reached.
+        */
+       PROC_LOCK(td->td_proc);
+       kern_psignal(td->td_proc, SIGXFSZ);
+       PROC_UNLOCK(td->td_proc);
+
+       return (EFBIG);
+}
+#endif /* _KERNEL */
diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c

index ea45c9f8afa9260e1d04c579b5cbf4fc399a2b48..6fe1da8ed46f22fc176dadb5f8c600fc3275a4af 100644 (file)
--- a/module/zcommon/zfeature_common.c
+++ b/module/zcommon/zfeature_common.c
@@ -725,6 +725,12 @@ zpool_feature_init(void)
                     blake3_deps, sfeatures);
         }
  
+       zfeature_register(SPA_FEATURE_BLOCK_CLONING,
+           "com.fudosecurity:block_cloning", "block_cloning",
+           "Support for block cloning via Block Reference Table.",
+           ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL,
+           sfeatures);
+
         zfs_mod_list_supported_free(sfeatures);
  }
  
diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c

index e99acef5a8fb9b25a55fb586d84019d160b16509..459ff62fc996c257b7e6845745b82148d55fe4c0 100644 (file)
--- a/module/zcommon/zpool_prop.c
+++ b/module/zcommon/zpool_prop.c
@@ -116,6 +116,15 @@ zpool_prop_init(void)
         zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0,
             PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>",
             "DEDUP", B_FALSE, sfeatures);
+       zprop_register_number(ZPOOL_PROP_BCLONEUSED, "bcloneused", 0,
+           PROP_READONLY, ZFS_TYPE_POOL, "<size>",
+           "BCLONE_USED", B_FALSE, sfeatures);
+       zprop_register_number(ZPOOL_PROP_BCLONESAVED, "bclonesaved", 0,
+           PROP_READONLY, ZFS_TYPE_POOL, "<size>",
+           "BCLONE_SAVED", B_FALSE, sfeatures);
+       zprop_register_number(ZPOOL_PROP_BCLONERATIO, "bcloneratio", 0,
+           PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if cloned>",
+           "BCLONE_RATIO", B_FALSE, sfeatures);
  
         /* default number properties */
         zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
diff --git a/module/zfs/brt.c b/module/zfs/brt.c

new file mode 100644 (file)

index 0000000..ca9c4e6
--- /dev/null
+++ b/module/zfs/brt.c
@@ -0,0 +1,1884 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright (c) 2020, 2021, 2022 by Pawel Jakub Dawidek
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/brt.h>
+#include <sys/ddt.h>
+#include <sys/bitmap.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/dsl_scan.h>
+#include <sys/vdev_impl.h>
+#include <sys/kstat.h>
+#include <sys/wmsum.h>
+
+/*
+ * Block Cloning design.
+ *
+ * Block Cloning allows to manually clone a file (or a subset of its blocks)
+ * into another (or the same) file by just creating additional references to
+ * the data blocks without copying the data itself. Those references are kept
+ * in the Block Reference Tables (BRTs).
+ *
+ * In many ways this is similar to the existing deduplication, but there are
+ * some important differences:
+ *
+ * - Deduplication is automatic and Block Cloning is not - one has to use a
+ *   dedicated system call(s) to clone the given file/blocks.
+ * - Deduplication keeps all data blocks in its table, even those referenced
+ *   just once. Block Cloning creates an entry in its tables only when there
+ *   are at least two references to the given data block. If the block was
+ *   never explicitly cloned or the second to last reference was dropped,
+ *   there will be neither space nor performance overhead.
+ * - Deduplication needs data to work - one needs to pass real data to the
+ *   write(2) syscall, so hash can be calculated. Block Cloning doesn't require
+ *   data, just block pointers to the data, so it is extremely fast, as we pay
+ *   neither the cost of reading the data, nor the cost of writing the data -
+ *   we operate exclusively on metadata.
+ * - If the D (dedup) bit is not set in the block pointer, it means that
+ *   the block is not in the dedup table (DDT) and we won't consult the DDT
+ *   when we need to free the block. Block Cloning must be consulted on every
+ *   free, because we cannot modify the source BP (eg. by setting something
+ *   similar to the D bit), thus we have no hint if the block is in the
+ *   Block Reference Table (BRT), so we need to look into the BRT. There is
+ *   an optimization in place that allows us to eliminate the majority of BRT
+ *   lookups which is described below in the "Minimizing free penalty" section.
+ * - The BRT entry is much smaller than the DDT entry - for BRT we only store
+ *   64bit offset and 64bit reference counter.
+ * - Dedup keys are cryptographic hashes, so two blocks that are close to each
+ *   other on disk are most likely in totally different parts of the DDT.
+ *   The BRT entry keys are offsets into a single top-level VDEV, so data blocks
+ *   from one file should have BRT entries close to each other.
+ * - Scrub will only do a single pass over a block that is referenced multiple
+ *   times in the DDT. Unfortunately it is not currently (if at all) possible
+ *   with Block Cloning and block referenced multiple times will be scrubbed
+ *   multiple times. The new, sorted scrub should be able to eliminate
+ *   duplicated reads given enough memory.
+ * - Deduplication requires cryptographically strong hash as a checksum or
+ *   additional data verification. Block Cloning works with any checksum
+ *   algorithm or even with checksumming disabled.
+ *
+ * As mentioned above, the BRT entries are much smaller than the DDT entries.
+ * To uniquely identify a block we just need its vdev id and offset. We also
+ * need to maintain a reference counter. The vdev id will often repeat, as there
+ * is a small number of top-level VDEVs and a large number of blocks stored in
+ * each VDEV. We take advantage of that to reduce the BRT entry size further by
+ * maintaining one BRT for each top-level VDEV, so we can then have only offset
+ * and counter as the BRT entry.
+ *
+ * Minimizing free penalty.
+ *
+ * Block Cloning allows creating additional references to any existing block.
+ * When we free a block there is no hint in the block pointer whether the block
+ * was cloned or not, so on each free we have to check if there is a
+ * corresponding entry in the BRT or not. If there is, we need to decrease
+ * the reference counter. Doing BRT lookup on every free can potentially be
+ * expensive by requiring additional I/Os if the BRT doesn't fit into memory.
+ * This is the main problem with deduplication, so we've learned our lesson and
+ * try not to repeat the same mistake here. How do we do that? We divide each
+ * top-level VDEV into 16MB regions. For each region we maintain a counter that
+ * is a sum of all the BRT entries that have offsets within the region. This
+ * creates the entries count array of 16bit numbers for each top-level VDEV.
+ * The entries count array is always kept in memory and updated on disk in the
+ * same transaction group as the BRT updates to keep everything in-sync. We can
+ * keep the array in memory, because it is very small. With 16MB regions and
+ * 1TB VDEV the array requires only 128kB of memory (we may decide to decrease
+ * the region size even further in the future). Now, when we want to free
+ * a block, we first consult the array. If the counter for the whole region is
+ * zero, there is no need to look for the BRT entry, as there isn't one for
+ * sure. If the counter for the region is greater than zero, only then we will
+ * do a BRT lookup and if an entry is found we will decrease the reference
+ * counter in the BRT entry and in the entry counters array.
+ *
+ * The entry counters array is small, but can potentially be larger for very
+ * large VDEVs or smaller regions. In this case we don't want to rewrite entire
+ * array on every change. We then divide the array into 32kB block and keep
+ * a bitmap of dirty blocks within a transaction group. When we sync the
+ * transaction group we can only update the parts of the entry counters array
+ * that were modified. Note: Keeping track of the dirty parts of the entry
+ * counters array is implemented, but updating only parts of the array on disk
+ * is not yet implemented - for now we will update entire array if there was
+ * any change.
+ *
+ * The implementation tries to be economic: if BRT is not used, or no longer
+ * used, there will be no entries in the MOS and no additional memory used (eg.
+ * the entry counters array is only allocated if needed).
+ *
+ * Interaction between Deduplication and Block Cloning.
+ *
+ * If both functionalities are in use, we could end up with a block that is
+ * referenced multiple times in both DDT and BRT. When we free one of the
+ * references we couldn't tell where it belongs, so we would have to decide
+ * what table takes the precedence: do we first clear DDT references or BRT
+ * references? To avoid this dilemma BRT cooperates with DDT - if a given block
+ * is being cloned using BRT and the BP has the D (dedup) bit set, BRT will
+ * lookup DDT entry instead and increase the counter there. No BRT entry
+ * will be created for a block which has the D (dedup) bit set.
+ * BRT may be more efficient for manual deduplication, but if the block is
+ * already in the DDT, then creating additional BRT entry would be less
+ * efficient. This clever idea was proposed by Allan Jude.
+ *
+ * Block Cloning across datasets.
+ *
+ * Block Cloning is not limited to cloning blocks within the same dataset.
+ * It is possible (and very useful) to clone blocks between different datasets.
+ * One use case is recovering files from snapshots. By cloning the files into
+ * dataset we need no additional storage. Without Block Cloning we would need
+ * additional space for those files.
+ * Another interesting use case is moving the files between datasets
+ * (copying the file content to the new dataset and removing the source file).
+ * In that case Block Cloning will only be used briefly, because the BRT entries
+ * will be removed when the source is removed.
+ * Note: currently it is not possible to clone blocks between encrypted
+ * datasets, even if those datasets use the same encryption key (this includes
+ * snapshots of encrypted datasets). Cloning blocks between datasets that use
+ * the same keys should be possible and should be implemented in the future.
+ *
+ * Block Cloning flow through ZFS layers.
+ *
+ * Note: Block Cloning can be used both for cloning file system blocks and ZVOL
+ * blocks. As of this writing no interface is implemented that allows for block
+ * cloning within a ZVOL.
+ * FreeBSD and Linux provides copy_file_range(2) system call and we will use it
+ * for blocking cloning.
+ *
+ *     ssize_t
+ *     copy_file_range(int infd, off_t *inoffp, int outfd, off_t *outoffp,
+ *                     size_t len, unsigned int flags);
+ *
+ * Even though offsets and length represent bytes, they have to be
+ * block-aligned or we will return the EXDEV error so the upper layer can
+ * fallback to the generic mechanism that will just copy the data.
+ * Using copy_file_range(2) will call OS-independent zfs_clone_range() function.
+ * This function was implemented based on zfs_write(), but instead of writing
+ * the given data we first read block pointers using the new dmu_read_l0_bps()
+ * function from the source file. Once we have BPs from the source file we call
+ * the dmu_brt_clone() function on the destination file. This function
+ * allocates BPs for us. We iterate over all source BPs. If the given BP is
+ * a hole or an embedded block, we just copy BP as-is. If it points to a real
+ * data we place this BP on a BRT pending list using the brt_pending_add()
+ * function.
+ *
+ * We use this pending list to keep track of all BPs that got new references
+ * within this transaction group.
+ *
+ * Some special cases to consider and how we address them:
+ * - The block we want to clone may have been created within the same
+ *   transaction group that we are trying to clone. Such block has no BP
+ *   allocated yet, so cannot be immediately cloned. We return EXDEV.
+ * - The block we want to clone may have been modified within the same
+ *   transaction group. We return EXDEV.
+ * - A block may be cloned multiple times during one transaction group (that's
+ *   why pending list is actually a tree and not an append-only list - this
+ *   way we can figure out faster if this block is cloned for the first time
+ *   in this txg or consecutive time).
+ * - A block may be cloned and freed within the same transaction group
+ *   (see dbuf_undirty()).
+ * - A block may be cloned and within the same transaction group the clone
+ *   can be cloned again (see dmu_read_l0_bps()).
+ * - A file might have been deleted, but the caller still has a file descriptor
+ *   open to this file and clones it.
+ *
+ * When we free a block we have an additional step in the ZIO pipeline where we
+ * call the zio_brt_free() function. We then call the brt_entry_decref()
+ * that loads the corresponding BRT entry (if one exists) and decreases
+ * reference counter. If this is not the last reference we will stop ZIO
+ * pipeline here. If this is the last reference or the block is not in the
+ * BRT, we continue the pipeline and free the block as usual.
+ *
+ * At the beginning of spa_sync() where there can be no more block cloning,
+ * but before issuing frees we call brt_pending_apply(). This function applies
+ * all the new clones to the BRT table - we load BRT entries and update
+ * reference counters. To sync new BRT entries to disk, we use brt_sync()
+ * function. This function will sync all dirty per-top-level-vdev BRTs,
+ * the entry counters arrays, etc.
+ *
+ * Block Cloning and ZIL.
+ *
+ * Every clone operation is divided into chunks (similar to write) and each
+ * chunk is cloned in a separate transaction. The chunk size is determined by
+ * how many BPs we can fit into a single ZIL entry.
+ * Replaying clone operation is different from the regular clone operation,
+ * as when we log clone operations we cannot use the source object - it may
+ * reside on a different dataset, so we log BPs we want to clone.
+ * The ZIL is replayed when we mount the given dataset, not when the pool is
+ * imported. Taking this into account it is possible that the pool is imported
+ * without mounting datasets and the source dataset is destroyed before the
+ * destination dataset is mounted and its ZIL replayed.
+ * To address this situation we leverage zil_claim() mechanism where ZFS will
+ * parse all the ZILs on pool import. When we come across TX_CLONE_RANGE
+ * entries, we will bump reference counters for their BPs in the BRT and then
+ * on mount and ZIL replay we will just attach BPs to the file without
+ * bumping reference counters.
+ * Note it is still possible that after zil_claim() we never mount the
+ * destination, so we never replay its ZIL and we destroy it. This way we would
+ * end up with leaked references in BRT. We address that too as ZFS gives us
+ * a chance to clean this up on dataset destroy (see zil_free_clone_range()).
+ */
+
+/*
+ * BRT - Block Reference Table.
+ */
+#define        BRT_OBJECT_VDEV_PREFIX  "com.fudosecurity:brt:vdev:"
+
+/*
+ * We divide each VDEV into 16MB chunks. Each chunk is represented in memory
+ * by a 16bit counter, thus 1TB VDEV requires 128kB of memory: (1TB / 16MB) * 2B
+ * Each element in this array represents how many BRT entries do we have in this
+ * chunk of storage. We always load this entire array into memory and update as
+ * needed. By having it in memory we can quickly tell (during zio_free()) if
+ * there are any BRT entries that we might need to update.
+ *
+ * This value cannot be larger than 16MB, at least as long as we support
+ * 512 byte block sizes. With 512 byte block size we can have exactly
+ * 32768 blocks in 16MB. In 32MB we could have 65536 blocks, which is one too
+ * many for a 16bit counter.
+ */
+#define        BRT_RANGESIZE   (16 * 1024 * 1024)
+_Static_assert(BRT_RANGESIZE / SPA_MINBLOCKSIZE <= UINT16_MAX,
+       "BRT_RANGESIZE is too large.");
+/*
+ * We don't want to update the whole structure every time. Maintain bitmap
+ * of dirty blocks within the regions, so that a single bit represents a
+ * block size of entcounts. For example if we have a 1PB vdev then all
+ * entcounts take 128MB of memory ((64TB / 16MB) * 2B). We can divide this
+ * 128MB array of entcounts into 32kB disk blocks, as we don't want to update
+ * the whole 128MB on disk when we have updated only a single entcount.
+ * We maintain a bitmap where each 32kB disk block within 128MB entcounts array
+ * is represented by a single bit. This gives us 4096 bits. A set bit in the
+ * bitmap means that we had a change in at least one of the 16384 entcounts
+ * that reside on a 32kB disk block (32kB / sizeof (uint16_t)).
+ */
+#define        BRT_BLOCKSIZE   (32 * 1024)
+#define        BRT_RANGESIZE_TO_NBLOCKS(size)                                  \
+       (((size) - 1) / BRT_BLOCKSIZE / sizeof (uint16_t) + 1)
+
+#define        BRT_LITTLE_ENDIAN       0
+#define        BRT_BIG_ENDIAN          1
+#ifdef _ZFS_LITTLE_ENDIAN
+#define        BRT_NATIVE_BYTEORDER            BRT_LITTLE_ENDIAN
+#define        BRT_NON_NATIVE_BYTEORDER        BRT_BIG_ENDIAN
+#else
+#define        BRT_NATIVE_BYTEORDER            BRT_BIG_ENDIAN
+#define        BRT_NON_NATIVE_BYTEORDER        BRT_LITTLE_ENDIAN
+#endif
+
+typedef struct brt_vdev_phys {
+       uint64_t        bvp_mos_entries;
+       uint64_t        bvp_size;
+       uint64_t        bvp_byteorder;
+       uint64_t        bvp_totalcount;
+       uint64_t        bvp_rangesize;
+       uint64_t        bvp_usedspace;
+       uint64_t        bvp_savedspace;
+} brt_vdev_phys_t;
+
+typedef struct brt_vdev {
+       /*
+        * VDEV id.
+        */
+       uint64_t        bv_vdevid;
+       /*
+        * Is the structure initiated?
+        * (bv_entcount and bv_bitmap are allocated?)
+        */
+       boolean_t       bv_initiated;
+       /*
+        * Object number in the MOS for the entcount array and brt_vdev_phys.
+        */
+       uint64_t        bv_mos_brtvdev;
+       /*
+        * Object number in the MOS for the entries table.
+        */
+       uint64_t        bv_mos_entries;
+       /*
+        * Entries to sync.
+        */
+       avl_tree_t      bv_tree;
+       /*
+        * Does the bv_entcount[] array needs byte swapping?
+        */
+       boolean_t       bv_need_byteswap;
+       /*
+        * Number of entries in the bv_entcount[] array.
+        */
+       uint64_t        bv_size;
+       /*
+        * This is the array with BRT entry count per BRT_RANGESIZE.
+        */
+       uint16_t        *bv_entcount;
+       /*
+        * Sum of all bv_entcount[]s.
+        */
+       uint64_t        bv_totalcount;
+       /*
+        * Space on disk occupied by cloned blocks (without compression).
+        */
+       uint64_t        bv_usedspace;
+       /*
+        * How much additional space would be occupied without block cloning.
+        */
+       uint64_t        bv_savedspace;
+       /*
+        * brt_vdev_phys needs updating on disk.
+        */
+       boolean_t       bv_meta_dirty;
+       /*
+        * bv_entcount[] needs updating on disk.
+        */
+       boolean_t       bv_entcount_dirty;
+       /*
+        * bv_entcount[] potentially can be a bit too big to sychronize it all
+        * when we just changed few entcounts. The fields below allow us to
+        * track updates to bv_entcount[] array since the last sync.
+        * A single bit in the bv_bitmap represents as many entcounts as can
+        * fit into a single BRT_BLOCKSIZE.
+        * For example we have 65536 entcounts in the bv_entcount array
+        * (so the whole array is 128kB). We updated bv_entcount[2] and
+        * bv_entcount[5]. In that case only first bit in the bv_bitmap will
+        * be set and we will write only first BRT_BLOCKSIZE out of 128kB.
+        */
+       ulong_t         *bv_bitmap;
+       uint64_t        bv_nblocks;
+} brt_vdev_t;
+
+/*
+ * In-core brt
+ */
+typedef struct brt {
+       krwlock_t       brt_lock;
+       spa_t           *brt_spa;
+#define        brt_mos         brt_spa->spa_meta_objset
+       uint64_t        brt_rangesize;
+       uint64_t        brt_usedspace;
+       uint64_t        brt_savedspace;
+       avl_tree_t      brt_pending_tree[TXG_SIZE];
+       kmutex_t        brt_pending_lock[TXG_SIZE];
+       /* Sum of all entries across all bv_trees. */
+       uint64_t        brt_nentries;
+       brt_vdev_t      *brt_vdevs;
+       uint64_t        brt_nvdevs;
+} brt_t;
+
+/* Size of bre_offset / sizeof (uint64_t). */
+#define        BRT_KEY_WORDS   (1)
+
+/*
+ * In-core brt entry.
+ * On-disk we use bre_offset as the key and bre_refcount as the value.
+ */
+typedef struct brt_entry {
+       uint64_t        bre_offset;
+       uint64_t        bre_refcount;
+       avl_node_t      bre_node;
+} brt_entry_t;
+
+typedef struct brt_pending_entry {
+       blkptr_t        bpe_bp;
+       int             bpe_count;
+       avl_node_t      bpe_node;
+} brt_pending_entry_t;
+
+static kmem_cache_t *brt_entry_cache;
+static kmem_cache_t *brt_pending_entry_cache;
+
+/*
+ * Enable/disable prefetching of BRT entries that we are going to modify.
+ */
+int zfs_brt_prefetch = 1;
+
+#ifdef ZFS_DEBUG
+#define        BRT_DEBUG(...)  do {                                            \
+       if ((zfs_flags & ZFS_DEBUG_BRT) != 0) {                         \
+               __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__); \
+       }                                                               \
+} while (0)
+#else
+#define        BRT_DEBUG(...)  do { } while (0)
+#endif
+
+int brt_zap_leaf_blockshift = 12;
+int brt_zap_indirect_blockshift = 12;
+
+static kstat_t *brt_ksp;
+
+typedef struct brt_stats {
+       kstat_named_t brt_addref_entry_in_memory;
+       kstat_named_t brt_addref_entry_not_on_disk;
+       kstat_named_t brt_addref_entry_on_disk;
+       kstat_named_t brt_addref_entry_read_lost_race;
+       kstat_named_t brt_decref_entry_in_memory;
+       kstat_named_t brt_decref_entry_loaded_from_disk;
+       kstat_named_t brt_decref_entry_not_in_memory;
+       kstat_named_t brt_decref_entry_not_on_disk;
+       kstat_named_t brt_decref_entry_read_lost_race;
+       kstat_named_t brt_decref_entry_still_referenced;
+       kstat_named_t brt_decref_free_data_later;
+       kstat_named_t brt_decref_free_data_now;
+       kstat_named_t brt_decref_no_entry;
+} brt_stats_t;
+
+static brt_stats_t brt_stats = {
+       { "addref_entry_in_memory",             KSTAT_DATA_UINT64 },
+       { "addref_entry_not_on_disk",           KSTAT_DATA_UINT64 },
+       { "addref_entry_on_disk",               KSTAT_DATA_UINT64 },
+       { "addref_entry_read_lost_race",        KSTAT_DATA_UINT64 },
+       { "decref_entry_in_memory",             KSTAT_DATA_UINT64 },
+       { "decref_entry_loaded_from_disk",      KSTAT_DATA_UINT64 },
+       { "decref_entry_not_in_memory",         KSTAT_DATA_UINT64 },
+       { "decref_entry_not_on_disk",           KSTAT_DATA_UINT64 },
+       { "decref_entry_read_lost_race",        KSTAT_DATA_UINT64 },
+       { "decref_entry_still_referenced",      KSTAT_DATA_UINT64 },
+       { "decref_free_data_later",             KSTAT_DATA_UINT64 },
+       { "decref_free_data_now",               KSTAT_DATA_UINT64 },
+       { "decref_no_entry",                    KSTAT_DATA_UINT64 }
+};
+
+struct {
+       wmsum_t brt_addref_entry_in_memory;
+       wmsum_t brt_addref_entry_not_on_disk;
+       wmsum_t brt_addref_entry_on_disk;
+       wmsum_t brt_addref_entry_read_lost_race;
+       wmsum_t brt_decref_entry_in_memory;
+       wmsum_t brt_decref_entry_loaded_from_disk;
+       wmsum_t brt_decref_entry_not_in_memory;
+       wmsum_t brt_decref_entry_not_on_disk;
+       wmsum_t brt_decref_entry_read_lost_race;
+       wmsum_t brt_decref_entry_still_referenced;
+       wmsum_t brt_decref_free_data_later;
+       wmsum_t brt_decref_free_data_now;
+       wmsum_t brt_decref_no_entry;
+} brt_sums;
+
+#define        BRTSTAT_BUMP(stat)      wmsum_add(&brt_sums.stat, 1)
+
+static int brt_entry_compare(const void *x1, const void *x2);
+static int brt_pending_entry_compare(const void *x1, const void *x2);
+
+static void
+brt_rlock(brt_t *brt)
+{
+       rw_enter(&brt->brt_lock, RW_READER);
+}
+
+static void
+brt_wlock(brt_t *brt)
+{
+       rw_enter(&brt->brt_lock, RW_WRITER);
+}
+
+static void
+brt_unlock(brt_t *brt)
+{
+       rw_exit(&brt->brt_lock);
+}
+
+static uint16_t
+brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx)
+{
+
+       ASSERT3U(idx, <, brtvd->bv_size);
+
+       if (brtvd->bv_need_byteswap) {
+               return (BSWAP_16(brtvd->bv_entcount[idx]));
+       } else {
+               return (brtvd->bv_entcount[idx]);
+       }
+}
+
+static void
+brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt)
+{
+
+       ASSERT3U(idx, <, brtvd->bv_size);
+
+       if (brtvd->bv_need_byteswap) {
+               brtvd->bv_entcount[idx] = BSWAP_16(entcnt);
+       } else {
+               brtvd->bv_entcount[idx] = entcnt;
+       }
+}
+
+static void
+brt_vdev_entcount_inc(brt_vdev_t *brtvd, uint64_t idx)
+{
+       uint16_t entcnt;
+
+       ASSERT3U(idx, <, brtvd->bv_size);
+
+       entcnt = brt_vdev_entcount_get(brtvd, idx);
+       ASSERT(entcnt < UINT16_MAX);
+
+       brt_vdev_entcount_set(brtvd, idx, entcnt + 1);
+}
+
+static void
+brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx)
+{
+       uint16_t entcnt;
+
+       ASSERT3U(idx, <, brtvd->bv_size);
+
+       entcnt = brt_vdev_entcount_get(brtvd, idx);
+       ASSERT(entcnt > 0);
+
+       brt_vdev_entcount_set(brtvd, idx, entcnt - 1);
+}
+
+#ifdef ZFS_DEBUG
+static void
+brt_vdev_dump(brt_t *brt)
+{
+       brt_vdev_t *brtvd;
+       uint64_t vdevid;
+
+       if ((zfs_flags & ZFS_DEBUG_BRT) == 0) {
+               return;
+       }
+
+       if (brt->brt_nvdevs == 0) {
+               zfs_dbgmsg("BRT empty");
+               return;
+       }
+
+       zfs_dbgmsg("BRT vdev dump:");
+       for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+               uint64_t idx;
+
+               brtvd = &brt->brt_vdevs[vdevid];
+               zfs_dbgmsg("  vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d "
+                   "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n",
+                   (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid,
+                   brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty,
+                   (u_longlong_t)brtvd->bv_size,
+                   (u_longlong_t)brtvd->bv_totalcount,
+                   (u_longlong_t)brtvd->bv_nblocks,
+                   (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks));
+               if (brtvd->bv_totalcount > 0) {
+                       zfs_dbgmsg("    entcounts:");
+                       for (idx = 0; idx < brtvd->bv_size; idx++) {
+                               if (brt_vdev_entcount_get(brtvd, idx) > 0) {
+                                       zfs_dbgmsg("      [%04llu] %hu",
+                                           (u_longlong_t)idx,
+                                           brt_vdev_entcount_get(brtvd, idx));
+                               }
+                       }
+               }
+               if (brtvd->bv_entcount_dirty) {
+                       char *bitmap;
+
+                       bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP);
+                       for (idx = 0; idx < brtvd->bv_nblocks; idx++) {
+                               bitmap[idx] =
+                                   BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.';
+                       }
+                       bitmap[idx] = '\0';
+                       zfs_dbgmsg("    bitmap: %s", bitmap);
+                       kmem_free(bitmap, brtvd->bv_nblocks + 1);
+               }
+       }
+}
+#endif
+
+static brt_vdev_t *
+brt_vdev(brt_t *brt, uint64_t vdevid)
+{
+       brt_vdev_t *brtvd;
+
+       ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+       if (vdevid < brt->brt_nvdevs) {
+               brtvd = &brt->brt_vdevs[vdevid];
+       } else {
+               brtvd = NULL;
+       }
+
+       return (brtvd);
+}
+
+static void
+brt_vdev_create(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+       char name[64];
+
+       ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+       ASSERT0(brtvd->bv_mos_brtvdev);
+       ASSERT0(brtvd->bv_mos_entries);
+       ASSERT(brtvd->bv_entcount != NULL);
+       ASSERT(brtvd->bv_size > 0);
+       ASSERT(brtvd->bv_bitmap != NULL);
+       ASSERT(brtvd->bv_nblocks > 0);
+
+       brtvd->bv_mos_entries = zap_create_flags(brt->brt_mos, 0,
+           ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY, DMU_OTN_ZAP_METADATA,
+           brt_zap_leaf_blockshift, brt_zap_indirect_blockshift, DMU_OT_NONE,
+           0, tx);
+       VERIFY(brtvd->bv_mos_entries != 0);
+       BRT_DEBUG("MOS entries created, object=%llu",
+           (u_longlong_t)brtvd->bv_mos_entries);
+
+       /*
+        * We allocate DMU buffer to store the bv_entcount[] array.
+        * We will keep array size (bv_size) and cummulative count for all
+        * bv_entcount[]s (bv_totalcount) in the bonus buffer.
+        */
+       brtvd->bv_mos_brtvdev = dmu_object_alloc(brt->brt_mos,
+           DMU_OTN_UINT64_METADATA, BRT_BLOCKSIZE,
+           DMU_OTN_UINT64_METADATA, sizeof (brt_vdev_phys_t), tx);
+       VERIFY(brtvd->bv_mos_brtvdev != 0);
+       BRT_DEBUG("MOS BRT VDEV created, object=%llu",
+           (u_longlong_t)brtvd->bv_mos_brtvdev);
+
+       snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+           (u_longlong_t)brtvd->bv_vdevid);
+       VERIFY0(zap_add(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+           sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev, tx));
+       BRT_DEBUG("Pool directory object created, object=%s", name);
+
+       spa_feature_incr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+}
+
+static void
+brt_vdev_realloc(brt_t *brt, brt_vdev_t *brtvd)
+{
+       vdev_t *vd;
+       uint16_t *entcount;
+       ulong_t *bitmap;
+       uint64_t nblocks, size;
+
+       ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+
+       spa_config_enter(brt->brt_spa, SCL_VDEV, FTAG, RW_READER);
+       vd = vdev_lookup_top(brt->brt_spa, brtvd->bv_vdevid);
+       size = (vdev_get_min_asize(vd) - 1) / brt->brt_rangesize + 1;
+       spa_config_exit(brt->brt_spa, SCL_VDEV, FTAG);
+
+       entcount = kmem_zalloc(sizeof (entcount[0]) * size, KM_SLEEP);
+       nblocks = BRT_RANGESIZE_TO_NBLOCKS(size);
+       bitmap = kmem_zalloc(BT_SIZEOFMAP(nblocks), KM_SLEEP);
+
+       if (!brtvd->bv_initiated) {
+               ASSERT0(brtvd->bv_size);
+               ASSERT(brtvd->bv_entcount == NULL);
+               ASSERT(brtvd->bv_bitmap == NULL);
+               ASSERT0(brtvd->bv_nblocks);
+
+               avl_create(&brtvd->bv_tree, brt_entry_compare,
+                   sizeof (brt_entry_t), offsetof(brt_entry_t, bre_node));
+       } else {
+               ASSERT(brtvd->bv_size > 0);
+               ASSERT(brtvd->bv_entcount != NULL);
+               ASSERT(brtvd->bv_bitmap != NULL);
+               ASSERT(brtvd->bv_nblocks > 0);
+               /*
+                * TODO: Allow vdev shrinking. We only need to implement
+                * shrinking the on-disk BRT VDEV object.
+                * dmu_free_range(brt->brt_mos, brtvd->bv_mos_brtvdev, offset,
+                *     size, tx);
+                */
+               ASSERT3U(brtvd->bv_size, <=, size);
+
+               memcpy(entcount, brtvd->bv_entcount,
+                   sizeof (entcount[0]) * MIN(size, brtvd->bv_size));
+               memcpy(bitmap, brtvd->bv_bitmap, MIN(BT_SIZEOFMAP(nblocks),
+                   BT_SIZEOFMAP(brtvd->bv_nblocks)));
+               kmem_free(brtvd->bv_entcount,
+                   sizeof (entcount[0]) * brtvd->bv_size);
+               kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+       }
+
+       brtvd->bv_size = size;
+       brtvd->bv_entcount = entcount;
+       brtvd->bv_bitmap = bitmap;
+       brtvd->bv_nblocks = nblocks;
+       if (!brtvd->bv_initiated) {
+               brtvd->bv_need_byteswap = FALSE;
+               brtvd->bv_initiated = TRUE;
+               BRT_DEBUG("BRT VDEV %llu initiated.",
+                   (u_longlong_t)brtvd->bv_vdevid);
+       }
+}
+
+static void
+brt_vdev_load(brt_t *brt, brt_vdev_t *brtvd)
+{
+       char name[64];
+       dmu_buf_t *db;
+       brt_vdev_phys_t *bvphys;
+       int error;
+
+       snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+           (u_longlong_t)brtvd->bv_vdevid);
+       error = zap_lookup(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name,
+           sizeof (uint64_t), 1, &brtvd->bv_mos_brtvdev);
+       if (error != 0)
+               return;
+       ASSERT(brtvd->bv_mos_brtvdev != 0);
+
+       error = dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db);
+       ASSERT0(error);
+       if (error != 0)
+               return;
+
+       bvphys = db->db_data;
+       if (brt->brt_rangesize == 0) {
+               brt->brt_rangesize = bvphys->bvp_rangesize;
+       } else {
+               ASSERT3U(brt->brt_rangesize, ==, bvphys->bvp_rangesize);
+       }
+
+       ASSERT(!brtvd->bv_initiated);
+       brt_vdev_realloc(brt, brtvd);
+
+       /* TODO: We don't support VDEV shrinking. */
+       ASSERT3U(bvphys->bvp_size, <=, brtvd->bv_size);
+
+       /*
+        * If VDEV grew, we will leave new bv_entcount[] entries zeroed out.
+        */
+       error = dmu_read(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+           MIN(brtvd->bv_size, bvphys->bvp_size) * sizeof (uint16_t),
+           brtvd->bv_entcount, DMU_READ_NO_PREFETCH);
+       ASSERT0(error);
+
+       brtvd->bv_mos_entries = bvphys->bvp_mos_entries;
+       ASSERT(brtvd->bv_mos_entries != 0);
+       brtvd->bv_need_byteswap =
+           (bvphys->bvp_byteorder != BRT_NATIVE_BYTEORDER);
+       brtvd->bv_totalcount = bvphys->bvp_totalcount;
+       brtvd->bv_usedspace = bvphys->bvp_usedspace;
+       brtvd->bv_savedspace = bvphys->bvp_savedspace;
+       brt->brt_usedspace += brtvd->bv_usedspace;
+       brt->brt_savedspace += brtvd->bv_savedspace;
+
+       dmu_buf_rele(db, FTAG);
+
+       BRT_DEBUG("MOS BRT VDEV %s loaded: mos_brtvdev=%llu, mos_entries=%llu",
+           name, (u_longlong_t)brtvd->bv_mos_brtvdev,
+           (u_longlong_t)brtvd->bv_mos_entries);
+}
+
+static void
+brt_vdev_dealloc(brt_t *brt, brt_vdev_t *brtvd)
+{
+
+       ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+       ASSERT(brtvd->bv_initiated);
+
+       kmem_free(brtvd->bv_entcount, sizeof (uint16_t) * brtvd->bv_size);
+       brtvd->bv_entcount = NULL;
+       kmem_free(brtvd->bv_bitmap, BT_SIZEOFMAP(brtvd->bv_nblocks));
+       brtvd->bv_bitmap = NULL;
+       ASSERT0(avl_numnodes(&brtvd->bv_tree));
+       avl_destroy(&brtvd->bv_tree);
+
+       brtvd->bv_size = 0;
+       brtvd->bv_nblocks = 0;
+
+       brtvd->bv_initiated = FALSE;
+       BRT_DEBUG("BRT VDEV %llu deallocated.", (u_longlong_t)brtvd->bv_vdevid);
+}
+
+static void
+brt_vdev_destroy(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+       char name[64];
+       uint64_t count;
+       dmu_buf_t *db;
+       brt_vdev_phys_t *bvphys;
+
+       ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+       ASSERT(brtvd->bv_mos_brtvdev != 0);
+       ASSERT(brtvd->bv_mos_entries != 0);
+
+       VERIFY0(zap_count(brt->brt_mos, brtvd->bv_mos_entries, &count));
+       VERIFY0(count);
+       VERIFY0(zap_destroy(brt->brt_mos, brtvd->bv_mos_entries, tx));
+       BRT_DEBUG("MOS entries destroyed, object=%llu",
+           (u_longlong_t)brtvd->bv_mos_entries);
+       brtvd->bv_mos_entries = 0;
+
+       VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+       bvphys = db->db_data;
+       ASSERT0(bvphys->bvp_totalcount);
+       ASSERT0(bvphys->bvp_usedspace);
+       ASSERT0(bvphys->bvp_savedspace);
+       dmu_buf_rele(db, FTAG);
+
+       VERIFY0(dmu_object_free(brt->brt_mos, brtvd->bv_mos_brtvdev, tx));
+       BRT_DEBUG("MOS BRT VDEV destroyed, object=%llu",
+           (u_longlong_t)brtvd->bv_mos_brtvdev);
+       brtvd->bv_mos_brtvdev = 0;
+
+       snprintf(name, sizeof (name), "%s%llu", BRT_OBJECT_VDEV_PREFIX,
+           (u_longlong_t)brtvd->bv_vdevid);
+       VERIFY0(zap_remove(brt->brt_mos, DMU_POOL_DIRECTORY_OBJECT, name, tx));
+       BRT_DEBUG("Pool directory object removed, object=%s", name);
+
+       brt_vdev_dealloc(brt, brtvd);
+
+       spa_feature_decr(brt->brt_spa, SPA_FEATURE_BLOCK_CLONING, tx);
+}
+
+static void
+brt_vdevs_expand(brt_t *brt, uint64_t nvdevs)
+{
+       brt_vdev_t *brtvd, *vdevs;
+       uint64_t vdevid;
+
+       ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+       ASSERT3U(nvdevs, >, brt->brt_nvdevs);
+
+       vdevs = kmem_zalloc(sizeof (vdevs[0]) * nvdevs, KM_SLEEP);
+       if (brt->brt_nvdevs > 0) {
+               ASSERT(brt->brt_vdevs != NULL);
+
+               memcpy(vdevs, brt->brt_vdevs,
+                   sizeof (brt_vdev_t) * brt->brt_nvdevs);
+               kmem_free(brt->brt_vdevs,
+                   sizeof (brt_vdev_t) * brt->brt_nvdevs);
+       }
+       for (vdevid = brt->brt_nvdevs; vdevid < nvdevs; vdevid++) {
+               brtvd = &vdevs[vdevid];
+
+               brtvd->bv_vdevid = vdevid;
+               brtvd->bv_initiated = FALSE;
+       }
+
+       BRT_DEBUG("BRT VDEVs expanded from %llu to %llu.",
+           (u_longlong_t)brt->brt_nvdevs, (u_longlong_t)nvdevs);
+
+       brt->brt_vdevs = vdevs;
+       brt->brt_nvdevs = nvdevs;
+}
+
+static boolean_t
+brt_vdev_lookup(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre)
+{
+       uint64_t idx;
+
+       ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+       idx = bre->bre_offset / brt->brt_rangesize;
+       if (brtvd->bv_entcount != NULL && idx < brtvd->bv_size) {
+               /* VDEV wasn't expanded. */
+               return (brt_vdev_entcount_get(brtvd, idx) > 0);
+       }
+
+       return (FALSE);
+}
+
+static void
+brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+    uint64_t dsize)
+{
+       uint64_t idx;
+
+       ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+       ASSERT(brtvd != NULL);
+       ASSERT(brtvd->bv_entcount != NULL);
+
+       brt->brt_savedspace += dsize;
+       brtvd->bv_savedspace += dsize;
+       brtvd->bv_meta_dirty = TRUE;
+
+       if (bre->bre_refcount > 1) {
+               return;
+       }
+
+       brt->brt_usedspace += dsize;
+       brtvd->bv_usedspace += dsize;
+
+       idx = bre->bre_offset / brt->brt_rangesize;
+       if (idx >= brtvd->bv_size) {
+               /* VDEV has been expanded. */
+               brt_vdev_realloc(brt, brtvd);
+       }
+
+       ASSERT3U(idx, <, brtvd->bv_size);
+
+       brtvd->bv_totalcount++;
+       brt_vdev_entcount_inc(brtvd, idx);
+       brtvd->bv_entcount_dirty = TRUE;
+       idx = idx / BRT_BLOCKSIZE / 8;
+       BT_SET(brtvd->bv_bitmap, idx);
+
+#ifdef ZFS_DEBUG
+       brt_vdev_dump(brt);
+#endif
+}
+
+static void
+brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre,
+    uint64_t dsize)
+{
+       uint64_t idx;
+
+       ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+       ASSERT(brtvd != NULL);
+       ASSERT(brtvd->bv_entcount != NULL);
+
+       brt->brt_savedspace -= dsize;
+       brtvd->bv_savedspace -= dsize;
+       brtvd->bv_meta_dirty = TRUE;
+
+       if (bre->bre_refcount > 0) {
+               return;
+       }
+
+       brt->brt_usedspace -= dsize;
+       brtvd->bv_usedspace -= dsize;
+
+       idx = bre->bre_offset / brt->brt_rangesize;
+       ASSERT3U(idx, <, brtvd->bv_size);
+
+       ASSERT(brtvd->bv_totalcount > 0);
+       brtvd->bv_totalcount--;
+       brt_vdev_entcount_dec(brtvd, idx);
+       brtvd->bv_entcount_dirty = TRUE;
+       idx = idx / BRT_BLOCKSIZE / 8;
+       BT_SET(brtvd->bv_bitmap, idx);
+
+#ifdef ZFS_DEBUG
+       brt_vdev_dump(brt);
+#endif
+}
+
+static void
+brt_vdev_sync(brt_t *brt, brt_vdev_t *brtvd, dmu_tx_t *tx)
+{
+       dmu_buf_t *db;
+       brt_vdev_phys_t *bvphys;
+
+       ASSERT(brtvd->bv_meta_dirty);
+       ASSERT(brtvd->bv_mos_brtvdev != 0);
+       ASSERT(dmu_tx_is_syncing(tx));
+
+       VERIFY0(dmu_bonus_hold(brt->brt_mos, brtvd->bv_mos_brtvdev, FTAG, &db));
+
+       if (brtvd->bv_entcount_dirty) {
+               /*
+                * TODO: Walk brtvd->bv_bitmap and write only the dirty blocks.
+                */
+               dmu_write(brt->brt_mos, brtvd->bv_mos_brtvdev, 0,
+                   brtvd->bv_size * sizeof (brtvd->bv_entcount[0]),
+                   brtvd->bv_entcount, tx);
+               memset(brtvd->bv_bitmap, 0, BT_SIZEOFMAP(brtvd->bv_nblocks));
+               brtvd->bv_entcount_dirty = FALSE;
+       }
+
+       dmu_buf_will_dirty(db, tx);
+       bvphys = db->db_data;
+       bvphys->bvp_mos_entries = brtvd->bv_mos_entries;
+       bvphys->bvp_size = brtvd->bv_size;
+       if (brtvd->bv_need_byteswap) {
+               bvphys->bvp_byteorder = BRT_NON_NATIVE_BYTEORDER;
+       } else {
+               bvphys->bvp_byteorder = BRT_NATIVE_BYTEORDER;
+       }
+       bvphys->bvp_totalcount = brtvd->bv_totalcount;
+       bvphys->bvp_rangesize = brt->brt_rangesize;
+       bvphys->bvp_usedspace = brtvd->bv_usedspace;
+       bvphys->bvp_savedspace = brtvd->bv_savedspace;
+       dmu_buf_rele(db, FTAG);
+
+       brtvd->bv_meta_dirty = FALSE;
+}
+
+static void
+brt_vdevs_alloc(brt_t *brt, boolean_t load)
+{
+       brt_vdev_t *brtvd;
+       uint64_t vdevid;
+
+       brt_wlock(brt);
+
+       brt_vdevs_expand(brt, brt->brt_spa->spa_root_vdev->vdev_children);
+
+       if (load) {
+               for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+                       brtvd = &brt->brt_vdevs[vdevid];
+                       ASSERT(brtvd->bv_entcount == NULL);
+
+                       brt_vdev_load(brt, brtvd);
+               }
+       }
+
+       if (brt->brt_rangesize == 0) {
+               brt->brt_rangesize = BRT_RANGESIZE;
+       }
+
+       brt_unlock(brt);
+}
+
+static void
+brt_vdevs_free(brt_t *brt)
+{
+       brt_vdev_t *brtvd;
+       uint64_t vdevid;
+
+       brt_wlock(brt);
+
+       for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+               brtvd = &brt->brt_vdevs[vdevid];
+               if (brtvd->bv_initiated)
+                       brt_vdev_dealloc(brt, brtvd);
+       }
+       kmem_free(brt->brt_vdevs, sizeof (brt_vdev_t) * brt->brt_nvdevs);
+
+       brt_unlock(brt);
+}
+
+static void
+brt_entry_fill(const blkptr_t *bp, brt_entry_t *bre, uint64_t *vdevidp)
+{
+
+       bre->bre_offset = DVA_GET_OFFSET(&bp->blk_dva[0]);
+       bre->bre_refcount = 0;
+
+       *vdevidp = DVA_GET_VDEV(&bp->blk_dva[0]);
+}
+
+static int
+brt_entry_compare(const void *x1, const void *x2)
+{
+       const brt_entry_t *bre1 = x1;
+       const brt_entry_t *bre2 = x2;
+
+       return (TREE_CMP(bre1->bre_offset, bre2->bre_offset));
+}
+
+static int
+brt_entry_lookup(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre)
+{
+       uint64_t mos_entries;
+       uint64_t one, physsize;
+       int error;
+
+       ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+
+       if (!brt_vdev_lookup(brt, brtvd, bre))
+               return (SET_ERROR(ENOENT));
+
+       /*
+        * Remember mos_entries object number. After we reacquire the BRT lock,
+        * the brtvd pointer may be invalid.
+        */
+       mos_entries = brtvd->bv_mos_entries;
+       if (mos_entries == 0)
+               return (SET_ERROR(ENOENT));
+
+       brt_unlock(brt);
+
+       error = zap_length_uint64(brt->brt_mos, mos_entries, &bre->bre_offset,
+           BRT_KEY_WORDS, &one, &physsize);
+       if (error == 0) {
+               ASSERT3U(one, ==, 1);
+               ASSERT3U(physsize, ==, sizeof (bre->bre_refcount));
+
+               error = zap_lookup_uint64(brt->brt_mos, mos_entries,
+                   &bre->bre_offset, BRT_KEY_WORDS, 1,
+                   sizeof (bre->bre_refcount), &bre->bre_refcount);
+               BRT_DEBUG("ZAP lookup: object=%llu vdev=%llu offset=%llu "
+                   "count=%llu error=%d", (u_longlong_t)mos_entries,
+                   (u_longlong_t)brtvd->bv_vdevid,
+                   (u_longlong_t)bre->bre_offset,
+                   error == 0 ? (u_longlong_t)bre->bre_refcount : 0, error);
+       }
+
+       brt_wlock(brt);
+
+       return (error);
+}
+
+static void
+brt_entry_prefetch(brt_t *brt, uint64_t vdevid, brt_entry_t *bre)
+{
+       brt_vdev_t *brtvd;
+       uint64_t mos_entries = 0;
+
+       brt_rlock(brt);
+       brtvd = brt_vdev(brt, vdevid);
+       if (brtvd != NULL)
+               mos_entries = brtvd->bv_mos_entries;
+       brt_unlock(brt);
+
+       if (mos_entries == 0)
+               return;
+
+       BRT_DEBUG("ZAP prefetch: object=%llu vdev=%llu offset=%llu",
+           (u_longlong_t)mos_entries, (u_longlong_t)vdevid,
+           (u_longlong_t)bre->bre_offset);
+       (void) zap_prefetch_uint64(brt->brt_mos, mos_entries,
+           (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS);
+}
+
+static int
+brt_entry_update(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+{
+       int error;
+
+       ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+       ASSERT(brtvd->bv_mos_entries != 0);
+       ASSERT(bre->bre_refcount > 0);
+
+       error = zap_update_uint64(brt->brt_mos, brtvd->bv_mos_entries,
+           (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, 1,
+           sizeof (bre->bre_refcount), &bre->bre_refcount, tx);
+       BRT_DEBUG("ZAP update: object=%llu vdev=%llu offset=%llu count=%llu "
+           "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
+           (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
+           (u_longlong_t)bre->bre_refcount, error);
+
+       return (error);
+}
+
+static int
+brt_entry_remove(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+{
+       int error;
+
+       ASSERT(RW_LOCK_HELD(&brt->brt_lock));
+       ASSERT(brtvd->bv_mos_entries != 0);
+       ASSERT0(bre->bre_refcount);
+
+       error = zap_remove_uint64(brt->brt_mos, brtvd->bv_mos_entries,
+           (uint64_t *)&bre->bre_offset, BRT_KEY_WORDS, tx);
+       BRT_DEBUG("ZAP remove: object=%llu vdev=%llu offset=%llu count=%llu "
+           "error=%d", (u_longlong_t)brtvd->bv_mos_entries,
+           (u_longlong_t)brtvd->bv_vdevid, (u_longlong_t)bre->bre_offset,
+           (u_longlong_t)bre->bre_refcount, error);
+
+       return (error);
+}
+
+/*
+ * Return TRUE if we _can_ have BRT entry for this bp. It might be false
+ * positive, but gives us quick answer if we should look into BRT, which
+ * may require reads and thus will be more expensive.
+ */
+boolean_t
+brt_maybe_exists(spa_t *spa, const blkptr_t *bp)
+{
+       brt_t *brt = spa->spa_brt;
+       brt_vdev_t *brtvd;
+       brt_entry_t bre_search;
+       boolean_t mayexists = FALSE;
+       uint64_t vdevid;
+
+       brt_entry_fill(bp, &bre_search, &vdevid);
+
+       brt_rlock(brt);
+
+       brtvd = brt_vdev(brt, vdevid);
+       if (brtvd != NULL && brtvd->bv_initiated) {
+               if (!avl_is_empty(&brtvd->bv_tree) ||
+                   brt_vdev_lookup(brt, brtvd, &bre_search)) {
+                       mayexists = TRUE;
+               }
+       }
+
+       brt_unlock(brt);
+
+       return (mayexists);
+}
+
+uint64_t
+brt_get_dspace(spa_t *spa)
+{
+       brt_t *brt = spa->spa_brt;
+
+       if (brt == NULL)
+               return (0);
+
+       return (brt->brt_savedspace);
+}
+
+uint64_t
+brt_get_used(spa_t *spa)
+{
+       brt_t *brt = spa->spa_brt;
+
+       if (brt == NULL)
+               return (0);
+
+       return (brt->brt_usedspace);
+}
+
+uint64_t
+brt_get_saved(spa_t *spa)
+{
+       brt_t *brt = spa->spa_brt;
+
+       if (brt == NULL)
+               return (0);
+
+       return (brt->brt_savedspace);
+}
+
+uint64_t
+brt_get_ratio(spa_t *spa)
+{
+       brt_t *brt = spa->spa_brt;
+
+       if (brt->brt_usedspace == 0)
+               return (100);
+
+       return ((brt->brt_usedspace + brt->brt_savedspace) * 100 /
+           brt->brt_usedspace);
+}
+
+static int
+brt_kstats_update(kstat_t *ksp, int rw)
+{
+       brt_stats_t *bs = ksp->ks_data;
+
+       if (rw == KSTAT_WRITE)
+               return (EACCES);
+
+       bs->brt_addref_entry_in_memory.value.ui64 =
+           wmsum_value(&brt_sums.brt_addref_entry_in_memory);
+       bs->brt_addref_entry_not_on_disk.value.ui64 =
+           wmsum_value(&brt_sums.brt_addref_entry_not_on_disk);
+       bs->brt_addref_entry_on_disk.value.ui64 =
+           wmsum_value(&brt_sums.brt_addref_entry_on_disk);
+       bs->brt_addref_entry_read_lost_race.value.ui64 =
+           wmsum_value(&brt_sums.brt_addref_entry_read_lost_race);
+       bs->brt_decref_entry_in_memory.value.ui64 =
+           wmsum_value(&brt_sums.brt_decref_entry_in_memory);
+       bs->brt_decref_entry_loaded_from_disk.value.ui64 =
+           wmsum_value(&brt_sums.brt_decref_entry_loaded_from_disk);
+       bs->brt_decref_entry_not_in_memory.value.ui64 =
+           wmsum_value(&brt_sums.brt_decref_entry_not_in_memory);
+       bs->brt_decref_entry_not_on_disk.value.ui64 =
+           wmsum_value(&brt_sums.brt_decref_entry_not_on_disk);
+       bs->brt_decref_entry_read_lost_race.value.ui64 =
+           wmsum_value(&brt_sums.brt_decref_entry_read_lost_race);
+       bs->brt_decref_entry_still_referenced.value.ui64 =
+           wmsum_value(&brt_sums.brt_decref_entry_still_referenced);
+       bs->brt_decref_free_data_later.value.ui64 =
+           wmsum_value(&brt_sums.brt_decref_free_data_later);
+       bs->brt_decref_free_data_now.value.ui64 =
+           wmsum_value(&brt_sums.brt_decref_free_data_now);
+       bs->brt_decref_no_entry.value.ui64 =
+           wmsum_value(&brt_sums.brt_decref_no_entry);
+
+       return (0);
+}
+
+static void
+brt_stat_init(void)
+{
+
+       wmsum_init(&brt_sums.brt_addref_entry_in_memory, 0);
+       wmsum_init(&brt_sums.brt_addref_entry_not_on_disk, 0);
+       wmsum_init(&brt_sums.brt_addref_entry_on_disk, 0);
+       wmsum_init(&brt_sums.brt_addref_entry_read_lost_race, 0);
+       wmsum_init(&brt_sums.brt_decref_entry_in_memory, 0);
+       wmsum_init(&brt_sums.brt_decref_entry_loaded_from_disk, 0);
+       wmsum_init(&brt_sums.brt_decref_entry_not_in_memory, 0);
+       wmsum_init(&brt_sums.brt_decref_entry_not_on_disk, 0);
+       wmsum_init(&brt_sums.brt_decref_entry_read_lost_race, 0);
+       wmsum_init(&brt_sums.brt_decref_entry_still_referenced, 0);
+       wmsum_init(&brt_sums.brt_decref_free_data_later, 0);
+       wmsum_init(&brt_sums.brt_decref_free_data_now, 0);
+       wmsum_init(&brt_sums.brt_decref_no_entry, 0);
+
+       brt_ksp = kstat_create("zfs", 0, "brtstats", "misc", KSTAT_TYPE_NAMED,
+           sizeof (brt_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL);
+       if (brt_ksp != NULL) {
+               brt_ksp->ks_data = &brt_stats;
+               brt_ksp->ks_update = brt_kstats_update;
+               kstat_install(brt_ksp);
+       }
+}
+
+static void
+brt_stat_fini(void)
+{
+       if (brt_ksp != NULL) {
+               kstat_delete(brt_ksp);
+               brt_ksp = NULL;
+       }
+
+       wmsum_fini(&brt_sums.brt_addref_entry_in_memory);
+       wmsum_fini(&brt_sums.brt_addref_entry_not_on_disk);
+       wmsum_fini(&brt_sums.brt_addref_entry_on_disk);
+       wmsum_fini(&brt_sums.brt_addref_entry_read_lost_race);
+       wmsum_fini(&brt_sums.brt_decref_entry_in_memory);
+       wmsum_fini(&brt_sums.brt_decref_entry_loaded_from_disk);
+       wmsum_fini(&brt_sums.brt_decref_entry_not_in_memory);
+       wmsum_fini(&brt_sums.brt_decref_entry_not_on_disk);
+       wmsum_fini(&brt_sums.brt_decref_entry_read_lost_race);
+       wmsum_fini(&brt_sums.brt_decref_entry_still_referenced);
+       wmsum_fini(&brt_sums.brt_decref_free_data_later);
+       wmsum_fini(&brt_sums.brt_decref_free_data_now);
+       wmsum_fini(&brt_sums.brt_decref_no_entry);
+}
+
+void
+brt_init(void)
+{
+       brt_entry_cache = kmem_cache_create("brt_entry_cache",
+           sizeof (brt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+       brt_pending_entry_cache = kmem_cache_create("brt_pending_entry_cache",
+           sizeof (brt_pending_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+
+       brt_stat_init();
+}
+
+void
+brt_fini(void)
+{
+       brt_stat_fini();
+
+       kmem_cache_destroy(brt_entry_cache);
+       kmem_cache_destroy(brt_pending_entry_cache);
+}
+
+static brt_entry_t *
+brt_entry_alloc(const brt_entry_t *bre_init)
+{
+       brt_entry_t *bre;
+
+       bre = kmem_cache_alloc(brt_entry_cache, KM_SLEEP);
+       bre->bre_offset = bre_init->bre_offset;
+       bre->bre_refcount = bre_init->bre_refcount;
+
+       return (bre);
+}
+
+static void
+brt_entry_free(brt_entry_t *bre)
+{
+
+       kmem_cache_free(brt_entry_cache, bre);
+}
+
+static void
+brt_entry_addref(brt_t *brt, const blkptr_t *bp)
+{
+       brt_vdev_t *brtvd;
+       brt_entry_t *bre, *racebre;
+       brt_entry_t bre_search;
+       avl_index_t where;
+       uint64_t vdevid;
+       int error;
+
+       ASSERT(!RW_WRITE_HELD(&brt->brt_lock));
+
+       brt_entry_fill(bp, &bre_search, &vdevid);
+
+       brt_wlock(brt);
+
+       brtvd = brt_vdev(brt, vdevid);
+       if (brtvd == NULL) {
+               ASSERT3U(vdevid, >=, brt->brt_nvdevs);
+
+               /* New VDEV was added. */
+               brt_vdevs_expand(brt, vdevid + 1);
+               brtvd = brt_vdev(brt, vdevid);
+       }
+       ASSERT(brtvd != NULL);
+       if (!brtvd->bv_initiated)
+               brt_vdev_realloc(brt, brtvd);
+
+       bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+       if (bre != NULL) {
+               BRTSTAT_BUMP(brt_addref_entry_in_memory);
+       } else {
+               /*
+                * brt_entry_lookup() may drop the BRT (read) lock and
+                * reacquire it (write).
+                */
+               error = brt_entry_lookup(brt, brtvd, &bre_search);
+               /* bre_search now contains correct bre_refcount */
+               ASSERT(error == 0 || error == ENOENT);
+               if (error == 0)
+                       BRTSTAT_BUMP(brt_addref_entry_on_disk);
+               else
+                       BRTSTAT_BUMP(brt_addref_entry_not_on_disk);
+               /*
+                * When the BRT lock was dropped, brt_vdevs[] may have been
+                * expanded and reallocated, we need to update brtvd's pointer.
+                */
+               brtvd = brt_vdev(brt, vdevid);
+               ASSERT(brtvd != NULL);
+
+               racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
+               if (racebre == NULL) {
+                       bre = brt_entry_alloc(&bre_search);
+                       ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+                       avl_insert(&brtvd->bv_tree, bre, where);
+                       brt->brt_nentries++;
+               } else {
+                       /*
+                        * The entry was added when the BRT lock was dropped in
+                        * brt_entry_lookup().
+                        */
+                       BRTSTAT_BUMP(brt_addref_entry_read_lost_race);
+                       bre = racebre;
+               }
+       }
+       bre->bre_refcount++;
+       brt_vdev_addref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+
+       brt_unlock(brt);
+}
+
+/* Return TRUE if block should be freed immediately. */
+boolean_t
+brt_entry_decref(spa_t *spa, const blkptr_t *bp)
+{
+       brt_t *brt = spa->spa_brt;
+       brt_vdev_t *brtvd;
+       brt_entry_t *bre, *racebre;
+       brt_entry_t bre_search;
+       avl_index_t where;
+       uint64_t vdevid;
+       int error;
+
+       brt_entry_fill(bp, &bre_search, &vdevid);
+
+       brt_wlock(brt);
+
+       brtvd = brt_vdev(brt, vdevid);
+       ASSERT(brtvd != NULL);
+
+       bre = avl_find(&brtvd->bv_tree, &bre_search, NULL);
+       if (bre != NULL) {
+               BRTSTAT_BUMP(brt_decref_entry_in_memory);
+               goto out;
+       } else {
+               BRTSTAT_BUMP(brt_decref_entry_not_in_memory);
+       }
+
+       /*
+        * brt_entry_lookup() may drop the BRT lock and reacquire it.
+        */
+       error = brt_entry_lookup(brt, brtvd, &bre_search);
+       /* bre_search now contains correct bre_refcount */
+       ASSERT(error == 0 || error == ENOENT);
+       /*
+        * When the BRT lock was dropped, brt_vdevs[] may have been expanded
+        * and reallocated, we need to update brtvd's pointer.
+        */
+       brtvd = brt_vdev(brt, vdevid);
+       ASSERT(brtvd != NULL);
+
+       if (error == ENOENT) {
+               BRTSTAT_BUMP(brt_decref_entry_not_on_disk);
+               bre = NULL;
+               goto out;
+       }
+
+       racebre = avl_find(&brtvd->bv_tree, &bre_search, &where);
+       if (racebre != NULL) {
+               /*
+                * The entry was added when the BRT lock was dropped in
+                * brt_entry_lookup().
+                */
+               BRTSTAT_BUMP(brt_decref_entry_read_lost_race);
+               bre = racebre;
+               goto out;
+       }
+
+       BRTSTAT_BUMP(brt_decref_entry_loaded_from_disk);
+       bre = brt_entry_alloc(&bre_search);
+       ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+       avl_insert(&brtvd->bv_tree, bre, where);
+       brt->brt_nentries++;
+
+out:
+       if (bre == NULL) {
+               /*
+                * This is a free of a regular (not cloned) block.
+                */
+               brt_unlock(brt);
+               BRTSTAT_BUMP(brt_decref_no_entry);
+               return (B_TRUE);
+       }
+       if (bre->bre_refcount == 0) {
+               brt_unlock(brt);
+               BRTSTAT_BUMP(brt_decref_free_data_now);
+               return (B_TRUE);
+       }
+
+       ASSERT(bre->bre_refcount > 0);
+       bre->bre_refcount--;
+       if (bre->bre_refcount == 0)
+               BRTSTAT_BUMP(brt_decref_free_data_later);
+       else
+               BRTSTAT_BUMP(brt_decref_entry_still_referenced);
+       brt_vdev_decref(brt, brtvd, bre, bp_get_dsize(brt->brt_spa, bp));
+
+       brt_unlock(brt);
+
+       return (B_FALSE);
+}
+
+static void
+brt_prefetch(brt_t *brt, const blkptr_t *bp)
+{
+       brt_entry_t bre;
+       uint64_t vdevid;
+
+       ASSERT(bp != NULL);
+
+       if (!zfs_brt_prefetch)
+               return;
+
+       brt_entry_fill(bp, &bre, &vdevid);
+
+       brt_entry_prefetch(brt, vdevid, &bre);
+}
+
+static int
+brt_pending_entry_compare(const void *x1, const void *x2)
+{
+       const brt_pending_entry_t *bpe1 = x1, *bpe2 = x2;
+       const blkptr_t *bp1 = &bpe1->bpe_bp, *bp2 = &bpe2->bpe_bp;
+       int cmp;
+
+       cmp = TREE_CMP(BP_PHYSICAL_BIRTH(bp1), BP_PHYSICAL_BIRTH(bp2));
+       if (cmp == 0) {
+               cmp = TREE_CMP(DVA_GET_VDEV(&bp1->blk_dva[0]),
+                   DVA_GET_VDEV(&bp2->blk_dva[0]));
+               if (cmp == 0) {
+                       cmp = TREE_CMP(DVA_GET_OFFSET(&bp1->blk_dva[0]),
+                           DVA_GET_OFFSET(&bp2->blk_dva[0]));
+               }
+       }
+
+       return (cmp);
+}
+
+void
+brt_pending_add(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       brt_t *brt;
+       avl_tree_t *pending_tree;
+       kmutex_t *pending_lock;
+       brt_pending_entry_t *bpe, *newbpe;
+       avl_index_t where;
+       uint64_t txg;
+
+       brt = spa->spa_brt;
+       txg = dmu_tx_get_txg(tx);
+       ASSERT3U(txg, !=, 0);
+       pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+       pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+       newbpe = kmem_cache_alloc(brt_pending_entry_cache, KM_SLEEP);
+       newbpe->bpe_bp = *bp;
+       newbpe->bpe_count = 1;
+
+       mutex_enter(pending_lock);
+
+       bpe = avl_find(pending_tree, newbpe, &where);
+       if (bpe == NULL) {
+               avl_insert(pending_tree, newbpe, where);
+               newbpe = NULL;
+       } else {
+               bpe->bpe_count++;
+       }
+
+       mutex_exit(pending_lock);
+
+       if (newbpe != NULL) {
+               ASSERT(bpe != NULL);
+               ASSERT(bpe != newbpe);
+               kmem_cache_free(brt_pending_entry_cache, newbpe);
+       } else {
+               ASSERT(bpe == NULL);
+       }
+
+       /* Prefetch BRT entry, as we will need it in the syncing context. */
+       brt_prefetch(brt, bp);
+}
+
+void
+brt_pending_remove(spa_t *spa, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       brt_t *brt;
+       avl_tree_t *pending_tree;
+       kmutex_t *pending_lock;
+       brt_pending_entry_t *bpe, bpe_search;
+       uint64_t txg;
+
+       brt = spa->spa_brt;
+       txg = dmu_tx_get_txg(tx);
+       ASSERT3U(txg, !=, 0);
+       pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+       pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+       bpe_search.bpe_bp = *bp;
+
+       mutex_enter(pending_lock);
+
+       bpe = avl_find(pending_tree, &bpe_search, NULL);
+       /* I believe we should always find bpe when this function is called. */
+       if (bpe != NULL) {
+               ASSERT(bpe->bpe_count > 0);
+
+               bpe->bpe_count--;
+               if (bpe->bpe_count == 0) {
+                       avl_remove(pending_tree, bpe);
+                       kmem_cache_free(brt_pending_entry_cache, bpe);
+               }
+       }
+
+       mutex_exit(pending_lock);
+}
+
+void
+brt_pending_apply(spa_t *spa, uint64_t txg)
+{
+       brt_t *brt;
+       brt_pending_entry_t *bpe;
+       avl_tree_t *pending_tree;
+       kmutex_t *pending_lock;
+       void *c;
+
+       ASSERT3U(txg, !=, 0);
+
+       brt = spa->spa_brt;
+       pending_tree = &brt->brt_pending_tree[txg & TXG_MASK];
+       pending_lock = &brt->brt_pending_lock[txg & TXG_MASK];
+
+       mutex_enter(pending_lock);
+
+       c = NULL;
+       while ((bpe = avl_destroy_nodes(pending_tree, &c)) != NULL) {
+               boolean_t added_to_ddt;
+
+               mutex_exit(pending_lock);
+
+               for (int i = 0; i < bpe->bpe_count; i++) {
+                       /*
+                        * If the block has DEDUP bit set, it means that it
+                        * already exists in the DEDUP table, so we can just
+                        * use that instead of creating new entry in
+                        * the BRT table.
+                        */
+                       if (BP_GET_DEDUP(&bpe->bpe_bp)) {
+                               added_to_ddt = ddt_addref(spa, &bpe->bpe_bp);
+                       } else {
+                               added_to_ddt = B_FALSE;
+                       }
+                       if (!added_to_ddt)
+                               brt_entry_addref(brt, &bpe->bpe_bp);
+               }
+
+               kmem_cache_free(brt_pending_entry_cache, bpe);
+               mutex_enter(pending_lock);
+       }
+
+       mutex_exit(pending_lock);
+}
+
+static void
+brt_sync_entry(brt_t *brt, brt_vdev_t *brtvd, brt_entry_t *bre, dmu_tx_t *tx)
+{
+
+       ASSERT(RW_WRITE_HELD(&brt->brt_lock));
+       ASSERT(brtvd->bv_mos_entries != 0);
+
+       if (bre->bre_refcount == 0) {
+               int error;
+
+               error = brt_entry_remove(brt, brtvd, bre, tx);
+               ASSERT(error == 0 || error == ENOENT);
+               /*
+                * If error == ENOENT then zfs_clone_range() was done from a
+                * removed (but opened) file (open(), unlink()).
+                */
+               ASSERT(brt_entry_lookup(brt, brtvd, bre) == ENOENT);
+       } else {
+               VERIFY0(brt_entry_update(brt, brtvd, bre, tx));
+       }
+}
+
+static void
+brt_sync_table(brt_t *brt, dmu_tx_t *tx)
+{
+       brt_vdev_t *brtvd;
+       brt_entry_t *bre;
+       uint64_t vdevid;
+       void *c;
+
+       brt_wlock(brt);
+
+       for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) {
+               brtvd = &brt->brt_vdevs[vdevid];
+
+               if (!brtvd->bv_initiated)
+                       continue;
+
+               if (!brtvd->bv_meta_dirty) {
+                       ASSERT(!brtvd->bv_entcount_dirty);
+                       ASSERT0(avl_numnodes(&brtvd->bv_tree));
+                       continue;
+               }
+
+               ASSERT(!brtvd->bv_entcount_dirty ||
+                   avl_numnodes(&brtvd->bv_tree) != 0);
+
+               if (brtvd->bv_mos_brtvdev == 0)
+                       brt_vdev_create(brt, brtvd, tx);
+
+               c = NULL;
+               while ((bre = avl_destroy_nodes(&brtvd->bv_tree, &c)) != NULL) {
+                       brt_sync_entry(brt, brtvd, bre, tx);
+                       brt_entry_free(bre);
+                       ASSERT(brt->brt_nentries > 0);
+                       brt->brt_nentries--;
+               }
+
+               brt_vdev_sync(brt, brtvd, tx);
+
+               if (brtvd->bv_totalcount == 0)
+                       brt_vdev_destroy(brt, brtvd, tx);
+       }
+
+       ASSERT0(brt->brt_nentries);
+
+       brt_unlock(brt);
+}
+
+void
+brt_sync(spa_t *spa, uint64_t txg)
+{
+       dmu_tx_t *tx;
+       brt_t *brt;
+
+       ASSERT(spa_syncing_txg(spa) == txg);
+
+       brt = spa->spa_brt;
+       brt_rlock(brt);
+       if (brt->brt_nentries == 0) {
+               /* No changes. */
+               brt_unlock(brt);
+               return;
+       }
+       brt_unlock(brt);
+
+       tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+       brt_sync_table(brt, tx);
+
+       dmu_tx_commit(tx);
+}
+
+static void
+brt_table_alloc(brt_t *brt)
+{
+
+       for (int i = 0; i < TXG_SIZE; i++) {
+               avl_create(&brt->brt_pending_tree[i],
+                   brt_pending_entry_compare,
+                   sizeof (brt_pending_entry_t),
+                   offsetof(brt_pending_entry_t, bpe_node));
+               mutex_init(&brt->brt_pending_lock[i], NULL, MUTEX_DEFAULT,
+                   NULL);
+       }
+}
+
+static void
+brt_table_free(brt_t *brt)
+{
+
+       for (int i = 0; i < TXG_SIZE; i++) {
+               ASSERT(avl_is_empty(&brt->brt_pending_tree[i]));
+
+               avl_destroy(&brt->brt_pending_tree[i]);
+               mutex_destroy(&brt->brt_pending_lock[i]);
+       }
+}
+
+static void
+brt_alloc(spa_t *spa)
+{
+       brt_t *brt;
+
+       ASSERT(spa->spa_brt == NULL);
+
+       brt = kmem_zalloc(sizeof (*brt), KM_SLEEP);
+       rw_init(&brt->brt_lock, NULL, RW_DEFAULT, NULL);
+       brt->brt_spa = spa;
+       brt->brt_rangesize = 0;
+       brt->brt_nentries = 0;
+       brt->brt_vdevs = NULL;
+       brt->brt_nvdevs = 0;
+       brt_table_alloc(brt);
+
+       spa->spa_brt = brt;
+}
+
+void
+brt_create(spa_t *spa)
+{
+
+       brt_alloc(spa);
+       brt_vdevs_alloc(spa->spa_brt, B_FALSE);
+}
+
+int
+brt_load(spa_t *spa)
+{
+
+       brt_alloc(spa);
+       brt_vdevs_alloc(spa->spa_brt, B_TRUE);
+
+       return (0);
+}
+
+void
+brt_unload(spa_t *spa)
+{
+       brt_t *brt = spa->spa_brt;
+
+       if (brt == NULL)
+               return;
+
+       brt_vdevs_free(brt);
+       brt_table_free(brt);
+       rw_destroy(&brt->brt_lock);
+       kmem_free(brt, sizeof (*brt));
+       spa->spa_brt = NULL;
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, prefetch, INT, ZMOD_RW,
+    "Enable prefetching of BRT entries");
+#ifdef ZFS_BRT_DEBUG
+ZFS_MODULE_PARAM(zfs_brt, zfs_brt_, debug, INT, ZMOD_RW, "BRT debug");
+#endif
+/* END CSTYLED */
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c

index 191e5e043942115bce471bf84ff3b1a4d062949e..94c2ae9d736d55ca9b5366633873d065938ab817 100644 (file)
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -26,6 +26,7 @@
   * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
   * Copyright (c) 2019, Klara Inc.
   * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
   */
  
  #include <sys/zfs_context.h>
@@ -49,6 +50,7 @@
  #include <sys/trace_zfs.h>
  #include <sys/callb.h>
  #include <sys/abd.h>
+#include <sys/brt.h>
  #include <sys/vdev.h>
  #include <cityhash.h>
  #include <sys/spa_impl.h>
@@ -1427,7 +1429,7 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
  }
  
  static void
-dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
  {
         blkptr_t *bps = db->db.db_data;
         uint32_t indbs = 1ULL << dn->dn_indblkshift;
@@ -1436,12 +1438,12 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
         for (int i = 0; i < n_bps; i++) {
                 blkptr_t *bp = &bps[i];
  
-               ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
-               BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
-                   dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
-               BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
-               BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
-               BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+               ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
+               BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
+                   dn->dn_datablksz : BP_GET_LSIZE(dbbp));
+               BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
+               BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
+               BP_SET_BIRTH(bp, dbbp->blk_birth, 0);
         }
  }
  
@@ -1451,30 +1453,27 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
   * was taken, ENOENT if no action was taken.
   */
  static int
-dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
  {
         ASSERT(MUTEX_HELD(&db->db_mtx));
  
-       int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+       int is_hole = bp == NULL || BP_IS_HOLE(bp);
         /*
          * For level 0 blocks only, if the above check fails:
          * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
          * processes the delete record and clears the bp while we are waiting
          * for the dn_mtx (resulting in a "no" from block_freed).
          */
-       if (!is_hole && db->db_level == 0) {
-               is_hole = dnode_block_freed(dn, db->db_blkid) ||
-                   BP_IS_HOLE(db->db_blkptr);
-       }
+       if (!is_hole && db->db_level == 0)
+               is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
  
         if (is_hole) {
                 dbuf_set_data(db, dbuf_alloc_arcbuf(db));
                 memset(db->db.db_data, 0, db->db.db_size);
  
-               if (db->db_blkptr != NULL && db->db_level > 0 &&
-                   BP_IS_HOLE(db->db_blkptr) &&
-                   db->db_blkptr->blk_birth != 0) {
-                       dbuf_handle_indirect_hole(db, dn);
+               if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
+                   bp->blk_birth != 0) {
+                       dbuf_handle_indirect_hole(db, dn, bp);
                 }
                 db->db_state = DB_CACHED;
                 DTRACE_SET_STATE(db, "hole read satisfied");
@@ -1551,12 +1550,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
         zbookmark_phys_t zb;
         uint32_t aflags = ARC_FLAG_NOWAIT;
         int err, zio_flags;
+       blkptr_t bp, *bpp;
  
         DB_DNODE_ENTER(db);
         dn = DB_DNODE(db);
         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
         ASSERT(MUTEX_HELD(&db->db_mtx));
-       ASSERT(db->db_state == DB_UNCACHED);
+       ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
         ASSERT(db->db_buf == NULL);
         ASSERT(db->db_parent == NULL ||
             RW_LOCK_HELD(&db->db_parent->db_rwlock));
@@ -1566,16 +1566,46 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
                 goto early_unlock;
         }
  
-       err = dbuf_read_hole(db, dn);
+       if (db->db_state == DB_UNCACHED) {
+               if (db->db_blkptr == NULL) {
+                       bpp = NULL;
+               } else {
+                       bp = *db->db_blkptr;
+                       bpp = &bp;
+               }
+       } else {
+               struct dirty_leaf *dl;
+               dbuf_dirty_record_t *dr;
+
+               ASSERT3S(db->db_state, ==, DB_NOFILL);
+
+               dr = list_head(&db->db_dirty_records);
+               if (dr == NULL) {
+                       err = EIO;
+                       goto early_unlock;
+               } else {
+                       dl = &dr->dt.dl;
+                       if (!dl->dr_brtwrite) {
+                               err = EIO;
+                               goto early_unlock;
+                       }
+                       bp = dl->dr_overridden_by;
+                       bpp = &bp;
+               }
+       }
+
+       err = dbuf_read_hole(db, dn, bpp);
         if (err == 0)
                 goto early_unlock;
  
+       ASSERT(bpp != NULL);
+
         /*
          * Any attempt to read a redacted block should result in an error. This
          * will never happen under normal conditions, but can be useful for
          * debugging purposes.
          */
-       if (BP_IS_REDACTED(db->db_blkptr)) {
+       if (BP_IS_REDACTED(bpp)) {
                 ASSERT(dsl_dataset_feature_is_active(
                     db->db_objset->os_dsl_dataset,
                     SPA_FEATURE_REDACTED_DATASETS));
@@ -1590,7 +1620,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
          * All bps of an encrypted os should have the encryption bit set.
          * If this is not true it indicates tampering and we report an error.
          */
-       if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
+       if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
                 spa_log_error(db->db_objset->os_spa, &zb);
                 zfs_panic_recover("unencrypted block in encrypted "
                     "object set %llu", dmu_objset_id(db->db_objset));
@@ -1621,15 +1651,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
         if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
                 zio_flags |= ZIO_FLAG_RAW;
         /*
-        * The zio layer will copy the provided blkptr later, but we need to
-        * do this now so that we can release the parent's rwlock. We have to
-        * do that now so that if dbuf_read_done is called synchronously (on
+        * The zio layer will copy the provided blkptr later, but we have our
+        * own copy so that we can release the parent's rwlock. We have to
+        * do that so that if dbuf_read_done is called synchronously (on
          * an l1 cache hit) we don't acquire the db_mtx while holding the
          * parent's rwlock, which would be a lock ordering violation.
          */
-       blkptr_t bp = *db->db_blkptr;
         dmu_buf_unlock_parent(db, dblt, tag);
-       (void) arc_read(zio, db->db_objset->os_spa, &bp,
+       (void) arc_read(zio, db->db_objset->os_spa, bpp,
             dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
             &aflags, &zb);
         return (err);
@@ -1731,9 +1760,6 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
          */
         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
  
-       if (db->db_state == DB_NOFILL)
-               return (SET_ERROR(EIO));
-
         DB_DNODE_ENTER(db);
         dn = DB_DNODE(db);
  
@@ -1780,13 +1806,13 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                 }
                 DB_DNODE_EXIT(db);
                 DBUF_STAT_BUMP(hash_hits);
-       } else if (db->db_state == DB_UNCACHED) {
+       } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
                 boolean_t need_wait = B_FALSE;
  
                 db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
  
-               if (zio == NULL &&
-                   db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+               if (zio == NULL && (db->db_state == DB_NOFILL ||
+                   (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
                         spa_t *spa = dn->dn_objset->os_spa;
                         zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
                         need_wait = B_TRUE;
@@ -1913,7 +1939,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
          * the buf thawed to save the effort of freezing &
          * immediately re-thawing it.
          */
-       arc_release(dr->dt.dl.dr_data, db);
+       if (!dr->dt.dl.dr_brtwrite)
+               arc_release(dr->dt.dl.dr_data, db);
  }
  
  /*
@@ -1996,6 +2023,11 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
                                     db->db_blkid > dn->dn_maxblkid)
                                         dn->dn_maxblkid = db->db_blkid;
                                 dbuf_unoverride(dr);
+                               if (dr->dt.dl.dr_brtwrite) {
+                                       ASSERT(db->db.db_data == NULL);
+                                       mutex_exit(&db->db_mtx);
+                                       continue;
+                               }
                         } else {
                                 /*
                                  * This dbuf is not dirty in the open context.
@@ -2285,7 +2317,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
  
         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
  
-       if (db->db_blkid != DMU_BONUS_BLKID) {
+       if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
                 dmu_objset_willuse_space(os, db->db.db_size, tx);
         }
  
@@ -2328,8 +2360,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                     sizeof (dbuf_dirty_record_t),
                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
         }
-       if (db->db_blkid != DMU_BONUS_BLKID)
+       if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
                 dr->dr_accounted = db->db.db_size;
+       }
         dr->dr_dbuf = db;
         dr->dr_txg = tx->tx_txg;
         list_insert_before(&db->db_dirty_records, dr_next, dr);
@@ -2489,6 +2522,7 @@ static boolean_t
  dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
  {
         uint64_t txg = tx->tx_txg;
+       boolean_t brtwrite;
  
         ASSERT(txg != 0);
  
@@ -2513,6 +2547,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                 return (B_FALSE);
         ASSERT(dr->dr_dbuf == db);
  
+       brtwrite = dr->dt.dl.dr_brtwrite;
+       if (brtwrite) {
+               /*
+                * We are freeing a block that we cloned in the same
+                * transaction group.
+                */
+               brt_pending_remove(dmu_objset_spa(db->db_objset),
+                   &dr->dt.dl.dr_overridden_by, tx);
+       }
+
         dnode_t *dn = dr->dr_dnode;
  
         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -2542,7 +2586,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                 mutex_exit(&dn->dn_mtx);
         }
  
-       if (db->db_state != DB_NOFILL) {
+       if (db->db_state != DB_NOFILL && !brtwrite) {
                 dbuf_unoverride(dr);
  
                 ASSERT(db->db_buf != NULL);
@@ -2557,7 +2601,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         db->db_dirtycnt -= 1;
  
         if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-               ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+               ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+                   arc_released(db->db_buf));
                 dbuf_destroy(db);
                 return (B_TRUE);
         }
@@ -4748,8 +4793,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
                 if (db->db_state != DB_NOFILL) {
-                       if (dr->dt.dl.dr_data != db->db_buf)
+                       if (dr->dt.dl.dr_data != NULL &&
+                           dr->dt.dl.dr_data != db->db_buf) {
                                 arc_buf_destroy(dr->dt.dl.dr_data, db);
+                       }
                 }
         } else {
                 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -5046,7 +5093,8 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
                 mutex_enter(&db->db_mtx);
                 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
                 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-                   dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+                   dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
+                   dr->dt.dl.dr_brtwrite);
                 mutex_exit(&db->db_mtx);
         } else if (db->db_state == DB_NOFILL) {
                 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c

index 7880a899aeb1476aeefb5678aa77b58b4c785d7c..33fea0ba3d3c73989f848865d37f65104692f3b9 100644 (file)
--- a/module/zfs/ddt.c
+++ b/module/zfs/ddt.c
@@ -22,6 +22,7 @@
  /*
   * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved.
   * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
   */
  
  #include <sys/zfs_context.h>
@@ -1180,5 +1181,59 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
         return (SET_ERROR(ENOENT));
  }
  
+/*
+ * This function is used by Block Cloning (brt.c) to increase reference
+ * counter for the DDT entry if the block is already in DDT.
+ *
+ * Return false if the block, despite having the D bit set, is not present
+ * in the DDT. Currently this is not possible but might be in the future.
+ * See the comment below.
+ */
+boolean_t
+ddt_addref(spa_t *spa, const blkptr_t *bp)
+{
+       ddt_t *ddt;
+       ddt_entry_t *dde;
+       boolean_t result;
+
+       spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+       ddt = ddt_select(spa, bp);
+       ddt_enter(ddt);
+
+       dde = ddt_lookup(ddt, bp, B_TRUE);
+       ASSERT(dde != NULL);
+
+       if (dde->dde_type < DDT_TYPES) {
+               ddt_phys_t *ddp;
+
+               ASSERT3S(dde->dde_class, <, DDT_CLASSES);
+
+               ddp = &dde->dde_phys[BP_GET_NDVAS(bp)];
+               if (ddp->ddp_refcnt == 0) {
+                       /* This should never happen? */
+                       ddt_phys_fill(ddp, bp);
+               }
+               ddt_phys_addref(ddp);
+               result = B_TRUE;
+       } else {
+               /*
+                * At the time of implementating this if the block has the
+                * DEDUP flag set it must exist in the DEDUP table, but
+                * there are many advocates that want ability to remove
+                * entries from DDT with refcnt=1. If this will happen,
+                * we may have a block with the DEDUP set, but which doesn't
+                * have a corresponding entry in the DDT. Be ready.
+                */
+               ASSERT3S(dde->dde_class, ==, DDT_CLASSES);
+               ddt_remove(ddt, dde);
+               result = B_FALSE;
+       }
+
+       ddt_exit(ddt);
+       spa_config_exit(spa, SCL_ZIO, FTAG);
+
+       return (result);
+}
+
  ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW,
         "Enable prefetching dedup-ed blks");
diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c

index 9b8fc7e49b2dfe447dee148b389afce85fee4aa3..e6bade11c8593710599ef7211fb05155ffad9582 100644 (file)
--- a/module/zfs/dmu.c
+++ b/module/zfs/dmu.c
@@ -29,6 +29,7 @@
   * Copyright (c) 2019, Klara Inc.
   * Copyright (c) 2019, Allan Jude
   * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
   */
  
  #include <sys/dmu.h>
@@ -52,6 +53,7 @@
  #include <sys/sa.h>
  #include <sys/zfeature.h>
  #include <sys/abd.h>
+#include <sys/brt.h>
  #include <sys/trace_zfs.h>
  #include <sys/zfs_racct.h>
  #include <sys/zfs_rlock.h>
@@ -513,7 +515,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
         zio_t *zio = NULL;
         boolean_t missed = B_FALSE;
  
-       ASSERT(length <= DMU_MAX_ACCESS);
+       ASSERT(!read || length <= DMU_MAX_ACCESS);
  
         /*
          * Note: We directly notify the prefetch code of this read, so that
@@ -2165,6 +2167,155 @@ restart:
         return (err);
  }
  
+int
+dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
+    dmu_tx_t *tx, blkptr_t *bps, size_t *nbpsp)
+{
+       dmu_buf_t **dbp, *dbuf;
+       dmu_buf_impl_t *db;
+       blkptr_t *bp;
+       int error, numbufs;
+
+       error = dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
+           &numbufs, &dbp);
+       if (error != 0) {
+               if (error == ESRCH) {
+                       error = SET_ERROR(ENXIO);
+               }
+               return (error);
+       }
+
+       ASSERT3U(numbufs, <=, *nbpsp);
+
+       for (int i = 0; i < numbufs; i++) {
+               dbuf = dbp[i];
+               db = (dmu_buf_impl_t *)dbuf;
+               bp = db->db_blkptr;
+
+               /*
+                * If the block is not on the disk yet, it has no BP assigned.
+                * There is not much we can do...
+                */
+               if (!list_is_empty(&db->db_dirty_records)) {
+                       dbuf_dirty_record_t *dr;
+
+                       dr = list_head(&db->db_dirty_records);
+                       if (dr->dt.dl.dr_brtwrite) {
+                               /*
+                                * This is very special case where we clone a
+                                * block and in the same transaction group we
+                                * read its BP (most likely to clone the clone).
+                                */
+                               bp = &dr->dt.dl.dr_overridden_by;
+                       } else {
+                               /*
+                                * The block was modified in the same
+                                * transaction group.
+                                */
+                               error = SET_ERROR(EAGAIN);
+                               goto out;
+                       }
+               }
+               if (bp == NULL) {
+                       /*
+                        * The block was created in this transaction group,
+                        * so it has no BP yet.
+                        */
+                       error = SET_ERROR(EAGAIN);
+                       goto out;
+               }
+               if (dmu_buf_is_dirty(dbuf, tx)) {
+                       error = SET_ERROR(EAGAIN);
+                       goto out;
+               }
+               /*
+                * Make sure we clone only data blocks.
+                */
+               if (BP_IS_METADATA(bp) && !BP_IS_HOLE(bp)) {
+                       error = SET_ERROR(EINVAL);
+                       goto out;
+               }
+
+               bps[i] = *bp;
+       }
+
+       *nbpsp = numbufs;
+out:
+       dmu_buf_rele_array(dbp, numbufs, FTAG);
+
+       return (error);
+}
+
+void
+dmu_brt_clone(objset_t *os, uint64_t object, uint64_t offset, uint64_t length,
+    dmu_tx_t *tx, const blkptr_t *bps, size_t nbps, boolean_t replay)
+{
+       spa_t *spa;
+       dmu_buf_t **dbp, *dbuf;
+       dmu_buf_impl_t *db;
+       struct dirty_leaf *dl;
+       dbuf_dirty_record_t *dr;
+       const blkptr_t *bp;
+       int numbufs;
+
+       spa = os->os_spa;
+
+       VERIFY0(dmu_buf_hold_array(os, object, offset, length, FALSE, FTAG,
+           &numbufs, &dbp));
+       ASSERT3U(nbps, ==, numbufs);
+
+       for (int i = 0; i < numbufs; i++) {
+               dbuf = dbp[i];
+               db = (dmu_buf_impl_t *)dbuf;
+               bp = &bps[i];
+
+               ASSERT0(db->db_level);
+               ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+               ASSERT(BP_IS_HOLE(bp) || dbuf->db_size == BP_GET_LSIZE(bp));
+
+               if (db->db_state == DB_UNCACHED) {
+                       /*
+                        * XXX-PJD: If the dbuf is already cached, calling
+                        * dmu_buf_will_not_fill() will panic on assertion
+                        * (db->db_buf == NULL) in dbuf_clear_data(),
+                        * which is called from dbuf_noread() in DB_NOFILL
+                        * case. I'm not 100% sure this is the right thing
+                        * to do, but it seems to work.
+                        */
+                       dmu_buf_will_not_fill(dbuf, tx);
+               }
+
+               dr = list_head(&db->db_dirty_records);
+               ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+               dl = &dr->dt.dl;
+               dl->dr_overridden_by = *bp;
+               dl->dr_brtwrite = B_TRUE;
+
+               dl->dr_override_state = DR_OVERRIDDEN;
+               if (BP_IS_HOLE(bp)) {
+                       dl->dr_overridden_by.blk_birth = 0;
+                       dl->dr_overridden_by.blk_phys_birth = 0;
+               } else {
+                       dl->dr_overridden_by.blk_birth = dr->dr_txg;
+                       dl->dr_overridden_by.blk_phys_birth =
+                           BP_PHYSICAL_BIRTH(bp);
+               }
+
+               /*
+                * When data in embedded into BP there is no need to create
+                * BRT entry as there is no data block. Just copy the BP as
+                * it contains the data.
+                * Also, when replaying ZIL we don't want to bump references
+                * in the BRT as it was already done during ZIL claim.
+                */
+               if (!replay && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+                       brt_pending_add(spa, bp, tx);
+               }
+       }
+
+       dmu_buf_rele_array(dbp, numbufs, FTAG);
+}
+
  void
  __dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
  {
diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c

index 815e27a6c7f79c77d4354d62beff175fb48c96f4..1c5608c4541b6599ef33c4c4f1e31642fd714850 100644 (file)
--- a/module/zfs/dmu_tx.c
+++ b/module/zfs/dmu_tx.c
@@ -349,7 +349,7 @@ dmu_tx_mark_netfree(dmu_tx_t *tx)
  }
  
  static void
-dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
  {
         dmu_tx_t *tx = txh->txh_tx;
         dnode_t *dn = txh->txh_dnode;
@@ -357,15 +357,11 @@ dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
  
         ASSERT(tx->tx_txg == 0);
  
-       dmu_tx_count_dnode(txh);
-
         if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
                 return;
         if (len == DMU_OBJECT_END)
                 len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
  
-       dmu_tx_count_dnode(txh);
-
         /*
          * For i/o error checking, we read the first and last level-0
          * blocks if they are not aligned, and all the level-1 blocks.
@@ -445,8 +441,10 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
  
         txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
             object, THT_FREE, off, len);
-       if (txh != NULL)
-               (void) dmu_tx_hold_free_impl(txh, off, len);
+       if (txh != NULL) {
+               dmu_tx_count_dnode(txh);
+               dmu_tx_count_free(txh, off, len);
+       }
  }
  
  void
@@ -455,8 +453,35 @@ dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
         dmu_tx_hold_t *txh;
  
         txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
-       if (txh != NULL)
-               (void) dmu_tx_hold_free_impl(txh, off, len);
+       if (txh != NULL) {
+               dmu_tx_count_dnode(txh);
+               dmu_tx_count_free(txh, off, len);
+       }
+}
+
+static void
+dmu_tx_count_clone(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
+{
+
+       /*
+        * Reuse dmu_tx_count_free(), it does exactly what we need for clone.
+        */
+       dmu_tx_count_free(txh, off, len);
+}
+
+void
+dmu_tx_hold_clone_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
+{
+       dmu_tx_hold_t *txh;
+
+       ASSERT0(tx->tx_txg);
+       ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
+
+       txh = dmu_tx_hold_dnode_impl(tx, dn, THT_CLONE, off, len);
+       if (txh != NULL) {
+               dmu_tx_count_dnode(txh);
+               dmu_tx_count_clone(txh, off, len);
+       }
  }
  
  static void
@@ -667,6 +692,10 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
                         case THT_NEWOBJECT:
                                 match_object = TRUE;
                                 break;
+                       case THT_CLONE:
+                               if (blkid >= beginblk && blkid <= endblk)
+                                       match_offset = TRUE;
+                               break;
                         default:
                                 cmn_err(CE_PANIC, "bad txh_type %d",
                                     txh->txh_type);
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c

index 021cba68cd21fe463a42182fb4bd29e7ef7e2564..8e3fd126caa5c91a9449b8e59685e4deb7e161f7 100644 (file)
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -47,6 +47,7 @@
  #include <sys/vdev_impl.h>
  #include <sys/zil_impl.h>
  #include <sys/zio_checksum.h>
+#include <sys/brt.h>
  #include <sys/ddt.h>
  #include <sys/sa.h>
  #include <sys/sa_impl.h>
@@ -3499,11 +3500,12 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
                 scn->scn_dedup_frees_this_txg = 0;
  
                 /*
-                * Write out changes to the DDT that may be required as a
-                * result of the blocks freed.  This ensures that the DDT
-                * is clean when a scrub/resilver runs.
+                * Write out changes to the DDT and the BRT that may be required
+                * as a result of the blocks freed.  This ensures that the DDT
+                * and the BRT are clean when a scrub/resilver runs.
                  */
                 ddt_sync(spa, tx->tx_txg);
+               brt_sync(spa, tx->tx_txg);
         }
         if (err != 0)
                 return (err);
diff --git a/module/zfs/spa.c b/module/zfs/spa.c

index 6be6fe11561a6ca00c6ba0c72eac35ae0d8dd421..98a302237df8c2f0e65e363754ebf72526e07297 100644 (file)
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -52,6 +52,7 @@
  #include <sys/dmu_tx.h>
  #include <sys/zap.h>
  #include <sys/zil.h>
+#include <sys/brt.h>
  #include <sys/ddt.h>
  #include <sys/vdev_impl.h>
  #include <sys/vdev_removal.h>
@@ -341,6 +342,12 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
  
                 spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
                     ddt_get_pool_dedup_ratio(spa), src);
+               spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONEUSED, NULL,
+                   brt_get_used(spa), src);
+               spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONESAVED, NULL,
+                   brt_get_saved(spa), src);
+               spa_prop_add_list(*nvp, ZPOOL_PROP_BCLONERATIO, NULL,
+                   brt_get_ratio(spa), src);
  
                 spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
                     rvd->vdev_state, src);
@@ -1707,6 +1714,7 @@ spa_unload(spa_t *spa)
         }
  
         ddt_unload(spa);
+       brt_unload(spa);
         spa_unload_log_sm_metadata(spa);
  
         /*
@@ -4414,6 +4422,21 @@ spa_ld_load_dedup_tables(spa_t *spa)
         return (0);
  }
  
+static int
+spa_ld_load_brt(spa_t *spa)
+{
+       int error = 0;
+       vdev_t *rvd = spa->spa_root_vdev;
+
+       error = brt_load(spa);
+       if (error != 0) {
+               spa_load_failed(spa, "brt_load failed [error=%d]", error);
+               return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+       }
+
+       return (0);
+}
+
  static int
  spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
  {
@@ -4895,6 +4918,10 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
         if (error != 0)
                 return (error);
  
+       error = spa_ld_load_brt(spa);
+       if (error != 0)
+               return (error);
+
         /*
          * Verify the logs now to make sure we don't have any unexpected errors
          * when we claim log blocks later.
@@ -5963,6 +5990,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
          * Create DDTs (dedup tables).
          */
         ddt_create(spa);
+       /*
+        * Create BRT table and BRT table object.
+        */
+       brt_create(spa);
  
         spa_update_dspace(spa);
  
@@ -9138,6 +9169,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                             &spa->spa_deferred_bpobj, tx);
                 }
  
+               brt_sync(spa, txg);
                 ddt_sync(spa, txg);
                 dsl_scan_sync(dp, tx);
                 svr_sync(spa, tx);
@@ -9262,6 +9294,13 @@ spa_sync(spa_t *spa, uint64_t txg)
         spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
             ZIO_FLAG_CANFAIL);
  
+       /*
+        * Now that there can be no more cloning in this transaction group,
+        * but we are still before issuing frees, we can process pending BRT
+        * updates.
+        */
+       brt_pending_apply(spa, txg);
+
         /*
          * Lock out configuration changes.
          */
diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c

index 53763e915ca83b2a959bc5efde2ec0425179a178..8466fa80e1e3386628bf677cbb113477fc087044 100644 (file)
--- a/module/zfs/spa_misc.c
+++ b/module/zfs/spa_misc.c
@@ -57,6 +57,7 @@
  #include <sys/fs/zfs.h>
  #include <sys/metaslab_impl.h>
  #include <sys/arc.h>
+#include <sys/brt.h>
  #include <sys/ddt.h>
  #include <sys/kstat.h>
  #include "zfs_prop.h"
@@ -1834,7 +1835,7 @@ void
  spa_update_dspace(spa_t *spa)
  {
         spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
-           ddt_get_dedup_dspace(spa);
+           ddt_get_dedup_dspace(spa) + brt_get_dspace(spa);
         if (spa->spa_nonallocating_dspace > 0) {
                 /*
                  * Subtract the space provided by all non-allocating vdevs that
@@ -2410,6 +2411,7 @@ spa_init(spa_mode_t mode)
         unique_init();
         zfs_btree_init();
         metaslab_stat_init();
+       brt_init();
         ddt_init();
         zio_init();
         dmu_init();
@@ -2446,6 +2448,7 @@ spa_fini(void)
         dmu_fini();
         zio_fini();
         ddt_fini();
+       brt_fini();
         metaslab_stat_fini();
         zfs_btree_fini();
         unique_fini();
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c

index 24ae0a00d92f57efd22dd4cffdb76ddfe1ebbdc1..9b859adc5551b47a9882103dbb5243e6ae2676a1 100644 (file)
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -23,7 +23,7 @@
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   * Portions Copyright 2011 Martin Matuska
   * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
- * Portions Copyright 2012 Pawel Jakub Dawidek <pawel@dawidek.net>
+ * Copyright (c) 2012 Pawel Jakub Dawidek
   * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved.
   * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
   * Copyright (c) 2014, Joyent, Inc. All rights reserved.
diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c

index 77bf9140d52d013bd0bd5c4ceb5de58cd44b652e..d009c58d86448df4470c0707799b0b1e05a5e82f 100644 (file)
--- a/module/zfs/zfs_log.c
+++ b/module/zfs/zfs_log.c
@@ -21,6 +21,7 @@
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   * Copyright (c) 2015, 2018 by Delphix. All rights reserved.
+ * Copyright (c) 2022 by Pawel Jakub Dawidek
   */
  
  
@@ -891,5 +892,56 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
         zil_itx_assign(zilog, itx, tx);
  }
  
+/*
+ * Handles TX_CLONE_RANGE transactions.
+ */
+void
+zfs_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp,
+    uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps,
+    size_t nbps)
+{
+       itx_t *itx;
+       lr_clone_range_t *lr;
+       uint64_t partlen, max_log_data;
+       size_t i, partnbps;
+
+       VERIFY(!zil_replaying(zilog, tx));
+
+       if (zp->z_unlinked)
+               return;
+
+       max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t));
+
+       while (nbps > 0) {
+               partnbps = MIN(nbps, max_log_data / sizeof (bps[0]));
+               partlen = 0;
+               for (i = 0; i < partnbps; i++) {
+                       partlen += BP_GET_LSIZE(&bps[i]);
+               }
+               partlen = MIN(partlen, len);
+
+               itx = zil_itx_create(txtype,
+                   sizeof (*lr) + sizeof (bps[0]) * partnbps);
+               lr = (lr_clone_range_t *)&itx->itx_lr;
+               lr->lr_foid = zp->z_id;
+               lr->lr_offset = off;
+               lr->lr_length = partlen;
+               lr->lr_blksz = blksz;
+               lr->lr_nbps = partnbps;
+               memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps);
+
+               itx->itx_sync = (zp->z_sync_cnt != 0);
+
+               zil_itx_assign(zilog, itx, tx);
+
+               bps += partnbps;
+               ASSERT3U(nbps, >=, partnbps);
+               nbps -= partnbps;
+               off += partlen;
+               ASSERT3U(len, >=, partlen);
+               len -= partlen;
+       }
+}
+
  ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, S64, ZMOD_RW,
         "Largest data block to write to zil");
diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c

index a5dc5c399b5da8064ff70798c39792e19cad03c2..9b351eefc04e5336b7f4e1647f01a2099d87c155 100644 (file)
--- a/module/zfs/zfs_quota.c
+++ b/module/zfs/zfs_quota.c
@@ -20,8 +20,7 @@
   */
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
- * All rights reserved.
+ * Copyright (c) 2011 Pawel Jakub Dawidek
   * Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved.
   * Copyright (c) 2014 Integros [integros.com]
   * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c

index 32be27a8ba6e3f71dfddf2b7be30f6416f51f0e1..04dfda56b3f1747ee637984b312da82c8009448c 100644 (file)
--- a/module/zfs/zfs_replay.c
+++ b/module/zfs/zfs_replay.c
@@ -22,6 +22,7 @@
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   * Copyright (c) 2012 Cyril Plisko. All rights reserved.
   * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
   */
  
  #include <sys/types.h>
@@ -1162,6 +1163,34 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
         return (error);
  }
  
+static int
+zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
+{
+       zfsvfs_t *zfsvfs = arg1;
+       lr_clone_range_t *lr = arg2;
+       znode_t *zp;
+       int error;
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
+               /*
+                * Clones can be logged out of order, so don't be surprised if
+                * the file is gone - just return success.
+                */
+               if (error == ENOENT)
+                       error = 0;
+               return (error);
+       }
+
+       error = zfs_clone_range_replay(zp, lr->lr_offset, lr->lr_length,
+           lr->lr_blksz, lr->lr_bps, lr->lr_nbps);
+
+       zrele(zp);
+       return (error);
+}
+
  /*
   * Callback vectors for replaying records
   */
@@ -1190,4 +1219,5 @@ zil_replay_func_t *const zfs_replay_vector[TX_MAX_TYPE] = {
         zfs_replay_setsaxattr,  /* TX_SETSAXATTR */
         zfs_replay_rename_exchange,     /* TX_RENAME_EXCHANGE */
         zfs_replay_rename_whiteout,     /* TX_RENAME_WHITEOUT */
+       zfs_replay_clone_range, /* TX_CLONE_RANGE */
  };
diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c

index 10677d8d994792907d28d6985964bdf1b63763e3..db80be78389960f14426627092b96d1cb9dd6915 100644 (file)
--- a/module/zfs/zfs_vnops.c
+++ b/module/zfs/zfs_vnops.c
@@ -24,6 +24,7 @@
   * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
   * Copyright (c) 2015 by Chunwei Chen. All rights reserved.
   * Copyright 2017 Nexenta Systems, Inc.
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
   */
  
  /* Portions Copyright 2007 Jeremy Teo */
@@ -50,6 +51,7 @@
  #include <sys/txg.h>
  #include <sys/dbuf.h>
  #include <sys/policy.h>
+#include <sys/zfeature.h>
  #include <sys/zfs_vnops.h>
  #include <sys/zfs_quota.h>
  #include <sys/zfs_vfsops.h>
@@ -501,7 +503,7 @@ zfs_write(znode_t *zp, zfs_uio_t *uio, int ioflag, cred_t *cr)
                 lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
         }
  
-       if (zn_rlimit_fsize(zp, uio)) {
+       if (zn_rlimit_fsize_uio(zp, uio)) {
                 zfs_rangelock_exit(lr);
                 zfs_exit(zfsvfs, FTAG);
                 return (SET_ERROR(EFBIG));
@@ -995,6 +997,467 @@ zfs_get_done(zgd_t *zgd, int error)
         kmem_free(zgd, sizeof (zgd_t));
  }
  
+static int
+zfs_enter_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+       int error;
+
+       /* Swap. Not sure if the order of zfs_enter()s is important. */
+       if (zfsvfs1 > zfsvfs2) {
+               zfsvfs_t *tmpzfsvfs;
+
+               tmpzfsvfs = zfsvfs2;
+               zfsvfs2 = zfsvfs1;
+               zfsvfs1 = tmpzfsvfs;
+       }
+
+       error = zfs_enter(zfsvfs1, tag);
+       if (error != 0)
+               return (error);
+       if (zfsvfs1 != zfsvfs2) {
+               error = zfs_enter(zfsvfs2, tag);
+               if (error != 0) {
+                       zfs_exit(zfsvfs1, tag);
+                       return (error);
+               }
+       }
+
+       return (0);
+}
+
+static void
+zfs_exit_two(zfsvfs_t *zfsvfs1, zfsvfs_t *zfsvfs2, const char *tag)
+{
+
+       zfs_exit(zfsvfs1, tag);
+       if (zfsvfs1 != zfsvfs2)
+               zfs_exit(zfsvfs2, tag);
+}
+
+/*
+ * We split each clone request in chunks that can fit into a single ZIL
+ * log entry. Each ZIL log entry can fit 130816 bytes for a block cloning
+ * operation (see zil_max_log_data() and zfs_log_clone_range()). This gives
+ * us room for storing 1022 block pointers.
+ *
+ * On success, the function return the number of bytes copied in *lenp.
+ * Note, it doesn't return how much bytes are left to be copied.
+ */
+int
+zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp,
+    uint64_t *outoffp, uint64_t *lenp, cred_t *cr)
+{
+       zfsvfs_t        *inzfsvfs, *outzfsvfs;
+       objset_t        *inos, *outos;
+       zfs_locked_range_t *inlr, *outlr;
+       dmu_buf_impl_t  *db;
+       dmu_tx_t        *tx;
+       zilog_t         *zilog;
+       uint64_t        inoff, outoff, len, done;
+       uint64_t        outsize, size;
+       int             error;
+       int             count = 0;
+       sa_bulk_attr_t  bulk[3];
+       uint64_t        mtime[2], ctime[2];
+       uint64_t        uid, gid, projid;
+       blkptr_t        *bps;
+       size_t          maxblocks, nbps;
+       uint_t          inblksz;
+       uint64_t        clear_setid_bits_txg = 0;
+
+       inoff = *inoffp;
+       outoff = *outoffp;
+       len = *lenp;
+       done = 0;
+
+       inzfsvfs = ZTOZSB(inzp);
+       outzfsvfs = ZTOZSB(outzp);
+       inos = inzfsvfs->z_os;
+       outos = outzfsvfs->z_os;
+
+       /*
+        * Both source and destination have to belong to the same storage pool.
+        */
+       if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) {
+               zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+               return (SET_ERROR(EXDEV));
+       }
+
+       /*
+        * We need to call zfs_enter() potentially on two different datasets,
+        * so we need a dedicated function for that.
+        */
+       error = zfs_enter_two(inzfsvfs, outzfsvfs, FTAG);
+       if (error != 0)
+               return (error);
+
+       ASSERT(!outzfsvfs->z_replay);
+
+       error = zfs_verify_zp(inzp);
+       if (error == 0)
+               error = zfs_verify_zp(outzp);
+       if (error != 0) {
+               zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+               return (error);
+       }
+
+       if (!spa_feature_is_enabled(dmu_objset_spa(outos),
+           SPA_FEATURE_BLOCK_CLONING)) {
+               zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+               return (SET_ERROR(EXDEV));
+       }
+
+       /*
+        * We don't copy source file's flags that's why we don't allow to clone
+        * files that are in quarantine.
+        */
+       if (inzp->z_pflags & ZFS_AV_QUARANTINED) {
+               zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+               return (SET_ERROR(EACCES));
+       }
+
+       if (inoff >= inzp->z_size) {
+               *lenp = 0;
+               zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+               return (0);
+       }
+       if (len > inzp->z_size - inoff) {
+               len = inzp->z_size - inoff;
+       }
+       if (len == 0) {
+               *lenp = 0;
+               zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+               return (0);
+       }
+
+       /*
+        * Callers might not be able to detect properly that we are read-only,
+        * so check it explicitly here.
+        */
+       if (zfs_is_readonly(outzfsvfs)) {
+               zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+               return (SET_ERROR(EROFS));
+       }
+
+       /*
+        * If immutable or not appending then return EPERM.
+        * Intentionally allow ZFS_READONLY through here.
+        * See zfs_zaccess_common()
+        */
+       if ((outzp->z_pflags & ZFS_IMMUTABLE) != 0) {
+               zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+               return (SET_ERROR(EPERM));
+       }
+
+       /*
+        * No overlapping if we are cloning within the same file.
+        */
+       if (inzp == outzp) {
+               if (inoff < outoff + len && outoff < inoff + len) {
+                       zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+                       return (SET_ERROR(EINVAL));
+               }
+       }
+
+       /*
+        * Maintain predictable lock order.
+        */
+       if (inzp < outzp || (inzp == outzp && inoff < outoff)) {
+               inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+                   RL_READER);
+               outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+                   RL_WRITER);
+       } else {
+               outlr = zfs_rangelock_enter(&outzp->z_rangelock, outoff, len,
+                   RL_WRITER);
+               inlr = zfs_rangelock_enter(&inzp->z_rangelock, inoff, len,
+                   RL_READER);
+       }
+
+       inblksz = inzp->z_blksz;
+
+       /*
+        * We cannot clone into files with different block size.
+        */
+       if (inblksz != outzp->z_blksz && outzp->z_size > inblksz) {
+               error = SET_ERROR(EXDEV);
+               goto unlock;
+       }
+
+       /*
+        * Offsets and len must be at block boundries.
+        */
+       if ((inoff % inblksz) != 0 || (outoff % inblksz) != 0) {
+               error = SET_ERROR(EXDEV);
+               goto unlock;
+       }
+       /*
+        * Length must be multipe of blksz, except for the end of the file.
+        */
+       if ((len % inblksz) != 0 &&
+           (len < inzp->z_size - inoff || len < outzp->z_size - outoff)) {
+               error = SET_ERROR(EXDEV);
+               goto unlock;
+       }
+
+       error = zn_rlimit_fsize(outoff + len);
+       if (error != 0) {
+               goto unlock;
+       }
+
+       if (inoff >= MAXOFFSET_T || outoff >= MAXOFFSET_T) {
+               error = SET_ERROR(EFBIG);
+               goto unlock;
+       }
+
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(outzfsvfs), NULL,
+           &mtime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(outzfsvfs), NULL,
+           &ctime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(outzfsvfs), NULL,
+           &outzp->z_size, 8);
+
+       zilog = outzfsvfs->z_log;
+       maxblocks = zil_max_log_data(zilog, sizeof (lr_clone_range_t)) /
+           sizeof (bps[0]);
+
+       uid = KUID_TO_SUID(ZTOUID(outzp));
+       gid = KGID_TO_SGID(ZTOGID(outzp));
+       projid = outzp->z_projid;
+
+       bps = kmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP);
+
+       /*
+        * Clone the file in reasonable size chunks.  Each chunk is cloned
+        * in a separate transaction; this keeps the intent log records small
+        * and allows us to do more fine-grained space accounting.
+        */
+       while (len > 0) {
+               size = MIN(inblksz * maxblocks, len);
+
+               if (zfs_id_overblockquota(outzfsvfs, DMU_USERUSED_OBJECT,
+                   uid) ||
+                   zfs_id_overblockquota(outzfsvfs, DMU_GROUPUSED_OBJECT,
+                   gid) ||
+                   (projid != ZFS_DEFAULT_PROJID &&
+                   zfs_id_overblockquota(outzfsvfs, DMU_PROJECTUSED_OBJECT,
+                   projid))) {
+                       error = SET_ERROR(EDQUOT);
+                       break;
+               }
+
+               /*
+                * Start a transaction.
+                */
+               tx = dmu_tx_create(outos);
+
+               nbps = maxblocks;
+               error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, tx, bps,
+                   &nbps);
+               if (error != 0) {
+                       dmu_tx_abort(tx);
+                       /*
+                        * If we are tyring to clone a block that was created
+                        * in the current transaction group. Return an error,
+                        * so the caller can fallback to just copying the data.
+                        */
+                       if (error == EAGAIN) {
+                               error = SET_ERROR(EXDEV);
+                       }
+                       break;
+               }
+               /*
+                * Encrypted data is fine as long as it comes from the same
+                * dataset.
+                * TODO: We want to extend it in the future to allow cloning to
+                * datasets with the same keys, like clones or to be able to
+                * clone a file from a snapshot of an encrypted dataset into the
+                * dataset itself.
+                */
+               if (BP_IS_PROTECTED(&bps[0])) {
+                       if (inzfsvfs != outzfsvfs) {
+                               dmu_tx_abort(tx);
+                               error = SET_ERROR(EXDEV);
+                               break;
+                       }
+               }
+
+               dmu_tx_hold_sa(tx, outzp->z_sa_hdl, B_FALSE);
+               db = (dmu_buf_impl_t *)sa_get_db(outzp->z_sa_hdl);
+               DB_DNODE_ENTER(db);
+               dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), outoff, size);
+               DB_DNODE_EXIT(db);
+               zfs_sa_upgrade_txholds(tx, outzp);
+               error = dmu_tx_assign(tx, TXG_WAIT);
+               if (error != 0) {
+                       dmu_tx_abort(tx);
+                       break;
+               }
+
+               /*
+                * Copy source znode's block size. This only happens on the
+                * first iteration since zfs_rangelock_reduce() will shrink down
+                * lr_len to the appropriate size.
+                */
+               if (outlr->lr_length == UINT64_MAX) {
+                       zfs_grow_blocksize(outzp, inblksz, tx);
+                       /*
+                        * Round range lock up to the block boundary, so we
+                        * prevent appends until we are done.
+                        */
+                       zfs_rangelock_reduce(outlr, outoff,
+                           ((len - 1) / inblksz + 1) * inblksz);
+               }
+
+               dmu_brt_clone(outos, outzp->z_id, outoff, size, tx, bps, nbps,
+                   B_FALSE);
+
+               zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr,
+                   &clear_setid_bits_txg, tx);
+
+               zfs_tstamp_update_setup(outzp, CONTENT_MODIFIED, mtime, ctime);
+
+               /*
+                * Update the file size (zp_size) if it has changed;
+                * account for possible concurrent updates.
+                */
+               while ((outsize = outzp->z_size) < outoff + size) {
+                       (void) atomic_cas_64(&outzp->z_size, outsize,
+                           outoff + size);
+               }
+
+               error = sa_bulk_update(outzp->z_sa_hdl, bulk, count, tx);
+
+               zfs_log_clone_range(zilog, tx, TX_CLONE_RANGE, outzp, outoff,
+                   size, inblksz, bps, nbps);
+
+               dmu_tx_commit(tx);
+
+               if (error != 0)
+                       break;
+
+               inoff += size;
+               outoff += size;
+               len -= size;
+               done += size;
+       }
+
+       kmem_free(bps, sizeof (bps[0]) * maxblocks);
+       zfs_znode_update_vfs(outzp);
+
+unlock:
+       zfs_rangelock_exit(outlr);
+       zfs_rangelock_exit(inlr);
+
+       if (done > 0) {
+               /*
+                * If we have made at least partial progress, reset the error.
+                */
+               error = 0;
+
+               ZFS_ACCESSTIME_STAMP(inzfsvfs, inzp);
+
+               if (outos->os_sync == ZFS_SYNC_ALWAYS) {
+                       zil_commit(zilog, outzp->z_id);
+               }
+
+               *inoffp += done;
+               *outoffp += done;
+               *lenp = done;
+       }
+
+       zfs_exit_two(inzfsvfs, outzfsvfs, FTAG);
+
+       return (error);
+}
+
+/*
+ * Usual pattern would be to call zfs_clone_range() from zfs_replay_clone(),
+ * but we cannot do that, because when replaying we don't have source znode
+ * available. This is why we need a dedicated replay function.
+ */
+int
+zfs_clone_range_replay(znode_t *zp, uint64_t off, uint64_t len, uint64_t blksz,
+    const blkptr_t *bps, size_t nbps)
+{
+       zfsvfs_t        *zfsvfs;
+       dmu_buf_impl_t  *db;
+       dmu_tx_t        *tx;
+       int             error;
+       int             count = 0;
+       sa_bulk_attr_t  bulk[3];
+       uint64_t        mtime[2], ctime[2];
+
+       ASSERT3U(off, <, MAXOFFSET_T);
+       ASSERT3U(len, >, 0);
+       ASSERT3U(nbps, >, 0);
+
+       zfsvfs = ZTOZSB(zp);
+
+       ASSERT(spa_feature_is_enabled(dmu_objset_spa(zfsvfs->z_os),
+           SPA_FEATURE_BLOCK_CLONING));
+
+       if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0)
+               return (error);
+
+       ASSERT(zfsvfs->z_replay);
+       ASSERT(!zfs_is_readonly(zfsvfs));
+
+       if ((off % blksz) != 0) {
+               zfs_exit(zfsvfs, FTAG);
+               return (SET_ERROR(EINVAL));
+       }
+
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
+       SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
+           &zp->z_size, 8);
+
+       /*
+        * Start a transaction.
+        */
+       tx = dmu_tx_create(zfsvfs->z_os);
+
+       dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
+       db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl);
+       DB_DNODE_ENTER(db);
+       dmu_tx_hold_clone_by_dnode(tx, DB_DNODE(db), off, len);
+       DB_DNODE_EXIT(db);
+       zfs_sa_upgrade_txholds(tx, zp);
+       error = dmu_tx_assign(tx, TXG_WAIT);
+       if (error != 0) {
+               dmu_tx_abort(tx);
+               zfs_exit(zfsvfs, FTAG);
+               return (error);
+       }
+
+       if (zp->z_blksz < blksz)
+               zfs_grow_blocksize(zp, blksz, tx);
+
+       dmu_brt_clone(zfsvfs->z_os, zp->z_id, off, len, tx, bps, nbps, B_TRUE);
+
+       zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime);
+
+       if (zp->z_size < off + len)
+               zp->z_size = off + len;
+
+       error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
+
+       /*
+        * zil_replaying() not only check if we are replaying ZIL, but also
+        * updates the ZIL header to record replay progress.
+        */
+       VERIFY(zil_replaying(zfsvfs->z_log, tx));
+
+       dmu_tx_commit(tx);
+
+       zfs_znode_update_vfs(zp);
+
+       zfs_exit(zfsvfs, FTAG);
+
+       return (error);
+}
+
  EXPORT_SYMBOL(zfs_access);
  EXPORT_SYMBOL(zfs_fsync);
  EXPORT_SYMBOL(zfs_holey);
@@ -1002,6 +1465,8 @@ EXPORT_SYMBOL(zfs_read);
  EXPORT_SYMBOL(zfs_write);
  EXPORT_SYMBOL(zfs_getsecattr);
  EXPORT_SYMBOL(zfs_setsecattr);
+EXPORT_SYMBOL(zfs_clone_range);
+EXPORT_SYMBOL(zfs_clone_range_replay);
  
  ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW,
         "Bytes to read per chunk");
diff --git a/module/zfs/zil.c b/module/zfs/zil.c

index fcf4e7357b00092b55dc4daf7b2c6ea86d9d22c4..fba1c19996123ff8fbc074db4ae5704053407ba6 100644 (file)
--- a/module/zfs/zil.c
+++ b/module/zfs/zil.c
@@ -43,6 +43,7 @@
  #include <sys/metaslab.h>
  #include <sys/trace_zfs.h>
  #include <sys/abd.h>
+#include <sys/brt.h>
  #include <sys/wmsum.h>
  
  /*
@@ -578,14 +579,12 @@ zil_claim_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
  }
  
  static int
-zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
-    uint64_t first_txg)
+zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg)
  {
         lr_write_t *lr = (lr_write_t *)lrc;
         int error;
  
-       if (lrc->lrc_txtype != TX_WRITE)
-               return (0);
+       ASSERT(lrc->lrc_txtype == TX_WRITE);
  
         /*
          * If the block is not readable, don't claim it.  This can happen
@@ -604,6 +603,57 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
         return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
  }
  
+static int
+zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
+{
+       const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
+       const blkptr_t *bp;
+       spa_t *spa;
+       uint_t ii;
+
+       ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+
+       if (tx == NULL) {
+               return (0);
+       }
+
+       /*
+        * XXX: Do we need to byteswap lr?
+        */
+
+       spa = zilog->zl_spa;
+
+       for (ii = 0; ii < lr->lr_nbps; ii++) {
+               bp = &lr->lr_bps[ii];
+
+               /*
+                * When data in embedded into BP there is no need to create
+                * BRT entry as there is no data block. Just copy the BP as
+                * it contains the data.
+                */
+               if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
+                       brt_pending_add(spa, bp, tx);
+               }
+       }
+
+       return (0);
+}
+
+static int
+zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t first_txg)
+{
+
+       switch (lrc->lrc_txtype) {
+       case TX_WRITE:
+               return (zil_claim_write(zilog, lrc, tx, first_txg));
+       case TX_CLONE_RANGE:
+               return (zil_claim_clone_range(zilog, lrc, tx));
+       default:
+               return (0);
+       }
+}
+
  static int
  zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
      uint64_t claim_txg)
@@ -616,23 +666,70 @@ zil_free_log_block(zilog_t *zilog, const blkptr_t *bp, void *tx,
  }
  
  static int
-zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
-    uint64_t claim_txg)
+zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg)
  {
         lr_write_t *lr = (lr_write_t *)lrc;
         blkptr_t *bp = &lr->lr_blkptr;
  
+       ASSERT(lrc->lrc_txtype == TX_WRITE);
+
         /*
          * If we previously claimed it, we need to free it.
          */
-       if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
-           bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
-           !BP_IS_HOLE(bp))
+       if (bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 &&
+           !BP_IS_HOLE(bp)) {
                 zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+       }
  
         return (0);
  }
  
+static int
+zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx)
+{
+       const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc;
+       const blkptr_t *bp;
+       spa_t *spa;
+       uint_t ii;
+
+       ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE);
+
+       if (tx == NULL) {
+               return (0);
+       }
+
+       spa = zilog->zl_spa;
+
+       for (ii = 0; ii < lr->lr_nbps; ii++) {
+               bp = &lr->lr_bps[ii];
+
+               if (!BP_IS_HOLE(bp)) {
+                       zio_free(spa, dmu_tx_get_txg(tx), bp);
+               }
+       }
+
+       return (0);
+}
+
+static int
+zil_free_log_record(zilog_t *zilog, const lr_t *lrc, void *tx,
+    uint64_t claim_txg)
+{
+
+       if (claim_txg == 0) {
+               return (0);
+       }
+
+       switch (lrc->lrc_txtype) {
+       case TX_WRITE:
+               return (zil_free_write(zilog, lrc, tx, claim_txg));
+       case TX_CLONE_RANGE:
+               return (zil_free_clone_range(zilog, lrc, tx));
+       default:
+               return (0);
+       }
+}
+
  static int
  zil_lwb_vdev_compare(const void *x1, const void *x2)
  {
@@ -1798,13 +1895,12 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb)
  }
  
  /*
- * Maximum amount of write data that can be put into single log block.
+ * Maximum amount of data that can be put into single log block.
   */
  uint64_t
-zil_max_log_data(zilog_t *zilog)
+zil_max_log_data(zilog_t *zilog, size_t hdrsize)
  {
-       return (zilog->zl_max_block_size -
-           sizeof (zil_chain_t) - sizeof (lr_write_t));
+       return (zilog->zl_max_block_size - sizeof (zil_chain_t) - hdrsize);
  }
  
  /*
@@ -1814,7 +1910,7 @@ zil_max_log_data(zilog_t *zilog)
  static inline uint64_t
  zil_max_waste_space(zilog_t *zilog)
  {
-       return (zil_max_log_data(zilog) / 8);
+       return (zil_max_log_data(zilog, sizeof (lr_write_t)) / 8);
  }
  
  /*
@@ -1887,7 +1983,7 @@ cont:
          * For WR_NEED_COPY optimize layout for minimal number of chunks.
          */
         lwb_sp = lwb->lwb_sz - lwb->lwb_nused;
-       max_log_data = zil_max_log_data(zilog);
+       max_log_data = zil_max_log_data(zilog, sizeof (lr_write_t));
         if (reclen > lwb_sp || (reclen + dlen > lwb_sp &&
             lwb_sp < zil_max_waste_space(zilog) &&
             (dlen % max_log_data == 0 ||
diff --git a/module/zfs/zio.c b/module/zfs/zio.c

index d17ee60dcde13a2054da422c625e1e97a11cbd1d..1b1a1831f3331a628604d1ce5032dc52b755a60e 100644 (file)
--- a/module/zfs/zio.c
+++ b/module/zfs/zio.c
@@ -41,6 +41,7 @@
  #include <sys/zio_checksum.h>
  #include <sys/dmu_objset.h>
  #include <sys/arc.h>
+#include <sys/brt.h>
  #include <sys/ddt.h>
  #include <sys/blkptr.h>
  #include <sys/zfeature.h>
@@ -1176,12 +1177,14 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data,
  }
  
  void
-zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite,
+    boolean_t brtwrite)
  {
         ASSERT(zio->io_type == ZIO_TYPE_WRITE);
         ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
         ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
         ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+       ASSERT(!brtwrite || !nopwrite);
  
         /*
          * We must reset the io_prop to match the values that existed
@@ -1190,6 +1193,7 @@ zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
          */
         zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
         zio->io_prop.zp_nopwrite = nopwrite;
+       zio->io_prop.zp_brtwrite = brtwrite;
         zio->io_prop.zp_copies = copies;
         zio->io_bp_override = bp;
  }
@@ -1222,7 +1226,8 @@ zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
             BP_GET_DEDUP(bp) ||
             txg != spa->spa_syncing_txg ||
             (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free &&
-           !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) {
+           !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) ||
+           brt_maybe_exists(spa, bp)) {
                 metaslab_check_free(spa, bp);
                 bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
         } else {
@@ -1249,11 +1254,13 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
         arc_freed(spa, bp);
         dsl_scan_freed(spa, bp);
  
-       if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) {
+       if (BP_IS_GANG(bp) ||
+           BP_GET_DEDUP(bp) ||
+           brt_maybe_exists(spa, bp)) {
                 /*
-                * GANG and DEDUP blocks can induce a read (for the gang block
-                * header, or the DDT), so issue them asynchronously so that
-                * this thread is not tied up.
+                * GANG, DEDUP and BRT blocks can induce a read (for the gang
+                * block header, the DDT or the BRT), so issue them
+                * asynchronously so that this thread is not tied up.
                  */
                 enum zio_stage stage =
                     ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC;
@@ -1594,11 +1601,15 @@ zio_write_bp_init(zio_t *zio)
                 zio_prop_t *zp = &zio->io_prop;
  
                 ASSERT(bp->blk_birth != zio->io_txg);
-               ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
  
                 *bp = *zio->io_bp_override;
                 zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
  
+               if (zp->zp_brtwrite)
+                       return (zio);
+
+               ASSERT(!BP_GET_DEDUP(zio->io_bp_override));
+
                 if (BP_IS_EMBEDDED(bp))
                         return (zio);
  
@@ -3042,6 +3053,35 @@ zio_nop_write(zio_t *zio)
         return (zio);
  }
  
+/*
+ * ==========================================================================
+ * Block Reference Table
+ * ==========================================================================
+ */
+static zio_t *
+zio_brt_free(zio_t *zio)
+{
+       blkptr_t *bp;
+
+       bp = zio->io_bp;
+
+       if (BP_GET_LEVEL(bp) > 0 ||
+           BP_IS_METADATA(bp) ||
+           !brt_maybe_exists(zio->io_spa, bp)) {
+               return (zio);
+       }
+
+       if (!brt_entry_decref(zio->io_spa, bp)) {
+               /*
+                * This isn't the last reference, so we cannot free
+                * the data yet.
+                */
+               zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+       }
+
+       return (zio);
+}
+
  /*
   * ==========================================================================
   * Dedup
@@ -4894,6 +4934,7 @@ static zio_pipe_stage_t *zio_pipeline[] = {
         zio_encrypt,
         zio_checksum_generate,
         zio_nop_write,
+       zio_brt_free,
         zio_ddt_read_start,
         zio_ddt_read_done,
         zio_ddt_write,
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c

index 1511f763fd77f54d805489294d5514cc3bf2c8ac..06bc75c634a68bbdc58d26047ce771ce807fb41d 100644 (file)
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -482,6 +482,60 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap)
         return (error);
  }
  
+/*
+ * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed
+ * after a system failure.
+ *
+ * TODO: For now we drop block cloning transations for ZVOLs as they are
+ *       unsupported, but we still need to inform BRT about that as we
+ *       claimed them during pool import.
+ *       This situation can occur when we try to import a pool from a ZFS
+ *       version supporting block cloning for ZVOLs into a system that
+ *       has this ZFS version, that doesn't support block cloning for ZVOLs.
+ */
+static int
+zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap)
+{
+       char name[ZFS_MAX_DATASET_NAME_LEN];
+       zvol_state_t *zv = arg1;
+       objset_t *os = zv->zv_objset;
+       lr_clone_range_t *lr = arg2;
+       blkptr_t *bp;
+       dmu_tx_t *tx;
+       spa_t *spa;
+       uint_t ii;
+       int error;
+
+       dmu_objset_name(os, name);
+       cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.",
+           name);
+
+       if (byteswap)
+               byteswap_uint64_array(lr, sizeof (*lr));
+
+       tx = dmu_tx_create(os);
+       error = dmu_tx_assign(tx, TXG_WAIT);
+       if (error) {
+               dmu_tx_abort(tx);
+               return (error);
+       }
+
+       spa = os->os_spa;
+
+       for (ii = 0; ii < lr->lr_nbps; ii++) {
+               bp = &lr->lr_bps[ii];
+
+               if (!BP_IS_HOLE(bp)) {
+                       zio_free(spa, dmu_tx_get_txg(tx), bp);
+               }
+       }
+
+       (void) zil_replaying(zv->zv_zilog, tx);
+       dmu_tx_commit(tx);
+
+       return (0);
+}
+
  static int
  zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap)
  {
@@ -516,6 +570,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = {
         zvol_replay_err,        /* TX_SETSAXATTR */
         zvol_replay_err,        /* TX_RENAME_EXCHANGE */
         zvol_replay_err,        /* TX_RENAME_WHITEOUT */
+       zvol_replay_clone_range /* TX_CLONE_RANGE */
  };
  
  /*
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg

index 99a70fa2c04d54f750dd7df3d749104d8b6684b2..097cd52e47776210ee9bf93a4aeb3cc3bc1ec368 100644 (file)
--- a/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
+++ b/tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
@@ -58,6 +58,9 @@ typeset -a properties=(
      "multihost"
      "autotrim"
      "compatibility"
+    "bcloneused"
+    "bclonesaved"
+    "bcloneratio"
      "feature@async_destroy"
      "feature@empty_bpobj"
      "feature@lz4_compress"
@@ -100,5 +103,6 @@ if is_linux || is_freebsd; then
             "feature@zilsaxattr"
             "feature@head_errlog"
             "feature@blake3"
+           "feature@block_cloning"
         )
  fi
author	Pawel Jakub Dawidek <pawel@dawidek.net>
	Fri, 10 Mar 2023 19:59:53 +0000 (20:59 +0100)
committer	GitHub <noreply@github.com>
	Fri, 10 Mar 2023 19:59:53 +0000 (11:59 -0800)
cmd/zdb/zdb_il.c		patch \| blob \| blame \| history
cmd/ztest.c		patch \| blob \| blame \| history
include/Makefile.am		patch \| blob \| blame \| history
include/os/freebsd/zfs/sys/zfs_znode_impl.h		patch \| blob \| blame \| history
include/os/linux/kernel/linux/mod_compat.h		patch \| blob \| blame \| history
include/os/linux/zfs/sys/zfs_znode_impl.h		patch \| blob \| blame \| history
include/sys/bitmap.h	[new file with mode: 0644]	patch \| blob
include/sys/brt.h	[new file with mode: 0644]	patch \| blob
include/sys/dbuf.h		patch \| blob \| blame \| history
include/sys/ddt.h		patch \| blob \| blame \| history
include/sys/dmu.h		patch \| blob \| blame \| history
include/sys/dmu_tx.h		patch \| blob \| blame \| history
include/sys/fs/zfs.h		patch \| blob \| blame \| history
include/sys/spa_impl.h		patch \| blob \| blame \| history
include/sys/zfs_debug.h		patch \| blob \| blame \| history
include/sys/zfs_vnops.h		patch \| blob \| blame \| history
include/sys/zfs_znode.h		patch \| blob \| blame \| history
include/sys/zil.h		patch \| blob \| blame \| history
include/sys/zio.h		patch \| blob \| blame \| history
include/sys/zio_impl.h		patch \| blob \| blame \| history
include/zfeature_common.h		patch \| blob \| blame \| history
lib/libzfs/libzfs.abi		patch \| blob \| blame \| history
lib/libzfs/libzfs_pool.c		patch \| blob \| blame \| history
lib/libzpool/Makefile.am		patch \| blob \| blame \| history
man/man7/zpool-features.7		patch \| blob \| blame \| history
man/man7/zpoolprops.7		patch \| blob \| blame \| history
module/Kbuild.in		patch \| blob \| blame \| history
module/Makefile.bsd		patch \| blob \| blame \| history
module/os/freebsd/zfs/sysctl_os.c		patch \| blob \| blame \| history
module/os/freebsd/zfs/zfs_vfsops.c		patch \| blob \| blame \| history
module/os/freebsd/zfs/zfs_vnops_os.c		patch \| blob \| blame \| history
module/os/freebsd/zfs/zfs_znode.c		patch \| blob \| blame \| history
module/zcommon/zfeature_common.c		patch \| blob \| blame \| history
module/zcommon/zpool_prop.c		patch \| blob \| blame \| history
module/zfs/brt.c	[new file with mode: 0644]	patch \| blob
module/zfs/dbuf.c		patch \| blob \| blame \| history
module/zfs/ddt.c		patch \| blob \| blame \| history
module/zfs/dmu.c		patch \| blob \| blame \| history
module/zfs/dmu_tx.c		patch \| blob \| blame \| history
module/zfs/dsl_scan.c		patch \| blob \| blame \| history
module/zfs/spa.c		patch \| blob \| blame \| history
module/zfs/spa_misc.c		patch \| blob \| blame \| history
module/zfs/zfs_ioctl.c		patch \| blob \| blame \| history
module/zfs/zfs_log.c		patch \| blob \| blame \| history
module/zfs/zfs_quota.c		patch \| blob \| blame \| history
module/zfs/zfs_replay.c		patch \| blob \| blame \| history
module/zfs/zfs_vnops.c		patch \| blob \| blame \| history
module/zfs/zil.c		patch \| blob \| blame \| history
module/zfs/zio.c		patch \| blob \| blame \| history
module/zfs/zvol.c		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg		patch \| blob \| blame \| history