Provide macros for setting and getting blkptr birth times

[mirror_zfs.git] / module / zfs / dbuf.c
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c

index db1123d37d9812725e0811d49cf53d47d07ecf8e..4e190c131e1dde2f9977573071d597f11b9c1b28 100644 (file)
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -26,6 +26,7 @@
   * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
   * Copyright (c) 2019, Klara Inc.
   * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
   */
  
  #include <sys/zfs_context.h>
@@ -49,6 +50,7 @@
  #include <sys/trace_zfs.h>
  #include <sys/callb.h>
  #include <sys/abd.h>
+#include <sys/brt.h>
  #include <sys/vdev.h>
  #include <cityhash.h>
  #include <sys/spa_impl.h>
@@ -173,7 +175,6 @@ struct {
                 continue;                                               \
  }
  
-static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
  static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
  static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
  static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
@@ -227,8 +228,8 @@ typedef struct dbuf_cache {
  dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
  
  /* Size limits for the caches */
-static unsigned long dbuf_cache_max_bytes = ULONG_MAX;
-static unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
+static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
+static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
  
  /* Set the default sizes of the caches to log2 fraction of arc size */
  static uint_t dbuf_cache_shift = 5;
@@ -339,7 +340,8 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
         (dbuf)->db_blkid == (blkid))
  
  dmu_buf_impl_t *
-dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
+    uint64_t *hash_out)
  {
         dbuf_hash_table_t *h = &dbuf_hash_table;
         uint64_t hv;
@@ -361,6 +363,8 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
                 }
         }
         mutex_exit(DBUF_HASH_MUTEX(h, idx));
+       if (hash_out != NULL)
+               *hash_out = hv;
         return (NULL);
  }
  
@@ -395,13 +399,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
         objset_t *os = db->db_objset;
         uint64_t obj = db->db.db_object;
         int level = db->db_level;
-       uint64_t blkid, hv, idx;
+       uint64_t blkid, idx;
         dmu_buf_impl_t *dbf;
         uint32_t i;
  
         blkid = db->db_blkid;
-       hv = dbuf_hash(os, obj, level, blkid);
-       idx = hv & h->hash_table_mask;
+       ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
+       idx = db->db_hash & h->hash_table_mask;
  
         mutex_enter(DBUF_HASH_MUTEX(h, idx));
         for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
@@ -475,12 +479,12 @@ static void
  dbuf_hash_remove(dmu_buf_impl_t *db)
  {
         dbuf_hash_table_t *h = &dbuf_hash_table;
-       uint64_t hv, idx;
+       uint64_t idx;
         dmu_buf_impl_t *dbf, **dbp;
  
-       hv = dbuf_hash(db->db_objset, db->db.db_object,
-           db->db_level, db->db_blkid);
-       idx = hv & h->hash_table_mask;
+       ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
+           db->db_blkid), ==, db->db_hash);
+       idx = db->db_hash & h->hash_table_mask;
  
         /*
          * We mustn't hold db_mtx to maintain lock ordering:
@@ -565,6 +569,21 @@ dbuf_evict_user(dmu_buf_impl_t *db)
                 *dbu->dbu_clear_on_evict_dbufp = NULL;
  #endif
  
+       if (db->db_caching_status != DB_NO_CACHE) {
+               /*
+                * This is a cached dbuf, so the size of the user data is
+                * included in its cached amount. We adjust it here because the
+                * user data has already been detached from the dbuf, and the
+                * sync functions are not supposed to touch it (the dbuf might
+                * not exist anymore by the time the sync functions run.
+                */
+               uint64_t size = dbu->dbu_size;
+               (void) zfs_refcount_remove_many(
+                   &dbuf_caches[db->db_caching_status].size, size, db);
+               if (db->db_caching_status == DB_DBUF_CACHE)
+                       DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
+       }
+
         /*
          * There are two eviction callbacks - one that we call synchronously
          * and one that we invoke via a taskq.  The async one is useful for
@@ -612,58 +631,58 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
  boolean_t
  dbuf_is_l2cacheable(dmu_buf_impl_t *db)
  {
-       vdev_t *vd = NULL;
-       zfs_cache_type_t cache = db->db_objset->os_secondary_cache;
-       blkptr_t *bp = db->db_blkptr;
-
-       if (bp != NULL && !BP_IS_HOLE(bp)) {
+       if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+           (db->db_objset->os_secondary_cache ==
+           ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
+               if (l2arc_exclude_special == 0)
+                       return (B_TRUE);
+
+               blkptr_t *bp = db->db_blkptr;
+               if (bp == NULL || BP_IS_HOLE(bp))
+                       return (B_FALSE);
                 uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
                 vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
+               vdev_t *vd = NULL;
  
                 if (vdev < rvd->vdev_children)
                         vd = rvd->vdev_child[vdev];
  
-               if (cache == ZFS_CACHE_ALL ||
-                   (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) {
-                       if (vd == NULL)
-                               return (B_TRUE);
+               if (vd == NULL)
+                       return (B_TRUE);
  
-                       if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
-                           vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
-                           l2arc_exclude_special == 0)
-                               return (B_TRUE);
-               }
+               if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+                   vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+                       return (B_TRUE);
         }
-
         return (B_FALSE);
  }
  
  static inline boolean_t
  dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
  {
-       vdev_t *vd = NULL;
-       zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache;
-
-       if (bp != NULL && !BP_IS_HOLE(bp)) {
+       if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+           (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
+           (level > 0 ||
+           DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
+               if (l2arc_exclude_special == 0)
+                       return (B_TRUE);
+
+               if (bp == NULL || BP_IS_HOLE(bp))
+                       return (B_FALSE);
                 uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
                 vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
+               vdev_t *vd = NULL;
  
                 if (vdev < rvd->vdev_children)
                         vd = rvd->vdev_child[vdev];
  
-               if (cache == ZFS_CACHE_ALL || ((level > 0 ||
-                   DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) &&
-                   cache == ZFS_CACHE_METADATA)) {
-                       if (vd == NULL)
-                               return (B_TRUE);
+               if (vd == NULL)
+                       return (B_TRUE);
  
-                       if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
-                           vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
-                           l2arc_exclude_special == 0)
-                               return (B_TRUE);
-               }
+               if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+                   vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+                       return (B_TRUE);
         }
-
         return (B_FALSE);
  }
  
@@ -766,12 +785,12 @@ dbuf_evict_one(void)
         if (db != NULL) {
                 multilist_sublist_remove(mls, db);
                 multilist_sublist_unlock(mls);
+               uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
                 (void) zfs_refcount_remove_many(
-                   &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+                   &dbuf_caches[DB_DBUF_CACHE].size, size, db);
                 DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
                 DBUF_STAT_BUMPDOWN(cache_count);
-               DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-                   db->db.db_size);
+               DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
                 ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
                 db->db_caching_status = DB_NO_CACHE;
                 dbuf_destroy(db);
@@ -1152,7 +1171,7 @@ dbuf_verify(dmu_buf_impl_t *db)
         if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
             (db->db_buf == NULL || db->db_buf->b_data) &&
             db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
-           db->db_state != DB_FILL && !dn->dn_free_txg) {
+           db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
                 /*
                  * If the blkptr isn't set but they have nonzero data,
                  * it had better be dirty, otherwise we'll lose that
@@ -1198,7 +1217,7 @@ dbuf_verify(dmu_buf_impl_t *db)
                                         ASSERT0(bp->blk_pad[1]);
                                         ASSERT(!BP_IS_EMBEDDED(bp));
                                         ASSERT(BP_IS_HOLE(bp));
-                                       ASSERT0(bp->blk_phys_birth);
+                                       ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
                                 }
                         }
                 }
@@ -1424,7 +1443,7 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
  }
  
  static void
-dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
  {
         blkptr_t *bps = db->db.db_data;
         uint32_t indbs = 1ULL << dn->dn_indblkshift;
@@ -1433,12 +1452,12 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
         for (int i = 0; i < n_bps; i++) {
                 blkptr_t *bp = &bps[i];
  
-               ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
-               BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
-                   dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
-               BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
-               BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
-               BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+               ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
+               BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
+                   dn->dn_datablksz : BP_GET_LSIZE(dbbp));
+               BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
+               BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
+               BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
         }
  }
  
@@ -1448,30 +1467,27 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
   * was taken, ENOENT if no action was taken.
   */
  static int
-dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
  {
         ASSERT(MUTEX_HELD(&db->db_mtx));
  
-       int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+       int is_hole = bp == NULL || BP_IS_HOLE(bp);
         /*
          * For level 0 blocks only, if the above check fails:
          * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
          * processes the delete record and clears the bp while we are waiting
          * for the dn_mtx (resulting in a "no" from block_freed).
          */
-       if (!is_hole && db->db_level == 0) {
-               is_hole = dnode_block_freed(dn, db->db_blkid) ||
-                   BP_IS_HOLE(db->db_blkptr);
-       }
+       if (!is_hole && db->db_level == 0)
+               is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
  
         if (is_hole) {
                 dbuf_set_data(db, dbuf_alloc_arcbuf(db));
                 memset(db->db.db_data, 0, db->db.db_size);
  
-               if (db->db_blkptr != NULL && db->db_level > 0 &&
-                   BP_IS_HOLE(db->db_blkptr) &&
-                   db->db_blkptr->blk_birth != 0) {
-                       dbuf_handle_indirect_hole(db, dn);
+               if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
+                   BP_GET_LOGICAL_BIRTH(bp) != 0) {
+                       dbuf_handle_indirect_hole(db, dn, bp);
                 }
                 db->db_state = DB_CACHED;
                 DTRACE_SET_STATE(db, "hole read satisfied");
@@ -1503,8 +1519,8 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
  
         ASSERT(MUTEX_HELD(&db->db_mtx));
  
-       if (!os->os_encrypted || os->os_raw_receive ||
-           (flags & DB_RF_NO_DECRYPT) != 0)
+       if ((flags & DB_RF_NO_DECRYPT) != 0 ||
+           !os->os_encrypted || os->os_raw_receive)
                 return (0);
  
         DB_DNODE_ENTER(db);
@@ -1548,13 +1564,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
         zbookmark_phys_t zb;
         uint32_t aflags = ARC_FLAG_NOWAIT;
         int err, zio_flags;
+       blkptr_t bp, *bpp;
  
-       err = zio_flags = 0;
         DB_DNODE_ENTER(db);
         dn = DB_DNODE(db);
         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
         ASSERT(MUTEX_HELD(&db->db_mtx));
-       ASSERT(db->db_state == DB_UNCACHED);
+       ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
         ASSERT(db->db_buf == NULL);
         ASSERT(db->db_parent == NULL ||
             RW_LOCK_HELD(&db->db_parent->db_rwlock));
@@ -1564,16 +1580,44 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
                 goto early_unlock;
         }
  
-       err = dbuf_read_hole(db, dn);
+       if (db->db_state == DB_UNCACHED) {
+               if (db->db_blkptr == NULL) {
+                       bpp = NULL;
+               } else {
+                       bp = *db->db_blkptr;
+                       bpp = &bp;
+               }
+       } else {
+               dbuf_dirty_record_t *dr;
+
+               ASSERT3S(db->db_state, ==, DB_NOFILL);
+
+               /*
+                * Block cloning: If we have a pending block clone,
+                * we don't want to read the underlying block, but the content
+                * of the block being cloned, so we have the most recent data.
+                */
+               dr = list_head(&db->db_dirty_records);
+               if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
+                       err = EIO;
+                       goto early_unlock;
+               }
+               bp = dr->dt.dl.dr_overridden_by;
+               bpp = &bp;
+       }
+
+       err = dbuf_read_hole(db, dn, bpp);
         if (err == 0)
                 goto early_unlock;
  
+       ASSERT(bpp != NULL);
+
         /*
          * Any attempt to read a redacted block should result in an error. This
          * will never happen under normal conditions, but can be useful for
          * debugging purposes.
          */
-       if (BP_IS_REDACTED(db->db_blkptr)) {
+       if (BP_IS_REDACTED(bpp)) {
                 ASSERT(dsl_dataset_feature_is_active(
                     db->db_objset->os_dsl_dataset,
                     SPA_FEATURE_REDACTED_DATASETS));
@@ -1588,10 +1632,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
          * All bps of an encrypted os should have the encryption bit set.
          * If this is not true it indicates tampering and we report an error.
          */
-       if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
-               spa_log_error(db->db_objset->os_spa, &zb);
-               zfs_panic_recover("unencrypted block in encrypted "
-                   "object set %llu", dmu_objset_id(db->db_objset));
+       if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
+               spa_log_error(db->db_objset->os_spa, &zb,
+                   BP_GET_LOGICAL_BIRTH(bpp));
                 err = SET_ERROR(EIO);
                 goto early_unlock;
         }
@@ -1606,7 +1649,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
         DTRACE_SET_STATE(db, "read issued");
         mutex_exit(&db->db_mtx);
  
-       if (dbuf_is_l2cacheable(db))
+       if (!DBUF_IS_CACHEABLE(db))
+               aflags |= ARC_FLAG_UNCACHED;
+       else if (dbuf_is_l2cacheable(db))
                 aflags |= ARC_FLAG_L2CACHE;
  
         dbuf_add_ref(db, NULL);
@@ -1617,15 +1662,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
         if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
                 zio_flags |= ZIO_FLAG_RAW;
         /*
-        * The zio layer will copy the provided blkptr later, but we need to
-        * do this now so that we can release the parent's rwlock. We have to
-        * do that now so that if dbuf_read_done is called synchronously (on
+        * The zio layer will copy the provided blkptr later, but we have our
+        * own copy so that we can release the parent's rwlock. We have to
+        * do that so that if dbuf_read_done is called synchronously (on
          * an l1 cache hit) we don't acquire the db_mtx while holding the
          * parent's rwlock, which would be a lock ordering violation.
          */
-       blkptr_t bp = *db->db_blkptr;
         dmu_buf_unlock_parent(db, dblt, tag);
-       (void) arc_read(zio, db->db_objset->os_spa, &bp,
+       (void) arc_read(zio, db->db_objset->os_spa, bpp,
             dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
             &aflags, &zb);
         return (err);
@@ -1727,20 +1771,18 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
          */
         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
  
-       if (db->db_state == DB_NOFILL)
-               return (SET_ERROR(EIO));
-
         DB_DNODE_ENTER(db);
         dn = DB_DNODE(db);
  
         prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-           (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
-           DBUF_IS_CACHEABLE(db);
+           (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
  
         mutex_enter(&db->db_mtx);
+       if (flags & DB_RF_PARTIAL_FIRST)
+               db->db_partial_read = B_TRUE;
+       else if (!(flags & DB_RF_PARTIAL_MORE))
+               db->db_partial_read = B_FALSE;
         if (db->db_state == DB_CACHED) {
-               spa_t *spa = dn->dn_objset->os_spa;
-
                 /*
                  * Ensure that this block's dnode has been decrypted if
                  * the caller has requested decrypted data.
@@ -1759,6 +1801,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                     (arc_is_encrypted(db->db_buf) ||
                     arc_is_unauthenticated(db->db_buf) ||
                     arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+                       spa_t *spa = dn->dn_objset->os_spa;
                         zbookmark_phys_t zb;
  
                         SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
@@ -1774,14 +1817,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                 }
                 DB_DNODE_EXIT(db);
                 DBUF_STAT_BUMP(hash_hits);
-       } else if (db->db_state == DB_UNCACHED) {
-               spa_t *spa = dn->dn_objset->os_spa;
+       } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
                 boolean_t need_wait = B_FALSE;
  
                 db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
  
-               if (zio == NULL &&
-                   db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+               if (zio == NULL && (db->db_state == DB_NOFILL ||
+                   (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
+                       spa_t *spa = dn->dn_objset->os_spa;
                         zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
                         need_wait = B_TRUE;
                 }
@@ -1895,8 +1938,13 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
         if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
                 zio_free(db->db_objset->os_spa, txg, bp);
  
+       if (dr->dt.dl.dr_brtwrite) {
+               ASSERT0P(dr->dt.dl.dr_data);
+               dr->dt.dl.dr_data = db->db_buf;
+       }
         dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
         dr->dt.dl.dr_nopwrite = B_FALSE;
+       dr->dt.dl.dr_brtwrite = B_FALSE;
         dr->dt.dl.dr_has_raw_params = B_FALSE;
  
         /*
@@ -1907,7 +1955,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
          * the buf thawed to save the effort of freezing &
          * immediately re-thawing it.
          */
-       arc_release(dr->dt.dl.dr_data, db);
+       if (dr->dt.dl.dr_data)
+               arc_release(dr->dt.dl.dr_data, db);
  }
  
  /*
@@ -2126,7 +2175,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
          * Otherwise the buffer contents could be inconsistent between the
          * dbuf and the lightweight dirty record.
          */
-       ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid));
+       ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
+           NULL));
  
         mutex_enter(&dn->dn_mtx);
         int txgoff = tx->tx_txg & TXG_MASK;
@@ -2278,7 +2328,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
  
         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
  
-       if (db->db_blkid != DMU_BONUS_BLKID) {
+       if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
                 dmu_objset_willuse_space(os, db->db.db_size, tx);
         }
  
@@ -2321,8 +2371,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                     sizeof (dbuf_dirty_record_t),
                     offsetof(dbuf_dirty_record_t, dr_dirty_node));
         }
-       if (db->db_blkid != DMU_BONUS_BLKID)
+       if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
                 dr->dr_accounted = db->db.db_size;
+       }
         dr->dr_dbuf = db;
         dr->dr_txg = tx->tx_txg;
         list_insert_before(&db->db_dirty_records, dr_next, dr);
@@ -2478,10 +2529,11 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
   * Undirty a buffer in the transaction group referenced by the given
   * transaction.  Return whether this evicted the dbuf.
   */
-static boolean_t
+boolean_t
  dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
  {
         uint64_t txg = tx->tx_txg;
+       boolean_t brtwrite;
  
         ASSERT(txg != 0);
  
@@ -2506,6 +2558,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                 return (B_FALSE);
         ASSERT(dr->dr_dbuf == db);
  
+       brtwrite = dr->dt.dl.dr_brtwrite;
+       if (brtwrite) {
+               /*
+                * We are freeing a block that we cloned in the same
+                * transaction group.
+                */
+               brt_pending_remove(dmu_objset_spa(db->db_objset),
+                   &dr->dt.dl.dr_overridden_by, tx);
+       }
+
         dnode_t *dn = dr->dr_dnode;
  
         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -2535,7 +2597,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                 mutex_exit(&dn->dn_mtx);
         }
  
-       if (db->db_state != DB_NOFILL) {
+       if (db->db_state != DB_NOFILL && !brtwrite) {
                 dbuf_unoverride(dr);
  
                 ASSERT(db->db_buf != NULL);
@@ -2550,7 +2612,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         db->db_dirtycnt -= 1;
  
         if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-               ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+               ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+                   arc_released(db->db_buf));
                 dbuf_destroy(db);
                 return (B_TRUE);
         }
@@ -2562,6 +2625,7 @@ static void
  dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
  {
         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       boolean_t undirty = B_FALSE;
  
         ASSERT(tx->tx_txg != 0);
         ASSERT(!zfs_refcount_is_zero(&db->db_holds));
@@ -2574,7 +2638,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
          */
         mutex_enter(&db->db_mtx);
  
-       if (db->db_state == DB_CACHED) {
+       if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
                 dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
                 /*
                  * It's possible that it is already dirty but not cached,
@@ -2582,10 +2646,21 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
                  * go through dmu_buf_will_dirty().
                  */
                 if (dr != NULL) {
-                       /* This dbuf is already dirty and cached. */
-                       dbuf_redirty(dr);
-                       mutex_exit(&db->db_mtx);
-                       return;
+                       if (dr->dt.dl.dr_brtwrite) {
+                               /*
+                                * Block cloning: If we are dirtying a cloned
+                                * block, we cannot simply redirty it, because
+                                * this dr has no data associated with it.
+                                * We will go through a full undirtying below,
+                                * before dirtying it again.
+                                */
+                               undirty = B_TRUE;
+                       } else {
+                               /* This dbuf is already dirty and cached. */
+                               dbuf_redirty(dr);
+                               mutex_exit(&db->db_mtx);
+                               return;
+                       }
                 }
         }
         mutex_exit(&db->db_mtx);
@@ -2594,7 +2669,20 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
         if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
                 flags |= DB_RF_HAVESTRUCT;
         DB_DNODE_EXIT(db);
+
+       /*
+        * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
+        * want to make sure dbuf_read() will read the pending cloned block and
+        * not the uderlying block that is being replaced. dbuf_undirty() will
+        * do dbuf_unoverride(), so we will end up with cloned block content,
+        * without overridden BP.
+        */
         (void) dbuf_read(db, NULL, flags);
+       if (undirty) {
+               mutex_enter(&db->db_mtx);
+               VERIFY(!dbuf_undirty(db, tx));
+               mutex_exit(&db->db_mtx);
+       }
         (void) dbuf_dirty(db, tx);
  }
  
@@ -2617,18 +2705,52 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
         return (dr != NULL);
  }
  
+void
+dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+       /*
+        * Block cloning: We are going to clone into this block, so undirty
+        * modifications done to this block so far in this txg. This includes
+        * writes and clones into this block.
+        */
+       mutex_enter(&db->db_mtx);
+       DBUF_VERIFY(db);
+       VERIFY(!dbuf_undirty(db, tx));
+       ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
+       if (db->db_buf != NULL) {
+               arc_buf_destroy(db->db_buf, db);
+               db->db_buf = NULL;
+               dbuf_clear_data(db);
+       }
+
+       db->db_state = DB_NOFILL;
+       DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+
+       DBUF_VERIFY(db);
+       mutex_exit(&db->db_mtx);
+
+       dbuf_noread(db);
+       (void) dbuf_dirty(db, tx);
+}
+
  void
  dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
  {
         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
  
+       mutex_enter(&db->db_mtx);
         db->db_state = DB_NOFILL;
         DTRACE_SET_STATE(db, "allocating NOFILL buffer");
-       dmu_buf_will_fill(db_fake, tx);
+       mutex_exit(&db->db_mtx);
+
+       dbuf_noread(db);
+       (void) dbuf_dirty(db, tx);
  }
  
  void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
  {
         dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
  
@@ -2640,6 +2762,25 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
         ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
             dmu_tx_private_ok(tx));
  
+       mutex_enter(&db->db_mtx);
+       if (db->db_state == DB_NOFILL) {
+               /*
+                * Block cloning: We will be completely overwriting a block
+                * cloned in this transaction group, so let's undirty the
+                * pending clone and mark the block as uncached. This will be
+                * as if the clone was never done.  But if the fill can fail
+                * we should have a way to return back to the cloned data.
+                */
+               if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+                       mutex_exit(&db->db_mtx);
+                       dmu_buf_will_dirty(db_fake, tx);
+                       return;
+               }
+               VERIFY(!dbuf_undirty(db, tx));
+               db->db_state = DB_UNCACHED;
+       }
+       mutex_exit(&db->db_mtx);
+
         dbuf_noread(db);
         (void) dbuf_dirty(db, tx);
  }
@@ -2687,39 +2828,49 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
         dbuf_dirty_record_t *dr;
  
         dr = list_head(&db->db_dirty_records);
+       ASSERT3P(dr, !=, NULL);
         ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
         dl = &dr->dt.dl;
         dl->dr_overridden_by = *bp;
         dl->dr_override_state = DR_OVERRIDDEN;
-       dl->dr_overridden_by.blk_birth = dr->dr_txg;
+       BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
  }
  
-void
-dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+boolean_t
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
  {
         (void) tx;
         dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-       dbuf_states_t old_state;
         mutex_enter(&db->db_mtx);
         DBUF_VERIFY(db);
  
-       old_state = db->db_state;
-       db->db_state = DB_CACHED;
-       if (old_state == DB_FILL) {
+       if (db->db_state == DB_FILL) {
                 if (db->db_level == 0 && db->db_freed_in_flight) {
                         ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                         /* we were freed while filling */
                         /* XXX dbuf_undirty? */
                         memset(db->db.db_data, 0, db->db.db_size);
                         db->db_freed_in_flight = FALSE;
+                       db->db_state = DB_CACHED;
                         DTRACE_SET_STATE(db,
                             "fill done handling freed in flight");
+                       failed = B_FALSE;
+               } else if (failed) {
+                       VERIFY(!dbuf_undirty(db, tx));
+                       db->db_buf = NULL;
+                       dbuf_clear_data(db);
+                       DTRACE_SET_STATE(db, "fill failed");
                 } else {
+                       db->db_state = DB_CACHED;
                         DTRACE_SET_STATE(db, "fill done");
                 }
                 cv_broadcast(&db->db_changed);
+       } else {
+               db->db_state = DB_CACHED;
+               failed = B_FALSE;
         }
         mutex_exit(&db->db_mtx);
+       return (failed);
  }
  
  void
@@ -2748,6 +2899,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
         dmu_buf_will_not_fill(dbuf, tx);
  
         dr = list_head(&db->db_dirty_records);
+       ASSERT3P(dr, !=, NULL);
         ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
         dl = &dr->dt.dl;
         encode_embedded_bp_compressed(&dl->dr_overridden_by,
@@ -2758,7 +2910,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
         BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
  
         dl->dr_override_state = DR_OVERRIDDEN;
-       dl->dr_overridden_by.blk_birth = dr->dr_txg;
+       BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
  }
  
  void
@@ -2809,7 +2961,8 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
         while (db->db_state == DB_READ || db->db_state == DB_FILL)
                 cv_wait(&db->db_changed, &db->db_mtx);
  
-       ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+       ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
+           db->db_state == DB_NOFILL);
  
         if (db->db_state == DB_CACHED &&
             zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
@@ -2846,6 +2999,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
                         arc_buf_destroy(db->db_buf, db);
                 }
                 db->db_buf = NULL;
+       } else if (db->db_state == DB_NOFILL) {
+               /*
+                * We will be completely replacing the cloned block.  In case
+                * it was cloned in this transaction group, let's undirty the
+                * pending clone and mark the block as uncached. This will be
+                * as if the clone was never done.
+                */
+               VERIFY(!dbuf_undirty(db, tx));
+               db->db_state = DB_UNCACHED;
         }
         ASSERT(db->db_buf == NULL);
         dbuf_set_data(db, buf);
@@ -2853,7 +3015,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
         DTRACE_SET_STATE(db, "filling assigned arcbuf");
         mutex_exit(&db->db_mtx);
         (void) dbuf_dirty(db, tx);
-       dmu_buf_fill_done(&db->db, tx);
+       dmu_buf_fill_done(&db->db, tx, B_FALSE);
  }
  
  void
@@ -2889,6 +3051,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
                     db->db_caching_status == DB_DBUF_METADATA_CACHE);
  
                 multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+               ASSERT0(dmu_buf_user_size(&db->db));
                 (void) zfs_refcount_remove_many(
                     &dbuf_caches[db->db_caching_status].size,
                     db->db.db_size, db);
@@ -3073,7 +3237,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
  
  static dmu_buf_impl_t *
  dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
-    dmu_buf_impl_t *parent, blkptr_t *blkptr)
+    dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
  {
         objset_t *os = dn->dn_objset;
         dmu_buf_impl_t *db, *odb;
@@ -3094,6 +3258,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
         db->db_dnode_handle = dn->dn_handle;
         db->db_parent = parent;
         db->db_blkptr = blkptr;
+       db->db_hash = hash;
  
         db->db_user = NULL;
         db->db_user_immediate_evict = FALSE;
@@ -3177,6 +3342,7 @@ dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
  
         err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
         if (err == 0) {
+               ASSERT3P(bp2, !=, NULL);
                 *bp = *bp2;
                 if (dbp != NULL)
                         dbuf_rele(dbp, NULL);
@@ -3322,10 +3488,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
         blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
             P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
  
-       ASSERT(!BP_IS_REDACTED(bp) ||
+       ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
             dsl_dataset_feature_is_active(
             dpa->dpa_dnode->dn_objset->os_dsl_dataset,
-           SPA_FEATURE_REDACTED_DATASETS));
+           SPA_FEATURE_REDACTED_DATASETS)));
         if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
                 arc_buf_destroy(abuf, private);
                 dbuf_prefetch_fini(dpa, B_TRUE);
@@ -3394,7 +3560,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
                 goto no_issue;
  
         dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
-           level, blkid);
+           level, blkid, NULL);
         if (db != NULL) {
                 mutex_exit(&db->db_mtx);
                 /*
@@ -3458,8 +3624,9 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
         dpa->dpa_cb = cb;
         dpa->dpa_arg = arg;
  
-       /* flag if L2ARC eligible, l2arc_noprefetch then decides */
-       if (dnode_level_is_l2cacheable(&bp, dn, level))
+       if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
+               dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
+       else if (dnode_level_is_l2cacheable(&bp, dn, level))
                 dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
  
         /*
@@ -3559,6 +3726,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
      const void *tag, dmu_buf_impl_t **dbp)
  {
         dmu_buf_impl_t *db, *parent = NULL;
+       uint64_t hv;
  
         /* If the pool has been created, verify the tx_sync_lock is not held */
         spa_t *spa = dn->dn_objset->os_spa;
@@ -3574,7 +3742,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
         *dbp = NULL;
  
         /* dbuf_find() returns with db_mtx held */
-       db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
+       db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
  
         if (db == NULL) {
                 blkptr_t *bp = NULL;
@@ -3596,7 +3764,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
                 }
                 if (err && err != ENOENT)
                         return (err);
-               db = dbuf_create(dn, level, blkid, parent, bp);
+               db = dbuf_create(dn, level, blkid, parent, bp, hv);
         }
  
         if (fail_uncached && db->db_state != DB_CACHED) {
@@ -3620,8 +3788,10 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
             dn->dn_object != DMU_META_DNODE_OBJECT &&
             db->db_state == DB_CACHED && db->db_data_pending) {
                 dbuf_dirty_record_t *dr = db->db_data_pending;
-               if (dr->dt.dl.dr_data == db->db_buf)
+               if (dr->dt.dl.dr_data == db->db_buf) {
+                       ASSERT3P(db->db_buf, !=, NULL);
                         dbuf_hold_copy(dn, db);
+               }
         }
  
         if (multilist_link_active(&db->db_cache_link)) {
@@ -3630,17 +3800,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
                     db->db_caching_status == DB_DBUF_METADATA_CACHE);
  
                 multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+               uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
                 (void) zfs_refcount_remove_many(
-                   &dbuf_caches[db->db_caching_status].size,
-                   db->db.db_size, db);
+                   &dbuf_caches[db->db_caching_status].size, size, db);
  
                 if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
                         DBUF_STAT_BUMPDOWN(metadata_cache_count);
                 } else {
                         DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
                         DBUF_STAT_BUMPDOWN(cache_count);
-                       DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-                           db->db.db_size);
+                       DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
                 }
                 db->db_caching_status = DB_NO_CACHE;
         }
@@ -3680,7 +3850,8 @@ dbuf_create_bonus(dnode_t *dn)
         ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
  
         ASSERT(dn->dn_bonus == NULL);
-       dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+       dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
+           dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
  }
  
  int
@@ -3726,7 +3897,7 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
         if (blkid == DMU_BONUS_BLKID)
                 found_db = dbuf_find_bonus(os, obj);
         else
-               found_db = dbuf_find(os, obj, 0, blkid);
+               found_db = dbuf_find(os, obj, 0, blkid, NULL);
  
         if (found_db != NULL) {
                 if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
@@ -3846,59 +4017,39 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
                          * This dbuf has anonymous data associated with it.
                          */
                         dbuf_destroy(db);
-               } else {
-                       boolean_t do_arc_evict = B_FALSE;
-                       blkptr_t bp;
-                       spa_t *spa = dmu_objset_spa(db->db_objset);
-
-                       if (!DBUF_IS_CACHEABLE(db) &&
-                           db->db_blkptr != NULL &&
-                           !BP_IS_HOLE(db->db_blkptr) &&
-                           !BP_IS_EMBEDDED(db->db_blkptr)) {
-                               do_arc_evict = B_TRUE;
-                               bp = *db->db_blkptr;
-                       }
-
-                       if (!DBUF_IS_CACHEABLE(db) ||
-                           db->db_pending_evict) {
-                               dbuf_destroy(db);
-                       } else if (!multilist_link_active(&db->db_cache_link)) {
-                               ASSERT3U(db->db_caching_status, ==,
-                                   DB_NO_CACHE);
-
-                               dbuf_cached_state_t dcs =
-                                   dbuf_include_in_metadata_cache(db) ?
-                                   DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
-                               db->db_caching_status = dcs;
-
-                               multilist_insert(&dbuf_caches[dcs].cache, db);
-                               uint64_t db_size = db->db.db_size;
-                               size = zfs_refcount_add_many(
-                                   &dbuf_caches[dcs].size, db_size, db);
-                               uint8_t db_level = db->db_level;
-                               mutex_exit(&db->db_mtx);
-
-                               if (dcs == DB_DBUF_METADATA_CACHE) {
-                                       DBUF_STAT_BUMP(metadata_cache_count);
-                                       DBUF_STAT_MAX(
-                                           metadata_cache_size_bytes_max,
-                                           size);
-                               } else {
-                                       DBUF_STAT_BUMP(cache_count);
-                                       DBUF_STAT_MAX(cache_size_bytes_max,
-                                           size);
-                                       DBUF_STAT_BUMP(cache_levels[db_level]);
-                                       DBUF_STAT_INCR(
-                                           cache_levels_bytes[db_level],
-                                           db_size);
-                               }
+               } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
+                   db->db_pending_evict) {
+                       dbuf_destroy(db);
+               } else if (!multilist_link_active(&db->db_cache_link)) {
+                       ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+
+                       dbuf_cached_state_t dcs =
+                           dbuf_include_in_metadata_cache(db) ?
+                           DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+                       db->db_caching_status = dcs;
+
+                       multilist_insert(&dbuf_caches[dcs].cache, db);
+                       uint64_t db_size = db->db.db_size +
+                           dmu_buf_user_size(&db->db);
+                       size = zfs_refcount_add_many(
+                           &dbuf_caches[dcs].size, db_size, db);
+                       uint8_t db_level = db->db_level;
+                       mutex_exit(&db->db_mtx);
  
-                               if (dcs == DB_DBUF_CACHE && !evicting)
-                                       dbuf_evict_notify(size);
+                       if (dcs == DB_DBUF_METADATA_CACHE) {
+                               DBUF_STAT_BUMP(metadata_cache_count);
+                               DBUF_STAT_MAX(metadata_cache_size_bytes_max,
+                                   size);
+                       } else {
+                               DBUF_STAT_BUMP(cache_count);
+                               DBUF_STAT_MAX(cache_size_bytes_max, size);
+                               DBUF_STAT_BUMP(cache_levels[db_level]);
+                               DBUF_STAT_INCR(cache_levels_bytes[db_level],
+                                   db_size);
                         }
  
-                       if (do_arc_evict)
-                               arc_freed(spa, &bp);
+                       if (dcs == DB_DBUF_CACHE && !evicting)
+                               dbuf_evict_notify(size);
                 }
         } else {
                 mutex_exit(&db->db_mtx);
@@ -3975,6 +4126,35 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
         return (db->db_user);
  }
  
+uint64_t
+dmu_buf_user_size(dmu_buf_t *db_fake)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       if (db->db_user == NULL)
+               return (0);
+       return (atomic_load_64(&db->db_user->dbu_size));
+}
+
+void
+dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+       ASSERT3P(db->db_user, !=, NULL);
+       ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
+       atomic_add_64(&db->db_user->dbu_size, nadd);
+}
+
+void
+dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+       ASSERT3P(db->db_user, !=, NULL);
+       ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
+       atomic_sub_64(&db->db_user->dbu_size, nsub);
+}
+
  void
  dmu_buf_user_evict_wait(void)
  {
@@ -3995,21 +4175,6 @@ dmu_buf_get_objset(dmu_buf_t *db)
         return (dbi->db_objset);
  }
  
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
-       dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-       DB_DNODE_ENTER(dbi);
-       return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
-       dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-       DB_DNODE_EXIT(dbi);
-}
-
  static void
  dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
  {
@@ -4270,22 +4435,6 @@ dbuf_lightweight_ready(zio_t *zio)
         rw_exit(&parent_db->db_rwlock);
  }
  
-static void
-dbuf_lightweight_physdone(zio_t *zio)
-{
-       dbuf_dirty_record_t *dr = zio->io_private;
-       dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
-       ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
-       /*
-        * The callback will be called io_phys_children times.  Retire one
-        * portion of our dirty space each time we are called.  Any rounding
-        * error will be cleaned up by dbuf_lightweight_done().
-        */
-       int delta = dr->dr_accounted / zio->io_phys_children;
-       dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
  static void
  dbuf_lightweight_done(zio_t *zio)
  {
@@ -4304,16 +4453,8 @@ dbuf_lightweight_done(zio_t *zio)
                 dsl_dataset_block_born(ds, zio->io_bp, tx);
         }
  
-       /*
-        * See comment in dbuf_write_done().
-        */
-       if (zio->io_phys_children == 0) {
-               dsl_pool_undirty_space(dmu_objset_pool(os),
-                   dr->dr_accounted, zio->io_txg);
-       } else {
-               dsl_pool_undirty_space(dmu_objset_pool(os),
-                   dr->dr_accounted % zio->io_phys_children, zio->io_txg);
-       }
+       dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+           zio->io_txg);
  
         abd_free(dr->dt.dll.dr_abd);
         kmem_free(dr, sizeof (*dr));
@@ -4347,8 +4488,7 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
             dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
             dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
             &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
-           dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
-           ZIO_PRIORITY_ASYNC_WRITE,
+           dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
             ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
  
         zio_nowait(dr->dr_zio);
@@ -4383,6 +4523,15 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
         } else if (db->db_state == DB_FILL) {
                 /* This buffer was freed and is now being re-filled */
                 ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+       } else if (db->db_state == DB_READ) {
+               /*
+                * This buffer has a clone we need to write, and an in-flight
+                * read on the BP we're about to clone. Its safe to issue the
+                * write here because the read has already been issued and the
+                * contents won't change.
+                */
+               ASSERT(dr->dt.dl.dr_brtwrite &&
+                   dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
         } else {
                 ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
         }
@@ -4439,7 +4588,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
         while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
                 ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
                 cv_wait(&db->db_changed, &db->db_mtx);
-               ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
         }
  
         /*
@@ -4505,6 +4653,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
         }
  }
  
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs.  May be
+ * called recursively from dbuf_sync_indirect().
+ */
  void
  dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
  {
@@ -4561,7 +4713,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
         dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
         zio->io_prev_space_delta = delta;
  
-       if (bp->blk_birth != 0) {
+       if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
                 ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
                     BP_GET_TYPE(bp) == dn->dn_type) ||
                     (db->db_blkid == DMU_SPILL_BLKID &&
@@ -4598,6 +4750,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                                 i += DNODE_MIN_SIZE;
                                 if (dnp->dn_type != DMU_OT_NONE) {
                                         fill++;
+                                       for (int j = 0; j < dnp->dn_nblkptr;
+                                           j++) {
+                                               (void) zfs_blkptr_verify(spa,
+                                                   &dnp->dn_blkptr[j],
+                                                   BLK_CONFIG_SKIP,
+                                                   BLK_VERIFY_HALT);
+                                       }
+                                       if (dnp->dn_flags &
+                                           DNODE_FLAG_SPILL_BLKPTR) {
+                                               (void) zfs_blkptr_verify(spa,
+                                                   DN_SPILL_BLKPTR(dnp),
+                                                   BLK_CONFIG_SKIP,
+                                                   BLK_VERIFY_HALT);
+                                       }
                                         i += dnp->dn_extra_slots *
                                             DNODE_MIN_SIZE;
                                 }
@@ -4615,6 +4781,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
                         if (BP_IS_HOLE(ibp))
                                 continue;
+                       (void) zfs_blkptr_verify(spa, ibp,
+                           BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
                         fill += BP_GET_FILL(ibp);
                 }
         }
@@ -4675,37 +4843,6 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
         DB_DNODE_EXIT(db);
  }
  
-/*
- * The SPA will call this callback several times for each zio - once
- * for every physical child i/o (zio->io_phys_children times).  This
- * allows the DMU to monitor the progress of each logical i/o.  For example,
- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
- * block.  There may be a long delay before all copies/fragments are completed,
- * so this callback allows us to retire dirty space gradually, as the physical
- * i/os complete.
- */
-static void
-dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
-{
-       (void) buf;
-       dmu_buf_impl_t *db = arg;
-       objset_t *os = db->db_objset;
-       dsl_pool_t *dp = dmu_objset_pool(os);
-       dbuf_dirty_record_t *dr;
-       int delta = 0;
-
-       dr = db->db_data_pending;
-       ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
-       /*
-        * The callback will be called io_phys_children times.  Retire one
-        * portion of our dirty space each time we are called.  Any rounding
-        * error will be cleaned up by dbuf_write_done().
-        */
-       delta = dr->dr_accounted / zio->io_phys_children;
-       dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
  static void
  dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
  {
@@ -4754,8 +4891,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
                 if (db->db_state != DB_NOFILL) {
-                       if (dr->dt.dl.dr_data != db->db_buf)
+                       if (dr->dt.dl.dr_data != NULL &&
+                           dr->dt.dl.dr_data != db->db_buf) {
                                 arc_buf_destroy(dr->dt.dl.dr_data, db);
+                       }
                 }
         } else {
                 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -4778,27 +4917,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
         db->db_data_pending = NULL;
         dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
  
-       /*
-        * If we didn't do a physical write in this ZIO and we
-        * still ended up here, it means that the space of the
-        * dbuf that we just released (and undirtied) above hasn't
-        * been marked as undirtied in the pool's accounting.
-        *
-        * Thus, we undirty that space in the pool's view of the
-        * world here. For physical writes this type of update
-        * happens in dbuf_write_physdone().
-        *
-        * If we did a physical write, cleanup any rounding errors
-        * that came up due to writing multiple copies of a block
-        * on disk [see dbuf_write_physdone()].
-        */
-       if (zio->io_phys_children == 0) {
-               dsl_pool_undirty_space(dmu_objset_pool(os),
-                   dr->dr_accounted, zio->io_txg);
-       } else {
-               dsl_pool_undirty_space(dmu_objset_pool(os),
-                   dr->dr_accounted % zio->io_phys_children, zio->io_txg);
-       }
+       dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+           zio->io_txg);
  
         kmem_free(dr, sizeof (dbuf_dirty_record_t));
  }
@@ -4880,7 +5000,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
         ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
  
         drica.drica_os = dn->dn_objset;
-       drica.drica_blk_birth = bp->blk_birth;
+       drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
         drica.drica_tx = tx;
         if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
             &drica)) {
@@ -4895,7 +5015,8 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
                 if (dn->dn_objset != spa_meta_objset(spa)) {
                         dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
                         if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-                           bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+                           BP_GET_LOGICAL_BIRTH(bp) >
+                           ds->ds_dir->dd_origin_txg) {
                                 ASSERT(!BP_IS_EMBEDDED(bp));
                                 ASSERT(dsl_dir_is_clone(ds->ds_dir));
                                 ASSERT(spa_feature_is_enabled(spa,
@@ -4955,7 +5076,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
  }
  
  
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
  static void
  dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
  {
@@ -5014,7 +5138,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
         }
  
         ASSERT(db->db_level == 0 || data == db->db_buf);
-       ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+       ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
         ASSERT(pio);
  
         SET_BOOKMARK(&zb, os->os_dsl_dataset ?
@@ -5046,20 +5170,21 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
  
                 dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
                     contents, db->db.db_size, db->db.db_size, &zp,
-                   dbuf_write_override_ready, NULL, NULL,
+                   dbuf_write_override_ready, NULL,
                     dbuf_write_override_done,
                     dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
                 mutex_enter(&db->db_mtx);
                 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
                 zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-                   dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+                   dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
+                   dr->dt.dl.dr_brtwrite);
                 mutex_exit(&db->db_mtx);
         } else if (db->db_state == DB_NOFILL) {
                 ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
                     zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
                 dr->dr_zio = zio_write(pio, os->os_spa, txg,
                     &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
-                   dbuf_write_nofill_ready, NULL, NULL,
+                   dbuf_write_nofill_ready, NULL,
                     dbuf_write_nofill_done, db,
                     ZIO_PRIORITY_ASYNC_WRITE,
                     ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
@@ -5076,11 +5201,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
                         children_ready_cb = dbuf_write_children_ready;
  
                 dr->dr_zio = arc_write(pio, os->os_spa, txg,
-                   &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db),
-                   &zp, dbuf_write_ready,
-                   children_ready_cb, dbuf_write_physdone,
-                   dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-                   ZIO_FLAG_MUSTSUCCEED, &zb);
+                   &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
+                   dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
+                   children_ready_cb, dbuf_write_done, db,
+                   ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
         }
  }
  
@@ -5098,6 +5222,7 @@ EXPORT_SYMBOL(dbuf_dirty);
  EXPORT_SYMBOL(dmu_buf_set_crypt_params);
  EXPORT_SYMBOL(dmu_buf_will_dirty);
  EXPORT_SYMBOL(dmu_buf_is_dirty);
+EXPORT_SYMBOL(dmu_buf_will_clone);
  EXPORT_SYMBOL(dmu_buf_will_not_fill);
  EXPORT_SYMBOL(dmu_buf_will_fill);
  EXPORT_SYMBOL(dmu_buf_fill_done);
@@ -5120,7 +5245,7 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie);
  EXPORT_SYMBOL(dmu_buf_get_user);
  EXPORT_SYMBOL(dmu_buf_get_blkptr);
  
-ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
         "Maximum size in bytes of the dbuf cache.");
  
  ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
@@ -5129,7 +5254,7 @@ ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
  ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
         "Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
  
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
         "Maximum size in bytes of dbuf metadata cache.");
  
  ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,