]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/dbuf.c
Provide macros for setting and getting blkptr birth times
[mirror_zfs.git] / module / zfs / dbuf.c
index db1123d37d9812725e0811d49cf53d47d07ecf8e..4e190c131e1dde2f9977573071d597f11b9c1b28 100644 (file)
@@ -26,6 +26,7 @@
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
@@ -49,6 +50,7 @@
 #include <sys/trace_zfs.h>
 #include <sys/callb.h>
 #include <sys/abd.h>
+#include <sys/brt.h>
 #include <sys/vdev.h>
 #include <cityhash.h>
 #include <sys/spa_impl.h>
@@ -173,7 +175,6 @@ struct {
                continue;                                               \
 }
 
-static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
 static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
@@ -227,8 +228,8 @@ typedef struct dbuf_cache {
 dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 
 /* Size limits for the caches */
-static unsigned long dbuf_cache_max_bytes = ULONG_MAX;
-static unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
+static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
+static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
 
 /* Set the default sizes of the caches to log2 fraction of arc size */
 static uint_t dbuf_cache_shift = 5;
@@ -339,7 +340,8 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
        (dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
-dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
+    uint64_t *hash_out)
 {
        dbuf_hash_table_t *h = &dbuf_hash_table;
        uint64_t hv;
@@ -361,6 +363,8 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
                }
        }
        mutex_exit(DBUF_HASH_MUTEX(h, idx));
+       if (hash_out != NULL)
+               *hash_out = hv;
        return (NULL);
 }
 
@@ -395,13 +399,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
        objset_t *os = db->db_objset;
        uint64_t obj = db->db.db_object;
        int level = db->db_level;
-       uint64_t blkid, hv, idx;
+       uint64_t blkid, idx;
        dmu_buf_impl_t *dbf;
        uint32_t i;
 
        blkid = db->db_blkid;
-       hv = dbuf_hash(os, obj, level, blkid);
-       idx = hv & h->hash_table_mask;
+       ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
+       idx = db->db_hash & h->hash_table_mask;
 
        mutex_enter(DBUF_HASH_MUTEX(h, idx));
        for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
@@ -475,12 +479,12 @@ static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
        dbuf_hash_table_t *h = &dbuf_hash_table;
-       uint64_t hv, idx;
+       uint64_t idx;
        dmu_buf_impl_t *dbf, **dbp;
 
-       hv = dbuf_hash(db->db_objset, db->db.db_object,
-           db->db_level, db->db_blkid);
-       idx = hv & h->hash_table_mask;
+       ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
+           db->db_blkid), ==, db->db_hash);
+       idx = db->db_hash & h->hash_table_mask;
 
        /*
         * We mustn't hold db_mtx to maintain lock ordering:
@@ -565,6 +569,21 @@ dbuf_evict_user(dmu_buf_impl_t *db)
                *dbu->dbu_clear_on_evict_dbufp = NULL;
 #endif
 
+       if (db->db_caching_status != DB_NO_CACHE) {
+               /*
+                * This is a cached dbuf, so the size of the user data is
+                * included in its cached amount. We adjust it here because the
+                * user data has already been detached from the dbuf, and the
+                * sync functions are not supposed to touch it (the dbuf might
+                * not exist anymore by the time the sync functions run.
+                */
+               uint64_t size = dbu->dbu_size;
+               (void) zfs_refcount_remove_many(
+                   &dbuf_caches[db->db_caching_status].size, size, db);
+               if (db->db_caching_status == DB_DBUF_CACHE)
+                       DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
+       }
+
        /*
         * There are two eviction callbacks - one that we call synchronously
         * and one that we invoke via a taskq.  The async one is useful for
@@ -612,58 +631,58 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
 boolean_t
 dbuf_is_l2cacheable(dmu_buf_impl_t *db)
 {
-       vdev_t *vd = NULL;
-       zfs_cache_type_t cache = db->db_objset->os_secondary_cache;
-       blkptr_t *bp = db->db_blkptr;
-
-       if (bp != NULL && !BP_IS_HOLE(bp)) {
+       if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+           (db->db_objset->os_secondary_cache ==
+           ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
+               if (l2arc_exclude_special == 0)
+                       return (B_TRUE);
+
+               blkptr_t *bp = db->db_blkptr;
+               if (bp == NULL || BP_IS_HOLE(bp))
+                       return (B_FALSE);
                uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
                vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
+               vdev_t *vd = NULL;
 
                if (vdev < rvd->vdev_children)
                        vd = rvd->vdev_child[vdev];
 
-               if (cache == ZFS_CACHE_ALL ||
-                   (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) {
-                       if (vd == NULL)
-                               return (B_TRUE);
+               if (vd == NULL)
+                       return (B_TRUE);
 
-                       if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
-                           vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
-                           l2arc_exclude_special == 0)
-                               return (B_TRUE);
-               }
+               if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+                   vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+                       return (B_TRUE);
        }
-
        return (B_FALSE);
 }
 
 static inline boolean_t
 dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
 {
-       vdev_t *vd = NULL;
-       zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache;
-
-       if (bp != NULL && !BP_IS_HOLE(bp)) {
+       if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+           (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
+           (level > 0 ||
+           DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
+               if (l2arc_exclude_special == 0)
+                       return (B_TRUE);
+
+               if (bp == NULL || BP_IS_HOLE(bp))
+                       return (B_FALSE);
                uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
                vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
+               vdev_t *vd = NULL;
 
                if (vdev < rvd->vdev_children)
                        vd = rvd->vdev_child[vdev];
 
-               if (cache == ZFS_CACHE_ALL || ((level > 0 ||
-                   DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) &&
-                   cache == ZFS_CACHE_METADATA)) {
-                       if (vd == NULL)
-                               return (B_TRUE);
+               if (vd == NULL)
+                       return (B_TRUE);
 
-                       if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
-                           vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
-                           l2arc_exclude_special == 0)
-                               return (B_TRUE);
-               }
+               if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+                   vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+                       return (B_TRUE);
        }
-
        return (B_FALSE);
 }
 
@@ -766,12 +785,12 @@ dbuf_evict_one(void)
        if (db != NULL) {
                multilist_sublist_remove(mls, db);
                multilist_sublist_unlock(mls);
+               uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
                (void) zfs_refcount_remove_many(
-                   &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+                   &dbuf_caches[DB_DBUF_CACHE].size, size, db);
                DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
                DBUF_STAT_BUMPDOWN(cache_count);
-               DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-                   db->db.db_size);
+               DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
                ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
                db->db_caching_status = DB_NO_CACHE;
                dbuf_destroy(db);
@@ -1152,7 +1171,7 @@ dbuf_verify(dmu_buf_impl_t *db)
        if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
            (db->db_buf == NULL || db->db_buf->b_data) &&
            db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
-           db->db_state != DB_FILL && !dn->dn_free_txg) {
+           db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
                /*
                 * If the blkptr isn't set but they have nonzero data,
                 * it had better be dirty, otherwise we'll lose that
@@ -1198,7 +1217,7 @@ dbuf_verify(dmu_buf_impl_t *db)
                                        ASSERT0(bp->blk_pad[1]);
                                        ASSERT(!BP_IS_EMBEDDED(bp));
                                        ASSERT(BP_IS_HOLE(bp));
-                                       ASSERT0(bp->blk_phys_birth);
+                                       ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
                                }
                        }
                }
@@ -1424,7 +1443,7 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
 }
 
 static void
-dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
 {
        blkptr_t *bps = db->db.db_data;
        uint32_t indbs = 1ULL << dn->dn_indblkshift;
@@ -1433,12 +1452,12 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
        for (int i = 0; i < n_bps; i++) {
                blkptr_t *bp = &bps[i];
 
-               ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
-               BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
-                   dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
-               BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
-               BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
-               BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+               ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
+               BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
+                   dn->dn_datablksz : BP_GET_LSIZE(dbbp));
+               BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
+               BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
+               BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
        }
 }
 
@@ -1448,30 +1467,27 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
  * was taken, ENOENT if no action was taken.
  */
 static int
-dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
 {
        ASSERT(MUTEX_HELD(&db->db_mtx));
 
-       int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+       int is_hole = bp == NULL || BP_IS_HOLE(bp);
        /*
         * For level 0 blocks only, if the above check fails:
         * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
         * processes the delete record and clears the bp while we are waiting
         * for the dn_mtx (resulting in a "no" from block_freed).
         */
-       if (!is_hole && db->db_level == 0) {
-               is_hole = dnode_block_freed(dn, db->db_blkid) ||
-                   BP_IS_HOLE(db->db_blkptr);
-       }
+       if (!is_hole && db->db_level == 0)
+               is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
 
        if (is_hole) {
                dbuf_set_data(db, dbuf_alloc_arcbuf(db));
                memset(db->db.db_data, 0, db->db.db_size);
 
-               if (db->db_blkptr != NULL && db->db_level > 0 &&
-                   BP_IS_HOLE(db->db_blkptr) &&
-                   db->db_blkptr->blk_birth != 0) {
-                       dbuf_handle_indirect_hole(db, dn);
+               if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
+                   BP_GET_LOGICAL_BIRTH(bp) != 0) {
+                       dbuf_handle_indirect_hole(db, dn, bp);
                }
                db->db_state = DB_CACHED;
                DTRACE_SET_STATE(db, "hole read satisfied");
@@ -1503,8 +1519,8 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
 
        ASSERT(MUTEX_HELD(&db->db_mtx));
 
-       if (!os->os_encrypted || os->os_raw_receive ||
-           (flags & DB_RF_NO_DECRYPT) != 0)
+       if ((flags & DB_RF_NO_DECRYPT) != 0 ||
+           !os->os_encrypted || os->os_raw_receive)
                return (0);
 
        DB_DNODE_ENTER(db);
@@ -1548,13 +1564,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
        zbookmark_phys_t zb;
        uint32_t aflags = ARC_FLAG_NOWAIT;
        int err, zio_flags;
+       blkptr_t bp, *bpp;
 
-       err = zio_flags = 0;
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
        ASSERT(!zfs_refcount_is_zero(&db->db_holds));
        ASSERT(MUTEX_HELD(&db->db_mtx));
-       ASSERT(db->db_state == DB_UNCACHED);
+       ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
        ASSERT(db->db_buf == NULL);
        ASSERT(db->db_parent == NULL ||
            RW_LOCK_HELD(&db->db_parent->db_rwlock));
@@ -1564,16 +1580,44 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
                goto early_unlock;
        }
 
-       err = dbuf_read_hole(db, dn);
+       if (db->db_state == DB_UNCACHED) {
+               if (db->db_blkptr == NULL) {
+                       bpp = NULL;
+               } else {
+                       bp = *db->db_blkptr;
+                       bpp = &bp;
+               }
+       } else {
+               dbuf_dirty_record_t *dr;
+
+               ASSERT3S(db->db_state, ==, DB_NOFILL);
+
+               /*
+                * Block cloning: If we have a pending block clone,
+                * we don't want to read the underlying block, but the content
+                * of the block being cloned, so we have the most recent data.
+                */
+               dr = list_head(&db->db_dirty_records);
+               if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
+                       err = EIO;
+                       goto early_unlock;
+               }
+               bp = dr->dt.dl.dr_overridden_by;
+               bpp = &bp;
+       }
+
+       err = dbuf_read_hole(db, dn, bpp);
        if (err == 0)
                goto early_unlock;
 
+       ASSERT(bpp != NULL);
+
        /*
         * Any attempt to read a redacted block should result in an error. This
         * will never happen under normal conditions, but can be useful for
         * debugging purposes.
         */
-       if (BP_IS_REDACTED(db->db_blkptr)) {
+       if (BP_IS_REDACTED(bpp)) {
                ASSERT(dsl_dataset_feature_is_active(
                    db->db_objset->os_dsl_dataset,
                    SPA_FEATURE_REDACTED_DATASETS));
@@ -1588,10 +1632,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
         * All bps of an encrypted os should have the encryption bit set.
         * If this is not true it indicates tampering and we report an error.
         */
-       if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
-               spa_log_error(db->db_objset->os_spa, &zb);
-               zfs_panic_recover("unencrypted block in encrypted "
-                   "object set %llu", dmu_objset_id(db->db_objset));
+       if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
+               spa_log_error(db->db_objset->os_spa, &zb,
+                   BP_GET_LOGICAL_BIRTH(bpp));
                err = SET_ERROR(EIO);
                goto early_unlock;
        }
@@ -1606,7 +1649,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
        DTRACE_SET_STATE(db, "read issued");
        mutex_exit(&db->db_mtx);
 
-       if (dbuf_is_l2cacheable(db))
+       if (!DBUF_IS_CACHEABLE(db))
+               aflags |= ARC_FLAG_UNCACHED;
+       else if (dbuf_is_l2cacheable(db))
                aflags |= ARC_FLAG_L2CACHE;
 
        dbuf_add_ref(db, NULL);
@@ -1617,15 +1662,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
        if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
                zio_flags |= ZIO_FLAG_RAW;
        /*
-        * The zio layer will copy the provided blkptr later, but we need to
-        * do this now so that we can release the parent's rwlock. We have to
-        * do that now so that if dbuf_read_done is called synchronously (on
+        * The zio layer will copy the provided blkptr later, but we have our
+        * own copy so that we can release the parent's rwlock. We have to
+        * do that so that if dbuf_read_done is called synchronously (on
         * an l1 cache hit) we don't acquire the db_mtx while holding the
         * parent's rwlock, which would be a lock ordering violation.
         */
-       blkptr_t bp = *db->db_blkptr;
        dmu_buf_unlock_parent(db, dblt, tag);
-       (void) arc_read(zio, db->db_objset->os_spa, &bp,
+       (void) arc_read(zio, db->db_objset->os_spa, bpp,
            dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
            &aflags, &zb);
        return (err);
@@ -1727,20 +1771,18 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
         */
        ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
-       if (db->db_state == DB_NOFILL)
-               return (SET_ERROR(EIO));
-
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
 
        prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-           (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
-           DBUF_IS_CACHEABLE(db);
+           (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
 
        mutex_enter(&db->db_mtx);
+       if (flags & DB_RF_PARTIAL_FIRST)
+               db->db_partial_read = B_TRUE;
+       else if (!(flags & DB_RF_PARTIAL_MORE))
+               db->db_partial_read = B_FALSE;
        if (db->db_state == DB_CACHED) {
-               spa_t *spa = dn->dn_objset->os_spa;
-
                /*
                 * Ensure that this block's dnode has been decrypted if
                 * the caller has requested decrypted data.
@@ -1759,6 +1801,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                    (arc_is_encrypted(db->db_buf) ||
                    arc_is_unauthenticated(db->db_buf) ||
                    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+                       spa_t *spa = dn->dn_objset->os_spa;
                        zbookmark_phys_t zb;
 
                        SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
@@ -1774,14 +1817,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                }
                DB_DNODE_EXIT(db);
                DBUF_STAT_BUMP(hash_hits);
-       } else if (db->db_state == DB_UNCACHED) {
-               spa_t *spa = dn->dn_objset->os_spa;
+       } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
                boolean_t need_wait = B_FALSE;
 
                db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 
-               if (zio == NULL &&
-                   db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+               if (zio == NULL && (db->db_state == DB_NOFILL ||
+                   (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
+                       spa_t *spa = dn->dn_objset->os_spa;
                        zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
                        need_wait = B_TRUE;
                }
@@ -1895,8 +1938,13 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
        if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
                zio_free(db->db_objset->os_spa, txg, bp);
 
+       if (dr->dt.dl.dr_brtwrite) {
+               ASSERT0P(dr->dt.dl.dr_data);
+               dr->dt.dl.dr_data = db->db_buf;
+       }
        dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
        dr->dt.dl.dr_nopwrite = B_FALSE;
+       dr->dt.dl.dr_brtwrite = B_FALSE;
        dr->dt.dl.dr_has_raw_params = B_FALSE;
 
        /*
@@ -1907,7 +1955,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
         * the buf thawed to save the effort of freezing &
         * immediately re-thawing it.
         */
-       arc_release(dr->dt.dl.dr_data, db);
+       if (dr->dt.dl.dr_data)
+               arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
@@ -2126,7 +2175,8 @@ dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
         * Otherwise the buffer contents could be inconsistent between the
         * dbuf and the lightweight dirty record.
         */
-       ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid));
+       ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
+           NULL));
 
        mutex_enter(&dn->dn_mtx);
        int txgoff = tx->tx_txg & TXG_MASK;
@@ -2278,7 +2328,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
        dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
-       if (db->db_blkid != DMU_BONUS_BLKID) {
+       if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
                dmu_objset_willuse_space(os, db->db.db_size, tx);
        }
 
@@ -2321,8 +2371,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                    sizeof (dbuf_dirty_record_t),
                    offsetof(dbuf_dirty_record_t, dr_dirty_node));
        }
-       if (db->db_blkid != DMU_BONUS_BLKID)
+       if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
                dr->dr_accounted = db->db.db_size;
+       }
        dr->dr_dbuf = db;
        dr->dr_txg = tx->tx_txg;
        list_insert_before(&db->db_dirty_records, dr_next, dr);
@@ -2478,10 +2529,11 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
  * Undirty a buffer in the transaction group referenced by the given
  * transaction.  Return whether this evicted the dbuf.
  */
-static boolean_t
+boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
        uint64_t txg = tx->tx_txg;
+       boolean_t brtwrite;
 
        ASSERT(txg != 0);
 
@@ -2506,6 +2558,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                return (B_FALSE);
        ASSERT(dr->dr_dbuf == db);
 
+       brtwrite = dr->dt.dl.dr_brtwrite;
+       if (brtwrite) {
+               /*
+                * We are freeing a block that we cloned in the same
+                * transaction group.
+                */
+               brt_pending_remove(dmu_objset_spa(db->db_objset),
+                   &dr->dt.dl.dr_overridden_by, tx);
+       }
+
        dnode_t *dn = dr->dr_dnode;
 
        dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -2535,7 +2597,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                mutex_exit(&dn->dn_mtx);
        }
 
-       if (db->db_state != DB_NOFILL) {
+       if (db->db_state != DB_NOFILL && !brtwrite) {
                dbuf_unoverride(dr);
 
                ASSERT(db->db_buf != NULL);
@@ -2550,7 +2612,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        db->db_dirtycnt -= 1;
 
        if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-               ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+               ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+                   arc_released(db->db_buf));
                dbuf_destroy(db);
                return (B_TRUE);
        }
@@ -2562,6 +2625,7 @@ static void
 dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       boolean_t undirty = B_FALSE;
 
        ASSERT(tx->tx_txg != 0);
        ASSERT(!zfs_refcount_is_zero(&db->db_holds));
@@ -2574,7 +2638,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
         */
        mutex_enter(&db->db_mtx);
 
-       if (db->db_state == DB_CACHED) {
+       if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
                dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
                /*
                 * It's possible that it is already dirty but not cached,
@@ -2582,10 +2646,21 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
                 * go through dmu_buf_will_dirty().
                 */
                if (dr != NULL) {
-                       /* This dbuf is already dirty and cached. */
-                       dbuf_redirty(dr);
-                       mutex_exit(&db->db_mtx);
-                       return;
+                       if (dr->dt.dl.dr_brtwrite) {
+                               /*
+                                * Block cloning: If we are dirtying a cloned
+                                * block, we cannot simply redirty it, because
+                                * this dr has no data associated with it.
+                                * We will go through a full undirtying below,
+                                * before dirtying it again.
+                                */
+                               undirty = B_TRUE;
+                       } else {
+                               /* This dbuf is already dirty and cached. */
+                               dbuf_redirty(dr);
+                               mutex_exit(&db->db_mtx);
+                               return;
+                       }
                }
        }
        mutex_exit(&db->db_mtx);
@@ -2594,7 +2669,20 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
        if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
                flags |= DB_RF_HAVESTRUCT;
        DB_DNODE_EXIT(db);
+
+       /*
+        * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
+        * want to make sure dbuf_read() will read the pending cloned block and
+        * not the uderlying block that is being replaced. dbuf_undirty() will
+        * do dbuf_unoverride(), so we will end up with cloned block content,
+        * without overridden BP.
+        */
        (void) dbuf_read(db, NULL, flags);
+       if (undirty) {
+               mutex_enter(&db->db_mtx);
+               VERIFY(!dbuf_undirty(db, tx));
+               mutex_exit(&db->db_mtx);
+       }
        (void) dbuf_dirty(db, tx);
 }
 
@@ -2617,18 +2705,52 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
        return (dr != NULL);
 }
 
+void
+dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+       /*
+        * Block cloning: We are going to clone into this block, so undirty
+        * modifications done to this block so far in this txg. This includes
+        * writes and clones into this block.
+        */
+       mutex_enter(&db->db_mtx);
+       DBUF_VERIFY(db);
+       VERIFY(!dbuf_undirty(db, tx));
+       ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
+       if (db->db_buf != NULL) {
+               arc_buf_destroy(db->db_buf, db);
+               db->db_buf = NULL;
+               dbuf_clear_data(db);
+       }
+
+       db->db_state = DB_NOFILL;
+       DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+
+       DBUF_VERIFY(db);
+       mutex_exit(&db->db_mtx);
+
+       dbuf_noread(db);
+       (void) dbuf_dirty(db, tx);
+}
+
 void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
+       mutex_enter(&db->db_mtx);
        db->db_state = DB_NOFILL;
        DTRACE_SET_STATE(db, "allocating NOFILL buffer");
-       dmu_buf_will_fill(db_fake, tx);
+       mutex_exit(&db->db_mtx);
+
+       dbuf_noread(db);
+       (void) dbuf_dirty(db, tx);
 }
 
 void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
@@ -2640,6 +2762,25 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
        ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
            dmu_tx_private_ok(tx));
 
+       mutex_enter(&db->db_mtx);
+       if (db->db_state == DB_NOFILL) {
+               /*
+                * Block cloning: We will be completely overwriting a block
+                * cloned in this transaction group, so let's undirty the
+                * pending clone and mark the block as uncached. This will be
+                * as if the clone was never done.  But if the fill can fail
+                * we should have a way to return back to the cloned data.
+                */
+               if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+                       mutex_exit(&db->db_mtx);
+                       dmu_buf_will_dirty(db_fake, tx);
+                       return;
+               }
+               VERIFY(!dbuf_undirty(db, tx));
+               db->db_state = DB_UNCACHED;
+       }
+       mutex_exit(&db->db_mtx);
+
        dbuf_noread(db);
        (void) dbuf_dirty(db, tx);
 }
@@ -2687,39 +2828,49 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
        dbuf_dirty_record_t *dr;
 
        dr = list_head(&db->db_dirty_records);
+       ASSERT3P(dr, !=, NULL);
        ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
        dl = &dr->dt.dl;
        dl->dr_overridden_by = *bp;
        dl->dr_override_state = DR_OVERRIDDEN;
-       dl->dr_overridden_by.blk_birth = dr->dr_txg;
+       BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
-void
-dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+boolean_t
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
 {
        (void) tx;
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-       dbuf_states_t old_state;
        mutex_enter(&db->db_mtx);
        DBUF_VERIFY(db);
 
-       old_state = db->db_state;
-       db->db_state = DB_CACHED;
-       if (old_state == DB_FILL) {
+       if (db->db_state == DB_FILL) {
                if (db->db_level == 0 && db->db_freed_in_flight) {
                        ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                        /* we were freed while filling */
                        /* XXX dbuf_undirty? */
                        memset(db->db.db_data, 0, db->db.db_size);
                        db->db_freed_in_flight = FALSE;
+                       db->db_state = DB_CACHED;
                        DTRACE_SET_STATE(db,
                            "fill done handling freed in flight");
+                       failed = B_FALSE;
+               } else if (failed) {
+                       VERIFY(!dbuf_undirty(db, tx));
+                       db->db_buf = NULL;
+                       dbuf_clear_data(db);
+                       DTRACE_SET_STATE(db, "fill failed");
                } else {
+                       db->db_state = DB_CACHED;
                        DTRACE_SET_STATE(db, "fill done");
                }
                cv_broadcast(&db->db_changed);
+       } else {
+               db->db_state = DB_CACHED;
+               failed = B_FALSE;
        }
        mutex_exit(&db->db_mtx);
+       return (failed);
 }
 
 void
@@ -2748,6 +2899,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
        dmu_buf_will_not_fill(dbuf, tx);
 
        dr = list_head(&db->db_dirty_records);
+       ASSERT3P(dr, !=, NULL);
        ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
        dl = &dr->dt.dl;
        encode_embedded_bp_compressed(&dl->dr_overridden_by,
@@ -2758,7 +2910,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
        BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
        dl->dr_override_state = DR_OVERRIDDEN;
-       dl->dr_overridden_by.blk_birth = dr->dr_txg;
+       BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 void
@@ -2809,7 +2961,8 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
        while (db->db_state == DB_READ || db->db_state == DB_FILL)
                cv_wait(&db->db_changed, &db->db_mtx);
 
-       ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+       ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
+           db->db_state == DB_NOFILL);
 
        if (db->db_state == DB_CACHED &&
            zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
@@ -2846,6 +2999,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
                        arc_buf_destroy(db->db_buf, db);
                }
                db->db_buf = NULL;
+       } else if (db->db_state == DB_NOFILL) {
+               /*
+                * We will be completely replacing the cloned block.  In case
+                * it was cloned in this transaction group, let's undirty the
+                * pending clone and mark the block as uncached. This will be
+                * as if the clone was never done.
+                */
+               VERIFY(!dbuf_undirty(db, tx));
+               db->db_state = DB_UNCACHED;
        }
        ASSERT(db->db_buf == NULL);
        dbuf_set_data(db, buf);
@@ -2853,7 +3015,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
        DTRACE_SET_STATE(db, "filling assigned arcbuf");
        mutex_exit(&db->db_mtx);
        (void) dbuf_dirty(db, tx);
-       dmu_buf_fill_done(&db->db, tx);
+       dmu_buf_fill_done(&db->db, tx, B_FALSE);
 }
 
 void
@@ -2889,6 +3051,8 @@ dbuf_destroy(dmu_buf_impl_t *db)
                    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
                multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+               ASSERT0(dmu_buf_user_size(&db->db));
                (void) zfs_refcount_remove_many(
                    &dbuf_caches[db->db_caching_status].size,
                    db->db.db_size, db);
@@ -3073,7 +3237,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 
 static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
-    dmu_buf_impl_t *parent, blkptr_t *blkptr)
+    dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
 {
        objset_t *os = dn->dn_objset;
        dmu_buf_impl_t *db, *odb;
@@ -3094,6 +3258,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
        db->db_dnode_handle = dn->dn_handle;
        db->db_parent = parent;
        db->db_blkptr = blkptr;
+       db->db_hash = hash;
 
        db->db_user = NULL;
        db->db_user_immediate_evict = FALSE;
@@ -3177,6 +3342,7 @@ dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
 
        err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
        if (err == 0) {
+               ASSERT3P(bp2, !=, NULL);
                *bp = *bp2;
                if (dbp != NULL)
                        dbuf_rele(dbp, NULL);
@@ -3322,10 +3488,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
        blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
            P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
 
-       ASSERT(!BP_IS_REDACTED(bp) ||
+       ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
            dsl_dataset_feature_is_active(
            dpa->dpa_dnode->dn_objset->os_dsl_dataset,
-           SPA_FEATURE_REDACTED_DATASETS));
+           SPA_FEATURE_REDACTED_DATASETS)));
        if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
                arc_buf_destroy(abuf, private);
                dbuf_prefetch_fini(dpa, B_TRUE);
@@ -3394,7 +3560,7 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
                goto no_issue;
 
        dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
-           level, blkid);
+           level, blkid, NULL);
        if (db != NULL) {
                mutex_exit(&db->db_mtx);
                /*
@@ -3458,8 +3624,9 @@ dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
        dpa->dpa_cb = cb;
        dpa->dpa_arg = arg;
 
-       /* flag if L2ARC eligible, l2arc_noprefetch then decides */
-       if (dnode_level_is_l2cacheable(&bp, dn, level))
+       if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
+               dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
+       else if (dnode_level_is_l2cacheable(&bp, dn, level))
                dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
 
        /*
@@ -3559,6 +3726,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
     const void *tag, dmu_buf_impl_t **dbp)
 {
        dmu_buf_impl_t *db, *parent = NULL;
+       uint64_t hv;
 
        /* If the pool has been created, verify the tx_sync_lock is not held */
        spa_t *spa = dn->dn_objset->os_spa;
@@ -3574,7 +3742,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
        *dbp = NULL;
 
        /* dbuf_find() returns with db_mtx held */
-       db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
+       db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
 
        if (db == NULL) {
                blkptr_t *bp = NULL;
@@ -3596,7 +3764,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
                }
                if (err && err != ENOENT)
                        return (err);
-               db = dbuf_create(dn, level, blkid, parent, bp);
+               db = dbuf_create(dn, level, blkid, parent, bp, hv);
        }
 
        if (fail_uncached && db->db_state != DB_CACHED) {
@@ -3620,8 +3788,10 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
            dn->dn_object != DMU_META_DNODE_OBJECT &&
            db->db_state == DB_CACHED && db->db_data_pending) {
                dbuf_dirty_record_t *dr = db->db_data_pending;
-               if (dr->dt.dl.dr_data == db->db_buf)
+               if (dr->dt.dl.dr_data == db->db_buf) {
+                       ASSERT3P(db->db_buf, !=, NULL);
                        dbuf_hold_copy(dn, db);
+               }
        }
 
        if (multilist_link_active(&db->db_cache_link)) {
@@ -3630,17 +3800,17 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
                    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
                multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+               uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
                (void) zfs_refcount_remove_many(
-                   &dbuf_caches[db->db_caching_status].size,
-                   db->db.db_size, db);
+                   &dbuf_caches[db->db_caching_status].size, size, db);
 
                if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
                        DBUF_STAT_BUMPDOWN(metadata_cache_count);
                } else {
                        DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
                        DBUF_STAT_BUMPDOWN(cache_count);
-                       DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-                           db->db.db_size);
+                       DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
                }
                db->db_caching_status = DB_NO_CACHE;
        }
@@ -3680,7 +3850,8 @@ dbuf_create_bonus(dnode_t *dn)
        ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
        ASSERT(dn->dn_bonus == NULL);
-       dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+       dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
+           dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
 }
 
 int
@@ -3726,7 +3897,7 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
        if (blkid == DMU_BONUS_BLKID)
                found_db = dbuf_find_bonus(os, obj);
        else
-               found_db = dbuf_find(os, obj, 0, blkid);
+               found_db = dbuf_find(os, obj, 0, blkid, NULL);
 
        if (found_db != NULL) {
                if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
@@ -3846,59 +4017,39 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
                         * This dbuf has anonymous data associated with it.
                         */
                        dbuf_destroy(db);
-               } else {
-                       boolean_t do_arc_evict = B_FALSE;
-                       blkptr_t bp;
-                       spa_t *spa = dmu_objset_spa(db->db_objset);
-
-                       if (!DBUF_IS_CACHEABLE(db) &&
-                           db->db_blkptr != NULL &&
-                           !BP_IS_HOLE(db->db_blkptr) &&
-                           !BP_IS_EMBEDDED(db->db_blkptr)) {
-                               do_arc_evict = B_TRUE;
-                               bp = *db->db_blkptr;
-                       }
-
-                       if (!DBUF_IS_CACHEABLE(db) ||
-                           db->db_pending_evict) {
-                               dbuf_destroy(db);
-                       } else if (!multilist_link_active(&db->db_cache_link)) {
-                               ASSERT3U(db->db_caching_status, ==,
-                                   DB_NO_CACHE);
-
-                               dbuf_cached_state_t dcs =
-                                   dbuf_include_in_metadata_cache(db) ?
-                                   DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
-                               db->db_caching_status = dcs;
-
-                               multilist_insert(&dbuf_caches[dcs].cache, db);
-                               uint64_t db_size = db->db.db_size;
-                               size = zfs_refcount_add_many(
-                                   &dbuf_caches[dcs].size, db_size, db);
-                               uint8_t db_level = db->db_level;
-                               mutex_exit(&db->db_mtx);
-
-                               if (dcs == DB_DBUF_METADATA_CACHE) {
-                                       DBUF_STAT_BUMP(metadata_cache_count);
-                                       DBUF_STAT_MAX(
-                                           metadata_cache_size_bytes_max,
-                                           size);
-                               } else {
-                                       DBUF_STAT_BUMP(cache_count);
-                                       DBUF_STAT_MAX(cache_size_bytes_max,
-                                           size);
-                                       DBUF_STAT_BUMP(cache_levels[db_level]);
-                                       DBUF_STAT_INCR(
-                                           cache_levels_bytes[db_level],
-                                           db_size);
-                               }
+               } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
+                   db->db_pending_evict) {
+                       dbuf_destroy(db);
+               } else if (!multilist_link_active(&db->db_cache_link)) {
+                       ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+
+                       dbuf_cached_state_t dcs =
+                           dbuf_include_in_metadata_cache(db) ?
+                           DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+                       db->db_caching_status = dcs;
+
+                       multilist_insert(&dbuf_caches[dcs].cache, db);
+                       uint64_t db_size = db->db.db_size +
+                           dmu_buf_user_size(&db->db);
+                       size = zfs_refcount_add_many(
+                           &dbuf_caches[dcs].size, db_size, db);
+                       uint8_t db_level = db->db_level;
+                       mutex_exit(&db->db_mtx);
 
-                               if (dcs == DB_DBUF_CACHE && !evicting)
-                                       dbuf_evict_notify(size);
+                       if (dcs == DB_DBUF_METADATA_CACHE) {
+                               DBUF_STAT_BUMP(metadata_cache_count);
+                               DBUF_STAT_MAX(metadata_cache_size_bytes_max,
+                                   size);
+                       } else {
+                               DBUF_STAT_BUMP(cache_count);
+                               DBUF_STAT_MAX(cache_size_bytes_max, size);
+                               DBUF_STAT_BUMP(cache_levels[db_level]);
+                               DBUF_STAT_INCR(cache_levels_bytes[db_level],
+                                   db_size);
                        }
 
-                       if (do_arc_evict)
-                               arc_freed(spa, &bp);
+                       if (dcs == DB_DBUF_CACHE && !evicting)
+                               dbuf_evict_notify(size);
                }
        } else {
                mutex_exit(&db->db_mtx);
@@ -3975,6 +4126,35 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
        return (db->db_user);
 }
 
+uint64_t
+dmu_buf_user_size(dmu_buf_t *db_fake)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       if (db->db_user == NULL)
+               return (0);
+       return (atomic_load_64(&db->db_user->dbu_size));
+}
+
+void
+dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+       ASSERT3P(db->db_user, !=, NULL);
+       ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
+       atomic_add_64(&db->db_user->dbu_size, nadd);
+}
+
+void
+dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+       ASSERT3P(db->db_user, !=, NULL);
+       ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
+       atomic_sub_64(&db->db_user->dbu_size, nsub);
+}
+
 void
 dmu_buf_user_evict_wait(void)
 {
@@ -3995,21 +4175,6 @@ dmu_buf_get_objset(dmu_buf_t *db)
        return (dbi->db_objset);
 }
 
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
-       dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-       DB_DNODE_ENTER(dbi);
-       return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
-       dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-       DB_DNODE_EXIT(dbi);
-}
-
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
@@ -4270,22 +4435,6 @@ dbuf_lightweight_ready(zio_t *zio)
        rw_exit(&parent_db->db_rwlock);
 }
 
-static void
-dbuf_lightweight_physdone(zio_t *zio)
-{
-       dbuf_dirty_record_t *dr = zio->io_private;
-       dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
-       ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
-       /*
-        * The callback will be called io_phys_children times.  Retire one
-        * portion of our dirty space each time we are called.  Any rounding
-        * error will be cleaned up by dbuf_lightweight_done().
-        */
-       int delta = dr->dr_accounted / zio->io_phys_children;
-       dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
 static void
 dbuf_lightweight_done(zio_t *zio)
 {
@@ -4304,16 +4453,8 @@ dbuf_lightweight_done(zio_t *zio)
                dsl_dataset_block_born(ds, zio->io_bp, tx);
        }
 
-       /*
-        * See comment in dbuf_write_done().
-        */
-       if (zio->io_phys_children == 0) {
-               dsl_pool_undirty_space(dmu_objset_pool(os),
-                   dr->dr_accounted, zio->io_txg);
-       } else {
-               dsl_pool_undirty_space(dmu_objset_pool(os),
-                   dr->dr_accounted % zio->io_phys_children, zio->io_txg);
-       }
+       dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+           zio->io_txg);
 
        abd_free(dr->dt.dll.dr_abd);
        kmem_free(dr, sizeof (*dr));
@@ -4347,8 +4488,7 @@ dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
            dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
            dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
            &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
-           dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
-           ZIO_PRIORITY_ASYNC_WRITE,
+           dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
            ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
 
        zio_nowait(dr->dr_zio);
@@ -4383,6 +4523,15 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        } else if (db->db_state == DB_FILL) {
                /* This buffer was freed and is now being re-filled */
                ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+       } else if (db->db_state == DB_READ) {
+               /*
+                * This buffer has a clone we need to write, and an in-flight
+                * read on the BP we're about to clone. Its safe to issue the
+                * write here because the read has already been issued and the
+                * contents won't change.
+                */
+               ASSERT(dr->dt.dl.dr_brtwrite &&
+                   dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
        } else {
                ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
        }
@@ -4439,7 +4588,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
                ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
                cv_wait(&db->db_changed, &db->db_mtx);
-               ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
        }
 
        /*
@@ -4505,6 +4653,10 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        }
 }
 
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs.  May be
+ * called recursively from dbuf_sync_indirect().
+ */
 void
 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
@@ -4561,7 +4713,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
        dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
        zio->io_prev_space_delta = delta;
 
-       if (bp->blk_birth != 0) {
+       if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
                ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
                    BP_GET_TYPE(bp) == dn->dn_type) ||
                    (db->db_blkid == DMU_SPILL_BLKID &&
@@ -4598,6 +4750,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                                i += DNODE_MIN_SIZE;
                                if (dnp->dn_type != DMU_OT_NONE) {
                                        fill++;
+                                       for (int j = 0; j < dnp->dn_nblkptr;
+                                           j++) {
+                                               (void) zfs_blkptr_verify(spa,
+                                                   &dnp->dn_blkptr[j],
+                                                   BLK_CONFIG_SKIP,
+                                                   BLK_VERIFY_HALT);
+                                       }
+                                       if (dnp->dn_flags &
+                                           DNODE_FLAG_SPILL_BLKPTR) {
+                                               (void) zfs_blkptr_verify(spa,
+                                                   DN_SPILL_BLKPTR(dnp),
+                                                   BLK_CONFIG_SKIP,
+                                                   BLK_VERIFY_HALT);
+                                       }
                                        i += dnp->dn_extra_slots *
                                            DNODE_MIN_SIZE;
                                }
@@ -4615,6 +4781,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
                        if (BP_IS_HOLE(ibp))
                                continue;
+                       (void) zfs_blkptr_verify(spa, ibp,
+                           BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
                        fill += BP_GET_FILL(ibp);
                }
        }
@@ -4675,37 +4843,6 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
        DB_DNODE_EXIT(db);
 }
 
-/*
- * The SPA will call this callback several times for each zio - once
- * for every physical child i/o (zio->io_phys_children times).  This
- * allows the DMU to monitor the progress of each logical i/o.  For example,
- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
- * block.  There may be a long delay before all copies/fragments are completed,
- * so this callback allows us to retire dirty space gradually, as the physical
- * i/os complete.
- */
-static void
-dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
-{
-       (void) buf;
-       dmu_buf_impl_t *db = arg;
-       objset_t *os = db->db_objset;
-       dsl_pool_t *dp = dmu_objset_pool(os);
-       dbuf_dirty_record_t *dr;
-       int delta = 0;
-
-       dr = db->db_data_pending;
-       ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
-       /*
-        * The callback will be called io_phys_children times.  Retire one
-        * portion of our dirty space each time we are called.  Any rounding
-        * error will be cleaned up by dbuf_write_done().
-        */
-       delta = dr->dr_accounted / zio->io_phys_children;
-       dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
@@ -4754,8 +4891,10 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
                if (db->db_state != DB_NOFILL) {
-                       if (dr->dt.dl.dr_data != db->db_buf)
+                       if (dr->dt.dl.dr_data != NULL &&
+                           dr->dt.dl.dr_data != db->db_buf) {
                                arc_buf_destroy(dr->dt.dl.dr_data, db);
+                       }
                }
        } else {
                ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -4778,27 +4917,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        db->db_data_pending = NULL;
        dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 
-       /*
-        * If we didn't do a physical write in this ZIO and we
-        * still ended up here, it means that the space of the
-        * dbuf that we just released (and undirtied) above hasn't
-        * been marked as undirtied in the pool's accounting.
-        *
-        * Thus, we undirty that space in the pool's view of the
-        * world here. For physical writes this type of update
-        * happens in dbuf_write_physdone().
-        *
-        * If we did a physical write, cleanup any rounding errors
-        * that came up due to writing multiple copies of a block
-        * on disk [see dbuf_write_physdone()].
-        */
-       if (zio->io_phys_children == 0) {
-               dsl_pool_undirty_space(dmu_objset_pool(os),
-                   dr->dr_accounted, zio->io_txg);
-       } else {
-               dsl_pool_undirty_space(dmu_objset_pool(os),
-                   dr->dr_accounted % zio->io_phys_children, zio->io_txg);
-       }
+       dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+           zio->io_txg);
 
        kmem_free(dr, sizeof (dbuf_dirty_record_t));
 }
@@ -4880,7 +5000,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
        ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
        drica.drica_os = dn->dn_objset;
-       drica.drica_blk_birth = bp->blk_birth;
+       drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
        drica.drica_tx = tx;
        if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
            &drica)) {
@@ -4895,7 +5015,8 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
                if (dn->dn_objset != spa_meta_objset(spa)) {
                        dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
                        if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-                           bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+                           BP_GET_LOGICAL_BIRTH(bp) >
+                           ds->ds_dir->dd_origin_txg) {
                                ASSERT(!BP_IS_EMBEDDED(bp));
                                ASSERT(dsl_dir_is_clone(ds->ds_dir));
                                ASSERT(spa_feature_is_enabled(spa,
@@ -4955,7 +5076,10 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
 }
 
 
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
@@ -5014,7 +5138,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
        }
 
        ASSERT(db->db_level == 0 || data == db->db_buf);
-       ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+       ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
        ASSERT(pio);
 
        SET_BOOKMARK(&zb, os->os_dsl_dataset ?
@@ -5046,20 +5170,21 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
                dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
                    contents, db->db.db_size, db->db.db_size, &zp,
-                   dbuf_write_override_ready, NULL, NULL,
+                   dbuf_write_override_ready, NULL,
                    dbuf_write_override_done,
                    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
                mutex_enter(&db->db_mtx);
                dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
                zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-                   dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+                   dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
+                   dr->dt.dl.dr_brtwrite);
                mutex_exit(&db->db_mtx);
        } else if (db->db_state == DB_NOFILL) {
                ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
                    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
                dr->dr_zio = zio_write(pio, os->os_spa, txg,
                    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
-                   dbuf_write_nofill_ready, NULL, NULL,
+                   dbuf_write_nofill_ready, NULL,
                    dbuf_write_nofill_done, db,
                    ZIO_PRIORITY_ASYNC_WRITE,
                    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
@@ -5076,11 +5201,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
                        children_ready_cb = dbuf_write_children_ready;
 
                dr->dr_zio = arc_write(pio, os->os_spa, txg,
-                   &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db),
-                   &zp, dbuf_write_ready,
-                   children_ready_cb, dbuf_write_physdone,
-                   dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-                   ZIO_FLAG_MUSTSUCCEED, &zb);
+                   &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
+                   dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
+                   children_ready_cb, dbuf_write_done, db,
+                   ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
        }
 }
 
@@ -5098,6 +5222,7 @@ EXPORT_SYMBOL(dbuf_dirty);
 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
 EXPORT_SYMBOL(dmu_buf_will_dirty);
 EXPORT_SYMBOL(dmu_buf_is_dirty);
+EXPORT_SYMBOL(dmu_buf_will_clone);
 EXPORT_SYMBOL(dmu_buf_will_not_fill);
 EXPORT_SYMBOL(dmu_buf_will_fill);
 EXPORT_SYMBOL(dmu_buf_fill_done);
@@ -5120,7 +5245,7 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie);
 EXPORT_SYMBOL(dmu_buf_get_user);
 EXPORT_SYMBOL(dmu_buf_get_blkptr);
 
-ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
        "Maximum size in bytes of the dbuf cache.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
@@ -5129,7 +5254,7 @@ ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
        "Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
 
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
        "Maximum size in bytes of dbuf metadata cache.");
 
 ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,