]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/dbuf.c
Provide macros for setting and getting blkptr birth times
[mirror_zfs.git] / module / zfs / dbuf.c
index be6a76830af394de43488d0de8bea4809b84f79b..4e190c131e1dde2f9977573071d597f11b9c1b28 100644 (file)
@@ -6,7 +6,7 @@
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
  */
 
 #include <sys/zfs_context.h>
 #include <sys/trace_zfs.h>
 #include <sys/callb.h>
 #include <sys/abd.h>
+#include <sys/brt.h>
 #include <sys/vdev.h>
 #include <cityhash.h>
 #include <sys/spa_impl.h>
+#include <sys/wmsum.h>
+#include <sys/vdev_impl.h>
 
-kstat_t *dbuf_ksp;
+static kstat_t *dbuf_ksp;
 
 typedef struct dbuf_stats {
        /*
@@ -95,6 +101,11 @@ typedef struct dbuf_stats {
         * already created and in the dbuf hash table.
         */
        kstat_named_t hash_insert_race;
+       /*
+        * Number of entries in the hash table dbuf and mutex arrays.
+        */
+       kstat_named_t hash_table_count;
+       kstat_named_t hash_mutex_count;
        /*
         * Statistics about the size of the metadata dbuf cache.
         */
@@ -127,14 +138,30 @@ dbuf_stats_t dbuf_stats = {
        { "hash_chains",                        KSTAT_DATA_UINT64 },
        { "hash_chain_max",                     KSTAT_DATA_UINT64 },
        { "hash_insert_race",                   KSTAT_DATA_UINT64 },
+       { "hash_table_count",                   KSTAT_DATA_UINT64 },
+       { "hash_mutex_count",                   KSTAT_DATA_UINT64 },
        { "metadata_cache_count",               KSTAT_DATA_UINT64 },
        { "metadata_cache_size_bytes",          KSTAT_DATA_UINT64 },
        { "metadata_cache_size_bytes_max",      KSTAT_DATA_UINT64 },
        { "metadata_cache_overflow",            KSTAT_DATA_UINT64 }
 };
 
+struct {
+       wmsum_t cache_count;
+       wmsum_t cache_total_evicts;
+       wmsum_t cache_levels[DN_MAX_LEVELS];
+       wmsum_t cache_levels_bytes[DN_MAX_LEVELS];
+       wmsum_t hash_hits;
+       wmsum_t hash_misses;
+       wmsum_t hash_collisions;
+       wmsum_t hash_chains;
+       wmsum_t hash_insert_race;
+       wmsum_t metadata_cache_count;
+       wmsum_t metadata_cache_overflow;
+} dbuf_sums;
+
 #define        DBUF_STAT_INCR(stat, val)       \
-       atomic_add_64(&dbuf_stats.stat.value.ui64, (val));
+       wmsum_add(&dbuf_sums.stat, val);
 #define        DBUF_STAT_DECR(stat, val)       \
        DBUF_STAT_INCR(stat, -(val));
 #define        DBUF_STAT_BUMP(stat)            \
@@ -148,16 +175,10 @@ dbuf_stats_t dbuf_stats = {
                continue;                                               \
 }
 
-static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
 static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
 
-extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
-    dmu_buf_evict_func_t *evict_func_sync,
-    dmu_buf_evict_func_t *evict_func_async,
-    dmu_buf_t **clear_on_evict_dbufp);
-
 /*
  * Global data structures and functions for the dbuf cache.
  */
@@ -201,17 +222,24 @@ static boolean_t dbuf_evict_thread_exit;
  * by those caches' matching enum values (from dbuf_cached_state_t).
  */
 typedef struct dbuf_cache {
-       multilist_t *cache;
-       zfs_refcount_t size;
+       multilist_t cache;
+       zfs_refcount_t size ____cacheline_aligned;
 } dbuf_cache_t;
 dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 
 /* Size limits for the caches */
-unsigned long dbuf_cache_max_bytes = 0;
-unsigned long dbuf_metadata_cache_max_bytes = 0;
+static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
+static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
+
 /* Set the default sizes of the caches to log2 fraction of arc size */
-int dbuf_cache_shift = 5;
-int dbuf_metadata_cache_shift = 6;
+static uint_t dbuf_cache_shift = 5;
+static uint_t dbuf_metadata_cache_shift = 6;
+
+/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
+static uint_t dbuf_mutex_cache_shift = 0;
+
+static unsigned long dbuf_cache_target_bytes(void);
+static unsigned long dbuf_metadata_cache_target_bytes(void);
 
 /*
  * The LRU dbuf cache uses a three-stage eviction policy:
@@ -255,15 +283,15 @@ int dbuf_metadata_cache_shift = 6;
 /*
  * The percentage above and below the maximum cache size.
  */
-uint_t dbuf_cache_hiwater_pct = 10;
-uint_t dbuf_cache_lowater_pct = 10;
+static uint_t dbuf_cache_hiwater_pct = 10;
+static uint_t dbuf_cache_lowater_pct = 10;
 
-/* ARGSUSED */
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {
+       (void) unused, (void) kmflag;
        dmu_buf_impl_t *db = vdb;
-       bzero(db, sizeof (dmu_buf_impl_t));
+       memset(db, 0, sizeof (dmu_buf_impl_t));
 
        mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
        rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
@@ -274,10 +302,10 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
        return (0);
 }
 
-/* ARGSUSED */
 static void
 dbuf_dest(void *vdb, void *unused)
 {
+       (void) unused;
        dmu_buf_impl_t *db = vdb;
        mutex_destroy(&db->db_mtx);
        rw_destroy(&db->db_rwlock);
@@ -291,8 +319,6 @@ dbuf_dest(void *vdb, void *unused)
  */
 static dbuf_hash_table_t dbuf_hash_table;
 
-static uint64_t dbuf_hash_count;
-
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
@@ -314,7 +340,8 @@ dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
        (dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
-dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
+    uint64_t *hash_out)
 {
        dbuf_hash_table_t *h = &dbuf_hash_table;
        uint64_t hv;
@@ -336,6 +363,8 @@ dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
                }
        }
        mutex_exit(DBUF_HASH_MUTEX(h, idx));
+       if (hash_out != NULL)
+               *hash_out = hv;
        return (NULL);
 }
 
@@ -370,13 +399,13 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
        objset_t *os = db->db_objset;
        uint64_t obj = db->db.db_object;
        int level = db->db_level;
-       uint64_t blkid, hv, idx;
+       uint64_t blkid, idx;
        dmu_buf_impl_t *dbf;
        uint32_t i;
 
        blkid = db->db_blkid;
-       hv = dbuf_hash(os, obj, level, blkid);
-       idx = hv & h->hash_table_mask;
+       ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
+       idx = db->db_hash & h->hash_table_mask;
 
        mutex_enter(DBUF_HASH_MUTEX(h, idx));
        for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
@@ -403,8 +432,8 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
        db->db_hash_next = h->hash_table[idx];
        h->hash_table[idx] = db;
        mutex_exit(DBUF_HASH_MUTEX(h, idx));
-       atomic_inc_64(&dbuf_hash_count);
-       DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count);
+       uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
+       DBUF_STAT_MAX(hash_elements_max, he);
 
        return (NULL);
 }
@@ -432,7 +461,7 @@ dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
                 */
                if (zfs_refcount_count(
                    &dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
-                   dbuf_metadata_cache_max_bytes) {
+                   dbuf_metadata_cache_target_bytes()) {
                        DBUF_STAT_BUMP(metadata_cache_overflow);
                        return (B_FALSE);
                }
@@ -450,12 +479,12 @@ static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
        dbuf_hash_table_t *h = &dbuf_hash_table;
-       uint64_t hv, idx;
+       uint64_t idx;
        dmu_buf_impl_t *dbf, **dbp;
 
-       hv = dbuf_hash(db->db_objset, db->db.db_object,
-           db->db_level, db->db_blkid);
-       idx = hv & h->hash_table_mask;
+       ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
+           db->db_blkid), ==, db->db_hash);
+       idx = db->db_hash & h->hash_table_mask;
 
        /*
         * We mustn't hold db_mtx to maintain lock ordering:
@@ -477,7 +506,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
            h->hash_table[idx]->db_hash_next == NULL)
                DBUF_STAT_BUMPDOWN(hash_chains);
        mutex_exit(DBUF_HASH_MUTEX(h, idx));
-       atomic_dec_64(&dbuf_hash_count);
+       atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
 }
 
 typedef enum {
@@ -540,6 +569,21 @@ dbuf_evict_user(dmu_buf_impl_t *db)
                *dbu->dbu_clear_on_evict_dbufp = NULL;
 #endif
 
+       if (db->db_caching_status != DB_NO_CACHE) {
+               /*
+                * This is a cached dbuf, so the size of the user data is
+                * included in its cached amount. We adjust it here because the
+                * user data has already been detached from the dbuf, and the
+                * sync functions are not supposed to touch it (the dbuf might
+                * not exist anymore by the time the sync functions run.
+                */
+               uint64_t size = dbu->dbu_size;
+               (void) zfs_refcount_remove_many(
+                   &dbuf_caches[db->db_caching_status].size, size, db);
+               if (db->db_caching_status == DB_DBUF_CACHE)
+                       DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
+       }
+
        /*
         * There are two eviction callbacks - one that we call synchronously
         * and one that we invoke via a taskq.  The async one is useful for
@@ -580,6 +624,68 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
        }
 }
 
+/*
+ * We want to exclude buffers that are on a special allocation class from
+ * L2ARC.
+ */
+boolean_t
+dbuf_is_l2cacheable(dmu_buf_impl_t *db)
+{
+       if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+           (db->db_objset->os_secondary_cache ==
+           ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
+               if (l2arc_exclude_special == 0)
+                       return (B_TRUE);
+
+               blkptr_t *bp = db->db_blkptr;
+               if (bp == NULL || BP_IS_HOLE(bp))
+                       return (B_FALSE);
+               uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
+               vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
+               vdev_t *vd = NULL;
+
+               if (vdev < rvd->vdev_children)
+                       vd = rvd->vdev_child[vdev];
+
+               if (vd == NULL)
+                       return (B_TRUE);
+
+               if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+                   vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+                       return (B_TRUE);
+       }
+       return (B_FALSE);
+}
+
+static inline boolean_t
+dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
+{
+       if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+           (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
+           (level > 0 ||
+           DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
+               if (l2arc_exclude_special == 0)
+                       return (B_TRUE);
+
+               if (bp == NULL || BP_IS_HOLE(bp))
+                       return (B_FALSE);
+               uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
+               vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
+               vdev_t *vd = NULL;
+
+               if (vdev < rvd->vdev_children)
+                       vd = rvd->vdev_child[vdev];
+
+               if (vd == NULL)
+                       return (B_TRUE);
+
+               if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+                   vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+                       return (B_TRUE);
+       }
+       return (B_FALSE);
+}
+
 
 /*
  * This function *must* return indices evenly distributed between all
@@ -588,7 +694,7 @@ dbuf_is_metadata(dmu_buf_impl_t *db)
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
-unsigned int
+static unsigned int
 dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
 {
        dmu_buf_impl_t *db = obj;
@@ -603,18 +709,34 @@ dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
         * Also, the low order bits of the hash value are thought to be
         * distributed evenly. Otherwise, in the case that the multilist
         * has a power of two number of sublists, each sublists' usage
-        * would not be evenly distributed.
+        * would not be evenly distributed. In this context full 64bit
+        * division would be a waste of time, so limit it to 32 bits.
         */
-       return (dbuf_hash(db->db_objset, db->db.db_object,
+       return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,
            db->db_level, db->db_blkid) %
            multilist_get_num_sublists(ml));
 }
 
+/*
+ * The target size of the dbuf cache can grow with the ARC target,
+ * unless limited by the tunable dbuf_cache_max_bytes.
+ */
 static inline unsigned long
 dbuf_cache_target_bytes(void)
 {
-       return MIN(dbuf_cache_max_bytes,
-           arc_target_bytes() >> dbuf_cache_shift);
+       return (MIN(dbuf_cache_max_bytes,
+           arc_target_bytes() >> dbuf_cache_shift));
+}
+
+/*
+ * The target size of the dbuf metadata cache can grow with the ARC target,
+ * unless limited by the tunable dbuf_metadata_cache_max_bytes.
+ */
+static inline unsigned long
+dbuf_metadata_cache_target_bytes(void)
+{
+       return (MIN(dbuf_metadata_cache_max_bytes,
+           arc_target_bytes() >> dbuf_metadata_cache_shift));
 }
 
 static inline uint64_t
@@ -646,9 +768,9 @@ dbuf_cache_above_lowater(void)
 static void
 dbuf_evict_one(void)
 {
-       int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+       int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
        multilist_sublist_t *mls = multilist_sublist_lock(
-           dbuf_caches[DB_DBUF_CACHE].cache, idx);
+           &dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
        ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
@@ -663,12 +785,12 @@ dbuf_evict_one(void)
        if (db != NULL) {
                multilist_sublist_remove(mls, db);
                multilist_sublist_unlock(mls);
+               uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
                (void) zfs_refcount_remove_many(
-                   &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+                   &dbuf_caches[DB_DBUF_CACHE].size, size, db);
                DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
                DBUF_STAT_BUMPDOWN(cache_count);
-               DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-                   db->db.db_size);
+               DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
                ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
                db->db_caching_status = DB_NO_CACHE;
                dbuf_destroy(db);
@@ -685,10 +807,10 @@ dbuf_evict_one(void)
  * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
  * out of the cache it is destroyed and becomes eligible for arc eviction.
  */
-/* ARGSUSED */
-static void
+static __attribute__((noreturn)) void
 dbuf_evict_thread(void *unused)
 {
+       (void) unused;
        callb_cpr_t cpr;
 
        CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
@@ -697,7 +819,7 @@ dbuf_evict_thread(void *unused)
        while (!dbuf_evict_thread_exit) {
                while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
                        CALLB_CPR_SAFE_BEGIN(&cpr);
-                       (void) cv_timedwait_sig_hires(&dbuf_evict_cv,
+                       (void) cv_timedwait_idle_hires(&dbuf_evict_cv,
                            &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
                        CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
                }
@@ -724,7 +846,7 @@ dbuf_evict_thread(void *unused)
 /*
  * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
  * If the dbuf cache is at its high water mark, then evict a dbuf from the
- * dbuf cache using the callers context.
+ * dbuf cache using the caller's context.
  */
 static void
 dbuf_evict_notify(uint64_t size)
@@ -745,83 +867,102 @@ static int
 dbuf_kstat_update(kstat_t *ksp, int rw)
 {
        dbuf_stats_t *ds = ksp->ks_data;
+       dbuf_hash_table_t *h = &dbuf_hash_table;
 
-       if (rw == KSTAT_WRITE) {
+       if (rw == KSTAT_WRITE)
                return (SET_ERROR(EACCES));
-       } else {
-               ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
-                   &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
-               ds->cache_size_bytes.value.ui64 =
-                   zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
-               ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
-               ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
-               ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
-               ds->hash_elements.value.ui64 = dbuf_hash_count;
-       }
 
+       ds->cache_count.value.ui64 =
+           wmsum_value(&dbuf_sums.cache_count);
+       ds->cache_size_bytes.value.ui64 =
+           zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
+       ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
+       ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
+       ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
+       ds->cache_total_evicts.value.ui64 =
+           wmsum_value(&dbuf_sums.cache_total_evicts);
+       for (int i = 0; i < DN_MAX_LEVELS; i++) {
+               ds->cache_levels[i].value.ui64 =
+                   wmsum_value(&dbuf_sums.cache_levels[i]);
+               ds->cache_levels_bytes[i].value.ui64 =
+                   wmsum_value(&dbuf_sums.cache_levels_bytes[i]);
+       }
+       ds->hash_hits.value.ui64 =
+           wmsum_value(&dbuf_sums.hash_hits);
+       ds->hash_misses.value.ui64 =
+           wmsum_value(&dbuf_sums.hash_misses);
+       ds->hash_collisions.value.ui64 =
+           wmsum_value(&dbuf_sums.hash_collisions);
+       ds->hash_chains.value.ui64 =
+           wmsum_value(&dbuf_sums.hash_chains);
+       ds->hash_insert_race.value.ui64 =
+           wmsum_value(&dbuf_sums.hash_insert_race);
+       ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
+       ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
+       ds->metadata_cache_count.value.ui64 =
+           wmsum_value(&dbuf_sums.metadata_cache_count);
+       ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
+           &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
+       ds->metadata_cache_overflow.value.ui64 =
+           wmsum_value(&dbuf_sums.metadata_cache_overflow);
        return (0);
 }
 
 void
 dbuf_init(void)
 {
-       uint64_t hsize = 1ULL << 16;
+       uint64_t hmsize, hsize = 1ULL << 16;
        dbuf_hash_table_t *h = &dbuf_hash_table;
-       int i;
 
        /*
-        * The hash table is big enough to fill all of physical memory
+        * The hash table is big enough to fill one eighth of physical memory
         * with an average block size of zfs_arc_average_blocksize (default 8K).
         * By default, the table will take up
         * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
         */
-       while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
+       while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
                hsize <<= 1;
 
-retry:
-       h->hash_table_mask = hsize - 1;
-#if defined(_KERNEL)
+       h->hash_table = NULL;
+       while (h->hash_table == NULL) {
+               h->hash_table_mask = hsize - 1;
+
+               h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+               if (h->hash_table == NULL)
+                       hsize >>= 1;
+
+               ASSERT3U(hsize, >=, 1ULL << 10);
+       }
+
        /*
-        * Large allocations which do not require contiguous pages
-        * should be using vmem_alloc() in the linux kernel
+        * The hash table buckets are protected by an array of mutexes where
+        * each mutex is reponsible for protecting 128 buckets.  A minimum
+        * array size of 8192 is targeted to avoid contention.
         */
-       h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
-#else
-       h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
-#endif
-       if (h->hash_table == NULL) {
-               /* XXX - we should really return an error instead of assert */
-               ASSERT(hsize > (1ULL << 10));
-               hsize >>= 1;
-               goto retry;
+       if (dbuf_mutex_cache_shift == 0)
+               hmsize = MAX(hsize >> 7, 1ULL << 13);
+       else
+               hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
+
+       h->hash_mutexes = NULL;
+       while (h->hash_mutexes == NULL) {
+               h->hash_mutex_mask = hmsize - 1;
+
+               h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
+                   KM_SLEEP);
+               if (h->hash_mutexes == NULL)
+                       hmsize >>= 1;
        }
 
        dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
            sizeof (dmu_buf_impl_t),
            0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
-       for (i = 0; i < DBUF_MUTEXES; i++)
+       for (int i = 0; i < hmsize; i++)
                mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 
        dbuf_stats_init(h);
 
-       /*
-        * Setup the parameters for the dbuf caches. We set the sizes of the
-        * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
-        * of the target size of the ARC. If the values has been specified as
-        * a module option and they're not greater than the target size of the
-        * ARC, then we honor that value.
-        */
-       if (dbuf_cache_max_bytes == 0 ||
-           dbuf_cache_max_bytes >= arc_target_bytes()) {
-               dbuf_cache_max_bytes = arc_target_bytes() >> dbuf_cache_shift;
-       }
-       if (dbuf_metadata_cache_max_bytes == 0 ||
-           dbuf_metadata_cache_max_bytes >= arc_target_bytes()) {
-               dbuf_metadata_cache_max_bytes =
-                   arc_target_bytes() >> dbuf_metadata_cache_shift;
-       }
-
        /*
         * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
         * configuration is not required.
@@ -829,8 +970,8 @@ retry:
        dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
 
        for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
-               dbuf_caches[dcs].cache =
-                   multilist_create(sizeof (dmu_buf_impl_t),
+               multilist_create(&dbuf_caches[dcs].cache,
+                   sizeof (dmu_buf_impl_t),
                    offsetof(dmu_buf_impl_t, db_cache_link),
                    dbuf_cache_multilist_index_func);
                zfs_refcount_create(&dbuf_caches[dcs].size);
@@ -842,11 +983,25 @@ retry:
        dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
            NULL, 0, &p0, TS_RUN, minclsyspri);
 
+       wmsum_init(&dbuf_sums.cache_count, 0);
+       wmsum_init(&dbuf_sums.cache_total_evicts, 0);
+       for (int i = 0; i < DN_MAX_LEVELS; i++) {
+               wmsum_init(&dbuf_sums.cache_levels[i], 0);
+               wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
+       }
+       wmsum_init(&dbuf_sums.hash_hits, 0);
+       wmsum_init(&dbuf_sums.hash_misses, 0);
+       wmsum_init(&dbuf_sums.hash_collisions, 0);
+       wmsum_init(&dbuf_sums.hash_chains, 0);
+       wmsum_init(&dbuf_sums.hash_insert_race, 0);
+       wmsum_init(&dbuf_sums.metadata_cache_count, 0);
+       wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);
+
        dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
            KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
            KSTAT_FLAG_VIRTUAL);
        if (dbuf_ksp != NULL) {
-               for (i = 0; i < DN_MAX_LEVELS; i++) {
+               for (int i = 0; i < DN_MAX_LEVELS; i++) {
                        snprintf(dbuf_stats.cache_levels[i].name,
                            KSTAT_STRLEN, "cache_level_%d", i);
                        dbuf_stats.cache_levels[i].data_type =
@@ -866,21 +1021,16 @@ void
 dbuf_fini(void)
 {
        dbuf_hash_table_t *h = &dbuf_hash_table;
-       int i;
 
        dbuf_stats_destroy();
 
-       for (i = 0; i < DBUF_MUTEXES; i++)
+       for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
                mutex_destroy(&h->hash_mutexes[i]);
-#if defined(_KERNEL)
-       /*
-        * Large allocations which do not require contiguous pages
-        * should be using vmem_free() in the linux kernel
-        */
+
        vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-#else
-       kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-#endif
+       vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
+           sizeof (kmutex_t));
+
        kmem_cache_destroy(dbuf_kmem_cache);
        taskq_destroy(dbu_evict_taskq);
 
@@ -897,13 +1047,27 @@ dbuf_fini(void)
 
        for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
                zfs_refcount_destroy(&dbuf_caches[dcs].size);
-               multilist_destroy(dbuf_caches[dcs].cache);
+               multilist_destroy(&dbuf_caches[dcs].cache);
        }
 
        if (dbuf_ksp != NULL) {
                kstat_delete(dbuf_ksp);
                dbuf_ksp = NULL;
        }
+
+       wmsum_fini(&dbuf_sums.cache_count);
+       wmsum_fini(&dbuf_sums.cache_total_evicts);
+       for (int i = 0; i < DN_MAX_LEVELS; i++) {
+               wmsum_fini(&dbuf_sums.cache_levels[i]);
+               wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
+       }
+       wmsum_fini(&dbuf_sums.hash_hits);
+       wmsum_fini(&dbuf_sums.hash_misses);
+       wmsum_fini(&dbuf_sums.hash_collisions);
+       wmsum_fini(&dbuf_sums.hash_chains);
+       wmsum_fini(&dbuf_sums.hash_insert_race);
+       wmsum_fini(&dbuf_sums.metadata_cache_count);
+       wmsum_fini(&dbuf_sums.metadata_cache_overflow);
 }
 
 /*
@@ -1007,7 +1171,7 @@ dbuf_verify(dmu_buf_impl_t *db)
        if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
            (db->db_buf == NULL || db->db_buf->b_data) &&
            db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
-           db->db_state != DB_FILL && !dn->dn_free_txg) {
+           db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
                /*
                 * If the blkptr isn't set but they have nonzero data,
                 * it had better be dirty, otherwise we'll lose that
@@ -1053,7 +1217,7 @@ dbuf_verify(dmu_buf_impl_t *db)
                                        ASSERT0(bp->blk_pad[1]);
                                        ASSERT(!BP_IS_EMBEDDED(bp));
                                        ASSERT(BP_IS_HOLE(bp));
-                                       ASSERT0(bp->blk_phys_birth);
+                                       ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
                                }
                        }
                }
@@ -1086,40 +1250,6 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
        db->db.db_data = buf->b_data;
 }
 
-static arc_buf_t *
-dbuf_alloc_arcbuf_from_arcbuf(dmu_buf_impl_t *db, arc_buf_t *data)
-{
-       objset_t *os = db->db_objset;
-       spa_t *spa = os->os_spa;
-       arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-       enum zio_compress compress_type;
-       int psize, lsize;
-
-       psize = arc_buf_size(data);
-       lsize = arc_buf_lsize(data);
-       compress_type = arc_get_compression(data);
-
-       if (arc_is_encrypted(data)) {
-               boolean_t byteorder;
-               uint8_t salt[ZIO_DATA_SALT_LEN];
-               uint8_t iv[ZIO_DATA_IV_LEN];
-               uint8_t mac[ZIO_DATA_MAC_LEN];
-               dnode_t *dn = DB_DNODE(db);
-
-               arc_get_raw_params(data, &byteorder, salt, iv, mac);
-               data = arc_alloc_raw_buf(spa, db, dmu_objset_id(os),
-                   byteorder, salt, iv, mac, dn->dn_type, psize, lsize,
-                   compress_type);
-       } else if (compress_type != ZIO_COMPRESS_OFF) {
-               ASSERT3U(type, ==, ARC_BUFC_DATA);
-               data = arc_alloc_compressed_buf(spa, db,
-                   psize, lsize, compress_type);
-       } else {
-               data = arc_alloc_buf(spa, db, type, psize);
-       }
-       return (data);
-}
-
 static arc_buf_t *
 dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
 {
@@ -1144,7 +1274,7 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 
                mutex_exit(&db->db_mtx);
                abuf = arc_loan_buf(spa, B_FALSE, blksz);
-               bcopy(db->db.db_data, abuf->b_data, blksz);
+               memcpy(abuf->b_data, db->db.db_data, blksz);
        } else {
                abuf = db->db_buf;
                arc_loan_inuse_buf(abuf, db);
@@ -1206,7 +1336,7 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
  * used when modifying or reading db_blkptr.
  */
 db_lock_type_t
-dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
+dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)
 {
        enum db_lock_type ret = DLT_NONE;
        if (db->db_parent != NULL) {
@@ -1231,7 +1361,7 @@ dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
  * panic if we didn't pass the lock type in.
  */
 void
-dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag)
+dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)
 {
        if (type == DLT_PARENT)
                rw_exit(&db->db_parent->db_rwlock);
@@ -1243,6 +1373,7 @@ static void
 dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
     arc_buf_t *buf, void *vdb)
 {
+       (void) zb, (void) bp;
        dmu_buf_impl_t *db = vdb;
 
        mutex_enter(&db->db_mtx);
@@ -1264,7 +1395,7 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
                /* freed in flight */
                ASSERT(zio == NULL || zio->io_error == 0);
                arc_release(buf, db);
-               bzero(buf->b_data, db->db.db_size);
+               memset(buf->b_data, 0, db->db.db_size);
                arc_buf_freeze(buf);
                db->db_freed_in_flight = FALSE;
                dbuf_set_data(db, buf);
@@ -1303,16 +1434,16 @@ dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
        db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
        arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
        if (bonuslen < max_bonuslen)
-               bzero(db->db.db_data, max_bonuslen);
+               memset(db->db.db_data, 0, max_bonuslen);
        if (bonuslen)
-               bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+               memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen);
        db->db_state = DB_CACHED;
        DTRACE_SET_STATE(db, "bonus buffer filled");
        return (0);
 }
 
 static void
-dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
 {
        blkptr_t *bps = db->db.db_data;
        uint32_t indbs = 1ULL << dn->dn_indblkshift;
@@ -1321,12 +1452,12 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
        for (int i = 0; i < n_bps; i++) {
                blkptr_t *bp = &bps[i];
 
-               ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
-               BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
-                   dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
-               BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
-               BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
-               BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+               ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
+               BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
+                   dn->dn_datablksz : BP_GET_LSIZE(dbbp));
+               BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
+               BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
+               BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
        }
 }
 
@@ -1336,30 +1467,27 @@ dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
  * was taken, ENOENT if no action was taken.
  */
 static int
-dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
 {
        ASSERT(MUTEX_HELD(&db->db_mtx));
 
-       int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+       int is_hole = bp == NULL || BP_IS_HOLE(bp);
        /*
         * For level 0 blocks only, if the above check fails:
         * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
         * processes the delete record and clears the bp while we are waiting
         * for the dn_mtx (resulting in a "no" from block_freed).
         */
-       if (!is_hole && db->db_level == 0) {
-               is_hole = dnode_block_freed(dn, db->db_blkid) ||
-                   BP_IS_HOLE(db->db_blkptr);
-       }
+       if (!is_hole && db->db_level == 0)
+               is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
 
        if (is_hole) {
                dbuf_set_data(db, dbuf_alloc_arcbuf(db));
-               bzero(db->db.db_data, db->db.db_size);
+               memset(db->db.db_data, 0, db->db.db_size);
 
-               if (db->db_blkptr != NULL && db->db_level > 0 &&
-                   BP_IS_HOLE(db->db_blkptr) &&
-                   db->db_blkptr->blk_birth != 0) {
-                       dbuf_handle_indirect_hole(db, dn);
+               if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
+                   BP_GET_LOGICAL_BIRTH(bp) != 0) {
+                       dbuf_handle_indirect_hole(db, dn, bp);
                }
                db->db_state = DB_CACHED;
                DTRACE_SET_STATE(db, "hole read satisfied");
@@ -1391,8 +1519,8 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
 
        ASSERT(MUTEX_HELD(&db->db_mtx));
 
-       if (!os->os_encrypted || os->os_raw_receive ||
-           (flags & DB_RF_NO_DECRYPT) != 0)
+       if ((flags & DB_RF_NO_DECRYPT) != 0 ||
+           !os->os_encrypted || os->os_raw_receive)
                return (0);
 
        DB_DNODE_ENTER(db);
@@ -1430,21 +1558,19 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
  */
 static int
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
-    db_lock_type_t dblt, void *tag)
+    db_lock_type_t dblt, const void *tag)
 {
        dnode_t *dn;
        zbookmark_phys_t zb;
        uint32_t aflags = ARC_FLAG_NOWAIT;
        int err, zio_flags;
-       boolean_t bonus_read;
+       blkptr_t bp, *bpp;
 
-       err = zio_flags = 0;
-       bonus_read = B_FALSE;
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
        ASSERT(!zfs_refcount_is_zero(&db->db_holds));
        ASSERT(MUTEX_HELD(&db->db_mtx));
-       ASSERT(db->db_state == DB_UNCACHED);
+       ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
        ASSERT(db->db_buf == NULL);
        ASSERT(db->db_parent == NULL ||
            RW_LOCK_HELD(&db->db_parent->db_rwlock));
@@ -1454,16 +1580,44 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
                goto early_unlock;
        }
 
-       err = dbuf_read_hole(db, dn, flags);
+       if (db->db_state == DB_UNCACHED) {
+               if (db->db_blkptr == NULL) {
+                       bpp = NULL;
+               } else {
+                       bp = *db->db_blkptr;
+                       bpp = &bp;
+               }
+       } else {
+               dbuf_dirty_record_t *dr;
+
+               ASSERT3S(db->db_state, ==, DB_NOFILL);
+
+               /*
+                * Block cloning: If we have a pending block clone,
+                * we don't want to read the underlying block, but the content
+                * of the block being cloned, so we have the most recent data.
+                */
+               dr = list_head(&db->db_dirty_records);
+               if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
+                       err = EIO;
+                       goto early_unlock;
+               }
+               bp = dr->dt.dl.dr_overridden_by;
+               bpp = &bp;
+       }
+
+       err = dbuf_read_hole(db, dn, bpp);
        if (err == 0)
                goto early_unlock;
 
+       ASSERT(bpp != NULL);
+
        /*
         * Any attempt to read a redacted block should result in an error. This
         * will never happen under normal conditions, but can be useful for
         * debugging purposes.
         */
-       if (BP_IS_REDACTED(db->db_blkptr)) {
+       if (BP_IS_REDACTED(bpp)) {
                ASSERT(dsl_dataset_feature_is_active(
                    db->db_objset->os_dsl_dataset,
                    SPA_FEATURE_REDACTED_DATASETS));
@@ -1478,10 +1632,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
         * All bps of an encrypted os should have the encryption bit set.
         * If this is not true it indicates tampering and we report an error.
         */
-       if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
-               spa_log_error(db->db_objset->os_spa, &zb);
-               zfs_panic_recover("unencrypted block in encrypted "
-                   "object set %llu", dmu_objset_id(db->db_objset));
+       if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
+               spa_log_error(db->db_objset->os_spa, &zb,
+                   BP_GET_LOGICAL_BIRTH(bpp));
                err = SET_ERROR(EIO);
                goto early_unlock;
        }
@@ -1496,7 +1649,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
        DTRACE_SET_STATE(db, "read issued");
        mutex_exit(&db->db_mtx);
 
-       if (DBUF_IS_L2CACHEABLE(db))
+       if (!DBUF_IS_CACHEABLE(db))
+               aflags |= ARC_FLAG_UNCACHED;
+       else if (dbuf_is_l2cacheable(db))
                aflags |= ARC_FLAG_L2CACHE;
 
        dbuf_add_ref(db, NULL);
@@ -1507,15 +1662,14 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
        if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
                zio_flags |= ZIO_FLAG_RAW;
        /*
-        * The zio layer will copy the provided blkptr later, but we need to
-        * do this now so that we can release the parent's rwlock. We have to
-        * do that now so that if dbuf_read_done is called synchronously (on
+        * The zio layer will copy the provided blkptr later, but we have our
+        * own copy so that we can release the parent's rwlock. We have to
+        * do that so that if dbuf_read_done is called synchronously (on
         * an l1 cache hit) we don't acquire the db_mtx while holding the
         * parent's rwlock, which would be a lock ordering violation.
         */
-       blkptr_t bp = *db->db_blkptr;
        dmu_buf_unlock_parent(db, dblt, tag);
-       (void) arc_read(zio, db->db_objset->os_spa, &bp,
+       (void) arc_read(zio, db->db_objset->os_spa, bpp,
            dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
            &aflags, &zb);
        return (err);
@@ -1567,11 +1721,37 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
                int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
                dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
                arc_space_consume(bonuslen, ARC_SPACE_BONUS);
-               bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
+               memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
        } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
-               arc_buf_t *buf = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf);
-               dr->dt.dl.dr_data = buf;
-               bcopy(db->db.db_data, buf->b_data, arc_buf_size(buf));
+               dnode_t *dn = DB_DNODE(db);
+               int size = arc_buf_size(db->db_buf);
+               arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+               spa_t *spa = db->db_objset->os_spa;
+               enum zio_compress compress_type =
+                   arc_get_compression(db->db_buf);
+               uint8_t complevel = arc_get_complevel(db->db_buf);
+
+               if (arc_is_encrypted(db->db_buf)) {
+                       boolean_t byteorder;
+                       uint8_t salt[ZIO_DATA_SALT_LEN];
+                       uint8_t iv[ZIO_DATA_IV_LEN];
+                       uint8_t mac[ZIO_DATA_MAC_LEN];
+
+                       arc_get_raw_params(db->db_buf, &byteorder, salt,
+                           iv, mac);
+                       dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
+                           dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
+                           mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
+                           compress_type, complevel);
+               } else if (compress_type != ZIO_COMPRESS_OFF) {
+                       ASSERT3U(type, ==, ARC_BUFC_DATA);
+                       dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
+                           size, arc_buf_lsize(db->db_buf), compress_type,
+                           complevel);
+               } else {
+                       dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
+               }
+               memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
        } else {
                db->db_buf = NULL;
                dbuf_clear_data(db);
@@ -1591,20 +1771,18 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
         */
        ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
-       if (db->db_state == DB_NOFILL)
-               return (SET_ERROR(EIO));
-
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
 
        prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
-           (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
-           DBUF_IS_CACHEABLE(db);
+           (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
 
        mutex_enter(&db->db_mtx);
+       if (flags & DB_RF_PARTIAL_FIRST)
+               db->db_partial_read = B_TRUE;
+       else if (!(flags & DB_RF_PARTIAL_MORE))
+               db->db_partial_read = B_FALSE;
        if (db->db_state == DB_CACHED) {
-               spa_t *spa = dn->dn_objset->os_spa;
-
                /*
                 * Ensure that this block's dnode has been decrypted if
                 * the caller has requested decrypted data.
@@ -1623,6 +1801,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                    (arc_is_encrypted(db->db_buf) ||
                    arc_is_unauthenticated(db->db_buf) ||
                    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+                       spa_t *spa = dn->dn_objset->os_spa;
                        zbookmark_phys_t zb;
 
                        SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
@@ -1634,18 +1813,18 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                mutex_exit(&db->db_mtx);
                if (err == 0 && prefetch) {
                        dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-                           flags & DB_RF_HAVESTRUCT);
+                           B_FALSE, flags & DB_RF_HAVESTRUCT);
                }
                DB_DNODE_EXIT(db);
                DBUF_STAT_BUMP(hash_hits);
-       } else if (db->db_state == DB_UNCACHED) {
-               spa_t *spa = dn->dn_objset->os_spa;
+       } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
                boolean_t need_wait = B_FALSE;
 
                db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
 
-               if (zio == NULL &&
-                   db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+               if (zio == NULL && (db->db_state == DB_NOFILL ||
+                   (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
+                       spa_t *spa = dn->dn_objset->os_spa;
                        zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
                        need_wait = B_TRUE;
                }
@@ -1656,6 +1835,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                 */
                if (!err && prefetch) {
                        dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+                           db->db_state != DB_CACHED,
                            flags & DB_RF_HAVESTRUCT);
                }
 
@@ -1685,7 +1865,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                mutex_exit(&db->db_mtx);
                if (prefetch) {
                        dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-                           flags & DB_RF_HAVESTRUCT);
+                           B_TRUE, flags & DB_RF_HAVESTRUCT);
                }
                DB_DNODE_EXIT(db);
                DBUF_STAT_BUMP(hash_misses);
@@ -1758,8 +1938,13 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
        if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
                zio_free(db->db_objset->os_spa, txg, bp);
 
+       if (dr->dt.dl.dr_brtwrite) {
+               ASSERT0P(dr->dt.dl.dr_data);
+               dr->dt.dl.dr_data = db->db_buf;
+       }
        dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
        dr->dt.dl.dr_nopwrite = B_FALSE;
+       dr->dt.dl.dr_brtwrite = B_FALSE;
        dr->dt.dl.dr_has_raw_params = B_FALSE;
 
        /*
@@ -1770,7 +1955,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
         * the buf thawed to save the effort of freezing &
         * immediately re-thawing it.
         */
-       arc_release(dr->dt.dl.dr_data, db);
+       if (dr->dt.dl.dr_data)
+               arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
@@ -1791,7 +1977,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
        if (end_blkid > dn->dn_maxblkid &&
            !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
                end_blkid = dn->dn_maxblkid;
-       dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+       dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,
+           (u_longlong_t)end_blkid);
 
        db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
        db_search->db_level = 0;
@@ -1867,7 +2054,7 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
                        ASSERT(db->db.db_data != NULL);
                        arc_release(db->db_buf, db);
                        rw_enter(&db->db_rwlock, RW_WRITER);
-                       bzero(db->db.db_data, db->db.db_size);
+                       memset(db->db.db_data, 0, db->db.db_size);
                        rw_exit(&db->db_rwlock);
                        arc_buf_freeze(db->db_buf);
                }
@@ -1875,8 +2062,8 @@ dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
                mutex_exit(&db->db_mtx);
        }
 
-       kmem_free(db_search, sizeof (dmu_buf_impl_t));
        mutex_exit(&dn->dn_dbufs_mtx);
+       kmem_free(db_search, sizeof (dmu_buf_impl_t));
 }
 
 void
@@ -1904,10 +2091,10 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 
        /* copy old block data to the new block */
        old_buf = db->db_buf;
-       bcopy(old_buf->b_data, buf->b_data, MIN(osize, size));
+       memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));
        /* zero the remainder */
        if (size > osize)
-               bzero((uint8_t *)buf->b_data + osize, size - osize);
+               memset((uint8_t *)buf->b_data + osize, 0, size - osize);
 
        mutex_enter(&db->db_mtx);
        dbuf_set_data(db, buf);
@@ -1967,6 +2154,75 @@ dbuf_redirty(dbuf_dirty_record_t *dr)
        }
 }
 
+dbuf_dirty_record_t *
+dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+       rw_enter(&dn->dn_struct_rwlock, RW_READER);
+       IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
+       dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
+       ASSERT(dn->dn_maxblkid >= blkid);
+
+       dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
+       list_link_init(&dr->dr_dirty_node);
+       list_link_init(&dr->dr_dbuf_node);
+       dr->dr_dnode = dn;
+       dr->dr_txg = tx->tx_txg;
+       dr->dt.dll.dr_blkid = blkid;
+       dr->dr_accounted = dn->dn_datablksz;
+
+       /*
+        * There should not be any dbuf for the block that we're dirtying.
+        * Otherwise the buffer contents could be inconsistent between the
+        * dbuf and the lightweight dirty record.
+        */
+       ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
+           NULL));
+
+       mutex_enter(&dn->dn_mtx);
+       int txgoff = tx->tx_txg & TXG_MASK;
+       if (dn->dn_free_ranges[txgoff] != NULL) {
+               range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
+       }
+
+       if (dn->dn_nlevels == 1) {
+               ASSERT3U(blkid, <, dn->dn_nblkptr);
+               list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+               mutex_exit(&dn->dn_mtx);
+               rw_exit(&dn->dn_struct_rwlock);
+               dnode_setdirty(dn, tx);
+       } else {
+               mutex_exit(&dn->dn_mtx);
+
+               int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+               dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
+                   1, blkid >> epbs, FTAG);
+               rw_exit(&dn->dn_struct_rwlock);
+               if (parent_db == NULL) {
+                       kmem_free(dr, sizeof (*dr));
+                       return (NULL);
+               }
+               int err = dbuf_read(parent_db, NULL,
+                   (DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+               if (err != 0) {
+                       dbuf_rele(parent_db, FTAG);
+                       kmem_free(dr, sizeof (*dr));
+                       return (NULL);
+               }
+
+               dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
+               dbuf_rele(parent_db, FTAG);
+               mutex_enter(&parent_dr->dt.di.dr_mtx);
+               ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
+               list_insert_tail(&parent_dr->dt.di.dr_children, dr);
+               mutex_exit(&parent_dr->dt.di.dr_mtx);
+               dr->dr_parent = parent_dr;
+       }
+
+       dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
+
+       return (dr);
+}
+
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
@@ -1987,7 +2243,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         * objects may be dirtied in syncing context, but only if they
         * were already pre-dirtied in open context.
         */
-#ifdef DEBUG
+#ifdef ZFS_DEBUG
        if (dn->dn_objset->os_dsl_dataset != NULL) {
                rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
                    RW_READER, FTAG);
@@ -2060,7 +2316,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         */
        os = dn->dn_objset;
        VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
-#ifdef DEBUG
+#ifdef ZFS_DEBUG
        if (dn->dn_objset->os_dsl_dataset != NULL)
                rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
        ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
@@ -2072,7 +2328,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
        dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
-       if (db->db_blkid != DMU_BONUS_BLKID) {
+       if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
                dmu_objset_willuse_space(os, db->db.db_size, tx);
        }
 
@@ -2084,6 +2340,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
        list_link_init(&dr->dr_dirty_node);
        list_link_init(&dr->dr_dbuf_node);
+       dr->dr_dnode = dn;
        if (db->db_level == 0) {
                void *data_old = db->db_buf;
 
@@ -2114,8 +2371,9 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                    sizeof (dbuf_dirty_record_t),
                    offsetof(dbuf_dirty_record_t, dr_dirty_node));
        }
-       if (db->db_blkid != DMU_BONUS_BLKID)
+       if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
                dr->dr_accounted = db->db.db_size;
+       }
        dr->dr_dbuf = db;
        dr->dr_txg = tx->tx_txg;
        list_insert_before(&db->db_dirty_records, dr_next, dr);
@@ -2249,7 +2507,7 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
        dmu_buf_impl_t *db = dr->dr_dbuf;
 
        if (dr->dt.dl.dr_data != db->db.db_data) {
-               struct dnode *dn = DB_DNODE(db);
+               struct dnode *dn = dr->dr_dnode;
                int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
 
                kmem_free(dr->dt.dl.dr_data, max_bonuslen);
@@ -2271,12 +2529,11 @@ dbuf_undirty_bonus(dbuf_dirty_record_t *dr)
  * Undirty a buffer in the transaction group referenced by the given
  * transaction.  Return whether this evicted the dbuf.
  */
-static boolean_t
+boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
-       dnode_t *dn;
        uint64_t txg = tx->tx_txg;
-       dbuf_dirty_record_t *dr;
+       boolean_t brtwrite;
 
        ASSERT(txg != 0);
 
@@ -2296,13 +2553,22 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        /*
         * If this buffer is not dirty, we're done.
         */
-       dr = dbuf_find_dirty_eq(db, txg);
+       dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
        if (dr == NULL)
                return (B_FALSE);
        ASSERT(dr->dr_dbuf == db);
 
-       DB_DNODE_ENTER(db);
-       dn = DB_DNODE(db);
+       brtwrite = dr->dt.dl.dr_brtwrite;
+       if (brtwrite) {
+               /*
+                * We are freeing a block that we cloned in the same
+                * transaction group.
+                */
+               brt_pending_remove(dmu_objset_spa(db->db_objset),
+                   &dr->dt.dl.dr_overridden_by, tx);
+       }
+
+       dnode_t *dn = dr->dr_dnode;
 
        dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
@@ -2330,9 +2596,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
                mutex_exit(&dn->dn_mtx);
        }
-       DB_DNODE_EXIT(db);
 
-       if (db->db_state != DB_NOFILL) {
+       if (db->db_state != DB_NOFILL && !brtwrite) {
                dbuf_unoverride(dr);
 
                ASSERT(db->db_buf != NULL);
@@ -2347,7 +2612,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        db->db_dirtycnt -= 1;
 
        if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
-               ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+               ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+                   arc_released(db->db_buf));
                dbuf_destroy(db);
                return (B_TRUE);
        }
@@ -2359,6 +2625,7 @@ static void
 dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       boolean_t undirty = B_FALSE;
 
        ASSERT(tx->tx_txg != 0);
        ASSERT(!zfs_refcount_is_zero(&db->db_holds));
@@ -2371,7 +2638,7 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
         */
        mutex_enter(&db->db_mtx);
 
-       if (db->db_state == DB_CACHED) {
+       if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
                dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
                /*
                 * It's possible that it is already dirty but not cached,
@@ -2379,10 +2646,21 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
                 * go through dmu_buf_will_dirty().
                 */
                if (dr != NULL) {
-                       /* This dbuf is already dirty and cached. */
-                       dbuf_redirty(dr);
-                       mutex_exit(&db->db_mtx);
-                       return;
+                       if (dr->dt.dl.dr_brtwrite) {
+                               /*
+                                * Block cloning: If we are dirtying a cloned
+                                * block, we cannot simply redirty it, because
+                                * this dr has no data associated with it.
+                                * We will go through a full undirtying below,
+                                * before dirtying it again.
+                                */
+                               undirty = B_TRUE;
+                       } else {
+                               /* This dbuf is already dirty and cached. */
+                               dbuf_redirty(dr);
+                               mutex_exit(&db->db_mtx);
+                               return;
+                       }
                }
        }
        mutex_exit(&db->db_mtx);
@@ -2391,7 +2669,20 @@ dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
        if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
                flags |= DB_RF_HAVESTRUCT;
        DB_DNODE_EXIT(db);
+
+       /*
+        * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
+        * want to make sure dbuf_read() will read the pending cloned block and
+        * not the uderlying block that is being replaced. dbuf_undirty() will
+        * do dbuf_unoverride(), so we will end up with cloned block content,
+        * without overridden BP.
+        */
        (void) dbuf_read(db, NULL, flags);
+       if (undirty) {
+               mutex_enter(&db->db_mtx);
+               VERIFY(!dbuf_undirty(db, tx));
+               mutex_exit(&db->db_mtx);
+       }
        (void) dbuf_dirty(db, tx);
 }
 
@@ -2414,18 +2705,52 @@ dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
        return (dr != NULL);
 }
 
+void
+dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+       /*
+        * Block cloning: We are going to clone into this block, so undirty
+        * modifications done to this block so far in this txg. This includes
+        * writes and clones into this block.
+        */
+       mutex_enter(&db->db_mtx);
+       DBUF_VERIFY(db);
+       VERIFY(!dbuf_undirty(db, tx));
+       ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
+       if (db->db_buf != NULL) {
+               arc_buf_destroy(db->db_buf, db);
+               db->db_buf = NULL;
+               dbuf_clear_data(db);
+       }
+
+       db->db_state = DB_NOFILL;
+       DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+
+       DBUF_VERIFY(db);
+       mutex_exit(&db->db_mtx);
+
+       dbuf_noread(db);
+       (void) dbuf_dirty(db, tx);
+}
+
 void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
+       mutex_enter(&db->db_mtx);
        db->db_state = DB_NOFILL;
        DTRACE_SET_STATE(db, "allocating NOFILL buffer");
-       dmu_buf_will_fill(db_fake, tx);
+       mutex_exit(&db->db_mtx);
+
+       dbuf_noread(db);
+       (void) dbuf_dirty(db, tx);
 }
 
 void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
@@ -2437,6 +2762,25 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
        ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
            dmu_tx_private_ok(tx));
 
+       mutex_enter(&db->db_mtx);
+       if (db->db_state == DB_NOFILL) {
+               /*
+                * Block cloning: We will be completely overwriting a block
+                * cloned in this transaction group, so let's undirty the
+                * pending clone and mark the block as uncached. This will be
+                * as if the clone was never done.  But if the fill can fail
+                * we should have a way to return back to the cloned data.
+                */
+               if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+                       mutex_exit(&db->db_mtx);
+                       dmu_buf_will_dirty(db_fake, tx);
+                       return;
+               }
+               VERIFY(!dbuf_undirty(db, tx));
+               db->db_state = DB_UNCACHED;
+       }
+       mutex_exit(&db->db_mtx);
+
        dbuf_noread(db);
        (void) dbuf_dirty(db, tx);
 }
@@ -2472,9 +2816,9 @@ dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder,
 
        dr->dt.dl.dr_has_raw_params = B_TRUE;
        dr->dt.dl.dr_byteorder = byteorder;
-       bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN);
-       bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN);
-       bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN);
+       memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);
+       memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);
+       memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);
 }
 
 static void
@@ -2484,39 +2828,49 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
        dbuf_dirty_record_t *dr;
 
        dr = list_head(&db->db_dirty_records);
+       ASSERT3P(dr, !=, NULL);
        ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
        dl = &dr->dt.dl;
        dl->dr_overridden_by = *bp;
        dl->dr_override_state = DR_OVERRIDDEN;
-       dl->dr_overridden_by.blk_birth = dr->dr_txg;
+       BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
-/* ARGSUSED */
-void
-dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+boolean_t
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
 {
+       (void) tx;
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-       dbuf_states_t old_state;
        mutex_enter(&db->db_mtx);
        DBUF_VERIFY(db);
 
-       old_state = db->db_state;
-       db->db_state = DB_CACHED;
-       if (old_state == DB_FILL) {
+       if (db->db_state == DB_FILL) {
                if (db->db_level == 0 && db->db_freed_in_flight) {
                        ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                        /* we were freed while filling */
                        /* XXX dbuf_undirty? */
-                       bzero(db->db.db_data, db->db.db_size);
+                       memset(db->db.db_data, 0, db->db.db_size);
                        db->db_freed_in_flight = FALSE;
+                       db->db_state = DB_CACHED;
                        DTRACE_SET_STATE(db,
                            "fill done handling freed in flight");
+                       failed = B_FALSE;
+               } else if (failed) {
+                       VERIFY(!dbuf_undirty(db, tx));
+                       db->db_buf = NULL;
+                       dbuf_clear_data(db);
+                       DTRACE_SET_STATE(db, "fill failed");
                } else {
+                       db->db_state = DB_CACHED;
                        DTRACE_SET_STATE(db, "fill done");
                }
                cv_broadcast(&db->db_changed);
+       } else {
+               db->db_state = DB_CACHED;
+               failed = B_FALSE;
        }
        mutex_exit(&db->db_mtx);
+       return (failed);
 }
 
 void
@@ -2545,6 +2899,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
        dmu_buf_will_not_fill(dbuf, tx);
 
        dr = list_head(&db->db_dirty_records);
+       ASSERT3P(dr, !=, NULL);
        ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
        dl = &dr->dt.dl;
        encode_embedded_bp_compressed(&dl->dr_overridden_by,
@@ -2555,7 +2910,7 @@ dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
        BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
        dl->dr_override_state = DR_OVERRIDDEN;
-       dl->dr_overridden_by.blk_birth = dr->dr_txg;
+       BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
 }
 
 void
@@ -2606,7 +2961,8 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
        while (db->db_state == DB_READ || db->db_state == DB_FILL)
                cv_wait(&db->db_changed, &db->db_mtx);
 
-       ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+       ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
+           db->db_state == DB_NOFILL);
 
        if (db->db_state == DB_CACHED &&
            zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
@@ -2619,13 +2975,11 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
                ASSERT(!arc_is_encrypted(buf));
                mutex_exit(&db->db_mtx);
                (void) dbuf_dirty(db, tx);
-               bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+               memcpy(db->db.db_data, buf->b_data, db->db.db_size);
                arc_buf_destroy(buf, db);
-               xuio_stat_wbuf_copied();
                return;
        }
 
-       xuio_stat_wbuf_nocopy();
        if (db->db_state == DB_CACHED) {
                dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
 
@@ -2645,6 +2999,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
                        arc_buf_destroy(db->db_buf, db);
                }
                db->db_buf = NULL;
+       } else if (db->db_state == DB_NOFILL) {
+               /*
+                * We will be completely replacing the cloned block.  In case
+                * it was cloned in this transaction group, let's undirty the
+                * pending clone and mark the block as uncached. This will be
+                * as if the clone was never done.
+                */
+               VERIFY(!dbuf_undirty(db, tx));
+               db->db_state = DB_UNCACHED;
        }
        ASSERT(db->db_buf == NULL);
        dbuf_set_data(db, buf);
@@ -2652,7 +3015,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
        DTRACE_SET_STATE(db, "filling assigned arcbuf");
        mutex_exit(&db->db_mtx);
        (void) dbuf_dirty(db, tx);
-       dmu_buf_fill_done(&db->db, tx);
+       dmu_buf_fill_done(&db->db, tx, B_FALSE);
 }
 
 void
@@ -2687,7 +3050,9 @@ dbuf_destroy(dmu_buf_impl_t *db)
                ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
                    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
-               multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+               multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+               ASSERT0(dmu_buf_user_size(&db->db));
                (void) zfs_refcount_remove_many(
                    &dbuf_caches[db->db_caching_status].size,
                    db->db.db_size, db);
@@ -2760,9 +3125,6 @@ dbuf_destroy(dmu_buf_impl_t *db)
        ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
        ASSERT(!multilist_link_active(&db->db_cache_link));
 
-       kmem_cache_free(dbuf_kmem_cache, db);
-       arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
-
        /*
         * If this dbuf is referenced from an indirect dbuf,
         * decrement the ref count on the indirect dbuf.
@@ -2771,6 +3133,9 @@ dbuf_destroy(dmu_buf_impl_t *db)
                mutex_enter(&parent->db_mtx);
                dbuf_rele_and_unlock(parent, db, B_TRUE);
        }
+
+       kmem_cache_free(dbuf_kmem_cache, db);
+       arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
 }
 
 /*
@@ -2872,7 +3237,7 @@ dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
 
 static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
-    dmu_buf_impl_t *parent, blkptr_t *blkptr)
+    dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
 {
        objset_t *os = dn->dn_objset;
        dmu_buf_impl_t *db, *odb;
@@ -2893,6 +3258,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
        db->db_dnode_handle = dn->dn_handle;
        db->db_parent = parent;
        db->db_blkptr = blkptr;
+       db->db_hash = hash;
 
        db->db_user = NULL;
        db->db_user_immediate_evict = FALSE;
@@ -2933,8 +3299,8 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
        db->db_state = DB_EVICTING; /* not worth logging this state change */
        if ((odb = dbuf_hash_insert(db)) != NULL) {
                /* someone else inserted it first */
-               kmem_cache_free(dbuf_kmem_cache, db);
                mutex_exit(&dn->dn_dbufs_mtx);
+               kmem_cache_free(dbuf_kmem_cache, db);
                DBUF_STAT_BUMP(hash_insert_race);
                return (odb);
        }
@@ -2976,6 +3342,7 @@ dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid,
 
        err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
        if (err == 0) {
+               ASSERT3P(bp2, !=, NULL);
                *bp = *bp2;
                if (dbp != NULL)
                        dbuf_rele(dbp, NULL);
@@ -2997,8 +3364,33 @@ typedef struct dbuf_prefetch_arg {
        zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
        zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
        arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+       dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
+       void *dpa_arg; /* prefetch completion arg */
 } dbuf_prefetch_arg_t;
 
+static void
+dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
+{
+       if (dpa->dpa_cb != NULL) {
+               dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,
+                   dpa->dpa_zb.zb_blkid, io_done);
+       }
+       kmem_free(dpa, sizeof (*dpa));
+}
+
+static void
+dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
+    const blkptr_t *iobp, arc_buf_t *abuf, void *private)
+{
+       (void) zio, (void) zb, (void) iobp;
+       dbuf_prefetch_arg_t *dpa = private;
+
+       if (abuf != NULL)
+               arc_buf_destroy(abuf, private);
+
+       dbuf_prefetch_fini(dpa, B_TRUE);
+}
+
 /*
  * Actually issue the prefetch read for the block given.
  */
@@ -3011,11 +3403,12 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
            SPA_FEATURE_REDACTED_DATASETS));
 
        if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
-               return;
+               return (dbuf_prefetch_fini(dpa, B_FALSE));
 
        int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
        arc_flags_t aflags =
-           dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+           dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+           ARC_FLAG_NO_BUF;
 
        /* dnodes are always read as raw and then converted later */
        if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
@@ -3025,7 +3418,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
        ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
        ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
        ASSERT(dpa->dpa_zio != NULL);
-       (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+       (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
+           dbuf_issue_final_prefetch_done, dpa,
            dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
 }
 
@@ -3038,6 +3432,7 @@ static void
 dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
     const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
+       (void) zb, (void) iobp;
        dbuf_prefetch_arg_t *dpa = private;
 
        ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
@@ -3045,7 +3440,7 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
 
        if (abuf == NULL) {
                ASSERT(zio == NULL || zio->io_error != 0);
-               kmem_free(dpa, sizeof (*dpa));
+               dbuf_prefetch_fini(dpa, B_TRUE);
                return;
        }
        ASSERT(zio == NULL || zio->io_error == 0);
@@ -3078,11 +3473,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
                dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
                    dpa->dpa_curlevel, curblkid, FTAG);
                if (db == NULL) {
-                       kmem_free(dpa, sizeof (*dpa));
                        arc_buf_destroy(abuf, private);
+                       dbuf_prefetch_fini(dpa, B_TRUE);
                        return;
                }
-
                (void) dbuf_read(db, NULL,
                    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
                dbuf_rele(db, FTAG);
@@ -3094,16 +3488,17 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
        blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
            P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
 
-       ASSERT(!BP_IS_REDACTED(bp) ||
+       ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
            dsl_dataset_feature_is_active(
            dpa->dpa_dnode->dn_objset->os_dsl_dataset,
-           SPA_FEATURE_REDACTED_DATASETS));
+           SPA_FEATURE_REDACTED_DATASETS)));
        if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
-               kmem_free(dpa, sizeof (*dpa));
+               arc_buf_destroy(abuf, private);
+               dbuf_prefetch_fini(dpa, B_TRUE);
+               return;
        } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
                ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
                dbuf_issue_final_prefetch(dpa, bp);
-               kmem_free(dpa, sizeof (*dpa));
        } else {
                arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
                zbookmark_phys_t zb;
@@ -3118,7 +3513,8 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
                    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
 
                (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
-                   bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+                   bp, dbuf_prefetch_indirect_done, dpa,
+                   ZIO_PRIORITY_SYNC_READ,
                    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
                    &iter_aflags, &zb);
        }
@@ -3133,9 +3529,10 @@ dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
  * complete. Note that the prefetch might fail if the dataset is encrypted and
  * the encryption key is unmapped before the IO completes.
  */
-void
-dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
-    arc_flags_t aflags)
+int
+dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
+    zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
+    void *arg)
 {
        blkptr_t bp;
        int epbs, nlevels, curlevel;
@@ -3145,10 +3542,10 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
        ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
        if (blkid > dn->dn_maxblkid)
-               return;
+               goto no_issue;
 
        if (level == 0 && dnode_block_freed(dn, blkid))
-               return;
+               goto no_issue;
 
        /*
         * This dnode hasn't been written to disk yet, so there's nothing to
@@ -3156,21 +3553,21 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
         */
        nlevels = dn->dn_phys->dn_nlevels;
        if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
-               return;
+               goto no_issue;
 
        epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
        if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
-               return;
+               goto no_issue;
 
        dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
-           level, blkid);
+           level, blkid, NULL);
        if (db != NULL) {
                mutex_exit(&db->db_mtx);
                /*
                 * This dbuf already exists.  It is either CACHED, or
                 * (we assume) about to be read or filled.
                 */
-               return;
+               goto no_issue;
        }
 
        /*
@@ -3206,7 +3603,7 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
            dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
            SPA_FEATURE_REDACTED_DATASETS));
        if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
-               return;
+               goto no_issue;
 
        ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
 
@@ -3224,9 +3621,12 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
        dpa->dpa_dnode = dn;
        dpa->dpa_epbs = epbs;
        dpa->dpa_zio = pio;
+       dpa->dpa_cb = cb;
+       dpa->dpa_arg = arg;
 
-       /* flag if L2ARC eligible, l2arc_noprefetch then decides */
-       if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+       if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
+               dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
+       else if (dnode_level_is_l2cacheable(&bp, dn, level))
                dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
 
        /*
@@ -3239,19 +3639,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
        if (curlevel == level) {
                ASSERT3U(curblkid, ==, blkid);
                dbuf_issue_final_prefetch(dpa, &bp);
-               kmem_free(dpa, sizeof (*dpa));
        } else {
                arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
                zbookmark_phys_t zb;
 
                /* flag if L2ARC eligible, l2arc_noprefetch then decides */
-               if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+               if (dnode_level_is_l2cacheable(&bp, dn, level))
                        iter_aflags |= ARC_FLAG_L2CACHE;
 
                SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
                    dn->dn_object, curlevel, curblkid);
                (void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
-                   &bp, dbuf_prefetch_indirect_done, dpa, prio,
+                   &bp, dbuf_prefetch_indirect_done, dpa,
+                   ZIO_PRIORITY_SYNC_READ,
                    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
                    &iter_aflags, &zb);
        }
@@ -3260,6 +3660,19 @@ dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
         * dpa may have already been freed.
         */
        zio_nowait(pio);
+       return (1);
+no_issue:
+       if (cb != NULL)
+               cb(arg, level, blkid, B_FALSE);
+       return (0);
+}
+
+int
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+    arc_flags_t aflags)
+{
+
+       return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
 }
 
 /*
@@ -3274,12 +3687,32 @@ noinline static void
 dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
 {
        dbuf_dirty_record_t *dr = db->db_data_pending;
-       arc_buf_t *newdata, *data = dr->dt.dl.dr_data;
+       arc_buf_t *data = dr->dt.dl.dr_data;
+       enum zio_compress compress_type = arc_get_compression(data);
+       uint8_t complevel = arc_get_complevel(data);
+
+       if (arc_is_encrypted(data)) {
+               boolean_t byteorder;
+               uint8_t salt[ZIO_DATA_SALT_LEN];
+               uint8_t iv[ZIO_DATA_IV_LEN];
+               uint8_t mac[ZIO_DATA_MAC_LEN];
+
+               arc_get_raw_params(data, &byteorder, salt, iv, mac);
+               dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
+                   dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
+                   dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
+                   compress_type, complevel));
+       } else if (compress_type != ZIO_COMPRESS_OFF) {
+               dbuf_set_data(db, arc_alloc_compressed_buf(
+                   dn->dn_objset->os_spa, db, arc_buf_size(data),
+                   arc_buf_lsize(data), compress_type, complevel));
+       } else {
+               dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
+                   DBUF_GET_BUFC_TYPE(db), db->db.db_size));
+       }
 
-       newdata = dbuf_alloc_arcbuf_from_arcbuf(db, data);
-       dbuf_set_data(db, newdata);
        rw_enter(&db->db_rwlock, RW_WRITER);
-       bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
+       memcpy(db->db.db_data, data->b_data, arc_buf_size(data));
        rw_exit(&db->db_rwlock);
 }
 
@@ -3290,9 +3723,10 @@ dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
 int
 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
     boolean_t fail_sparse, boolean_t fail_uncached,
-    void *tag, dmu_buf_impl_t **dbp)
+    const void *tag, dmu_buf_impl_t **dbp)
 {
        dmu_buf_impl_t *db, *parent = NULL;
+       uint64_t hv;
 
        /* If the pool has been created, verify the tx_sync_lock is not held */
        spa_t *spa = dn->dn_objset->os_spa;
@@ -3308,7 +3742,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
        *dbp = NULL;
 
        /* dbuf_find() returns with db_mtx held */
-       db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
+       db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
 
        if (db == NULL) {
                blkptr_t *bp = NULL;
@@ -3330,7 +3764,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
                }
                if (err && err != ENOENT)
                        return (err);
-               db = dbuf_create(dn, level, blkid, parent, bp);
+               db = dbuf_create(dn, level, blkid, parent, bp, hv);
        }
 
        if (fail_uncached && db->db_state != DB_CACHED) {
@@ -3354,8 +3788,10 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
            dn->dn_object != DMU_META_DNODE_OBJECT &&
            db->db_state == DB_CACHED && db->db_data_pending) {
                dbuf_dirty_record_t *dr = db->db_data_pending;
-               if (dr->dt.dl.dr_data == db->db_buf)
+               if (dr->dt.dl.dr_data == db->db_buf) {
+                       ASSERT3P(db->db_buf, !=, NULL);
                        dbuf_hold_copy(dn, db);
+               }
        }
 
        if (multilist_link_active(&db->db_cache_link)) {
@@ -3363,18 +3799,18 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
                ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
                    db->db_caching_status == DB_DBUF_METADATA_CACHE);
 
-               multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+               multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+               uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
                (void) zfs_refcount_remove_many(
-                   &dbuf_caches[db->db_caching_status].size,
-                   db->db.db_size, db);
+                   &dbuf_caches[db->db_caching_status].size, size, db);
 
                if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
                        DBUF_STAT_BUMPDOWN(metadata_cache_count);
                } else {
                        DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
                        DBUF_STAT_BUMPDOWN(cache_count);
-                       DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
-                           db->db.db_size);
+                       DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
                }
                db->db_caching_status = DB_NO_CACHE;
        }
@@ -3395,13 +3831,13 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
 }
 
 dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)
 {
        return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
-dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)
 {
        dmu_buf_impl_t *db;
        int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
@@ -3414,7 +3850,8 @@ dbuf_create_bonus(dnode_t *dn)
        ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
        ASSERT(dn->dn_bonus == NULL);
-       dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+       dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
+           dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
 }
 
 int
@@ -3442,7 +3879,7 @@ dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
-dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)
 {
        int64_t holds = zfs_refcount_add(&db->db_holds, tag);
        VERIFY3S(holds, >, 1);
@@ -3451,7 +3888,7 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
 boolean_t
 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
-    void *tag)
+    const void *tag)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
        dmu_buf_impl_t *found_db;
@@ -3460,7 +3897,7 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
        if (blkid == DMU_BONUS_BLKID)
                found_db = dbuf_find_bonus(os, obj);
        else
-               found_db = dbuf_find(os, obj, 0, blkid);
+               found_db = dbuf_find(os, obj, 0, blkid, NULL);
 
        if (found_db != NULL) {
                if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
@@ -3480,14 +3917,14 @@ dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
  * dnode's parent dbuf evicting its dnode handles.
  */
 void
-dbuf_rele(dmu_buf_impl_t *db, void *tag)
+dbuf_rele(dmu_buf_impl_t *db, const void *tag)
 {
        mutex_enter(&db->db_mtx);
        dbuf_rele_and_unlock(db, tag, B_FALSE);
 }
 
 void
-dmu_buf_rele(dmu_buf_t *db, void *tag)
+dmu_buf_rele(dmu_buf_t *db, const void *tag)
 {
        dbuf_rele((dmu_buf_impl_t *)db, tag);
 }
@@ -3506,7 +3943,7 @@ dmu_buf_rele(dmu_buf_t *db, void *tag)
  *
  */
 void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
 {
        int64_t holds;
        uint64_t size;
@@ -3580,59 +4017,39 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
                         * This dbuf has anonymous data associated with it.
                         */
                        dbuf_destroy(db);
-               } else {
-                       boolean_t do_arc_evict = B_FALSE;
-                       blkptr_t bp;
-                       spa_t *spa = dmu_objset_spa(db->db_objset);
-
-                       if (!DBUF_IS_CACHEABLE(db) &&
-                           db->db_blkptr != NULL &&
-                           !BP_IS_HOLE(db->db_blkptr) &&
-                           !BP_IS_EMBEDDED(db->db_blkptr)) {
-                               do_arc_evict = B_TRUE;
-                               bp = *db->db_blkptr;
-                       }
-
-                       if (!DBUF_IS_CACHEABLE(db) ||
-                           db->db_pending_evict) {
-                               dbuf_destroy(db);
-                       } else if (!multilist_link_active(&db->db_cache_link)) {
-                               ASSERT3U(db->db_caching_status, ==,
-                                   DB_NO_CACHE);
-
-                               dbuf_cached_state_t dcs =
-                                   dbuf_include_in_metadata_cache(db) ?
-                                   DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
-                               db->db_caching_status = dcs;
-
-                               multilist_insert(dbuf_caches[dcs].cache, db);
-                               size = zfs_refcount_add_many(
-                                   &dbuf_caches[dcs].size,
-                                   db->db.db_size, db);
-
-                               if (dcs == DB_DBUF_METADATA_CACHE) {
-                                       DBUF_STAT_BUMP(metadata_cache_count);
-                                       DBUF_STAT_MAX(
-                                           metadata_cache_size_bytes_max,
-                                           size);
-                               } else {
-                                       DBUF_STAT_BUMP(
-                                           cache_levels[db->db_level]);
-                                       DBUF_STAT_BUMP(cache_count);
-                                       DBUF_STAT_INCR(
-                                           cache_levels_bytes[db->db_level],
-                                           db->db.db_size);
-                                       DBUF_STAT_MAX(cache_size_bytes_max,
-                                           size);
-                               }
-                               mutex_exit(&db->db_mtx);
+               } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
+                   db->db_pending_evict) {
+                       dbuf_destroy(db);
+               } else if (!multilist_link_active(&db->db_cache_link)) {
+                       ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+
+                       dbuf_cached_state_t dcs =
+                           dbuf_include_in_metadata_cache(db) ?
+                           DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+                       db->db_caching_status = dcs;
+
+                       multilist_insert(&dbuf_caches[dcs].cache, db);
+                       uint64_t db_size = db->db.db_size +
+                           dmu_buf_user_size(&db->db);
+                       size = zfs_refcount_add_many(
+                           &dbuf_caches[dcs].size, db_size, db);
+                       uint8_t db_level = db->db_level;
+                       mutex_exit(&db->db_mtx);
 
-                               if (dcs == DB_DBUF_CACHE && !evicting)
-                                       dbuf_evict_notify(size);
+                       if (dcs == DB_DBUF_METADATA_CACHE) {
+                               DBUF_STAT_BUMP(metadata_cache_count);
+                               DBUF_STAT_MAX(metadata_cache_size_bytes_max,
+                                   size);
+                       } else {
+                               DBUF_STAT_BUMP(cache_count);
+                               DBUF_STAT_MAX(cache_size_bytes_max, size);
+                               DBUF_STAT_BUMP(cache_levels[db_level]);
+                               DBUF_STAT_INCR(cache_levels_bytes[db_level],
+                                   db_size);
                        }
 
-                       if (do_arc_evict)
-                               arc_freed(spa, &bp);
+                       if (dcs == DB_DBUF_CACHE && !evicting)
+                               dbuf_evict_notify(size);
                }
        } else {
                mutex_exit(&db->db_mtx);
@@ -3709,8 +4126,37 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
        return (db->db_user);
 }
 
+uint64_t
+dmu_buf_user_size(dmu_buf_t *db_fake)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       if (db->db_user == NULL)
+               return (0);
+       return (atomic_load_64(&db->db_user->dbu_size));
+}
+
+void
+dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+       ASSERT3P(db->db_user, !=, NULL);
+       ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
+       atomic_add_64(&db->db_user->dbu_size, nadd);
+}
+
+void
+dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+       ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+       ASSERT3P(db->db_user, !=, NULL);
+       ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
+       atomic_sub_64(&db->db_user->dbu_size, nsub);
+}
+
 void
-dmu_buf_user_evict_wait()
+dmu_buf_user_evict_wait(void)
 {
        taskq_wait(dbu_evict_taskq);
 }
@@ -3729,21 +4175,6 @@ dmu_buf_get_objset(dmu_buf_t *db)
        return (dbi->db_objset);
 }
 
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
-       dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-       DB_DNODE_ENTER(dbi);
-       return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
-       dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
-       DB_DNODE_EXIT(dbi);
-}
-
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
@@ -3797,15 +4228,13 @@ dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 
        ASSERT0(db->db_level);
        ASSERT(MUTEX_HELD(&db->db_mtx));
-       ASSERT(DB_DNODE_HELD(db));
        ASSERT(db->db_blkid == DMU_BONUS_BLKID);
        ASSERT(data != NULL);
 
-       dnode_t *dn = DB_DNODE(db);
+       dnode_t *dn = dr->dr_dnode;
        ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
            DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
-       bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys));
-       DB_DNODE_EXIT(db);
+       memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));
 
        dbuf_sync_leaf_verify_bonus_dnode(dr);
 
@@ -3864,8 +4293,7 @@ noinline static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = dr->dr_dbuf;
-       dnode_t *dn;
-       zio_t *zio;
+       dnode_t *dn = dr->dr_dnode;
 
        ASSERT(dmu_tx_is_syncing(tx));
 
@@ -3885,12 +4313,9 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        ASSERT3U(db->db_state, ==, DB_CACHED);
        ASSERT(db->db_buf != NULL);
 
-       DB_DNODE_ENTER(db);
-       dn = DB_DNODE(db);
        /* Indirect block size must match what the dnode thinks it is. */
        ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
        dbuf_check_blkptr(dn, db);
-       DB_DNODE_EXIT(db);
 
        /* Provide the pending dirty record to child dbufs */
        db->db_data_pending = dr;
@@ -3899,7 +4324,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 
        dbuf_write(dr, db->db_buf, tx);
 
-       zio = dr->dr_zio;
+       zio_t *zio = dr->dr_zio;
        mutex_enter(&dr->dt.di.dr_mtx);
        dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
        ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
@@ -3924,7 +4349,7 @@ static void
 dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
 {
 #ifdef ZFS_DEBUG
-       dnode_t *dn = DB_DNODE(dr->dr_dbuf);
+       dnode_t *dn = dr->dr_dnode;
 
        /*
         * Encrypted bonus buffers can have data past their bonuslen.
@@ -3947,6 +4372,128 @@ dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
 #endif
 }
 
+static blkptr_t *
+dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
+{
+       /* This must be a lightweight dirty record. */
+       ASSERT3P(dr->dr_dbuf, ==, NULL);
+       dnode_t *dn = dr->dr_dnode;
+
+       if (dn->dn_phys->dn_nlevels == 1) {
+               VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
+               return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
+       } else {
+               dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
+               int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+               VERIFY3U(parent_db->db_level, ==, 1);
+               VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn);
+               VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
+               blkptr_t *bp = parent_db->db.db_data;
+               return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
+       }
+}
+
+static void
+dbuf_lightweight_ready(zio_t *zio)
+{
+       dbuf_dirty_record_t *dr = zio->io_private;
+       blkptr_t *bp = zio->io_bp;
+
+       if (zio->io_error != 0)
+               return;
+
+       dnode_t *dn = dr->dr_dnode;
+
+       blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
+       spa_t *spa = dmu_objset_spa(dn->dn_objset);
+       int64_t delta = bp_get_dsize_sync(spa, bp) -
+           bp_get_dsize_sync(spa, bp_orig);
+       dnode_diduse_space(dn, delta);
+
+       uint64_t blkid = dr->dt.dll.dr_blkid;
+       mutex_enter(&dn->dn_mtx);
+       if (blkid > dn->dn_phys->dn_maxblkid) {
+               ASSERT0(dn->dn_objset->os_raw_receive);
+               dn->dn_phys->dn_maxblkid = blkid;
+       }
+       mutex_exit(&dn->dn_mtx);
+
+       if (!BP_IS_EMBEDDED(bp)) {
+               uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
+               BP_SET_FILL(bp, fill);
+       }
+
+       dmu_buf_impl_t *parent_db;
+       EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
+       if (dr->dr_parent == NULL) {
+               parent_db = dn->dn_dbuf;
+       } else {
+               parent_db = dr->dr_parent->dr_dbuf;
+       }
+       rw_enter(&parent_db->db_rwlock, RW_WRITER);
+       *bp_orig = *bp;
+       rw_exit(&parent_db->db_rwlock);
+}
+
+static void
+dbuf_lightweight_done(zio_t *zio)
+{
+       dbuf_dirty_record_t *dr = zio->io_private;
+
+       VERIFY0(zio->io_error);
+
+       objset_t *os = dr->dr_dnode->dn_objset;
+       dmu_tx_t *tx = os->os_synctx;
+
+       if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
+               ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
+       } else {
+               dsl_dataset_t *ds = os->os_dsl_dataset;
+               (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
+               dsl_dataset_block_born(ds, zio->io_bp, tx);
+       }
+
+       dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+           zio->io_txg);
+
+       abd_free(dr->dt.dll.dr_abd);
+       kmem_free(dr, sizeof (*dr));
+}
+
+noinline static void
+dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+       dnode_t *dn = dr->dr_dnode;
+       zio_t *pio;
+       if (dn->dn_phys->dn_nlevels == 1) {
+               pio = dn->dn_zio;
+       } else {
+               pio = dr->dr_parent->dr_zio;
+       }
+
+       zbookmark_phys_t zb = {
+               .zb_objset = dmu_objset_id(dn->dn_objset),
+               .zb_object = dn->dn_object,
+               .zb_level = 0,
+               .zb_blkid = dr->dt.dll.dr_blkid,
+       };
+
+       /*
+        * See comment in dbuf_write().  This is so that zio->io_bp_orig
+        * will have the old BP in dbuf_lightweight_done().
+        */
+       dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
+
+       dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
+           dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
+           dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
+           &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
+           dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
+           ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
+
+       zio_nowait(dr->dr_zio);
+}
+
 /*
  * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
  * critical the we not allow the compiler to inline this function in to
@@ -3957,7 +4504,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
        arc_buf_t **datap = &dr->dt.dl.dr_data;
        dmu_buf_impl_t *db = dr->dr_dbuf;
-       dnode_t *dn;
+       dnode_t *dn = dr->dr_dnode;
        objset_t *os;
        uint64_t txg = tx->tx_txg;
 
@@ -3976,14 +4523,20 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        } else if (db->db_state == DB_FILL) {
                /* This buffer was freed and is now being re-filled */
                ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+       } else if (db->db_state == DB_READ) {
+               /*
+                * This buffer has a clone we need to write, and an in-flight
+                * read on the BP we're about to clone. Its safe to issue the
+                * write here because the read has already been issued and the
+                * contents won't change.
+                */
+               ASSERT(dr->dt.dl.dr_brtwrite &&
+                   dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
        } else {
                ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
        }
        DBUF_VERIFY(db);
 
-       DB_DNODE_ENTER(db);
-       dn = DB_DNODE(db);
-
        if (db->db_blkid == DMU_SPILL_BLKID) {
                mutex_enter(&dn->dn_mtx);
                if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
@@ -4035,7 +4588,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
                ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
                cv_wait(&db->db_changed, &db->db_mtx);
-               ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
        }
 
        /*
@@ -4061,8 +4613,31 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
                 * objects only modified in the syncing context (e.g.
                 * DNONE_DNODE blocks).
                 */
-               *datap = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf);
-               bcopy(db->db.db_data, (*datap)->b_data, arc_buf_size(*datap));
+               int psize = arc_buf_size(*datap);
+               int lsize = arc_buf_lsize(*datap);
+               arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+               enum zio_compress compress_type = arc_get_compression(*datap);
+               uint8_t complevel = arc_get_complevel(*datap);
+
+               if (arc_is_encrypted(*datap)) {
+                       boolean_t byteorder;
+                       uint8_t salt[ZIO_DATA_SALT_LEN];
+                       uint8_t iv[ZIO_DATA_IV_LEN];
+                       uint8_t mac[ZIO_DATA_MAC_LEN];
+
+                       arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
+                       *datap = arc_alloc_raw_buf(os->os_spa, db,
+                           dmu_objset_id(os), byteorder, salt, iv, mac,
+                           dn->dn_type, psize, lsize, compress_type,
+                           complevel);
+               } else if (compress_type != ZIO_COMPRESS_OFF) {
+                       ASSERT3U(type, ==, ARC_BUFC_DATA);
+                       *datap = arc_alloc_compressed_buf(os->os_spa, db,
+                           psize, lsize, compress_type, complevel);
+               } else {
+                       *datap = arc_alloc_buf(os->os_spa, db, type, psize);
+               }
+               memcpy((*datap)->b_data, db->db.db_data, psize);
        }
        db->db_data_pending = dr;
 
@@ -4073,20 +4648,15 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
        ASSERT(!list_link_active(&dr->dr_dirty_node));
        if (dn->dn_object == DMU_META_DNODE_OBJECT) {
                list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
-               DB_DNODE_EXIT(db);
        } else {
-               /*
-                * Although zio_nowait() does not "wait for an IO", it does
-                * initiate the IO. If this is an empty write it seems plausible
-                * that the IO could actually be completed before the nowait
-                * returns. We need to DB_DNODE_EXIT() first in case
-                * zio_nowait() invalidates the dbuf.
-                */
-               DB_DNODE_EXIT(db);
                zio_nowait(dr->dr_zio);
        }
 }
 
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs.  May be
+ * called recursively from dbuf_sync_indirect().
+ */
 void
 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
@@ -4105,22 +4675,26 @@ dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
                            DMU_META_DNODE_OBJECT);
                        break;
                }
-               if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
-                   dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
-                       VERIFY3U(dr->dr_dbuf->db_level, ==, level);
-               }
                list_remove(list, dr);
-               if (dr->dr_dbuf->db_level > 0)
-                       dbuf_sync_indirect(dr, tx);
-               else
-                       dbuf_sync_leaf(dr, tx);
+               if (dr->dr_dbuf == NULL) {
+                       dbuf_sync_lightweight(dr, tx);
+               } else {
+                       if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+                           dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+                               VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+                       }
+                       if (dr->dr_dbuf->db_level > 0)
+                               dbuf_sync_indirect(dr, tx);
+                       else
+                               dbuf_sync_leaf(dr, tx);
+               }
        }
 }
 
-/* ARGSUSED */
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
+       (void) buf;
        dmu_buf_impl_t *db = vdb;
        dnode_t *dn;
        blkptr_t *bp = zio->io_bp;
@@ -4139,7 +4713,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
        dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
        zio->io_prev_space_delta = delta;
 
-       if (bp->blk_birth != 0) {
+       if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
                ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
                    BP_GET_TYPE(bp) == dn->dn_type) ||
                    (db->db_blkid == DMU_SPILL_BLKID &&
@@ -4176,6 +4750,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                                i += DNODE_MIN_SIZE;
                                if (dnp->dn_type != DMU_OT_NONE) {
                                        fill++;
+                                       for (int j = 0; j < dnp->dn_nblkptr;
+                                           j++) {
+                                               (void) zfs_blkptr_verify(spa,
+                                                   &dnp->dn_blkptr[j],
+                                                   BLK_CONFIG_SKIP,
+                                                   BLK_VERIFY_HALT);
+                                       }
+                                       if (dnp->dn_flags &
+                                           DNODE_FLAG_SPILL_BLKPTR) {
+                                               (void) zfs_blkptr_verify(spa,
+                                                   DN_SPILL_BLKPTR(dnp),
+                                                   BLK_CONFIG_SKIP,
+                                                   BLK_VERIFY_HALT);
+                                       }
                                        i += dnp->dn_extra_slots *
                                            DNODE_MIN_SIZE;
                                }
@@ -4193,6 +4781,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
                        if (BP_IS_HOLE(ibp))
                                continue;
+                       (void) zfs_blkptr_verify(spa, ibp,
+                           BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
                        fill += BP_GET_FILL(ibp);
                }
        }
@@ -4208,7 +4798,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
        dmu_buf_unlock_parent(db, dblt, FTAG);
 }
 
-/* ARGSUSED */
 /*
  * This function gets called just prior to running through the compression
  * stage of the zio pipeline. If we're an indirect block comprised of only
@@ -4219,6 +4808,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 static void
 dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
+       (void) zio, (void) buf;
        dmu_buf_impl_t *db = vdb;
        dnode_t *dn;
        blkptr_t *bp;
@@ -4247,53 +4837,21 @@ dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                 * zero out.
                 */
                rw_enter(&db->db_rwlock, RW_WRITER);
-               bzero(db->db.db_data, db->db.db_size);
+               memset(db->db.db_data, 0, db->db.db_size);
                rw_exit(&db->db_rwlock);
        }
        DB_DNODE_EXIT(db);
 }
 
-/*
- * The SPA will call this callback several times for each zio - once
- * for every physical child i/o (zio->io_phys_children times).  This
- * allows the DMU to monitor the progress of each logical i/o.  For example,
- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
- * block.  There may be a long delay before all copies/fragments are completed,
- * so this callback allows us to retire dirty space gradually, as the physical
- * i/os complete.
- */
-/* ARGSUSED */
-static void
-dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
-{
-       dmu_buf_impl_t *db = arg;
-       objset_t *os = db->db_objset;
-       dsl_pool_t *dp = dmu_objset_pool(os);
-       dbuf_dirty_record_t *dr;
-       int delta = 0;
-
-       dr = db->db_data_pending;
-       ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
-       /*
-        * The callback will be called io_phys_children times.  Retire one
-        * portion of our dirty space each time we are called.  Any rounding
-        * error will be cleaned up by dbuf_write_done().
-        */
-       delta = dr->dr_accounted / zio->io_phys_children;
-       dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
-/* ARGSUSED */
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
+       (void) buf;
        dmu_buf_impl_t *db = vdb;
        blkptr_t *bp_orig = &zio->io_bp_orig;
        blkptr_t *bp = db->db_blkptr;
        objset_t *os = db->db_objset;
        dmu_tx_t *tx = os->os_synctx;
-       dbuf_dirty_record_t *dr;
 
        ASSERT0(zio->io_error);
        ASSERT(db->db_blkptr == bp);
@@ -4314,7 +4872,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 
        DBUF_VERIFY(db);
 
-       dr = db->db_data_pending;
+       dbuf_dirty_record_t *dr = db->db_data_pending;
+       dnode_t *dn = dr->dr_dnode;
        ASSERT(!list_link_active(&dr->dr_dirty_node));
        ASSERT(dr->dr_dbuf == db);
        ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
@@ -4322,14 +4881,9 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 
 #ifdef ZFS_DEBUG
        if (db->db_blkid == DMU_SPILL_BLKID) {
-               dnode_t *dn;
-
-               DB_DNODE_ENTER(db);
-               dn = DB_DNODE(db);
                ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
                ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
                    db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
-               DB_DNODE_EXIT(db);
        }
 #endif
 
@@ -4337,14 +4891,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
                if (db->db_state != DB_NOFILL) {
-                       if (dr->dt.dl.dr_data != db->db_buf)
+                       if (dr->dt.dl.dr_data != NULL &&
+                           dr->dt.dl.dr_data != db->db_buf) {
                                arc_buf_destroy(dr->dt.dl.dr_data, db);
+                       }
                }
        } else {
-               dnode_t *dn;
-
-               DB_DNODE_ENTER(db);
-               dn = DB_DNODE(db);
                ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
                ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
                if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -4355,7 +4907,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                        ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
                            db->db.db_size);
                }
-               DB_DNODE_EXIT(db);
                mutex_destroy(&dr->dt.di.dr_mtx);
                list_destroy(&dr->dt.di.dr_children);
        }
@@ -4366,27 +4917,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        db->db_data_pending = NULL;
        dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
 
-       /*
-        * If we didn't do a physical write in this ZIO and we
-        * still ended up here, it means that the space of the
-        * dbuf that we just released (and undirtied) above hasn't
-        * been marked as undirtied in the pool's accounting.
-        *
-        * Thus, we undirty that space in the pool's view of the
-        * world here. For physical writes this type of update
-        * happens in dbuf_write_physdone().
-        *
-        * If we did a physical write, cleanup any rounding errors
-        * that came up due to writing multiple copies of a block
-        * on disk [see dbuf_write_physdone()].
-        */
-       if (zio->io_phys_children == 0) {
-               dsl_pool_undirty_space(dmu_objset_pool(os),
-                   dr->dr_accounted, zio->io_txg);
-       } else {
-               dsl_pool_undirty_space(dmu_objset_pool(os),
-                   dr->dr_accounted % zio->io_phys_children, zio->io_txg);
-       }
+       dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+           zio->io_txg);
 
        kmem_free(dr, sizeof (dbuf_dirty_record_t));
 }
@@ -4430,7 +4962,7 @@ dbuf_write_override_done(zio_t *zio)
        dbuf_write_done(zio, NULL, db);
 
        if (zio->io_abd != NULL)
-               abd_put(zio->io_abd);
+               abd_free(zio->io_abd);
 }
 
 typedef struct dbuf_remap_impl_callback_arg {
@@ -4468,7 +5000,7 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
        ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
        drica.drica_os = dn->dn_objset;
-       drica.drica_blk_birth = bp->blk_birth;
+       drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
        drica.drica_tx = tx;
        if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
            &drica)) {
@@ -4483,7 +5015,8 @@ dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx)
                if (dn->dn_objset != spa_meta_objset(spa)) {
                        dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
                        if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
-                           bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+                           BP_GET_LOGICAL_BIRTH(bp) >
+                           ds->ds_dir->dd_origin_txg) {
                                ASSERT(!BP_IS_EMBEDDED(bp));
                                ASSERT(dsl_dir_is_clone(ds->ds_dir));
                                ASSERT(spa_feature_is_enabled(spa,
@@ -4543,12 +5076,15 @@ dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
 }
 
 
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db = dr->dr_dbuf;
-       dnode_t *dn;
+       dnode_t *dn = dr->dr_dnode;
        objset_t *os;
        dmu_buf_impl_t *parent = db->db_parent;
        uint64_t txg = tx->tx_txg;
@@ -4559,8 +5095,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
        ASSERT(dmu_tx_is_syncing(tx));
 
-       DB_DNODE_ENTER(db);
-       dn = DB_DNODE(db);
        os = dn->dn_objset;
 
        if (db->db_state != DB_NOFILL) {
@@ -4604,7 +5138,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
        }
 
        ASSERT(db->db_level == 0 || data == db->db_buf);
-       ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+       ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
        ASSERT(pio);
 
        SET_BOOKMARK(&zb, os->os_dsl_dataset ?
@@ -4616,7 +5150,6 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
        wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
 
        dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
-       DB_DNODE_EXIT(db);
 
        /*
         * We copy the blkptr now (rather than when we instantiate the dirty
@@ -4637,20 +5170,21 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 
                dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
                    contents, db->db.db_size, db->db.db_size, &zp,
-                   dbuf_write_override_ready, NULL, NULL,
+                   dbuf_write_override_ready, NULL,
                    dbuf_write_override_done,
                    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
                mutex_enter(&db->db_mtx);
                dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
                zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
-                   dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+                   dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
+                   dr->dt.dl.dr_brtwrite);
                mutex_exit(&db->db_mtx);
        } else if (db->db_state == DB_NOFILL) {
                ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
                    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
                dr->dr_zio = zio_write(pio, os->os_spa, txg,
                    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
-                   dbuf_write_nofill_ready, NULL, NULL,
+                   dbuf_write_nofill_ready, NULL,
                    dbuf_write_nofill_done, db,
                    ZIO_PRIORITY_ASYNC_WRITE,
                    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
@@ -4667,11 +5201,10 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
                        children_ready_cb = dbuf_write_children_ready;
 
                dr->dr_zio = arc_write(pio, os->os_spa, txg,
-                   &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
-                   &zp, dbuf_write_ready,
-                   children_ready_cb, dbuf_write_physdone,
-                   dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-                   ZIO_FLAG_MUSTSUCCEED, &zb);
+                   &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
+                   dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
+                   children_ready_cb, dbuf_write_done, db,
+                   ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
        }
 }
 
@@ -4689,6 +5222,7 @@ EXPORT_SYMBOL(dbuf_dirty);
 EXPORT_SYMBOL(dmu_buf_set_crypt_params);
 EXPORT_SYMBOL(dmu_buf_will_dirty);
 EXPORT_SYMBOL(dmu_buf_is_dirty);
+EXPORT_SYMBOL(dmu_buf_will_clone);
 EXPORT_SYMBOL(dmu_buf_will_not_fill);
 EXPORT_SYMBOL(dmu_buf_will_fill);
 EXPORT_SYMBOL(dmu_buf_fill_done);
@@ -4711,25 +5245,23 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie);
 EXPORT_SYMBOL(dmu_buf_get_user);
 EXPORT_SYMBOL(dmu_buf_get_blkptr);
 
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
        "Maximum size in bytes of the dbuf cache.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
-       "Percentage over dbuf_cache_max_bytes when dbufs must be evicted "
-       "directly.");
+       "Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");
 
 ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
-       "Percentage below dbuf_cache_max_bytes when the evict thread stops "
-       "evicting dbufs.");
+       "Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
+       "Maximum size in bytes of dbuf metadata cache.");
 
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
-       "Maximum size in bytes of the dbuf metadata cache.");
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,
+       "Set size of dbuf cache to log2 fraction of arc size.");
 
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
-       "Set the size of the dbuf cache to a log2 fraction of arc size.");
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
+       "Set size of dbuf metadata cache to log2 fraction of arc size.");
 
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
-       "Set the size of the dbuf metadata cache to a log2 fraction of arc "
-       "size.");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
+       "Set size of dbuf cache mutex array as log2 shift.");