]> git.proxmox.com Git - mirror_zfs-debian.git/blobdiff - module/zfs/dbuf.c
Imported Upstream version 0.6.4.2
[mirror_zfs-debian.git] / module / zfs / dbuf.c
index c8a52617178e573f12f4426b48f80dee302b45a1..ed6a8fd2a4dc1b59f75321625759cbe9580835dc 100644 (file)
@@ -21,7 +21,7 @@
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  */
 
 #include <sys/dmu_zfetch.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
+#include <sys/range_tree.h>
+#include <sys/trace_dbuf.h>
 
 struct dbuf_hold_impl_data {
        /* Function arguments */
@@ -208,8 +212,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
 }
 
 /*
- * Remove an entry from the hash table.  This operation will
- * fail if there are any existing holds on the db.
+ * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
@@ -223,7 +226,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
        idx = hv & h->hash_table_mask;
 
        /*
-        * We musn't hold db_mtx to maintin lock ordering:
+        * We musn't hold db_mtx to maintain lock ordering:
         * DBUF_HASH_MUTEX > db_mtx.
         */
        ASSERT(refcount_is_zero(&db->db_holds));
@@ -263,7 +266,10 @@ dbuf_evict_user(dmu_buf_impl_t *db)
 boolean_t
 dbuf_is_metadata(dmu_buf_impl_t *db)
 {
-       if (db->db_level > 0) {
+       /*
+        * Consider indirect blocks and spill blocks to be meta data.
+        */
+       if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
                return (B_TRUE);
        } else {
                boolean_t is_metadata;
@@ -309,7 +315,7 @@ retry:
         * Large allocations which do not require contiguous pages
         * should be using vmem_alloc() in the linux kernel
         */
-       h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_PUSHPAGE);
+       h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
 #else
        h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 #endif
@@ -481,7 +487,6 @@ static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
        ASSERT(MUTEX_HELD(&db->db_mtx));
-       ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
        db->db_buf = buf;
        if (buf != NULL) {
                ASSERT(buf->b_data != NULL);
@@ -508,10 +513,9 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
        mutex_enter(&db->db_mtx);
        if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
                int blksz = db->db.db_size;
-               spa_t *spa;
+               spa_t *spa = db->db_objset->os_spa;
 
                mutex_exit(&db->db_mtx);
-               DB_GET_SPA(&spa, db);
                abuf = arc_loan_buf(spa, blksz);
                bcopy(db->db.db_data, abuf->b_data, blksz);
        } else {
@@ -568,13 +572,13 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        dbuf_rele_and_unlock(db, NULL);
 }
 
-static void
+static int
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 {
        dnode_t *dn;
-       spa_t *spa;
-       zbookmark_t zb;
+       zbookmark_phys_t zb;
        uint32_t aflags = ARC_NOWAIT;
+       int err;
 
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
@@ -599,7 +603,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
                dbuf_update_data(db);
                db->db_state = DB_CACHED;
                mutex_exit(&db->db_mtx);
-               return;
+               return (0);
        }
 
        /*
@@ -612,17 +616,16 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
            BP_IS_HOLE(db->db_blkptr)))) {
                arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
-               dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
-                   db->db.db_size, db, type));
                DB_DNODE_EXIT(db);
+               dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
+                   db->db.db_size, db, type));
                bzero(db->db.db_data, db->db.db_size);
                db->db_state = DB_CACHED;
                *flags |= DB_RF_CACHED;
                mutex_exit(&db->db_mtx);
-               return;
+               return (0);
        }
 
-       spa = dn->dn_objset->os_spa;
        DB_DNODE_EXIT(db);
 
        db->db_state = DB_READ;
@@ -639,20 +642,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 
        dbuf_add_ref(db, NULL);
 
-       (void) arc_read(zio, spa, db->db_blkptr,
+       err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
            dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
            (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
            &aflags, &zb);
        if (aflags & ARC_CACHED)
                *flags |= DB_RF_CACHED;
+
+       return (SET_ERROR(err));
 }
 
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
        int err = 0;
-       int havepzio = (zio != NULL);
-       int prefetch;
+       boolean_t havepzio = (zio != NULL);
+       boolean_t prefetch;
        dnode_t *dn;
 
        /*
@@ -687,11 +692,12 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 
                if (zio == NULL)
                        zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-               dbuf_read_impl(db, zio, &flags);
+
+               err = dbuf_read_impl(db, zio, &flags);
 
                /* dbuf_read_impl has dropped db_mtx for us */
 
-               if (prefetch)
+               if (!err && prefetch)
                        dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
                            db->db.db_size, flags & DB_RF_CACHED);
 
@@ -699,7 +705,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                        rw_exit(&dn->dn_struct_rwlock);
                DB_DNODE_EXIT(db);
 
-               if (!havepzio)
+               if (!err && !havepzio)
                        err = zio_wait(zio);
        } else {
                /*
@@ -725,6 +731,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                            db->db_state == DB_FILL) {
                                ASSERT(db->db_state == DB_READ ||
                                    (flags & DB_RF_HAVESTRUCT) == 0);
+                               DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
+                                   db, zio_t *, zio);
                                cv_wait(&db->db_changed, &db->db_mtx);
                        }
                        if (db->db_state == DB_UNCACHED)
@@ -747,11 +755,10 @@ dbuf_noread(dmu_buf_impl_t *db)
                cv_wait(&db->db_changed, &db->db_mtx);
        if (db->db_state == DB_UNCACHED) {
                arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-               spa_t *spa;
+               spa_t *spa = db->db_objset->os_spa;
 
                ASSERT(db->db_buf == NULL);
                ASSERT(db->db.db_data == NULL);
-               DB_GET_SPA(&spa, db);
                dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
                db->db_state = DB_FILL;
        } else if (db->db_state == DB_NOFILL) {
@@ -806,9 +813,8 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
        } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
                int size = db->db.db_size;
                arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-               spa_t *spa;
+               spa_t *spa = db->db_objset->os_spa;
 
-               DB_GET_SPA(&spa, db);
                dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
                bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
        } else {
@@ -834,12 +840,9 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
        ASSERT(db->db_data_pending != dr);
 
        /* free this block */
-       if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
-               spa_t *spa;
+       if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
+               zio_free(db->db_objset->os_spa, txg, bp);
 
-               DB_GET_SPA(&spa, db);
-               zio_free(spa, txg, bp);
-       }
        dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
        dr->dt.dl.dr_nopwrite = B_FALSE;
 
@@ -857,9 +860,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 /*
  * Evict (if its unreferenced) or clear (if its referenced) any level-0
  * data blocks in the free range, so that any future readers will find
- * empty blocks.  Also, if we happen across any level-1 dbufs in the
- * range that have not already been marked dirty, mark them dirty so
- * they stay in memory.
+ * empty blocks.
  *
  * This is a no-op if the dataset is in the middle of an incremental
  * receive; see comment below for details.
@@ -869,18 +870,16 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
 {
        dmu_buf_impl_t *db, *db_next;
        uint64_t txg = tx->tx_txg;
-       int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-       uint64_t first_l1 = start >> epbs;
-       uint64_t last_l1 = end >> epbs;
+       boolean_t freespill =
+           (start == DMU_SPILL_BLKID || end == DMU_SPILL_BLKID);
 
-       if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
+       if (end > dn->dn_maxblkid && !freespill)
                end = dn->dn_maxblkid;
-               last_l1 = end >> epbs;
-       }
        dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
 
        mutex_enter(&dn->dn_dbufs_mtx);
-       if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
+       if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz &&
+           !freespill) {
                /* There can't be any dbufs in this range; no need to search. */
                mutex_exit(&dn->dn_dbufs_mtx);
                return;
@@ -899,24 +898,14 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
                db_next = list_next(&dn->dn_dbufs, db);
                ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
-               if (db->db_level == 1 &&
-                   db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
-                       mutex_enter(&db->db_mtx);
-                       if (db->db_last_dirty &&
-                           db->db_last_dirty->dr_txg < txg) {
-                               dbuf_add_ref(db, FTAG);
-                               mutex_exit(&db->db_mtx);
-                               dbuf_will_dirty(db, tx);
-                               dbuf_rele(db, FTAG);
-                       } else {
-                               mutex_exit(&db->db_mtx);
-                       }
-               }
-
+               /* Skip indirect blocks. */
                if (db->db_level != 0)
                        continue;
-               dprintf_dbuf(db, "found buf %s\n", "");
-               if (db->db_blkid < start || db->db_blkid > end)
+               /* Skip direct blocks outside the range. */
+               if (!freespill && (db->db_blkid < start || db->db_blkid > end))
+                       continue;
+               /* Skip all direct blocks, only free spill blocks. */
+               if (freespill && (db->db_blkid != DMU_SPILL_BLKID))
                        continue;
 
                /* found a level 0 buffer in the range */
@@ -992,24 +981,29 @@ dbuf_block_freeable(dmu_buf_impl_t *db)
         * We don't need any locking to protect db_blkptr:
         * If it's syncing, then db_last_dirty will be set
         * so we'll ignore db_blkptr.
+        *
+        * This logic ensures that only block births for
+        * filled blocks are considered.
         */
        ASSERT(MUTEX_HELD(&db->db_mtx));
-       if (db->db_last_dirty)
+       if (db->db_last_dirty && (db->db_blkptr == NULL ||
+           !BP_IS_HOLE(db->db_blkptr))) {
                birth_txg = db->db_last_dirty->dr_txg;
-       else if (db->db_blkptr)
+       } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
                birth_txg = db->db_blkptr->blk_birth;
+       }
 
        /*
-        * If we don't exist or are in a snapshot, we can't be freed.
+        * If this block don't exist or is in a snapshot, it can't be freed.
         * Don't pass the bp to dsl_dataset_block_freeable() since we
         * are holding the db_mtx lock and might deadlock if we are
         * prefetching a dedup-ed block.
         */
-       if (birth_txg)
+       if (birth_txg != 0)
                return (ds == NULL ||
                    dsl_dataset_block_freeable(ds, NULL, birth_txg));
        else
-               return (FALSE);
+               return (B_FALSE);
 }
 
 void
@@ -1029,7 +1023,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
        ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
        /*
-        * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+        * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
         * is OK, because there can be no other references to the db
         * when we are changing its size, so no concurrent DB_FILL can
         * be happening.
@@ -1038,7 +1032,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
         * XXX we should be doing a dbuf_read, checking the return
         * value and returning that up to our callers
         */
-       dbuf_will_dirty(db, tx);
+       dmu_buf_will_dirty(&db->db, tx);
 
        /* create the data buffer for the new block */
        buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
@@ -1068,9 +1062,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
-       objset_t *os;
+       ASSERTV(objset_t *os = db->db_objset);
 
-       DB_GET_OBJSET(&os, db);
        ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
        ASSERT(arc_released(os->os_phys_buf) ||
            list_link_active(&os->os_dsl_dataset->ds_synced_link));
@@ -1133,7 +1126,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                dn->dn_dirtyctx =
                    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
                ASSERT(dn->dn_dirtyctx_firstset == NULL);
-               dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_PUSHPAGE);
+               dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
        }
        mutex_exit(&dn->dn_mtx);
 
@@ -1210,7 +1203,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         * to make a copy of it so that the changes we make in this
         * transaction group won't leak out when we sync the older txg.
         */
-       dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_PUSHPAGE);
+       dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
        list_link_init(&dr->dr_dirty_node);
        if (db->db_level == 0) {
                void *data_old = db->db_buf;
@@ -1257,7 +1250,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
            db->db_blkid != DMU_SPILL_BLKID) {
                mutex_enter(&dn->dn_mtx);
-               dnode_clear_range(dn, db->db_blkid, 1, tx);
+               if (dn->dn_free_ranges[txgoff] != NULL) {
+                       range_tree_clear(dn->dn_free_ranges[txgoff],
+                           db->db_blkid, 1);
+               }
                mutex_exit(&dn->dn_mtx);
                db->db_freed_in_flight = FALSE;
        }
@@ -1388,14 +1384,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
 
-       /*
-        * Note:  This code will probably work even if there are concurrent
-        * holders, but it is untested in that scenerio, as the ZPL and
-        * ztest have additional locking (the range locks) that prevents
-        * that type of concurrent access.
-        */
-       ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
-
        dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
        ASSERT(db->db.db_size != 0);
@@ -1453,10 +1441,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
        return (B_FALSE);
 }
 
-#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
 void
-dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
        int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
 
        ASSERT(tx->tx_txg != 0);
@@ -1519,6 +1507,38 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
        mutex_exit(&db->db_mtx);
 }
 
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder,
+    dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+       struct dirty_leaf *dl;
+       dmu_object_type_t type;
+
+       DB_DNODE_ENTER(db);
+       type = DB_DNODE(db)->dn_type;
+       DB_DNODE_EXIT(db);
+
+       ASSERT0(db->db_level);
+       ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+       dmu_buf_will_not_fill(dbuf, tx);
+
+       ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+       dl = &db->db_last_dirty->dt.dl;
+       encode_embedded_bp_compressed(&dl->dr_overridden_by,
+           data, comp, uncompressed_size, compressed_size);
+       BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+       BP_SET_TYPE(&dl->dr_overridden_by, type);
+       BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+       BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+       dl->dr_override_state = DR_OVERRIDDEN;
+       dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
+}
+
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
@@ -1579,7 +1599,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
        db->db_state = DB_FILL;
        mutex_exit(&db->db_mtx);
        (void) dbuf_dirty(db, tx);
-       dbuf_fill_done(db, tx);
+       dmu_buf_fill_done(&db->db, tx);
 }
 
 /*
@@ -1588,12 +1608,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
  * when we are not holding the dn_dbufs_mtx, we can't clear the
  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
  * in this case.  For callers from the DMU we will usually see:
- *     dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
+ *     dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
  * For the arc callback, we will usually see:
  *     dbuf_do_evict()->dbuf_clear();dbuf_destroy()
  * Sometimes, though, we will get a mix of these two:
- *     DMU: dbuf_clear()->arc_buf_evict()
+ *     DMU: dbuf_clear()->arc_clear_callback()
  *     ARC: dbuf_do_evict()->dbuf_destroy()
+ *
+ * This routine will dissociate the dbuf from the arc, by calling
+ * arc_clear_callback(), but will not evict the data from the ARC.
  */
 void
 dbuf_clear(dmu_buf_impl_t *db)
@@ -1601,7 +1624,7 @@ dbuf_clear(dmu_buf_impl_t *db)
        dnode_t *dn;
        dmu_buf_impl_t *parent = db->db_parent;
        dmu_buf_impl_t *dndb;
-       int dbuf_gone = FALSE;
+       boolean_t dbuf_gone = B_FALSE;
 
        ASSERT(MUTEX_HELD(&db->db_mtx));
        ASSERT(refcount_is_zero(&db->db_holds));
@@ -1629,7 +1652,7 @@ dbuf_clear(dmu_buf_impl_t *db)
        dndb = dn->dn_dbuf;
        if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
                list_remove(&dn->dn_dbufs, db);
-               (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+               atomic_dec_32(&dn->dn_dbufs_count);
                membar_producer();
                DB_DNODE_EXIT(db);
                /*
@@ -1647,7 +1670,7 @@ dbuf_clear(dmu_buf_impl_t *db)
        }
 
        if (db->db_buf)
-               dbuf_gone = arc_buf_evict(db->db_buf);
+               dbuf_gone = arc_clear_callback(db->db_buf);
 
        if (!dbuf_gone)
                mutex_exit(&db->db_mtx);
@@ -1746,7 +1769,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
        ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
        ASSERT(dn->dn_type != DMU_OT_NONE);
 
-       db = kmem_cache_alloc(dbuf_cache, KM_PUSHPAGE);
+       db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
 
        db->db_objset = os;
        db->db.db_object = dn->dn_object;
@@ -1814,7 +1837,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
        ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
            refcount_count(&dn->dn_holds) > 0);
        (void) refcount_add(&dn->dn_holds, db);
-       (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+       atomic_inc_32(&dn->dn_dbufs_count);
 
        dprintf_dbuf(db, "db=%p\n", db);
 
@@ -1824,8 +1847,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 static int
 dbuf_do_evict(void *private)
 {
-       arc_buf_t *buf = private;
-       dmu_buf_impl_t *db = buf->b_private;
+       dmu_buf_impl_t *db = private;
 
        if (!MUTEX_HELD(&db->db_mtx))
                mutex_enter(&db->db_mtx);
@@ -1861,7 +1883,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
                        dn = DB_DNODE(db);
                        mutex_enter(&dn->dn_dbufs_mtx);
                        list_remove(&dn->dn_dbufs, db);
-                       (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+                       atomic_dec_32(&dn->dn_dbufs_count);
                        mutex_exit(&dn->dn_dbufs_mtx);
                        DB_DNODE_EXIT(db);
                        /*
@@ -1912,10 +1934,10 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
        }
 
        if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
-               if (bp && !BP_IS_HOLE(bp)) {
+               if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
                        dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                        uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
-                       zbookmark_t zb;
+                       zbookmark_phys_t zb;
 
                        SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
                            dn->dn_object, 0, blkid);
@@ -2042,7 +2064,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
        int error;
 
        dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) *
-           DBUF_HOLD_IMPL_MAX_DEPTH, KM_PUSHPAGE);
+           DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
        __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0);
 
        error = __dbuf_hold_impl(dh);
@@ -2137,7 +2159,6 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
  * dnode's parent dbuf evicting its dnode handles.
  */
-#pragma weak dmu_buf_rele = dbuf_rele
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
@@ -2145,6 +2166,12 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
        dbuf_rele_and_unlock(db, tag);
 }
 
+void
+dmu_buf_rele(dmu_buf_t *db, void *tag)
+{
+       dbuf_rele((dmu_buf_impl_t *)db, tag);
+}
+
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
  * db_dirtycnt and db_holds to be updated atomically.
@@ -2178,21 +2205,60 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 
        if (holds == 0) {
                if (db->db_blkid == DMU_BONUS_BLKID) {
-                       mutex_exit(&db->db_mtx);
+                       dnode_t *dn;
 
                        /*
-                        * If the dnode moves here, we cannot cross this barrier
-                        * until the move completes.
+                        * If the dnode moves here, we cannot cross this
+                        * barrier until the move completes.
                         */
                        DB_DNODE_ENTER(db);
-                       (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+
+                       dn = DB_DNODE(db);
+                       atomic_dec_32(&dn->dn_dbufs_count);
+
+                       /*
+                        * Decrementing the dbuf count means that the bonus
+                        * buffer's dnode hold is no longer discounted in
+                        * dnode_move(). The dnode cannot move until after
+                        * the dnode_rele_and_unlock() below.
+                        */
                        DB_DNODE_EXIT(db);
+
+                       /*
+                        * Do not reference db after its lock is dropped.
+                        * Another thread may evict it.
+                        */
+                       mutex_exit(&db->db_mtx);
+
                        /*
-                        * The bonus buffer's dnode hold is no longer discounted
-                        * in dnode_move(). The dnode cannot move until after
-                        * the dnode_rele().
+                        * If the dnode has been freed, evict the bonus
+                        * buffer immediately.  The data in the bonus
+                        * buffer is no longer relevant and this prevents
+                        * a stale bonus buffer from being associated
+                        * with this dnode_t should the dnode_t be reused
+                        * prior to being destroyed.
                         */
-                       dnode_rele(DB_DNODE(db), db);
+                       mutex_enter(&dn->dn_mtx);
+                       if (dn->dn_type == DMU_OT_NONE ||
+                           dn->dn_free_txg != 0) {
+                               /*
+                                * Drop dn_mtx.  It is a leaf lock and
+                                * cannot be held when dnode_evict_bonus()
+                                * acquires other locks in order to
+                                * perform the eviction.
+                                *
+                                * Freed dnodes cannot be reused until the
+                                * last hold is released.  Since this bonus
+                                * buffer has a hold, the dnode will remain
+                                * in the free state, even without dn_mtx
+                                * held, until the dnode_rele_and_unlock()
+                                * below.
+                                */
+                               mutex_exit(&dn->dn_mtx);
+                               dnode_evict_bonus(dn);
+                               mutex_enter(&dn->dn_mtx);
+                       }
+                       dnode_rele_and_unlock(dn, db);
                } else if (db->db_buf == NULL) {
                        /*
                         * This is a special case: we never associated this
@@ -2227,11 +2293,23 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                         * block on-disk. If so, then we simply evict
                         * ourselves.
                         */
-                       if (!DBUF_IS_CACHEABLE(db) ||
-                           arc_buf_eviction_needed(db->db_buf))
+                       if (!DBUF_IS_CACHEABLE(db)) {
+                               if (db->db_blkptr != NULL &&
+                                   !BP_IS_HOLE(db->db_blkptr) &&
+                                   !BP_IS_EMBEDDED(db->db_blkptr)) {
+                                       spa_t *spa =
+                                           dmu_objset_spa(db->db_objset);
+                                       blkptr_t bp = *db->db_blkptr;
+                                       dbuf_clear(db);
+                                       arc_freed(spa, &bp);
+                               } else {
+                                       dbuf_clear(db);
+                               }
+                       } else if (arc_buf_eviction_needed(db->db_buf)) {
                                dbuf_clear(db);
-                       else
+                       } else {
                                mutex_exit(&db->db_mtx);
+                       }
                }
        } else {
                mutex_exit(&db->db_mtx);
@@ -2597,7 +2675,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
        uint64_t fill = 0;
        int i;
 
-       ASSERT(db->db_blkptr == bp);
+       ASSERT3P(db->db_blkptr, ==, bp);
 
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
@@ -2605,18 +2683,15 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
        dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
        zio->io_prev_space_delta = delta;
 
-       if (BP_IS_HOLE(bp)) {
-               ASSERT(bp->blk_fill == 0);
-               DB_DNODE_EXIT(db);
-               return;
+       if (bp->blk_birth != 0) {
+               ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+                   BP_GET_TYPE(bp) == dn->dn_type) ||
+                   (db->db_blkid == DMU_SPILL_BLKID &&
+                   BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+                   BP_IS_EMBEDDED(bp));
+               ASSERT(BP_GET_LEVEL(bp) == db->db_level);
        }
 
-       ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
-           BP_GET_TYPE(bp) == dn->dn_type) ||
-           (db->db_blkid == DMU_SPILL_BLKID &&
-           BP_GET_TYPE(bp) == dn->dn_bonustype));
-       ASSERT(BP_GET_LEVEL(bp) == db->db_level);
-
        mutex_enter(&db->db_mtx);
 
 #ifdef ZFS_DEBUG
@@ -2642,7 +2717,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                                        fill++;
                        }
                } else {
-                       fill = 1;
+                       if (BP_IS_HOLE(bp)) {
+                               fill = 0;
+                       } else {
+                               fill = 1;
+                       }
                }
        } else {
                blkptr_t *ibp = db->db.db_data;
@@ -2650,12 +2729,13 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
                        if (BP_IS_HOLE(ibp))
                                continue;
-                       fill += ibp->blk_fill;
+                       fill += BP_GET_FILL(ibp);
                }
        }
        DB_DNODE_EXIT(db);
 
-       bp->blk_fill = fill;
+       if (!BP_IS_EMBEDDED(bp))
+               bp->blk_fill = fill;
 
        mutex_exit(&db->db_mtx);
 }
@@ -2697,9 +2777,10 @@ static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
        dmu_buf_impl_t *db = vdb;
-       blkptr_t *bp = zio->io_bp;
        blkptr_t *bp_orig = &zio->io_bp_orig;
-       uint64_t txg = zio->io_txg;
+       blkptr_t *bp = db->db_blkptr;
+       objset_t *os = db->db_objset;
+       dmu_tx_t *tx = os->os_synctx;
        dbuf_dirty_record_t **drp, *dr;
 
        ASSERT0(zio->io_error);
@@ -2712,14 +2793,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
                ASSERT(BP_EQUAL(bp, bp_orig));
        } else {
-               objset_t *os;
-               dsl_dataset_t *ds;
-               dmu_tx_t *tx;
-
-               DB_GET_OBJSET(&os, db);
-               ds = os->os_dsl_dataset;
-               tx = os->os_synctx;
-
+               dsl_dataset_t *ds = os->os_dsl_dataset;
                (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
                dsl_dataset_block_born(ds, bp, tx);
        }
@@ -2732,7 +2806,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        while ((dr = *drp) != db->db_data_pending)
                drp = &dr->dr_next;
        ASSERT(!list_link_active(&dr->dr_dirty_node));
-       ASSERT(dr->dr_txg == txg);
        ASSERT(dr->dr_dbuf == db);
        ASSERT(dr->dr_next == NULL);
        *drp = dr->dr_next;
@@ -2766,15 +2839,16 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                DB_DNODE_ENTER(db);
                dn = DB_DNODE(db);
                ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-               ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+               ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
                if (!BP_IS_HOLE(db->db_blkptr)) {
                        ASSERTV(int epbs = dn->dn_phys->dn_indblkshift -
                            SPA_BLKPTRSHIFT);
+                       ASSERT3U(db->db_blkid, <=,
+                           dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
                        ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
                            db->db.db_size);
-                       ASSERT3U(dn->dn_phys->dn_maxblkid
-                           >> (db->db_level * epbs), >=, db->db_blkid);
-                       arc_set_callback(db->db_buf, dbuf_do_evict, db);
+                       if (!arc_released(db->db_buf))
+                               arc_set_callback(db->db_buf, dbuf_do_evict, db);
                }
                DB_DNODE_EXIT(db);
                mutex_destroy(&dr->dt.di.dr_mtx);
@@ -2786,8 +2860,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
        ASSERT(db->db_dirtycnt > 0);
        db->db_dirtycnt -= 1;
        db->db_data_pending = NULL;
-
-       dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+       dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
 }
 
 static void
@@ -2838,7 +2911,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
        objset_t *os;
        dmu_buf_impl_t *parent = db->db_parent;
        uint64_t txg = tx->tx_txg;
-       zbookmark_t zb;
+       zbookmark_phys_t zb;
        zio_prop_t zp;
        zio_t *zio;
        int wp_flag = 0;
@@ -2901,10 +2974,16 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
        dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
        DB_DNODE_EXIT(db);
 
-       if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-               ASSERT(db->db_state != DB_NOFILL);
+       if (db->db_level == 0 &&
+           dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+               /*
+                * The BP for this block has been provided by open context
+                * (by dmu_sync() or dmu_buf_write_embedded()).
+                */
+               void *contents = (data != NULL) ? data->b_data : NULL;
+
                dr->dr_zio = zio_write(zio, os->os_spa, txg,
-                   db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+                   db->db_blkptr, contents, db->db.db_size, &zp,
                    dbuf_write_override_ready, NULL, dbuf_write_override_done,
                    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
                mutex_enter(&db->db_mtx);
@@ -2965,4 +3044,5 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie);
 EXPORT_SYMBOL(dmu_buf_update_user);
 EXPORT_SYMBOL(dmu_buf_get_user);
 EXPORT_SYMBOL(dmu_buf_freeable);
+EXPORT_SYMBOL(dmu_buf_get_blkptr);
 #endif