Imported Upstream version 0.6.4.2

[mirror_zfs-debian.git] / module / zfs / dbuf.c
diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c

index c8a52617178e573f12f4426b48f80dee302b45a1..ed6a8fd2a4dc1b59f75321625759cbe9580835dc 100644 (file)
--- a/module/zfs/dbuf.c
+++ b/module/zfs/dbuf.c
@@ -21,7 +21,7 @@
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
   */
  
@@ -40,6 +40,10 @@
  #include <sys/dmu_zfetch.h>
  #include <sys/sa.h>
  #include <sys/sa_impl.h>
+#include <sys/zfeature.h>
+#include <sys/blkptr.h>
+#include <sys/range_tree.h>
+#include <sys/trace_dbuf.h>
  
  struct dbuf_hold_impl_data {
         /* Function arguments */
@@ -208,8 +212,7 @@ dbuf_hash_insert(dmu_buf_impl_t *db)
  }
  
  /*
- * Remove an entry from the hash table.  This operation will
- * fail if there are any existing holds on the db.
+ * Remove an entry from the hash table.  It must be in the EVICTING state.
   */
  static void
  dbuf_hash_remove(dmu_buf_impl_t *db)
@@ -223,7 +226,7 @@ dbuf_hash_remove(dmu_buf_impl_t *db)
         idx = hv & h->hash_table_mask;
  
         /*
-        * We musn't hold db_mtx to maintin lock ordering:
+        * We musn't hold db_mtx to maintain lock ordering:
          * DBUF_HASH_MUTEX > db_mtx.
          */
         ASSERT(refcount_is_zero(&db->db_holds));
@@ -263,7 +266,10 @@ dbuf_evict_user(dmu_buf_impl_t *db)
  boolean_t
  dbuf_is_metadata(dmu_buf_impl_t *db)
  {
-       if (db->db_level > 0) {
+       /*
+        * Consider indirect blocks and spill blocks to be meta data.
+        */
+       if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) {
                 return (B_TRUE);
         } else {
                 boolean_t is_metadata;
@@ -309,7 +315,7 @@ retry:
          * Large allocations which do not require contiguous pages
          * should be using vmem_alloc() in the linux kernel
          */
-       h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_PUSHPAGE);
+       h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
  #else
         h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
  #endif
@@ -481,7 +487,6 @@ static void
  dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
  {
         ASSERT(MUTEX_HELD(&db->db_mtx));
-       ASSERT(db->db_buf == NULL || !arc_has_callback(db->db_buf));
         db->db_buf = buf;
         if (buf != NULL) {
                 ASSERT(buf->b_data != NULL);
@@ -508,10 +513,9 @@ dbuf_loan_arcbuf(dmu_buf_impl_t *db)
         mutex_enter(&db->db_mtx);
         if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
                 int blksz = db->db.db_size;
-               spa_t *spa;
+               spa_t *spa = db->db_objset->os_spa;
  
                 mutex_exit(&db->db_mtx);
-               DB_GET_SPA(&spa, db);
                 abuf = arc_loan_buf(spa, blksz);
                 bcopy(db->db.db_data, abuf->b_data, blksz);
         } else {
@@ -568,13 +572,13 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
         dbuf_rele_and_unlock(db, NULL);
  }
  
-static void
+static int
  dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
  {
         dnode_t *dn;
-       spa_t *spa;
-       zbookmark_t zb;
+       zbookmark_phys_t zb;
         uint32_t aflags = ARC_NOWAIT;
+       int err;
  
         DB_DNODE_ENTER(db);
         dn = DB_DNODE(db);
@@ -599,7 +603,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
                 dbuf_update_data(db);
                 db->db_state = DB_CACHED;
                 mutex_exit(&db->db_mtx);
-               return;
+               return (0);
         }
  
         /*
@@ -612,17 +616,16 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
             BP_IS_HOLE(db->db_blkptr)))) {
                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
  
-               dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
-                   db->db.db_size, db, type));
                 DB_DNODE_EXIT(db);
+               dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
+                   db->db.db_size, db, type));
                 bzero(db->db.db_data, db->db.db_size);
                 db->db_state = DB_CACHED;
                 *flags |= DB_RF_CACHED;
                 mutex_exit(&db->db_mtx);
-               return;
+               return (0);
         }
  
-       spa = dn->dn_objset->os_spa;
         DB_DNODE_EXIT(db);
  
         db->db_state = DB_READ;
@@ -639,20 +642,22 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
  
         dbuf_add_ref(db, NULL);
  
-       (void) arc_read(zio, spa, db->db_blkptr,
+       err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
             dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
             (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
             &aflags, &zb);
         if (aflags & ARC_CACHED)
                 *flags |= DB_RF_CACHED;
+
+       return (SET_ERROR(err));
  }
  
  int
  dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
  {
         int err = 0;
-       int havepzio = (zio != NULL);
-       int prefetch;
+       boolean_t havepzio = (zio != NULL);
+       boolean_t prefetch;
         dnode_t *dn;
  
         /*
@@ -687,11 +692,12 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
  
                 if (zio == NULL)
                         zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-               dbuf_read_impl(db, zio, &flags);
+
+               err = dbuf_read_impl(db, zio, &flags);
  
                 /* dbuf_read_impl has dropped db_mtx for us */
  
-               if (prefetch)
+               if (!err && prefetch)
                         dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
                             db->db.db_size, flags & DB_RF_CACHED);
  
@@ -699,7 +705,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                         rw_exit(&dn->dn_struct_rwlock);
                 DB_DNODE_EXIT(db);
  
-               if (!havepzio)
+               if (!err && !havepzio)
                         err = zio_wait(zio);
         } else {
                 /*
@@ -725,6 +731,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
                             db->db_state == DB_FILL) {
                                 ASSERT(db->db_state == DB_READ ||
                                     (flags & DB_RF_HAVESTRUCT) == 0);
+                               DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
+                                   db, zio_t *, zio);
                                 cv_wait(&db->db_changed, &db->db_mtx);
                         }
                         if (db->db_state == DB_UNCACHED)
@@ -747,11 +755,10 @@ dbuf_noread(dmu_buf_impl_t *db)
                 cv_wait(&db->db_changed, &db->db_mtx);
         if (db->db_state == DB_UNCACHED) {
                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-               spa_t *spa;
+               spa_t *spa = db->db_objset->os_spa;
  
                 ASSERT(db->db_buf == NULL);
                 ASSERT(db->db.db_data == NULL);
-               DB_GET_SPA(&spa, db);
                 dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
                 db->db_state = DB_FILL;
         } else if (db->db_state == DB_NOFILL) {
@@ -806,9 +813,8 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
         } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
                 int size = db->db.db_size;
                 arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
-               spa_t *spa;
+               spa_t *spa = db->db_objset->os_spa;
  
-               DB_GET_SPA(&spa, db);
                 dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
                 bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
         } else {
@@ -834,12 +840,9 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
         ASSERT(db->db_data_pending != dr);
  
         /* free this block */
-       if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) {
-               spa_t *spa;
+       if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
+               zio_free(db->db_objset->os_spa, txg, bp);
  
-               DB_GET_SPA(&spa, db);
-               zio_free(spa, txg, bp);
-       }
         dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
         dr->dt.dl.dr_nopwrite = B_FALSE;
  
@@ -857,9 +860,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
  /*
   * Evict (if its unreferenced) or clear (if its referenced) any level-0
   * data blocks in the free range, so that any future readers will find
- * empty blocks.  Also, if we happen across any level-1 dbufs in the
- * range that have not already been marked dirty, mark them dirty so
- * they stay in memory.
+ * empty blocks.
   *
   * This is a no-op if the dataset is in the middle of an incremental
   * receive; see comment below for details.
@@ -869,18 +870,16 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
  {
         dmu_buf_impl_t *db, *db_next;
         uint64_t txg = tx->tx_txg;
-       int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-       uint64_t first_l1 = start >> epbs;
-       uint64_t last_l1 = end >> epbs;
+       boolean_t freespill =
+           (start == DMU_SPILL_BLKID || end == DMU_SPILL_BLKID);
  
-       if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID)) {
+       if (end > dn->dn_maxblkid && !freespill)
                 end = dn->dn_maxblkid;
-               last_l1 = end >> epbs;
-       }
         dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
  
         mutex_enter(&dn->dn_dbufs_mtx);
-       if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
+       if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz &&
+           !freespill) {
                 /* There can't be any dbufs in this range; no need to search. */
                 mutex_exit(&dn->dn_dbufs_mtx);
                 return;
@@ -899,24 +898,14 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
                 db_next = list_next(&dn->dn_dbufs, db);
                 ASSERT(db->db_blkid != DMU_BONUS_BLKID);
  
-               if (db->db_level == 1 &&
-                   db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
-                       mutex_enter(&db->db_mtx);
-                       if (db->db_last_dirty &&
-                           db->db_last_dirty->dr_txg < txg) {
-                               dbuf_add_ref(db, FTAG);
-                               mutex_exit(&db->db_mtx);
-                               dbuf_will_dirty(db, tx);
-                               dbuf_rele(db, FTAG);
-                       } else {
-                               mutex_exit(&db->db_mtx);
-                       }
-               }
-
+               /* Skip indirect blocks. */
                 if (db->db_level != 0)
                         continue;
-               dprintf_dbuf(db, "found buf %s\n", "");
-               if (db->db_blkid < start || db->db_blkid > end)
+               /* Skip direct blocks outside the range. */
+               if (!freespill && (db->db_blkid < start || db->db_blkid > end))
+                       continue;
+               /* Skip all direct blocks, only free spill blocks. */
+               if (freespill && (db->db_blkid != DMU_SPILL_BLKID))
                         continue;
  
                 /* found a level 0 buffer in the range */
@@ -992,24 +981,29 @@ dbuf_block_freeable(dmu_buf_impl_t *db)
          * We don't need any locking to protect db_blkptr:
          * If it's syncing, then db_last_dirty will be set
          * so we'll ignore db_blkptr.
+        *
+        * This logic ensures that only block births for
+        * filled blocks are considered.
          */
         ASSERT(MUTEX_HELD(&db->db_mtx));
-       if (db->db_last_dirty)
+       if (db->db_last_dirty && (db->db_blkptr == NULL ||
+           !BP_IS_HOLE(db->db_blkptr))) {
                 birth_txg = db->db_last_dirty->dr_txg;
-       else if (db->db_blkptr)
+       } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
                 birth_txg = db->db_blkptr->blk_birth;
+       }
  
         /*
-        * If we don't exist or are in a snapshot, we can't be freed.
+        * If this block don't exist or is in a snapshot, it can't be freed.
          * Don't pass the bp to dsl_dataset_block_freeable() since we
          * are holding the db_mtx lock and might deadlock if we are
          * prefetching a dedup-ed block.
          */
-       if (birth_txg)
+       if (birth_txg != 0)
                 return (ds == NULL ||
                     dsl_dataset_block_freeable(ds, NULL, birth_txg));
         else
-               return (FALSE);
+               return (B_FALSE);
  }
  
  void
@@ -1029,7 +1023,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
         ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
  
         /*
-        * This call to dbuf_will_dirty() with the dn_struct_rwlock held
+        * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
          * is OK, because there can be no other references to the db
          * when we are changing its size, so no concurrent DB_FILL can
          * be happening.
@@ -1038,7 +1032,7 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
          * XXX we should be doing a dbuf_read, checking the return
          * value and returning that up to our callers
          */
-       dbuf_will_dirty(db, tx);
+       dmu_buf_will_dirty(&db->db, tx);
  
         /* create the data buffer for the new block */
         buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
@@ -1068,9 +1062,8 @@ dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
  void
  dbuf_release_bp(dmu_buf_impl_t *db)
  {
-       objset_t *os;
+       ASSERTV(objset_t *os = db->db_objset);
  
-       DB_GET_OBJSET(&os, db);
         ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
         ASSERT(arc_released(os->os_phys_buf) ||
             list_link_active(&os->os_dsl_dataset->ds_synced_link));
@@ -1133,7 +1126,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
                 dn->dn_dirtyctx =
                     (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
                 ASSERT(dn->dn_dirtyctx_firstset == NULL);
-               dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_PUSHPAGE);
+               dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
         }
         mutex_exit(&dn->dn_mtx);
  
@@ -1210,7 +1203,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
          * to make a copy of it so that the changes we make in this
          * transaction group won't leak out when we sync the older txg.
          */
-       dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_PUSHPAGE);
+       dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
         list_link_init(&dr->dr_dirty_node);
         if (db->db_level == 0) {
                 void *data_old = db->db_buf;
@@ -1257,7 +1250,10 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
             db->db_blkid != DMU_SPILL_BLKID) {
                 mutex_enter(&dn->dn_mtx);
-               dnode_clear_range(dn, db->db_blkid, 1, tx);
+               if (dn->dn_free_ranges[txgoff] != NULL) {
+                       range_tree_clear(dn->dn_free_ranges[txgoff],
+                           db->db_blkid, 1);
+               }
                 mutex_exit(&dn->dn_mtx);
                 db->db_freed_in_flight = FALSE;
         }
@@ -1388,14 +1384,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         DB_DNODE_ENTER(db);
         dn = DB_DNODE(db);
  
-       /*
-        * Note:  This code will probably work even if there are concurrent
-        * holders, but it is untested in that scenerio, as the ZPL and
-        * ztest have additional locking (the range locks) that prevents
-        * that type of concurrent access.
-        */
-       ASSERT3U(refcount_count(&db->db_holds), ==, db->db_dirtycnt);
-
         dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
  
         ASSERT(db->db.db_size != 0);
@@ -1453,10 +1441,10 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
         return (B_FALSE);
  }
  
-#pragma weak dmu_buf_will_dirty = dbuf_will_dirty
  void
-dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
  {
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
         int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
  
         ASSERT(tx->tx_txg != 0);
@@ -1519,6 +1507,38 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
         mutex_exit(&db->db_mtx);
  }
  
+void
+dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
+    bp_embedded_type_t etype, enum zio_compress comp,
+    int uncompressed_size, int compressed_size, int byteorder,
+    dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+       struct dirty_leaf *dl;
+       dmu_object_type_t type;
+
+       DB_DNODE_ENTER(db);
+       type = DB_DNODE(db)->dn_type;
+       DB_DNODE_EXIT(db);
+
+       ASSERT0(db->db_level);
+       ASSERT(db->db_blkid != DMU_BONUS_BLKID);
+
+       dmu_buf_will_not_fill(dbuf, tx);
+
+       ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
+       dl = &db->db_last_dirty->dt.dl;
+       encode_embedded_bp_compressed(&dl->dr_overridden_by,
+           data, comp, uncompressed_size, compressed_size);
+       BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
+       BP_SET_TYPE(&dl->dr_overridden_by, type);
+       BP_SET_LEVEL(&dl->dr_overridden_by, 0);
+       BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
+
+       dl->dr_override_state = DR_OVERRIDDEN;
+       dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
+}
+
  /*
   * Directly assign a provided arc buf to a given dbuf if it's not referenced
   * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
@@ -1579,7 +1599,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
         db->db_state = DB_FILL;
         mutex_exit(&db->db_mtx);
         (void) dbuf_dirty(db, tx);
-       dbuf_fill_done(db, tx);
+       dmu_buf_fill_done(&db->db, tx);
  }
  
  /*
@@ -1588,12 +1608,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
   * when we are not holding the dn_dbufs_mtx, we can't clear the
   * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
   * in this case.  For callers from the DMU we will usually see:
- *     dbuf_clear()->arc_buf_evict()->dbuf_do_evict()->dbuf_destroy()
+ *     dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
   * For the arc callback, we will usually see:
   *     dbuf_do_evict()->dbuf_clear();dbuf_destroy()
   * Sometimes, though, we will get a mix of these two:
- *     DMU: dbuf_clear()->arc_buf_evict()
+ *     DMU: dbuf_clear()->arc_clear_callback()
   *     ARC: dbuf_do_evict()->dbuf_destroy()
+ *
+ * This routine will dissociate the dbuf from the arc, by calling
+ * arc_clear_callback(), but will not evict the data from the ARC.
   */
  void
  dbuf_clear(dmu_buf_impl_t *db)
@@ -1601,7 +1624,7 @@ dbuf_clear(dmu_buf_impl_t *db)
         dnode_t *dn;
         dmu_buf_impl_t *parent = db->db_parent;
         dmu_buf_impl_t *dndb;
-       int dbuf_gone = FALSE;
+       boolean_t dbuf_gone = B_FALSE;
  
         ASSERT(MUTEX_HELD(&db->db_mtx));
         ASSERT(refcount_is_zero(&db->db_holds));
@@ -1629,7 +1652,7 @@ dbuf_clear(dmu_buf_impl_t *db)
         dndb = dn->dn_dbuf;
         if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
                 list_remove(&dn->dn_dbufs, db);
-               (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+               atomic_dec_32(&dn->dn_dbufs_count);
                 membar_producer();
                 DB_DNODE_EXIT(db);
                 /*
@@ -1647,7 +1670,7 @@ dbuf_clear(dmu_buf_impl_t *db)
         }
  
         if (db->db_buf)
-               dbuf_gone = arc_buf_evict(db->db_buf);
+               dbuf_gone = arc_clear_callback(db->db_buf);
  
         if (!dbuf_gone)
                 mutex_exit(&db->db_mtx);
@@ -1746,7 +1769,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
         ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
         ASSERT(dn->dn_type != DMU_OT_NONE);
  
-       db = kmem_cache_alloc(dbuf_cache, KM_PUSHPAGE);
+       db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
  
         db->db_objset = os;
         db->db.db_object = dn->dn_object;
@@ -1814,7 +1837,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
         ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
             refcount_count(&dn->dn_holds) > 0);
         (void) refcount_add(&dn->dn_holds, db);
-       (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
+       atomic_inc_32(&dn->dn_dbufs_count);
  
         dprintf_dbuf(db, "db=%p\n", db);
  
@@ -1824,8 +1847,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
  static int
  dbuf_do_evict(void *private)
  {
-       arc_buf_t *buf = private;
-       dmu_buf_impl_t *db = buf->b_private;
+       dmu_buf_impl_t *db = private;
  
         if (!MUTEX_HELD(&db->db_mtx))
                 mutex_enter(&db->db_mtx);
@@ -1861,7 +1883,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
                         dn = DB_DNODE(db);
                         mutex_enter(&dn->dn_dbufs_mtx);
                         list_remove(&dn->dn_dbufs, db);
-                       (void) atomic_dec_32_nv(&dn->dn_dbufs_count);
+                       atomic_dec_32(&dn->dn_dbufs_count);
                         mutex_exit(&dn->dn_dbufs_mtx);
                         DB_DNODE_EXIT(db);
                         /*
@@ -1912,10 +1934,10 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
         }
  
         if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp, NULL) == 0) {
-               if (bp && !BP_IS_HOLE(bp)) {
+               if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
                         dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
                         uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
-                       zbookmark_t zb;
+                       zbookmark_phys_t zb;
  
                         SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
                             dn->dn_object, 0, blkid);
@@ -2042,7 +2064,7 @@ dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
         int error;
  
         dh = kmem_zalloc(sizeof (struct dbuf_hold_impl_data) *
-           DBUF_HOLD_IMPL_MAX_DEPTH, KM_PUSHPAGE);
+           DBUF_HOLD_IMPL_MAX_DEPTH, KM_SLEEP);
         __dbuf_hold_impl_init(dh, dn, level, blkid, fail_sparse, tag, dbp, 0);
  
         error = __dbuf_hold_impl(dh);
@@ -2137,7 +2159,6 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
   * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
   * dnode's parent dbuf evicting its dnode handles.
   */
-#pragma weak dmu_buf_rele = dbuf_rele
  void
  dbuf_rele(dmu_buf_impl_t *db, void *tag)
  {
@@ -2145,6 +2166,12 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
         dbuf_rele_and_unlock(db, tag);
  }
  
+void
+dmu_buf_rele(dmu_buf_t *db, void *tag)
+{
+       dbuf_rele((dmu_buf_impl_t *)db, tag);
+}
+
  /*
   * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
   * db_dirtycnt and db_holds to be updated atomically.
@@ -2178,21 +2205,60 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
  
         if (holds == 0) {
                 if (db->db_blkid == DMU_BONUS_BLKID) {
-                       mutex_exit(&db->db_mtx);
+                       dnode_t *dn;
  
                         /*
-                        * If the dnode moves here, we cannot cross this barrier
-                        * until the move completes.
+                        * If the dnode moves here, we cannot cross this
+                        * barrier until the move completes.
                          */
                         DB_DNODE_ENTER(db);
-                       (void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
+
+                       dn = DB_DNODE(db);
+                       atomic_dec_32(&dn->dn_dbufs_count);
+
+                       /*
+                        * Decrementing the dbuf count means that the bonus
+                        * buffer's dnode hold is no longer discounted in
+                        * dnode_move(). The dnode cannot move until after
+                        * the dnode_rele_and_unlock() below.
+                        */
                         DB_DNODE_EXIT(db);
+
+                       /*
+                        * Do not reference db after its lock is dropped.
+                        * Another thread may evict it.
+                        */
+                       mutex_exit(&db->db_mtx);
+
                         /*
-                        * The bonus buffer's dnode hold is no longer discounted
-                        * in dnode_move(). The dnode cannot move until after
-                        * the dnode_rele().
+                        * If the dnode has been freed, evict the bonus
+                        * buffer immediately.  The data in the bonus
+                        * buffer is no longer relevant and this prevents
+                        * a stale bonus buffer from being associated
+                        * with this dnode_t should the dnode_t be reused
+                        * prior to being destroyed.
                          */
-                       dnode_rele(DB_DNODE(db), db);
+                       mutex_enter(&dn->dn_mtx);
+                       if (dn->dn_type == DMU_OT_NONE ||
+                           dn->dn_free_txg != 0) {
+                               /*
+                                * Drop dn_mtx.  It is a leaf lock and
+                                * cannot be held when dnode_evict_bonus()
+                                * acquires other locks in order to
+                                * perform the eviction.
+                                *
+                                * Freed dnodes cannot be reused until the
+                                * last hold is released.  Since this bonus
+                                * buffer has a hold, the dnode will remain
+                                * in the free state, even without dn_mtx
+                                * held, until the dnode_rele_and_unlock()
+                                * below.
+                                */
+                               mutex_exit(&dn->dn_mtx);
+                               dnode_evict_bonus(dn);
+                               mutex_enter(&dn->dn_mtx);
+                       }
+                       dnode_rele_and_unlock(dn, db);
                 } else if (db->db_buf == NULL) {
                         /*
                          * This is a special case: we never associated this
@@ -2227,11 +2293,23 @@ dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
                          * block on-disk. If so, then we simply evict
                          * ourselves.
                          */
-                       if (!DBUF_IS_CACHEABLE(db) ||
-                           arc_buf_eviction_needed(db->db_buf))
+                       if (!DBUF_IS_CACHEABLE(db)) {
+                               if (db->db_blkptr != NULL &&
+                                   !BP_IS_HOLE(db->db_blkptr) &&
+                                   !BP_IS_EMBEDDED(db->db_blkptr)) {
+                                       spa_t *spa =
+                                           dmu_objset_spa(db->db_objset);
+                                       blkptr_t bp = *db->db_blkptr;
+                                       dbuf_clear(db);
+                                       arc_freed(spa, &bp);
+                               } else {
+                                       dbuf_clear(db);
+                               }
+                       } else if (arc_buf_eviction_needed(db->db_buf)) {
                                 dbuf_clear(db);
-                       else
+                       } else {
                                 mutex_exit(&db->db_mtx);
+                       }
                 }
         } else {
                 mutex_exit(&db->db_mtx);
@@ -2597,7 +2675,7 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
         uint64_t fill = 0;
         int i;
  
-       ASSERT(db->db_blkptr == bp);
+       ASSERT3P(db->db_blkptr, ==, bp);
  
         DB_DNODE_ENTER(db);
         dn = DB_DNODE(db);
@@ -2605,18 +2683,15 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
         dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
         zio->io_prev_space_delta = delta;
  
-       if (BP_IS_HOLE(bp)) {
-               ASSERT(bp->blk_fill == 0);
-               DB_DNODE_EXIT(db);
-               return;
+       if (bp->blk_birth != 0) {
+               ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
+                   BP_GET_TYPE(bp) == dn->dn_type) ||
+                   (db->db_blkid == DMU_SPILL_BLKID &&
+                   BP_GET_TYPE(bp) == dn->dn_bonustype) ||
+                   BP_IS_EMBEDDED(bp));
+               ASSERT(BP_GET_LEVEL(bp) == db->db_level);
         }
  
-       ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
-           BP_GET_TYPE(bp) == dn->dn_type) ||
-           (db->db_blkid == DMU_SPILL_BLKID &&
-           BP_GET_TYPE(bp) == dn->dn_bonustype));
-       ASSERT(BP_GET_LEVEL(bp) == db->db_level);
-
         mutex_enter(&db->db_mtx);
  
  #ifdef ZFS_DEBUG
@@ -2642,7 +2717,11 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                                         fill++;
                         }
                 } else {
-                       fill = 1;
+                       if (BP_IS_HOLE(bp)) {
+                               fill = 0;
+                       } else {
+                               fill = 1;
+                       }
                 }
         } else {
                 blkptr_t *ibp = db->db.db_data;
@@ -2650,12 +2729,13 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                 for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
                         if (BP_IS_HOLE(ibp))
                                 continue;
-                       fill += ibp->blk_fill;
+                       fill += BP_GET_FILL(ibp);
                 }
         }
         DB_DNODE_EXIT(db);
  
-       bp->blk_fill = fill;
+       if (!BP_IS_EMBEDDED(bp))
+               bp->blk_fill = fill;
  
         mutex_exit(&db->db_mtx);
  }
@@ -2697,9 +2777,10 @@ static void
  dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
  {
         dmu_buf_impl_t *db = vdb;
-       blkptr_t *bp = zio->io_bp;
         blkptr_t *bp_orig = &zio->io_bp_orig;
-       uint64_t txg = zio->io_txg;
+       blkptr_t *bp = db->db_blkptr;
+       objset_t *os = db->db_objset;
+       dmu_tx_t *tx = os->os_synctx;
         dbuf_dirty_record_t **drp, *dr;
  
         ASSERT0(zio->io_error);
@@ -2712,14 +2793,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
         if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
                 ASSERT(BP_EQUAL(bp, bp_orig));
         } else {
-               objset_t *os;
-               dsl_dataset_t *ds;
-               dmu_tx_t *tx;
-
-               DB_GET_OBJSET(&os, db);
-               ds = os->os_dsl_dataset;
-               tx = os->os_synctx;
-
+               dsl_dataset_t *ds = os->os_dsl_dataset;
                 (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
                 dsl_dataset_block_born(ds, bp, tx);
         }
@@ -2732,7 +2806,6 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
         while ((dr = *drp) != db->db_data_pending)
                 drp = &dr->dr_next;
         ASSERT(!list_link_active(&dr->dr_dirty_node));
-       ASSERT(dr->dr_txg == txg);
         ASSERT(dr->dr_dbuf == db);
         ASSERT(dr->dr_next == NULL);
         *drp = dr->dr_next;
@@ -2766,15 +2839,16 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
                 DB_DNODE_ENTER(db);
                 dn = DB_DNODE(db);
                 ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-               ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
+               ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
                 if (!BP_IS_HOLE(db->db_blkptr)) {
                         ASSERTV(int epbs = dn->dn_phys->dn_indblkshift -
                             SPA_BLKPTRSHIFT);
+                       ASSERT3U(db->db_blkid, <=,
+                           dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
                         ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
                             db->db.db_size);
-                       ASSERT3U(dn->dn_phys->dn_maxblkid
-                           >> (db->db_level * epbs), >=, db->db_blkid);
-                       arc_set_callback(db->db_buf, dbuf_do_evict, db);
+                       if (!arc_released(db->db_buf))
+                               arc_set_callback(db->db_buf, dbuf_do_evict, db);
                 }
                 DB_DNODE_EXIT(db);
                 mutex_destroy(&dr->dt.di.dr_mtx);
@@ -2786,8 +2860,7 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
         ASSERT(db->db_dirtycnt > 0);
         db->db_dirtycnt -= 1;
         db->db_data_pending = NULL;
-
-       dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+       dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
  }
  
  static void
@@ -2838,7 +2911,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
         objset_t *os;
         dmu_buf_impl_t *parent = db->db_parent;
         uint64_t txg = tx->tx_txg;
-       zbookmark_t zb;
+       zbookmark_phys_t zb;
         zio_prop_t zp;
         zio_t *zio;
         int wp_flag = 0;
@@ -2901,10 +2974,16 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
         dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
         DB_DNODE_EXIT(db);
  
-       if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-               ASSERT(db->db_state != DB_NOFILL);
+       if (db->db_level == 0 &&
+           dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+               /*
+                * The BP for this block has been provided by open context
+                * (by dmu_sync() or dmu_buf_write_embedded()).
+                */
+               void *contents = (data != NULL) ? data->b_data : NULL;
+
                 dr->dr_zio = zio_write(zio, os->os_spa, txg,
-                   db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+                   db->db_blkptr, contents, db->db.db_size, &zp,
                     dbuf_write_override_ready, NULL, dbuf_write_override_done,
                     dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
                 mutex_enter(&db->db_mtx);
@@ -2965,4 +3044,5 @@ EXPORT_SYMBOL(dmu_buf_set_user_ie);
  EXPORT_SYMBOL(dmu_buf_update_user);
  EXPORT_SYMBOL(dmu_buf_get_user);
  EXPORT_SYMBOL(dmu_buf_freeable);
+EXPORT_SYMBOL(dmu_buf_get_blkptr);
  #endif