]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/dnode.c
Fix typos in module/zfs/
[mirror_zfs.git] / module / zfs / dnode.c
index 4e2a733830b1ff5dc0530670850bef38931ff1d8..4ee192ed5e9520dd98471a14d0d93b38f31afe4c 100644 (file)
@@ -55,7 +55,6 @@ dnode_stats_t dnode_stats = {
        { "dnode_hold_free_lock_retry",         KSTAT_DATA_UINT64 },
        { "dnode_hold_free_overflow",           KSTAT_DATA_UINT64 },
        { "dnode_hold_free_refcount",           KSTAT_DATA_UINT64 },
-       { "dnode_hold_free_txg",                KSTAT_DATA_UINT64 },
        { "dnode_free_interior_lock_retry",     KSTAT_DATA_UINT64 },
        { "dnode_allocate",                     KSTAT_DATA_UINT64 },
        { "dnode_reallocate",                   KSTAT_DATA_UINT64 },
@@ -125,8 +124,8 @@ dnode_cons(void *arg, void *unused, int kmflag)
         * Every dbuf has a reference, and dropping a tracked reference is
         * O(number of references), so don't track dn_holds.
         */
-       refcount_create_untracked(&dn->dn_holds);
-       refcount_create(&dn->dn_tx_holds);
+       zfs_refcount_create_untracked(&dn->dn_holds);
+       zfs_refcount_create(&dn->dn_tx_holds);
        list_link_init(&dn->dn_link);
 
        bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
@@ -184,8 +183,8 @@ dnode_dest(void *arg, void *unused)
        mutex_destroy(&dn->dn_mtx);
        mutex_destroy(&dn->dn_dbufs_mtx);
        cv_destroy(&dn->dn_notxholds);
-       refcount_destroy(&dn->dn_holds);
-       refcount_destroy(&dn->dn_tx_holds);
+       zfs_refcount_destroy(&dn->dn_holds);
+       zfs_refcount_destroy(&dn->dn_tx_holds);
        ASSERT(!list_link_active(&dn->dn_link));
 
        for (i = 0; i < TXG_SIZE; i++) {
@@ -384,12 +383,20 @@ dnode_buf_byteswap(void *vbuf, size_t size)
 void
 dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 {
-       ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+       ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
 
        dnode_setdirty(dn, tx);
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
        ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
            (dn->dn_nblkptr-1) * sizeof (blkptr_t));
+
+       if (newsize < dn->dn_bonuslen) {
+               /* clear any data after the end of the new size */
+               size_t diff = dn->dn_bonuslen - newsize;
+               char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
+               bzero(data_end, diff);
+       }
+
        dn->dn_bonuslen = newsize;
        if (newsize == 0)
                dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN;
@@ -401,7 +408,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 void
 dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
 {
-       ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+       ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
        dnode_setdirty(dn, tx);
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
        dn->dn_bonustype = newtype;
@@ -412,10 +419,10 @@ dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx)
 void
 dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
-       ASSERT3U(refcount_count(&dn->dn_holds), >=, 1);
+       ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1);
        ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
        dnode_setdirty(dn, tx);
-       dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK;
+       dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK;
        dn->dn_have_spill = B_FALSE;
 }
 
@@ -605,8 +612,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        ASSERT0(dn->dn_allocated_txg);
        ASSERT0(dn->dn_assigned_txg);
        ASSERT0(dn->dn_dirty_txg);
-       ASSERT(refcount_is_zero(&dn->dn_tx_holds));
-       ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
+       ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
+       ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
        ASSERT(avl_is_empty(&dn->dn_dbufs));
 
        for (i = 0; i < TXG_SIZE; i++) {
@@ -660,7 +667,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
 void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots,
+    boolean_t keep_spill, dmu_tx_t *tx)
 {
        int nblkptr;
 
@@ -690,14 +698,15 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        dnode_setdirty(dn, tx);
        if (dn->dn_datablksz != blocksize) {
                /* change blocksize */
-               ASSERT(dn->dn_maxblkid == 0 &&
-                   (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
-                   dnode_block_freed(dn, 0)));
+               ASSERT0(dn->dn_maxblkid);
+               ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
+                   dnode_block_freed(dn, 0));
+
                dnode_setdblksz(dn, blocksize);
-               dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
+               dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize;
        }
        if (dn->dn_bonuslen != bonuslen)
-               dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
+               dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen;
 
        if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
                nblkptr = 1;
@@ -706,13 +715,14 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
                    1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
                    SPA_BLKPTRSHIFT));
        if (dn->dn_bonustype != bonustype)
-               dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
+               dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype;
        if (dn->dn_nblkptr != nblkptr)
-               dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
-       if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+               dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr;
+       if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
                dbuf_rm_spill(dn, tx);
                dnode_rm_spill(dn, tx);
        }
+
        rw_exit(&dn->dn_struct_rwlock);
 
        /* change type */
@@ -749,7 +759,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
        ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
        ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
-       ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock));
+       ASSERT(!MUTEX_HELD(&odn->dn_zfetch.zf_lock));
 
        /* Copy fields. */
        ndn->dn_objset = odn->dn_objset;
@@ -800,8 +810,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        ndn->dn_dirty_txg = odn->dn_dirty_txg;
        ndn->dn_dirtyctx = odn->dn_dirtyctx;
        ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
-       ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
-       refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
+       ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0);
+       zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
        ASSERT(avl_is_empty(&ndn->dn_dbufs));
        avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
        ndn->dn_dbufs_count = odn->dn_dbufs_count;
@@ -840,7 +850,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
            offsetof(dmu_buf_impl_t, db_link));
        odn->dn_dbufs_count = 0;
        odn->dn_bonus = NULL;
-       odn->dn_zfetch.zf_dnode = NULL;
+       dmu_zfetch_fini(&odn->dn_zfetch);
 
        /*
         * Set the low bit of the objset pointer to ensure that dnode_move()
@@ -993,7 +1003,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
         * hold before the dbuf is removed, the hold is discounted, and the
         * removal is blocked until the move completes.
         */
-       refcount = refcount_count(&odn->dn_holds);
+       refcount = zfs_refcount_count(&odn->dn_holds);
        ASSERT(refcount >= 0);
        dbufs = odn->dn_dbufs_count;
 
@@ -1021,7 +1031,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 
        list_link_replace(&odn->dn_link, &ndn->dn_link);
        /* If the dnode was safe to move, the refcount cannot have changed. */
-       ASSERT(refcount == refcount_count(&ndn->dn_holds));
+       ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
        ASSERT(dbufs == ndn->dn_dbufs_count);
        zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
        mutex_exit(&os->os_lock);
@@ -1106,6 +1116,7 @@ dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
                } else if (DN_SLOT_IS_PTR(dn)) {
                        mutex_enter(&dn->dn_mtx);
                        boolean_t can_free = (dn->dn_type == DMU_OT_NONE &&
+                           zfs_refcount_is_zero(&dn->dn_holds) &&
                            !DNODE_IS_DIRTY(dn));
                        mutex_exit(&dn->dn_mtx);
 
@@ -1152,8 +1163,10 @@ dnode_free_interior_slots(dnode_t *dn)
 
        ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
-       while (!dnode_slots_tryenter(children, idx, slots))
+       while (!dnode_slots_tryenter(children, idx, slots)) {
                DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+               cond_resched();
+       }
 
        dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
        dnode_slots_rele(children, idx, slots);
@@ -1170,7 +1183,7 @@ dnode_special_close(dnode_handle_t *dnh)
         * has a hold on this dnode while we are trying to evict this
         * dnode.
         */
-       while (refcount_count(&dn->dn_holds) > 0)
+       while (zfs_refcount_count(&dn->dn_holds) > 0)
                delay(1);
        ASSERT(dn->dn_dbuf == NULL ||
            dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
@@ -1225,8 +1238,8 @@ dnode_buf_evict_async(void *dbu)
                 * it wouldn't be eligible for eviction and this function
                 * would not have been called.
                 */
-               ASSERT(refcount_is_zero(&dn->dn_holds));
-               ASSERT(refcount_is_zero(&dn->dn_tx_holds));
+               ASSERT(zfs_refcount_is_zero(&dn->dn_holds));
+               ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
 
                dnode_destroy(dn); /* implicit zrl_remove() for first slot */
                zrl_destroy(&dnh->dnh_zrlock);
@@ -1249,12 +1262,18 @@ dnode_buf_evict_async(void *dbu)
  * as an extra dnode slot by an large dnode, in which case it returns
  * ENOENT.
  *
+ * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just
+ * return whether the hold would succeed or not. tag and dnp should set to
+ * NULL in this case.
+ *
  * errors:
  * EINVAL - Invalid object number or flags.
  * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
  * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
+ *        - Refers to a freeing dnode (DNODE_MUST_BE_FREE)
  *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
  * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
+ *        - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED)
  * EIO    - I/O error when reading the meta dnode dbuf.
  *
  * succeeds even for free dnodes.
@@ -1275,6 +1294,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
 
        ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
        ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+       IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL));
 
        /*
         * If you are holding the spa config lock as writer, you shouldn't
@@ -1304,8 +1324,11 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
                        return (SET_ERROR(EEXIST));
                DNODE_VERIFY(dn);
-               (void) refcount_add(&dn->dn_holds, tag);
-               *dnp = dn;
+               /* Don't actually hold if dry run, just return 0 */
+               if (!(flag & DNODE_DRY_RUN)) {
+                       (void) zfs_refcount_add(&dn->dn_holds, tag);
+                       *dnp = dn;
+               }
                return (0);
        }
 
@@ -1323,7 +1346,6 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
        }
 
        blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
-
        db = dbuf_hold(mdn, blk, FTAG);
        if (drop_struct_lock)
                rw_exit(&mdn->dn_struct_rwlock);
@@ -1398,34 +1420,30 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
        }
 
        ASSERT(dnc->dnc_count == epb);
-       dn = DN_SLOT_UNINIT;
 
        if (flag & DNODE_MUST_BE_ALLOCATED) {
                slots = 1;
 
-               while (dn == DN_SLOT_UNINIT) {
-                       dnode_slots_hold(dnc, idx, slots);
-                       dnh = &dnc->dnc_children[idx];
-
-                       if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
-                               dn = dnh->dnh_dnode;
-                               break;
-                       } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
-                               DNODE_STAT_BUMP(dnode_hold_alloc_interior);
-                               dnode_slots_rele(dnc, idx, slots);
-                               dbuf_rele(db, FTAG);
-                               return (SET_ERROR(EEXIST));
-                       } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
-                               DNODE_STAT_BUMP(dnode_hold_alloc_misses);
-                               dnode_slots_rele(dnc, idx, slots);
-                               dbuf_rele(db, FTAG);
-                               return (SET_ERROR(ENOENT));
-                       }
+               dnode_slots_hold(dnc, idx, slots);
+               dnh = &dnc->dnc_children[idx];
 
+               if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                       dn = dnh->dnh_dnode;
+               } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
+                       DNODE_STAT_BUMP(dnode_hold_alloc_interior);
+                       dnode_slots_rele(dnc, idx, slots);
+                       dbuf_rele(db, FTAG);
+                       return (SET_ERROR(EEXIST));
+               } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
+                       DNODE_STAT_BUMP(dnode_hold_alloc_misses);
+                       dnode_slots_rele(dnc, idx, slots);
+                       dbuf_rele(db, FTAG);
+                       return (SET_ERROR(ENOENT));
+               } else {
                        dnode_slots_rele(dnc, idx, slots);
-                       if (!dnode_slots_tryenter(dnc, idx, slots)) {
+                       while (!dnode_slots_tryenter(dnc, idx, slots)) {
                                DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
-                               continue;
+                               cond_resched();
                        }
 
                        /*
@@ -1443,7 +1461,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                }
 
                mutex_enter(&dn->dn_mtx);
-               if (dn->dn_type == DMU_OT_NONE) {
+               if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) {
                        DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
                        mutex_exit(&dn->dn_mtx);
                        dnode_slots_rele(dnc, idx, slots);
@@ -1451,6 +1469,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                        return (SET_ERROR(ENOENT));
                }
 
+               /* Don't actually hold if dry run, just return 0 */
+               if (flag & DNODE_DRY_RUN) {
+                       mutex_exit(&dn->dn_mtx);
+                       dnode_slots_rele(dnc, idx, slots);
+                       dbuf_rele(db, FTAG);
+                       return (0);
+               }
+
                DNODE_STAT_BUMP(dnode_hold_alloc_hits);
        } else if (flag & DNODE_MUST_BE_FREE) {
 
@@ -1460,49 +1486,47 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                        return (SET_ERROR(ENOSPC));
                }
 
-               while (dn == DN_SLOT_UNINIT) {
-                       dnode_slots_hold(dnc, idx, slots);
-
-                       if (!dnode_check_slots_free(dnc, idx, slots)) {
-                               DNODE_STAT_BUMP(dnode_hold_free_misses);
-                               dnode_slots_rele(dnc, idx, slots);
-                               dbuf_rele(db, FTAG);
-                               return (SET_ERROR(ENOSPC));
-                       }
+               dnode_slots_hold(dnc, idx, slots);
 
+               if (!dnode_check_slots_free(dnc, idx, slots)) {
+                       DNODE_STAT_BUMP(dnode_hold_free_misses);
                        dnode_slots_rele(dnc, idx, slots);
-                       if (!dnode_slots_tryenter(dnc, idx, slots)) {
-                               DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
-                               continue;
-                       }
+                       dbuf_rele(db, FTAG);
+                       return (SET_ERROR(ENOSPC));
+               }
 
-                       if (!dnode_check_slots_free(dnc, idx, slots)) {
-                               DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
-                               dnode_slots_rele(dnc, idx, slots);
-                               dbuf_rele(db, FTAG);
-                               return (SET_ERROR(ENOSPC));
-                       }
+               dnode_slots_rele(dnc, idx, slots);
+               while (!dnode_slots_tryenter(dnc, idx, slots)) {
+                       DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
+                       cond_resched();
+               }
 
-                       /*
-                        * Allocated but otherwise free dnodes which would
-                        * be in the interior of a multi-slot dnodes need
-                        * to be freed.  Single slot dnodes can be safely
-                        * re-purposed as a performance optimization.
-                        */
-                       if (slots > 1)
-                               dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+               if (!dnode_check_slots_free(dnc, idx, slots)) {
+                       DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
+                       dnode_slots_rele(dnc, idx, slots);
+                       dbuf_rele(db, FTAG);
+                       return (SET_ERROR(ENOSPC));
+               }
 
-                       dnh = &dnc->dnc_children[idx];
-                       if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
-                               dn = dnh->dnh_dnode;
-                       } else {
-                               dn = dnode_create(os, dn_block + idx, db,
-                                   object, dnh);
-                       }
+               /*
+                * Allocated but otherwise free dnodes which would
+                * be in the interior of a multi-slot dnodes need
+                * to be freed.  Single slot dnodes can be safely
+                * re-purposed as a performance optimization.
+                */
+               if (slots > 1)
+                       dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
+               dnh = &dnc->dnc_children[idx];
+               if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                       dn = dnh->dnh_dnode;
+               } else {
+                       dn = dnode_create(os, dn_block + idx, db,
+                           object, dnh);
                }
 
                mutex_enter(&dn->dn_mtx);
-               if (!refcount_is_zero(&dn->dn_holds)) {
+               if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) {
                        DNODE_STAT_BUMP(dnode_hold_free_refcount);
                        mutex_exit(&dn->dn_mtx);
                        dnode_slots_rele(dnc, idx, slots);
@@ -1510,6 +1534,14 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                        return (SET_ERROR(EEXIST));
                }
 
+               /* Don't actually hold if dry run, just return 0 */
+               if (flag & DNODE_DRY_RUN) {
+                       mutex_exit(&dn->dn_mtx);
+                       dnode_slots_rele(dnc, idx, slots);
+                       dbuf_rele(db, FTAG);
+                       return (0);
+               }
+
                dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
                DNODE_STAT_BUMP(dnode_hold_free_hits);
        } else {
@@ -1517,17 +1549,9 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                return (SET_ERROR(EINVAL));
        }
 
-       if (dn->dn_free_txg) {
-               DNODE_STAT_BUMP(dnode_hold_free_txg);
-               type = dn->dn_type;
-               mutex_exit(&dn->dn_mtx);
-               dnode_slots_rele(dnc, idx, slots);
-               dbuf_rele(db, FTAG);
-               return (SET_ERROR((flag & DNODE_MUST_BE_ALLOCATED) ?
-                   ENOENT : EEXIST));
-       }
+       ASSERT0(dn->dn_free_txg);
 
-       if (refcount_add(&dn->dn_holds, tag) == 1)
+       if (zfs_refcount_add(&dn->dn_holds, tag) == 1)
                dbuf_add_ref(db, dnh);
 
        mutex_exit(&dn->dn_mtx);
@@ -1563,11 +1587,11 @@ boolean_t
 dnode_add_ref(dnode_t *dn, void *tag)
 {
        mutex_enter(&dn->dn_mtx);
-       if (refcount_is_zero(&dn->dn_holds)) {
+       if (zfs_refcount_is_zero(&dn->dn_holds)) {
                mutex_exit(&dn->dn_mtx);
                return (FALSE);
        }
-       VERIFY(1 < refcount_add(&dn->dn_holds, tag));
+       VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag));
        mutex_exit(&dn->dn_mtx);
        return (TRUE);
 }
@@ -1587,7 +1611,7 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
        dmu_buf_impl_t *db = dn->dn_dbuf;
        dnode_handle_t *dnh = dn->dn_handle;
 
-       refs = refcount_remove(&dn->dn_holds, tag);
+       refs = zfs_refcount_remove(&dn->dn_holds, tag);
        mutex_exit(&dn->dn_mtx);
 
        /*
@@ -1616,6 +1640,16 @@ dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
        }
 }
 
+/*
+ * Test whether we can create a dnode at the specified location.
+ */
+int
+dnode_try_claim(objset_t *os, uint64_t object, int slots)
+{
+       return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN,
+           slots, NULL, NULL));
+}
+
 void
 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 {
@@ -1652,12 +1686,12 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
                return;
        }
 
-       ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+       ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) ||
            !avl_is_empty(&dn->dn_dbufs));
        ASSERT(dn->dn_datablksz != 0);
-       ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
-       ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
-       ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
+       ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]);
+       ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]);
+       ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]);
 
        dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
            dn->dn_object, txg);
@@ -1740,10 +1774,11 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 
        /* resize the old block */
        err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
-       if (err == 0)
+       if (err == 0) {
                dbuf_new_size(db, size, tx);
-       else if (err != ENOENT)
+       } else if (err != ENOENT) {
                goto fail;
+       }
 
        dnode_setdblksz(dn, size);
        dnode_setdirty(dn, tx);
@@ -1752,7 +1787,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
                dn->dn_indblkshift = ibs;
                dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
        }
-       /* rele after we have fixed the blocksize in the dnode */
+       /* release after we have fixed the blocksize in the dnode */
        if (db)
                dbuf_rele(db, FTAG);
 
@@ -1829,7 +1864,8 @@ out:
 
 /* read-holding callers must not rely on the lock being continuously held */
 void
-dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
+dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read,
+    boolean_t force)
 {
        int epbs, new_nlevels;
        uint64_t sz;
@@ -1854,14 +1890,25 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
                }
        }
 
-       if (blkid <= dn->dn_maxblkid)
+       /*
+        * Raw sends (indicated by the force flag) require that we take the
+        * given blkid even if the value is lower than the current value.
+        */
+       if (!force && blkid <= dn->dn_maxblkid)
                goto out;
 
+       /*
+        * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff]
+        * to indicate that this field is set. This allows us to set the
+        * maxblkid to 0 on an existing object in dnode_sync().
+        */
        dn->dn_maxblkid = blkid;
-       dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = blkid;
+       dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] =
+           blkid | DMU_NEXT_MAXBLKID_SET;
 
        /*
         * Compute the number of levels necessary to support the new maxblkid.
+        * Raw sends will ensure nlevels is set correctly for us.
         */
        new_nlevels = 1;
        epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -1871,8 +1918,12 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
 
        ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
 
-       if (new_nlevels > dn->dn_nlevels)
-               dnode_set_nlevels_impl(dn, new_nlevels, tx);
+       if (!force) {
+               if (new_nlevels > dn->dn_nlevels)
+                       dnode_set_nlevels_impl(dn, new_nlevels, tx);
+       } else {
+               ASSERT3U(dn->dn_nlevels, >=, new_nlevels);
+       }
 
 out:
        if (have_read)
@@ -1949,7 +2000,8 @@ dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
        for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
                if (db->db_level != 1 || db->db_blkid >= end_blkid)
                        break;
-               ASSERT(db->db_dirtycnt > 0);
+               if (db->db_state != DB_EVICTING)
+                       ASSERT(db->db_dirtycnt > 0);
        }
 #endif
        mutex_exit(&dn->dn_dbufs_mtx);
@@ -1964,7 +2016,6 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        int trunc = FALSE;
        int epbs;
 
-       rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
        blksz = dn->dn_datablksz;
        blkshift = dn->dn_datablkshift;
        epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
@@ -1981,7 +2032,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
                head = P2NPHASE(off, blksz);
                blkoff = P2PHASE(off, blksz);
                if ((off >> blkshift) > dn->dn_maxblkid)
-                       goto out;
+                       return;
        } else {
                ASSERT(dn->dn_maxblkid == 0);
                if (off == 0 && len >= blksz) {
@@ -1990,12 +2041,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
                         */
                        blkid = 0;
                        nblks = 1;
-                       if (dn->dn_nlevels > 1)
+                       if (dn->dn_nlevels > 1) {
+                               rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                                dnode_dirty_l1(dn, 0, tx);
+                               rw_exit(&dn->dn_struct_rwlock);
+                       }
                        goto done;
                } else if (off >= blksz) {
                        /* Freeing past end-of-data */
-                       goto out;
+                       return;
                } else {
                        /* Freeing part of the block. */
                        head = blksz - off;
@@ -2005,19 +2059,26 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        }
        /* zero out any partial block data at the start of the range */
        if (head) {
+               int res;
                ASSERT3U(blkoff + head, ==, blksz);
                if (len < head)
                        head = len;
-               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
-                   TRUE, FALSE, FTAG, &db) == 0) {
+               rw_enter(&dn->dn_struct_rwlock, RW_READER);
+               res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+                   TRUE, FALSE, FTAG, &db);
+               rw_exit(&dn->dn_struct_rwlock);
+               if (res == 0) {
                        caddr_t data;
+                       boolean_t dirty;
 
+                       db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER,
+                           FTAG);
                        /* don't dirty if it isn't on disk and isn't dirty */
-                       if (db->db_last_dirty ||
-                           (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
-                               rw_exit(&dn->dn_struct_rwlock);
+                       dirty = db->db_last_dirty ||
+                           (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+                       dmu_buf_unlock_parent(db, dblt, FTAG);
+                       if (dirty) {
                                dmu_buf_will_dirty(&db->db, tx);
-                               rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                                data = db->db.db_data;
                                bzero(data + blkoff, head);
                        }
@@ -2029,11 +2090,11 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 
        /* If the range was less than one block, we're done */
        if (len == 0)
-               goto out;
+               return;
 
        /* If the remaining range is past end of file, we're done */
        if ((off >> blkshift) > dn->dn_maxblkid)
-               goto out;
+               return;
 
        ASSERT(ISP2(blksz));
        if (trunc)
@@ -2044,16 +2105,23 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        ASSERT0(P2PHASE(off, blksz));
        /* zero out any partial block data at the end of the range */
        if (tail) {
+               int res;
                if (len < tail)
                        tail = len;
-               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
-                   TRUE, FALSE, FTAG, &db) == 0) {
+               rw_enter(&dn->dn_struct_rwlock, RW_READER);
+               res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+                   TRUE, FALSE, FTAG, &db);
+               rw_exit(&dn->dn_struct_rwlock);
+               if (res == 0) {
+                       boolean_t dirty;
                        /* don't dirty if not on disk and not dirty */
-                       if (db->db_last_dirty ||
-                           (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
-                               rw_exit(&dn->dn_struct_rwlock);
+                       db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER,
+                           FTAG);
+                       dirty = db->db_last_dirty ||
+                           (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+                       dmu_buf_unlock_parent(db, type, FTAG);
+                       if (dirty) {
                                dmu_buf_will_dirty(&db->db, tx);
-                               rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                                bzero(db->db.db_data, tail);
                        }
                        dbuf_rele(db, FTAG);
@@ -2063,7 +2131,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 
        /* If the range did not include a full block, we are done */
        if (len == 0)
-               goto out;
+               return;
 
        ASSERT(IS_P2ALIGNED(off, blksz));
        ASSERT(trunc || IS_P2ALIGNED(len, blksz));
@@ -2093,6 +2161,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
         *    amount of space if we copy the freed BPs into deadlists.
         */
        if (dn->dn_nlevels > 1) {
+               rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                uint64_t first, last;
 
                first = blkid >> epbs;
@@ -2137,6 +2206,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 
                        dnode_dirty_l1(dn, i, tx);
                }
+               rw_exit(&dn->dn_struct_rwlock);
        }
 
 done:
@@ -2159,9 +2229,6 @@ done:
 
        dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
        dnode_setdirty(dn, tx);
-out:
-
-       rw_exit(&dn->dn_struct_rwlock);
 }
 
 static boolean_t
@@ -2270,6 +2337,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
        boolean_t hole;
        int i, inc, error, span;
 
+       ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
+
        hole = ((flags & DNODE_FIND_HOLE) != 0);
        inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
        ASSERT(txg == 0 || !hole);
@@ -2302,9 +2371,9 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                        return (error);
                }
                data = db->db.db_data;
+               rw_enter(&db->db_rwlock, RW_READER);
        }
 
-
        if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
            db->db_blkptr->blk_birth <= txg ||
            BP_IS_HOLE(db->db_blkptr))) {
@@ -2377,8 +2446,10 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                        error = SET_ERROR(ESRCH);
        }
 
-       if (db)
+       if (db != NULL) {
+               rw_exit(&db->db_rwlock);
                dbuf_rele(db, FTAG);
+       }
 
        return (error);
 }
@@ -2464,3 +2535,13 @@ out:
 
        return (error);
 }
+
+#if defined(_KERNEL)
+EXPORT_SYMBOL(dnode_hold);
+EXPORT_SYMBOL(dnode_rele);
+EXPORT_SYMBOL(dnode_set_nlevels);
+EXPORT_SYMBOL(dnode_set_blksz);
+EXPORT_SYMBOL(dnode_free_range);
+EXPORT_SYMBOL(dnode_evict_dbufs);
+EXPORT_SYMBOL(dnode_evict_bonus);
+#endif