]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/dnode.c
OpenZFS 7500 - Simplify dbuf_free_range by removing dn_unlisted_l0_blkid
[mirror_zfs.git] / module / zfs / dnode.c
index 2858bbfb492ea74f81e241cab6fb84f9f94f931e..be12ac0fe7169076914ad9490a3214d83923c2de 100644 (file)
@@ -20,7 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
@@ -69,19 +69,13 @@ dbuf_compare(const void *x1, const void *x2)
        const dmu_buf_impl_t *d1 = x1;
        const dmu_buf_impl_t *d2 = x2;
 
-       if (d1->db_level < d2->db_level) {
-               return (-1);
-       }
-       if (d1->db_level > d2->db_level) {
-               return (1);
-       }
+       int cmp = AVL_CMP(d1->db_level, d2->db_level);
+       if (likely(cmp))
+               return (cmp);
 
-       if (d1->db_blkid < d2->db_blkid) {
-               return (-1);
-       }
-       if (d1->db_blkid > d2->db_blkid) {
-               return (1);
-       }
+       cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+       if (likely(cmp))
+               return (cmp);
 
        if (d1->db_state == DB_SEARCH) {
                ASSERT3S(d2->db_state, !=, DB_SEARCH);
@@ -91,13 +85,7 @@ dbuf_compare(const void *x1, const void *x2)
                return (1);
        }
 
-       if ((uintptr_t)d1 < (uintptr_t)d2) {
-               return (-1);
-       }
-       if ((uintptr_t)d1 > (uintptr_t)d2) {
-               return (1);
-       }
-       return (0);
+       return (AVL_PCMP(d1, d2));
 }
 
 /* ARGSUSED */
@@ -107,7 +95,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
        dnode_t *dn = arg;
        int i;
 
-       rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+       rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
        mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
@@ -153,7 +141,6 @@ dnode_cons(void *arg, void *unused, int kmflag)
        dn->dn_id_flags = 0;
 
        dn->dn_dbufs_count = 0;
-       dn->dn_unlisted_l0_blkid = 0;
        avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
@@ -206,7 +193,6 @@ dnode_dest(void *arg, void *unused)
        ASSERT0(dn->dn_id_flags);
 
        ASSERT0(dn->dn_dbufs_count);
-       ASSERT0(dn->dn_unlisted_l0_blkid);
        avl_destroy(&dn->dn_dbufs);
 }
 
@@ -248,6 +234,7 @@ dnode_verify(dnode_t *dn)
        }
        if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
                int i;
+               int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
                ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
                if (dn->dn_datablkshift) {
                        ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
@@ -258,12 +245,12 @@ dnode_verify(dnode_t *dn)
                ASSERT(DMU_OT_IS_VALID(dn->dn_type));
                ASSERT3U(dn->dn_nblkptr, >=, 1);
                ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
-               ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+               ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
                ASSERT3U(dn->dn_datablksz, ==,
                    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
                ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
                ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
-                   dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+                   dn->dn_bonuslen, <=, max_bonuslen);
                for (i = 0; i < TXG_SIZE; i++) {
                        ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
                }
@@ -294,6 +281,7 @@ dnode_byteswap(dnode_phys_t *dnp)
 
        dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
        dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+       dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
        dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
        dnp->dn_used = BSWAP_64(dnp->dn_used);
 
@@ -320,7 +308,8 @@ dnode_byteswap(dnode_phys_t *dnp)
                 * dnode buffer).
                 */
                int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
-               size_t len = DN_MAX_BONUSLEN - off;
+               int slots = dnp->dn_extra_slots + 1;
+               size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
                dmu_object_byteswap_t byteswap;
                ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
                byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
@@ -329,23 +318,24 @@ dnode_byteswap(dnode_phys_t *dnp)
 
        /* Swap SPILL block if we have one */
        if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
-               byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
-
+               byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
 }
 
 void
 dnode_buf_byteswap(void *vbuf, size_t size)
 {
-       dnode_phys_t *buf = vbuf;
-       int i;
+       int i = 0;
 
        ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
        ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 
-       size >>= DNODE_SHIFT;
-       for (i = 0; i < size; i++) {
-               dnode_byteswap(buf);
-               buf++;
+       while (i < size) {
+               dnode_phys_t *dnp = vbuf + i;
+               dnode_byteswap(dnp);
+
+               i += DNODE_MIN_SIZE;
+               if (dnp->dn_type != DMU_OT_NONE)
+                       i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
        }
 }
 
@@ -356,7 +346,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 
        dnode_setdirty(dn, tx);
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-       ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+       ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
            (dn->dn_nblkptr-1) * sizeof (blkptr_t));
        dn->dn_bonuslen = newsize;
        if (newsize == 0)
@@ -434,6 +424,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        dn->dn_compress = dnp->dn_compress;
        dn->dn_bonustype = dnp->dn_bonustype;
        dn->dn_bonuslen = dnp->dn_bonuslen;
+       dn->dn_num_slots = dnp->dn_extra_slots + 1;
        dn->dn_maxblkid = dnp->dn_maxblkid;
        dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
        dn->dn_id_flags = 0;
@@ -470,7 +461,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        dnh->dnh_dnode = dn;
        mutex_exit(&os->os_lock);
 
-       arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
+       arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
        return (dn);
 }
 
@@ -509,7 +500,7 @@ dnode_destroy(dnode_t *dn)
        }
        if (dn->dn_bonus != NULL) {
                mutex_enter(&dn->dn_bonus->db_mtx);
-               dbuf_evict(dn->dn_bonus);
+               dbuf_destroy(dn->dn_bonus);
                dn->dn_bonus = NULL;
        }
        dn->dn_zio = NULL;
@@ -522,11 +513,10 @@ dnode_destroy(dnode_t *dn)
        dn->dn_newuid = 0;
        dn->dn_newgid = 0;
        dn->dn_id_flags = 0;
-       dn->dn_unlisted_l0_blkid = 0;
 
-       dmu_zfetch_rele(&dn->dn_zfetch);
+       dmu_zfetch_fini(&dn->dn_zfetch);
        kmem_cache_free(dnode_cache, dn);
-       arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+       arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
 
        if (complete_os_eviction)
                dmu_objset_evict_done(os);
@@ -534,10 +524,13 @@ dnode_destroy(dnode_t *dn)
 
 void
 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
 {
        int i;
 
+       ASSERT3U(dn_slots, >, 0);
+       ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+           spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
        ASSERT3U(blocksize, <=,
            spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (blocksize == 0)
@@ -550,8 +543,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
        ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 
-       dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
-           dn->dn_object, tx->tx_txg, blocksize, ibs);
+       dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
+           dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
 
        ASSERT(dn->dn_type == DMU_OT_NONE);
        ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
@@ -562,7 +555,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
            (bonustype == DMU_OT_SA && bonuslen == 0) ||
            (bonustype != DMU_OT_NONE && bonuslen != 0));
        ASSERT(DMU_OT_IS_VALID(bonustype));
-       ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+       ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
        ASSERT(dn->dn_type == DMU_OT_NONE);
        ASSERT0(dn->dn_maxblkid);
        ASSERT0(dn->dn_allocated_txg);
@@ -588,11 +581,15 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        dnode_setdblksz(dn, blocksize);
        dn->dn_indblkshift = ibs;
        dn->dn_nlevels = 1;
+       dn->dn_num_slots = dn_slots;
        if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
                dn->dn_nblkptr = 1;
-       else
-               dn->dn_nblkptr = 1 +
-                   ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+       else {
+               dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+                   1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+                   SPA_BLKPTRSHIFT));
+       }
+
        dn->dn_bonustype = bonustype;
        dn->dn_bonuslen = bonuslen;
        dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -617,7 +614,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
 void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
 {
        int nblkptr;
 
@@ -631,7 +628,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
            (bonustype != DMU_OT_NONE && bonuslen != 0) ||
            (bonustype == DMU_OT_SA && bonuslen == 0));
        ASSERT(DMU_OT_IS_VALID(bonustype));
-       ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+       ASSERT3U(bonuslen, <=,
+           DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+
+       dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
 
        /* clean up any unreferenced dbufs */
        dnode_evict_dbufs(dn);
@@ -654,7 +654,9 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
                nblkptr = 1;
        else
-               nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+               nblkptr = MIN(DN_MAX_NBLKPTR,
+                   1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+                   SPA_BLKPTRSHIFT));
        if (dn->dn_bonustype != bonustype)
                dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
        if (dn->dn_nblkptr != nblkptr)
@@ -672,6 +674,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        mutex_enter(&dn->dn_mtx);
        dn->dn_bonustype = bonustype;
        dn->dn_bonuslen = bonuslen;
+       dn->dn_num_slots = dn_slots;
        dn->dn_nblkptr = nblkptr;
        dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
        dn->dn_compress = ZIO_COMPRESS_INHERIT;
@@ -680,7 +683,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        /* fix up the bonus db_size */
        if (dn->dn_bonus) {
                dn->dn_bonus->db.db_size =
-                   DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+                   DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+                   (dn->dn_nblkptr-1) * sizeof (blkptr_t);
                ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
        }
 
@@ -759,7 +763,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        ASSERT(avl_is_empty(&ndn->dn_dbufs));
        avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
        ndn->dn_dbufs_count = odn->dn_dbufs_count;
-       ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
        ndn->dn_bonus = odn->dn_bonus;
        ndn->dn_have_spill = odn->dn_have_spill;
        ndn->dn_zio = odn->dn_zio;
@@ -773,8 +776,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        dmu_zfetch_init(&ndn->dn_zfetch, NULL);
        list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
        ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
-       ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
-       ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
 
        /*
         * Update back pointers. Updating the handle fixes the back pointer of
@@ -794,7 +795,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
        odn->dn_dbufs_count = 0;
-       odn->dn_unlisted_l0_blkid = 0;
        odn->dn_bonus = NULL;
        odn->dn_zfetch.zf_dnode = NULL;
 
@@ -1016,7 +1016,7 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
 }
 
 static void
-dnode_buf_pageout(void *dbu)
+dnode_buf_evict_async(void *dbu)
 {
        dnode_children_t *children_dnodes = dbu;
        int i;
@@ -1054,25 +1054,152 @@ dnode_buf_pageout(void *dbu)
            children_dnodes->dnc_count * sizeof (dnode_handle_t));
 }
 
+/*
+ * Return true if the given index is interior to a dnode already
+ * allocated in the block. That is, the index is neither free nor
+ * allocated, but is consumed by a large dnode.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_consumed(dmu_buf_impl_t *db, int idx)
+{
+       dnode_handle_t *dnh;
+       dmu_object_type_t ot;
+       dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block;
+       int skip;
+       int i;
+
+       children_dnodes = dmu_buf_get_user(&db->db);
+       dn_block = (dnode_phys_t *)db->db.db_data;
+
+       for (i = 0; i < idx; i += skip) {
+               dnh = &children_dnodes->dnc_children[i];
+
+               zrl_add(&dnh->dnh_zrlock);
+               if (dnh->dnh_dnode != NULL) {
+                       ot = dnh->dnh_dnode->dn_type;
+                       skip = dnh->dnh_dnode->dn_num_slots;
+               } else {
+                       ot = dn_block[i].dn_type;
+                       skip = dn_block[i].dn_extra_slots + 1;
+               }
+               zrl_remove(&dnh->dnh_zrlock);
+
+               if (ot == DMU_OT_NONE)
+                       skip = 1;
+       }
+
+       return (i > idx);
+}
+
+/*
+ * Return true if the given index in the dnode block is a valid
+ * allocated dnode. That is, the index is not consumed by a large
+ * dnode and is not free.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_allocated(dmu_buf_impl_t *db, int idx)
+{
+       dnode_handle_t *dnh;
+       dmu_object_type_t ot;
+       dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block;
+
+       if (dnode_is_consumed(db, idx))
+               return (B_FALSE);
+
+       children_dnodes = dmu_buf_get_user(&db->db);
+       dn_block = (dnode_phys_t *)db->db.db_data;
+
+       dnh = &children_dnodes->dnc_children[idx];
+
+       zrl_add(&dnh->dnh_zrlock);
+       if (dnh->dnh_dnode != NULL)
+               ot = dnh->dnh_dnode->dn_type;
+       else
+               ot = dn_block[idx].dn_type;
+       zrl_remove(&dnh->dnh_zrlock);
+
+       return (ot != DMU_OT_NONE);
+}
+
+/*
+ * Return true if the given range of indices in the dnode block are
+ * free. That is, the starting index is not consumed by a large dnode
+ * and none of the indices are allocated.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
+{
+       dnode_handle_t *dnh;
+       dmu_object_type_t ot;
+       dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block;
+       int i;
+
+       if (idx + slots > DNODES_PER_BLOCK)
+               return (B_FALSE);
+
+       children_dnodes = dmu_buf_get_user(&db->db);
+       dn_block = (dnode_phys_t *)db->db.db_data;
+
+       if (dnode_is_consumed(db, idx))
+               return (B_FALSE);
+
+       for (i = idx; i < idx + slots; i++) {
+               dnh = &children_dnodes->dnc_children[i];
+
+               zrl_add(&dnh->dnh_zrlock);
+               if (dnh->dnh_dnode != NULL)
+                       ot = dnh->dnh_dnode->dn_type;
+               else
+                       ot = dn_block[i].dn_type;
+               zrl_remove(&dnh->dnh_zrlock);
+
+               if (ot != DMU_OT_NONE)
+                       return (B_FALSE);
+       }
+
+       return (B_TRUE);
+}
+
 /*
  * errors:
  * EINVAL - invalid object number.
+ * ENOSPC - hole too small to fulfill "slots" request
+ * ENOENT - the requested dnode is not allocated
  * EIO - i/o error.
  * succeeds even for free dnodes.
  */
 int
-dnode_hold_impl(objset_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
     void *tag, dnode_t **dnp)
 {
-       int epb, idx, err;
+       int epb, idx, err, i;
        int drop_struct_lock = FALSE;
        int type;
        uint64_t blk;
        dnode_t *mdn, *dn;
        dmu_buf_impl_t *db;
        dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block_begin;
        dnode_handle_t *dnh;
 
+       ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+       ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+
        /*
         * If you are holding the spa config lock as writer, you shouldn't
         * be asking the DMU to do *anything* unless it's the root pool
@@ -1112,7 +1239,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
                drop_struct_lock = TRUE;
        }
 
-       blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+       blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 
        db = dbuf_hold(mdn, blk, FTAG);
        if (drop_struct_lock)
@@ -1128,12 +1255,9 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
        ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
        epb = db->db.db_size >> DNODE_SHIFT;
 
-       idx = object & (epb-1);
-
        ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
        children_dnodes = dmu_buf_get_user(&db->db);
        if (children_dnodes == NULL) {
-               int i;
                dnode_children_t *winner;
                children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
                    epb * sizeof (dnode_handle_t), KM_SLEEP);
@@ -1142,8 +1266,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
                for (i = 0; i < epb; i++) {
                        zrl_init(&dnh[i].dnh_zrlock);
                }
-               dmu_buf_init_user(&children_dnodes->dnc_dbu,
-                   dnode_buf_pageout, NULL);
+               dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
+                   dnode_buf_evict_async, NULL);
                winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
                if (winner != NULL) {
 
@@ -1158,21 +1282,28 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
        }
        ASSERT(children_dnodes->dnc_count == epb);
 
+       idx = object & (epb - 1);
+       dn_block_begin = (dnode_phys_t *)db->db.db_data;
+
+       if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
+               dbuf_rele(db, FTAG);
+               return (ENOSPC);
+       } else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
+           !dnode_is_allocated(db, idx)) {
+               dbuf_rele(db, FTAG);
+               return (ENOENT);
+       }
+
        dnh = &children_dnodes->dnc_children[idx];
        zrl_add(&dnh->dnh_zrlock);
        dn = dnh->dnh_dnode;
-       if (dn == NULL) {
-               dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
-
-               dn = dnode_create(os, phys, db, object, dnh);
-       }
+       if (dn == NULL)
+               dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
 
        mutex_enter(&dn->dn_mtx);
        type = dn->dn_type;
        if (dn->dn_free_txg ||
-           ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
-           ((flag & DNODE_MUST_BE_FREE) &&
-           (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
+           ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
                mutex_exit(&dn->dn_mtx);
                zrl_remove(&dnh->dnh_zrlock);
                dbuf_rele(db, FTAG);
@@ -1200,7 +1331,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
 int
 dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
-       return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+       return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+           dnp));
 }
 
 /*
@@ -1409,7 +1541,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
                goto fail;
 
        /* resize the old block */
-       err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+       err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
        if (err == 0)
                dbuf_new_size(db, size, tx);
        else if (err != ENOENT)
@@ -1476,6 +1608,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
            sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
                new_nlevels++;
 
+       ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
+
        if (new_nlevels > dn->dn_nlevels) {
                int old_nlevels = dn->dn_nlevels;
                dmu_buf_impl_t *db;
@@ -1582,8 +1716,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
                ASSERT3U(blkoff + head, ==, blksz);
                if (len < head)
                        head = len;
-               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
-                   FTAG, &db) == 0) {
+               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+                   TRUE, FALSE, FTAG, &db) == 0) {
                        caddr_t data;
 
                        /* don't dirty if it isn't on disk and isn't dirty */
@@ -1620,8 +1754,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        if (tail) {
                if (len < tail)
                        tail = len;
-               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-                   TRUE, FTAG, &db) == 0) {
+               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+                   TRUE, FALSE, FTAG, &db) == 0) {
                        /* don't dirty if not on disk and not dirty */
                        if (db->db_last_dirty ||
                            (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
@@ -1853,7 +1987,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
-       int lvl, uint64_t blkfill, uint64_t txg)
+    int lvl, uint64_t blkfill, uint64_t txg)
 {
        dmu_buf_impl_t *db = NULL;
        void *data = NULL;
@@ -1863,9 +1997,6 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
        boolean_t hole;
        int i, inc, error, span;
 
-       dprintf("probing object %llu offset %llx level %d of %u\n",
-           dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
-
        hole = ((flags & DNODE_FIND_HOLE) != 0);
        inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
        ASSERT(txg == 0 || !hole);
@@ -1875,8 +2006,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                epb = dn->dn_phys->dn_nblkptr;
                data = dn->dn_phys->dn_blkptr;
        } else {
-               uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
-               error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+               uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+               error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
                if (error) {
                        if (error != ENOENT)
                                return (error);
@@ -1910,17 +2041,21 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                error = SET_ERROR(ESRCH);
        } else if (lvl == 0) {
                dnode_phys_t *dnp = data;
-               span = DNODE_SHIFT;
+
                ASSERT(dn->dn_type == DMU_OT_DNODE);
+               ASSERT(!(flags & DNODE_FIND_BACKWARDS));
 
-               for (i = (*offset >> span) & (blkfill - 1);
-                   i >= 0 && i < blkfill; i += inc) {
+               for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+                   i < blkfill; i += dnp[i].dn_extra_slots + 1) {
                        if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
                                break;
-                       *offset += (1ULL << span) * inc;
                }
-               if (i < 0 || i == blkfill)
+
+               if (i == blkfill)
                        error = SET_ERROR(ESRCH);
+
+               *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+                   (i << DNODE_SHIFT);
        } else {
                blkptr_t *bp = data;
                uint64_t start = *offset;
@@ -1933,7 +2068,14 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                else
                        minfill++;
 
-               *offset = *offset >> span;
+               if (span >= 8 * sizeof (*offset)) {
+                       /* This only happens on the highest indirection level */
+                       ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
+                       *offset = 0;
+               } else {
+                       *offset = *offset >> span;
+               }
+
                for (i = BF64_GET(*offset, 0, epbs);
                    i >= 0 && i < epb; i += inc) {
                        if (BP_GET_FILL(&bp[i]) >= minfill &&
@@ -1943,7 +2085,13 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                        if (inc > 0 || *offset > 0)
                                *offset += inc;
                }
-               *offset = *offset << span;
+
+               if (span >= 8 * sizeof (*offset)) {
+                       *offset = start;
+               } else {
+                       *offset = *offset << span;
+               }
+
                if (inc < 0) {
                        /* traversing backwards; position offset at the end */
                        ASSERT3U(*offset, <=, start);