]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/dnode.c
OpenZFS 7968 - multi-threaded spa_sync()
[mirror_zfs.git] / module / zfs / dnode.c
index 815696f702180ffb4080c5617fee923cc9633150..fd4acbbaa50177699bba44602c3c6cb36f354676 100644 (file)
@@ -20,7 +20,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -62,6 +63,31 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif /* _KERNEL */
 
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+       const dmu_buf_impl_t *d1 = x1;
+       const dmu_buf_impl_t *d2 = x2;
+
+       int cmp = AVL_CMP(d1->db_level, d2->db_level);
+       if (likely(cmp))
+               return (cmp);
+
+       cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+       if (likely(cmp))
+               return (cmp);
+
+       if (d1->db_state == DB_SEARCH) {
+               ASSERT3S(d2->db_state, !=, DB_SEARCH);
+               return (-1);
+       } else if (d2->db_state == DB_SEARCH) {
+               ASSERT3S(d1->db_state, !=, DB_SEARCH);
+               return (1);
+       }
+
+       return (AVL_PCMP(d1, d2));
+}
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
@@ -69,7 +95,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
        dnode_t *dn = arg;
        int i;
 
-       rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+       rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
        mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
@@ -115,8 +141,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
        dn->dn_id_flags = 0;
 
        dn->dn_dbufs_count = 0;
-       dn->dn_unlisted_l0_blkid = 0;
-       list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
        dn->dn_moved = 0;
@@ -168,8 +193,7 @@ dnode_dest(void *arg, void *unused)
        ASSERT0(dn->dn_id_flags);
 
        ASSERT0(dn->dn_dbufs_count);
-       ASSERT0(dn->dn_unlisted_l0_blkid);
-       list_destroy(&dn->dn_dbufs);
+       avl_destroy(&dn->dn_dbufs);
 }
 
 void
@@ -210,6 +234,7 @@ dnode_verify(dnode_t *dn)
        }
        if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
                int i;
+               int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
                ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
                if (dn->dn_datablkshift) {
                        ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
@@ -220,12 +245,12 @@ dnode_verify(dnode_t *dn)
                ASSERT(DMU_OT_IS_VALID(dn->dn_type));
                ASSERT3U(dn->dn_nblkptr, >=, 1);
                ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
-               ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+               ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
                ASSERT3U(dn->dn_datablksz, ==,
                    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
                ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
                ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
-                   dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+                   dn->dn_bonuslen, <=, max_bonuslen);
                for (i = 0; i < TXG_SIZE; i++) {
                        ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
                }
@@ -256,6 +281,7 @@ dnode_byteswap(dnode_phys_t *dnp)
 
        dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
        dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+       dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
        dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
        dnp->dn_used = BSWAP_64(dnp->dn_used);
 
@@ -282,7 +308,8 @@ dnode_byteswap(dnode_phys_t *dnp)
                 * dnode buffer).
                 */
                int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
-               size_t len = DN_MAX_BONUSLEN - off;
+               int slots = dnp->dn_extra_slots + 1;
+               size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
                dmu_object_byteswap_t byteswap;
                ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
                byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
@@ -291,23 +318,24 @@ dnode_byteswap(dnode_phys_t *dnp)
 
        /* Swap SPILL block if we have one */
        if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
-               byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
-
+               byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
 }
 
 void
 dnode_buf_byteswap(void *vbuf, size_t size)
 {
-       dnode_phys_t *buf = vbuf;
-       int i;
+       int i = 0;
 
        ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
        ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 
-       size >>= DNODE_SHIFT;
-       for (i = 0; i < size; i++) {
-               dnode_byteswap(buf);
-               buf++;
+       while (i < size) {
+               dnode_phys_t *dnp = vbuf + i;
+               dnode_byteswap(dnp);
+
+               i += DNODE_MIN_SIZE;
+               if (dnp->dn_type != DMU_OT_NONE)
+                       i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
        }
 }
 
@@ -318,7 +346,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 
        dnode_setdirty(dn, tx);
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-       ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+       ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
            (dn->dn_nblkptr-1) * sizeof (blkptr_t));
        dn->dn_bonuslen = newsize;
        if (newsize == 0)
@@ -366,8 +394,9 @@ static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
-       dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+       dnode_t *dn;
 
+       dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
        ASSERT(!POINTER_IS_VALID(dn->dn_objset));
        dn->dn_moved = 0;
 
@@ -395,6 +424,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        dn->dn_compress = dnp->dn_compress;
        dn->dn_bonustype = dnp->dn_bonustype;
        dn->dn_bonuslen = dnp->dn_bonuslen;
+       dn->dn_num_slots = dnp->dn_extra_slots + 1;
        dn->dn_maxblkid = dnp->dn_maxblkid;
        dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
        dn->dn_id_flags = 0;
@@ -404,16 +434,34 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
        mutex_enter(&os->os_lock);
-       list_insert_head(&os->os_dnodes, dn);
+       if (dnh->dnh_dnode != NULL) {
+               /* Lost the allocation race. */
+               mutex_exit(&os->os_lock);
+               kmem_cache_free(dnode_cache, dn);
+               return (dnh->dnh_dnode);
+       }
+
+       /*
+        * Exclude special dnodes from os_dnodes so an empty os_dnodes
+        * signifies that the special dnodes have no references from
+        * their children (the entries in os_dnodes).  This allows
+        * dnode_destroy() to easily determine if the last child has
+        * been removed and then complete eviction of the objset.
+        */
+       if (!DMU_OBJECT_IS_SPECIAL(object))
+               list_insert_head(&os->os_dnodes, dn);
        membar_producer();
+
        /*
-        * Everything else must be valid before assigning dn_objset makes the
-        * dnode eligible for dnode_move().
+        * Everything else must be valid before assigning dn_objset
+        * makes the dnode eligible for dnode_move().
         */
        dn->dn_objset = os;
+
+       dnh->dnh_dnode = dn;
        mutex_exit(&os->os_lock);
 
-       arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
+       arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
        return (dn);
 }
 
@@ -424,12 +472,18 @@ static void
 dnode_destroy(dnode_t *dn)
 {
        objset_t *os = dn->dn_objset;
+       boolean_t complete_os_eviction = B_FALSE;
 
        ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
        mutex_enter(&os->os_lock);
        POINTER_INVALIDATE(&dn->dn_objset);
-       list_remove(&os->os_dnodes, dn);
+       if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+               list_remove(&os->os_dnodes, dn);
+               complete_os_eviction =
+                   list_is_empty(&os->os_dnodes) &&
+                   list_link_active(&os->os_evicting_node);
+       }
        mutex_exit(&os->os_lock);
 
        /* the dnode can no longer move, so we can release the handle */
@@ -446,7 +500,7 @@ dnode_destroy(dnode_t *dn)
        }
        if (dn->dn_bonus != NULL) {
                mutex_enter(&dn->dn_bonus->db_mtx);
-               dbuf_evict(dn->dn_bonus);
+               dbuf_destroy(dn->dn_bonus);
                dn->dn_bonus = NULL;
        }
        dn->dn_zio = NULL;
@@ -459,23 +513,28 @@ dnode_destroy(dnode_t *dn)
        dn->dn_newuid = 0;
        dn->dn_newgid = 0;
        dn->dn_id_flags = 0;
-       dn->dn_unlisted_l0_blkid = 0;
 
-       dmu_zfetch_rele(&dn->dn_zfetch);
+       dmu_zfetch_fini(&dn->dn_zfetch);
        kmem_cache_free(dnode_cache, dn);
-       arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+       arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+       if (complete_os_eviction)
+               dmu_objset_evict_done(os);
 }
 
 void
 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
 {
        int i;
 
+       ASSERT3U(dn_slots, >, 0);
+       ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+           spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (blocksize == 0)
                blocksize = 1 << zfs_default_bs;
-       else if (blocksize > SPA_MAXBLOCKSIZE)
-               blocksize = SPA_MAXBLOCKSIZE;
        else
                blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
@@ -484,8 +543,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
        ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 
-       dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
-           dn->dn_object, tx->tx_txg, blocksize, ibs);
+       dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
+           dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
 
        ASSERT(dn->dn_type == DMU_OT_NONE);
        ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
@@ -496,14 +555,14 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
            (bonustype == DMU_OT_SA && bonuslen == 0) ||
            (bonustype != DMU_OT_NONE && bonuslen != 0));
        ASSERT(DMU_OT_IS_VALID(bonustype));
-       ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+       ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
        ASSERT(dn->dn_type == DMU_OT_NONE);
        ASSERT0(dn->dn_maxblkid);
        ASSERT0(dn->dn_allocated_txg);
        ASSERT0(dn->dn_assigned_txg);
        ASSERT(refcount_is_zero(&dn->dn_tx_holds));
        ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
-       ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+       ASSERT(avl_is_empty(&dn->dn_dbufs));
 
        for (i = 0; i < TXG_SIZE; i++) {
                ASSERT0(dn->dn_next_nblkptr[i]);
@@ -522,11 +581,15 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
        dnode_setdblksz(dn, blocksize);
        dn->dn_indblkshift = ibs;
        dn->dn_nlevels = 1;
+       dn->dn_num_slots = dn_slots;
        if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
                dn->dn_nblkptr = 1;
-       else
-               dn->dn_nblkptr = 1 +
-                   ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+       else {
+               dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+                   1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+                   SPA_BLKPTRSHIFT));
+       }
+
        dn->dn_bonustype = bonustype;
        dn->dn_bonuslen = bonuslen;
        dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -551,12 +614,13 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
 void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
 {
        int nblkptr;
 
        ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
-       ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        ASSERT0(blocksize % SPA_MINBLOCKSIZE);
        ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
        ASSERT(tx->tx_txg != 0);
@@ -564,7 +628,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
            (bonustype != DMU_OT_NONE && bonuslen != 0) ||
            (bonustype == DMU_OT_SA && bonuslen == 0));
        ASSERT(DMU_OT_IS_VALID(bonustype));
-       ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+       ASSERT3U(bonuslen, <=,
+           DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+
+       dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
 
        /* clean up any unreferenced dbufs */
        dnode_evict_dbufs(dn);
@@ -587,7 +654,9 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
                nblkptr = 1;
        else
-               nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+               nblkptr = MIN(DN_MAX_NBLKPTR,
+                   1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+                   SPA_BLKPTRSHIFT));
        if (dn->dn_bonustype != bonustype)
                dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
        if (dn->dn_nblkptr != nblkptr)
@@ -605,6 +674,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        mutex_enter(&dn->dn_mtx);
        dn->dn_bonustype = bonustype;
        dn->dn_bonuslen = bonuslen;
+       dn->dn_num_slots = dn_slots;
        dn->dn_nblkptr = nblkptr;
        dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
        dn->dn_compress = ZIO_COMPRESS_INHERIT;
@@ -613,7 +683,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        /* fix up the bonus db_size */
        if (dn->dn_bonus) {
                dn->dn_bonus->db.db_size =
-                   DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+                   DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+                   (dn->dn_nblkptr-1) * sizeof (blkptr_t);
                ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
        }
 
@@ -689,10 +760,9 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
        ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
        refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
-       ASSERT(list_is_empty(&ndn->dn_dbufs));
-       list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+       ASSERT(avl_is_empty(&ndn->dn_dbufs));
+       avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
        ndn->dn_dbufs_count = odn->dn_dbufs_count;
-       ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
        ndn->dn_bonus = odn->dn_bonus;
        ndn->dn_have_spill = odn->dn_have_spill;
        ndn->dn_zio = odn->dn_zio;
@@ -706,8 +776,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        dmu_zfetch_init(&ndn->dn_zfetch, NULL);
        list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
        ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
-       ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
-       ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
 
        /*
         * Update back pointers. Updating the handle fixes the back pointer of
@@ -724,10 +792,9 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
         */
        odn->dn_dbuf = NULL;
        odn->dn_handle = NULL;
-       list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
        odn->dn_dbufs_count = 0;
-       odn->dn_unlisted_l0_blkid = 0;
        odn->dn_bonus = NULL;
        odn->dn_zfetch.zf_dnode = NULL;
 
@@ -929,33 +996,32 @@ dnode_special_close(dnode_handle_t *dnh)
         */
        while (refcount_count(&dn->dn_holds) > 0)
                delay(1);
+       ASSERT(dn->dn_dbuf == NULL ||
+           dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
        zrl_add(&dnh->dnh_zrlock);
        dnode_destroy(dn); /* implicit zrl_remove() */
        zrl_destroy(&dnh->dnh_zrlock);
        dnh->dnh_dnode = NULL;
 }
 
-dnode_t *
+void
 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
     dnode_handle_t *dnh)
 {
-       dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
-       dnh->dnh_dnode = dn;
+       dnode_t *dn;
+
+       dn = dnode_create(os, dnp, NULL, object, dnh);
        zrl_init(&dnh->dnh_zrlock);
        DNODE_VERIFY(dn);
-       return (dn);
 }
 
 static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
+dnode_buf_evict_async(void *dbu)
 {
-       dnode_children_t *children_dnodes = arg;
+       dnode_children_t *children_dnodes = dbu;
        int i;
-       int epb = db->db_size >> DNODE_SHIFT;
-
-       ASSERT(epb == children_dnodes->dnc_count);
 
-       for (i = 0; i < epb; i++) {
+       for (i = 0; i < children_dnodes->dnc_count; i++) {
                dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
                dnode_t *dn;
 
@@ -985,28 +1051,155 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
                dnh->dnh_dnode = NULL;
        }
        kmem_free(children_dnodes, sizeof (dnode_children_t) +
-           (epb - 1) * sizeof (dnode_handle_t));
+           children_dnodes->dnc_count * sizeof (dnode_handle_t));
+}
+
+/*
+ * Return true if the given index is interior to a dnode already
+ * allocated in the block. That is, the index is neither free nor
+ * allocated, but is consumed by a large dnode.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_consumed(dmu_buf_impl_t *db, int idx)
+{
+       dnode_handle_t *dnh;
+       dmu_object_type_t ot;
+       dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block;
+       int skip;
+       int i;
+
+       children_dnodes = dmu_buf_get_user(&db->db);
+       dn_block = (dnode_phys_t *)db->db.db_data;
+
+       for (i = 0; i < idx; i += skip) {
+               dnh = &children_dnodes->dnc_children[i];
+
+               zrl_add(&dnh->dnh_zrlock);
+               if (dnh->dnh_dnode != NULL) {
+                       ot = dnh->dnh_dnode->dn_type;
+                       skip = dnh->dnh_dnode->dn_num_slots;
+               } else {
+                       ot = dn_block[i].dn_type;
+                       skip = dn_block[i].dn_extra_slots + 1;
+               }
+               zrl_remove(&dnh->dnh_zrlock);
+
+               if (ot == DMU_OT_NONE)
+                       skip = 1;
+       }
+
+       return (i > idx);
+}
+
+/*
+ * Return true if the given index in the dnode block is a valid
+ * allocated dnode. That is, the index is not consumed by a large
+ * dnode and is not free.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_allocated(dmu_buf_impl_t *db, int idx)
+{
+       dnode_handle_t *dnh;
+       dmu_object_type_t ot;
+       dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block;
+
+       if (dnode_is_consumed(db, idx))
+               return (B_FALSE);
+
+       children_dnodes = dmu_buf_get_user(&db->db);
+       dn_block = (dnode_phys_t *)db->db.db_data;
+
+       dnh = &children_dnodes->dnc_children[idx];
+
+       zrl_add(&dnh->dnh_zrlock);
+       if (dnh->dnh_dnode != NULL)
+               ot = dnh->dnh_dnode->dn_type;
+       else
+               ot = dn_block[idx].dn_type;
+       zrl_remove(&dnh->dnh_zrlock);
+
+       return (ot != DMU_OT_NONE);
+}
+
+/*
+ * Return true if the given range of indices in the dnode block are
+ * free. That is, the starting index is not consumed by a large dnode
+ * and none of the indices are allocated.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
+{
+       dnode_handle_t *dnh;
+       dmu_object_type_t ot;
+       dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block;
+       int i;
+
+       if (idx + slots > DNODES_PER_BLOCK)
+               return (B_FALSE);
+
+       children_dnodes = dmu_buf_get_user(&db->db);
+       dn_block = (dnode_phys_t *)db->db.db_data;
+
+       if (dnode_is_consumed(db, idx))
+               return (B_FALSE);
+
+       for (i = idx; i < idx + slots; i++) {
+               dnh = &children_dnodes->dnc_children[i];
+
+               zrl_add(&dnh->dnh_zrlock);
+               if (dnh->dnh_dnode != NULL)
+                       ot = dnh->dnh_dnode->dn_type;
+               else
+                       ot = dn_block[i].dn_type;
+               zrl_remove(&dnh->dnh_zrlock);
+
+               if (ot != DMU_OT_NONE)
+                       return (B_FALSE);
+       }
+
+       return (B_TRUE);
 }
 
 /*
  * errors:
  * EINVAL - invalid object number.
+ * ENOSPC - hole too small to fulfill "slots" request
+ * ENOENT - the requested dnode is not allocated
  * EIO - i/o error.
  * succeeds even for free dnodes.
  */
 int
-dnode_hold_impl(objset_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
     void *tag, dnode_t **dnp)
 {
-       int epb, idx, err;
+       int epb, idx, err, i;
        int drop_struct_lock = FALSE;
        int type;
        uint64_t blk;
        dnode_t *mdn, *dn;
        dmu_buf_impl_t *db;
        dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block_begin;
        dnode_handle_t *dnh;
 
+       ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+       ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+
        /*
         * If you are holding the spa config lock as writer, you shouldn't
         * be asking the DMU to do *anything* unless it's the root pool
@@ -1046,7 +1239,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
                drop_struct_lock = TRUE;
        }
 
-       blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+       blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 
        db = dbuf_hold(mdn, blk, FTAG);
        if (drop_struct_lock)
@@ -1062,65 +1255,64 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
        ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
        epb = db->db.db_size >> DNODE_SHIFT;
 
-       idx = object & (epb-1);
-
        ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
        children_dnodes = dmu_buf_get_user(&db->db);
        if (children_dnodes == NULL) {
-               int i;
                dnode_children_t *winner;
-               children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
-                   (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+               children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
+                   epb * sizeof (dnode_handle_t), KM_SLEEP);
                children_dnodes->dnc_count = epb;
                dnh = &children_dnodes->dnc_children[0];
                for (i = 0; i < epb; i++) {
                        zrl_init(&dnh[i].dnh_zrlock);
-                       dnh[i].dnh_dnode = NULL;
                }
-               if ((winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
-                   dnode_buf_pageout))) {
+               dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
+                   dnode_buf_evict_async, NULL);
+               winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+               if (winner != NULL) {
 
                        for (i = 0; i < epb; i++) {
                                zrl_destroy(&dnh[i].dnh_zrlock);
                        }
 
                        kmem_free(children_dnodes, sizeof (dnode_children_t) +
-                           (epb - 1) * sizeof (dnode_handle_t));
+                           epb * sizeof (dnode_handle_t));
                        children_dnodes = winner;
                }
        }
        ASSERT(children_dnodes->dnc_count == epb);
 
-       dnh = &children_dnodes->dnc_children[idx];
-       zrl_add(&dnh->dnh_zrlock);
-       if ((dn = dnh->dnh_dnode) == NULL) {
-               dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
-               dnode_t *winner;
+       idx = object & (epb - 1);
+       dn_block_begin = (dnode_phys_t *)db->db.db_data;
 
-               dn = dnode_create(os, phys, db, object, dnh);
-               winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
-               if (winner != NULL) {
-                       zrl_add(&dnh->dnh_zrlock);
-                       dnode_destroy(dn); /* implicit zrl_remove() */
-                       dn = winner;
-               }
+       if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
+               dbuf_rele(db, FTAG);
+               return (ENOSPC);
+       } else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
+           !dnode_is_allocated(db, idx)) {
+               dbuf_rele(db, FTAG);
+               return (ENOENT);
        }
 
+       dnh = &children_dnodes->dnc_children[idx];
+       zrl_add(&dnh->dnh_zrlock);
+       dn = dnh->dnh_dnode;
+       if (dn == NULL)
+               dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
+
        mutex_enter(&dn->dn_mtx);
        type = dn->dn_type;
        if (dn->dn_free_txg ||
-           ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
-           ((flag & DNODE_MUST_BE_FREE) &&
-           (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
+           ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
                mutex_exit(&dn->dn_mtx);
                zrl_remove(&dnh->dnh_zrlock);
                dbuf_rele(db, FTAG);
                return (type == DMU_OT_NONE ? ENOENT : EEXIST);
        }
-       mutex_exit(&dn->dn_mtx);
-
        if (refcount_add(&dn->dn_holds, tag) == 1)
                dbuf_add_ref(db, dnh);
+       mutex_exit(&dn->dn_mtx);
+
        /* Now we can rely on the hold to prevent the dnode from moving. */
        zrl_remove(&dnh->dnh_zrlock);
 
@@ -1139,7 +1331,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
 int
 dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
-       return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+       return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+           dnp));
 }
 
 /*
@@ -1228,17 +1421,19 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
         */
        dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
 
-       mutex_enter(&os->os_lock);
+       multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
+       multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
 
        /*
         * If we are already marked dirty, we're done.
         */
        if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
-               mutex_exit(&os->os_lock);
+               multilist_sublist_unlock(mls);
                return;
        }
 
-       ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+       ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+           !avl_is_empty(&dn->dn_dbufs));
        ASSERT(dn->dn_datablksz != 0);
        ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
        ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
@@ -1247,13 +1442,9 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
        dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
            dn->dn_object, txg);
 
-       if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
-               list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
-       } else {
-               list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
-       }
+       multilist_sublist_insert_head(mls, dn);
 
-       mutex_exit(&os->os_lock);
+       multilist_sublist_unlock(mls);
 
        /*
         * The dnode maintains a hold on its containing dbuf as
@@ -1274,13 +1465,6 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 void
 dnode_free(dnode_t *dn, dmu_tx_t *tx)
 {
-       int txgoff = tx->tx_txg & TXG_MASK;
-
-       dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
-
-       /* we should be the only holder... hopefully */
-       /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
-
        mutex_enter(&dn->dn_mtx);
        if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
                mutex_exit(&dn->dn_mtx);
@@ -1289,19 +1473,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
        dn->dn_free_txg = tx->tx_txg;
        mutex_exit(&dn->dn_mtx);
 
-       /*
-        * If the dnode is already dirty, it needs to be moved from
-        * the dirty list to the free list.
-        */
-       mutex_enter(&dn->dn_objset->os_lock);
-       if (list_link_active(&dn->dn_dirty_link[txgoff])) {
-               list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
-               list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
-               mutex_exit(&dn->dn_objset->os_lock);
-       } else {
-               mutex_exit(&dn->dn_objset->os_lock);
-               dnode_setdirty(dn, tx);
-       }
+       dnode_setdirty(dn, tx);
 }
 
 /*
@@ -1311,13 +1483,12 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
-       dmu_buf_impl_t *db, *db_next;
+       dmu_buf_impl_t *db;
        int err;
 
+       ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (size == 0)
                size = SPA_MINBLOCKSIZE;
-       if (size > SPA_MAXBLOCKSIZE)
-               size = SPA_MAXBLOCKSIZE;
        else
                size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
@@ -1334,9 +1505,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
                goto fail;
 
        mutex_enter(&dn->dn_dbufs_mtx);
-       for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-               db_next = list_next(&dn->dn_dbufs, db);
-
+       for (db = avl_first(&dn->dn_dbufs); db != NULL;
+           db = AVL_NEXT(&dn->dn_dbufs, db)) {
                if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
                    db->db_blkid != DMU_SPILL_BLKID) {
                        mutex_exit(&dn->dn_dbufs_mtx);
@@ -1349,7 +1519,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
                goto fail;
 
        /* resize the old block */
-       err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+       err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
        if (err == 0)
                dbuf_new_size(db, size, tx);
        else if (err != ENOENT)
@@ -1416,6 +1586,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
            sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
                new_nlevels++;
 
+       ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
+
        if (new_nlevels > dn->dn_nlevels) {
                int old_nlevels = dn->dn_nlevels;
                dmu_buf_impl_t *db;
@@ -1457,6 +1629,16 @@ out:
                rw_downgrade(&dn->dn_struct_rwlock);
 }
 
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+       dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+       if (db != NULL) {
+               dmu_buf_will_dirty(&db->db, tx);
+               dbuf_rele(db, FTAG);
+       }
+}
+
 void
 dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
 {
@@ -1512,8 +1694,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
                ASSERT3U(blkoff + head, ==, blksz);
                if (len < head)
                        head = len;
-               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
-                   FTAG, &db) == 0) {
+               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+                   TRUE, FALSE, FTAG, &db) == 0) {
                        caddr_t data;
 
                        /* don't dirty if it isn't on disk and isn't dirty */
@@ -1550,8 +1732,8 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        if (tail) {
                if (len < tail)
                        tail = len;
-               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-                   TRUE, FTAG, &db) == 0) {
+               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+                   TRUE, FALSE, FTAG, &db) == 0) {
                        /* don't dirty if not on disk and not dirty */
                        if (db->db_last_dirty ||
                            (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
@@ -1577,27 +1759,68 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
                nblks += 1;
 
        /*
-        * Dirty the first and last indirect blocks, as they (and/or their
-        * parents) will need to be written out if they were only
-        * partially freed.  Interior indirect blocks will be themselves freed,
-        * by free_children(), so they need not be dirtied.  Note that these
-        * interior blocks have already been prefetched by dmu_tx_hold_free().
+        * Dirty all the indirect blocks in this range.  Note that only
+        * the first and last indirect blocks can actually be written
+        * (if they were partially freed) -- they must be dirtied, even if
+        * they do not exist on disk yet.  The interior blocks will
+        * be freed by free_children(), so they will not actually be written.
+        * Even though these interior blocks will not be written, we
+        * dirty them for two reasons:
+        *
+        *  - It ensures that the indirect blocks remain in memory until
+        *    syncing context.  (They have already been prefetched by
+        *    dmu_tx_hold_free(), so we don't have to worry about reading
+        *    them serially here.)
+        *
+        *  - The dirty space accounting will put pressure on the txg sync
+        *    mechanism to begin syncing, and to delay transactions if there
+        *    is a large amount of freeing.  Even though these indirect
+        *    blocks will not be written, we could need to write the same
+        *    amount of space if we copy the freed BPs into deadlists.
         */
        if (dn->dn_nlevels > 1) {
-               uint64_t first, last;
+               uint64_t first, last, i, ibyte;
+               int shift, err;
 
                first = blkid >> epbs;
-               if ((db = dbuf_hold_level(dn, 1, first, FTAG))) {
-                       dmu_buf_will_dirty(&db->db, tx);
-                       dbuf_rele(db, FTAG);
-               }
+               dnode_dirty_l1(dn, first, tx);
                if (trunc)
                        last = dn->dn_maxblkid >> epbs;
                else
                        last = (blkid + nblks - 1) >> epbs;
-               if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
-                       dmu_buf_will_dirty(&db->db, tx);
-                       dbuf_rele(db, FTAG);
+               if (last != first)
+                       dnode_dirty_l1(dn, last, tx);
+
+               shift = dn->dn_datablkshift + dn->dn_indblkshift -
+                   SPA_BLKPTRSHIFT;
+               for (i = first + 1; i < last; i++) {
+                       /*
+                        * Set i to the blockid of the next non-hole
+                        * level-1 indirect block at or after i.  Note
+                        * that dnode_next_offset() operates in terms of
+                        * level-0-equivalent bytes.
+                        */
+                       ibyte = i << shift;
+                       err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+                           &ibyte, 2, 1, 0);
+                       i = ibyte >> shift;
+                       if (i >= last)
+                               break;
+
+                       /*
+                        * Normally we should not see an error, either
+                        * from dnode_next_offset() or dbuf_hold_level()
+                        * (except for ESRCH from dnode_next_offset).
+                        * If there is an i/o error, then when we read
+                        * this block in syncing context, it will use
+                        * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+                        * to the "failmode" property.  dnode_next_offset()
+                        * doesn't have a flag to indicate MUSTSUCCEED.
+                        */
+                       if (err != 0)
+                               break;
+
+                       dnode_dirty_l1(dn, i, tx);
                }
        }
 
@@ -1703,25 +1926,6 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
        mutex_exit(&dn->dn_mtx);
 }
 
-/*
- * Call when we think we're going to write/free space in open context to track
- * the amount of memory in use by the currently open txg.
- */
-void
-dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
-{
-       objset_t *os = dn->dn_objset;
-       dsl_dataset_t *ds = os->os_dsl_dataset;
-       int64_t aspace = spa_get_asize(os->os_spa, space);
-
-       if (ds != NULL) {
-               dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
-               dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
-       }
-
-       dmu_tx_willuse_space(tx, aspace);
-}
-
 /*
  * Scans a block at the indicated "level" looking for a hole or data,
  * depending on 'flags'.
@@ -1742,7 +1946,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
-       int lvl, uint64_t blkfill, uint64_t txg)
+    int lvl, uint64_t blkfill, uint64_t txg)
 {
        dmu_buf_impl_t *db = NULL;
        void *data = NULL;
@@ -1752,9 +1956,6 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
        boolean_t hole;
        int i, inc, error, span;
 
-       dprintf("probing object %llu offset %llx level %d of %u\n",
-           dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
-
        hole = ((flags & DNODE_FIND_HOLE) != 0);
        inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
        ASSERT(txg == 0 || !hole);
@@ -1764,8 +1965,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                epb = dn->dn_phys->dn_nblkptr;
                data = dn->dn_phys->dn_blkptr;
        } else {
-               uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
-               error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+               uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+               error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
                if (error) {
                        if (error != ENOENT)
                                return (error);
@@ -1799,17 +2000,21 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                error = SET_ERROR(ESRCH);
        } else if (lvl == 0) {
                dnode_phys_t *dnp = data;
-               span = DNODE_SHIFT;
+
                ASSERT(dn->dn_type == DMU_OT_DNODE);
+               ASSERT(!(flags & DNODE_FIND_BACKWARDS));
 
-               for (i = (*offset >> span) & (blkfill - 1);
-                   i >= 0 && i < blkfill; i += inc) {
+               for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+                   i < blkfill; i += dnp[i].dn_extra_slots + 1) {
                        if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
                                break;
-                       *offset += (1ULL << span) * inc;
                }
-               if (i < 0 || i == blkfill)
+
+               if (i == blkfill)
                        error = SET_ERROR(ESRCH);
+
+               *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+                   (i << DNODE_SHIFT);
        } else {
                blkptr_t *bp = data;
                uint64_t start = *offset;
@@ -1822,7 +2027,14 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                else
                        minfill++;
 
-               *offset = *offset >> span;
+               if (span >= 8 * sizeof (*offset)) {
+                       /* This only happens on the highest indirection level */
+                       ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
+                       *offset = 0;
+               } else {
+                       *offset = *offset >> span;
+               }
+
                for (i = BF64_GET(*offset, 0, epbs);
                    i >= 0 && i < epb; i += inc) {
                        if (BP_GET_FILL(&bp[i]) >= minfill &&
@@ -1832,7 +2044,13 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                        if (inc > 0 || *offset > 0)
                                *offset += inc;
                }
-               *offset = *offset << span;
+
+               if (span >= 8 * sizeof (*offset)) {
+                       *offset = start;
+               } else {
+                       *offset = *offset << span;
+               }
+
                if (inc < 0) {
                        /* traversing backwards; position offset at the end */
                        ASSERT3U(*offset, <=, start);