]> git.proxmox.com Git - mirror_zfs-debian.git/blobdiff - module/zfs/dnode.c
New upstream version 0.7.11
[mirror_zfs-debian.git] / module / zfs / dnode.c
index 3a8a5e32edd6e17109dad51e3db0e771f577db2f..d465b545a9935fa033d376ed73086be101307184 100644 (file)
@@ -20,7 +20,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
-
-static int free_range_compar(const void *node1, const void *node2);
-
+#include <sys/range_tree.h>
+#include <sys/trace_dnode.h>
+
+dnode_stats_t dnode_stats = {
+       { "dnode_hold_dbuf_hold",               KSTAT_DATA_UINT64 },
+       { "dnode_hold_dbuf_read",               KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_hits",              KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_misses",            KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_interior",          KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_lock_retry",        KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_lock_misses",       KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_type_none",         KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_hits",               KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_misses",             KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_lock_misses",        KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_lock_retry",         KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_overflow",           KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_refcount",           KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_txg",                KSTAT_DATA_UINT64 },
+       { "dnode_free_interior_lock_retry",     KSTAT_DATA_UINT64 },
+       { "dnode_allocate",                     KSTAT_DATA_UINT64 },
+       { "dnode_reallocate",                   KSTAT_DATA_UINT64 },
+       { "dnode_buf_evict",                    KSTAT_DATA_UINT64 },
+       { "dnode_alloc_next_chunk",             KSTAT_DATA_UINT64 },
+       { "dnode_alloc_race",                   KSTAT_DATA_UINT64 },
+       { "dnode_alloc_next_block",             KSTAT_DATA_UINT64 },
+       { "dnode_move_invalid",                 KSTAT_DATA_UINT64 },
+       { "dnode_move_recheck1",                KSTAT_DATA_UINT64 },
+       { "dnode_move_recheck2",                KSTAT_DATA_UINT64 },
+       { "dnode_move_special",                 KSTAT_DATA_UINT64 },
+       { "dnode_move_handle",                  KSTAT_DATA_UINT64 },
+       { "dnode_move_rwlock",                  KSTAT_DATA_UINT64 },
+       { "dnode_move_active",                  KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dnode_ksp;
 static kmem_cache_t *dnode_cache;
-/*
- * Define DNODE_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
-#ifdef DEBUG
-#define        DNODE_STATS
-#endif /* DEBUG */
-
-#ifdef DNODE_STATS
-#define        DNODE_STAT_ADD(stat)                    ((stat)++)
-#else
-#define        DNODE_STAT_ADD(stat)                    /* nothing */
-#endif /* DNODE_STATS */
 
 ASSERTV(static dnode_phys_t dnode_phys_zero);
 
@@ -62,6 +83,31 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif /* _KERNEL */
 
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+       const dmu_buf_impl_t *d1 = x1;
+       const dmu_buf_impl_t *d2 = x2;
+
+       int cmp = AVL_CMP(d1->db_level, d2->db_level);
+       if (likely(cmp))
+               return (cmp);
+
+       cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+       if (likely(cmp))
+               return (cmp);
+
+       if (d1->db_state == DB_SEARCH) {
+               ASSERT3S(d2->db_state, !=, DB_SEARCH);
+               return (-1);
+       } else if (d2->db_state == DB_SEARCH) {
+               ASSERT3S(d1->db_state, !=, DB_SEARCH);
+               return (1);
+       }
+
+       return (AVL_PCMP(d1, d2));
+}
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
@@ -69,12 +115,16 @@ dnode_cons(void *arg, void *unused, int kmflag)
        dnode_t *dn = arg;
        int i;
 
-       rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+       rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
        mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
 
-       refcount_create(&dn->dn_holds);
+       /*
+        * Every dbuf has a reference, and dropping a tracked reference is
+        * O(number of references), so don't track dn_holds.
+        */
+       refcount_create_untracked(&dn->dn_holds);
        refcount_create(&dn->dn_tx_holds);
        list_link_init(&dn->dn_link);
 
@@ -88,9 +138,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
 
        for (i = 0; i < TXG_SIZE; i++) {
                list_link_init(&dn->dn_dirty_link[i]);
-               avl_create(&dn->dn_ranges[i], free_range_compar,
-                   sizeof (free_range_t),
-                   offsetof(struct free_range, fr_node));
+               dn->dn_free_ranges[i] = NULL;
                list_create(&dn->dn_dirty_records[i],
                    sizeof (dbuf_dirty_record_t),
                    offsetof(dbuf_dirty_record_t, dr_dirty_node));
@@ -113,7 +161,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
        dn->dn_id_flags = 0;
 
        dn->dn_dbufs_count = 0;
-       list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
        dn->dn_moved = 0;
@@ -137,35 +185,35 @@ dnode_dest(void *arg, void *unused)
 
        for (i = 0; i < TXG_SIZE; i++) {
                ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-               avl_destroy(&dn->dn_ranges[i]);
+               ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
                list_destroy(&dn->dn_dirty_records[i]);
-               ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
-               ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
-               ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
-               ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
-               ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
-               ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
-               ASSERT3U(dn->dn_next_blksz[i], ==, 0);
-       }
-
-       ASSERT3U(dn->dn_allocated_txg, ==, 0);
-       ASSERT3U(dn->dn_free_txg, ==, 0);
-       ASSERT3U(dn->dn_assigned_txg, ==, 0);
-       ASSERT3U(dn->dn_dirtyctx, ==, 0);
+               ASSERT0(dn->dn_next_nblkptr[i]);
+               ASSERT0(dn->dn_next_nlevels[i]);
+               ASSERT0(dn->dn_next_indblkshift[i]);
+               ASSERT0(dn->dn_next_bonustype[i]);
+               ASSERT0(dn->dn_rm_spillblk[i]);
+               ASSERT0(dn->dn_next_bonuslen[i]);
+               ASSERT0(dn->dn_next_blksz[i]);
+       }
+
+       ASSERT0(dn->dn_allocated_txg);
+       ASSERT0(dn->dn_free_txg);
+       ASSERT0(dn->dn_assigned_txg);
+       ASSERT0(dn->dn_dirtyctx);
        ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
        ASSERT3P(dn->dn_bonus, ==, NULL);
        ASSERT(!dn->dn_have_spill);
        ASSERT3P(dn->dn_zio, ==, NULL);
-       ASSERT3U(dn->dn_oldused, ==, 0);
-       ASSERT3U(dn->dn_oldflags, ==, 0);
-       ASSERT3U(dn->dn_olduid, ==, 0);
-       ASSERT3U(dn->dn_oldgid, ==, 0);
-       ASSERT3U(dn->dn_newuid, ==, 0);
-       ASSERT3U(dn->dn_newgid, ==, 0);
-       ASSERT3U(dn->dn_id_flags, ==, 0);
-
-       ASSERT3U(dn->dn_dbufs_count, ==, 0);
-       list_destroy(&dn->dn_dbufs);
+       ASSERT0(dn->dn_oldused);
+       ASSERT0(dn->dn_oldflags);
+       ASSERT0(dn->dn_olduid);
+       ASSERT0(dn->dn_oldgid);
+       ASSERT0(dn->dn_newuid);
+       ASSERT0(dn->dn_newgid);
+       ASSERT0(dn->dn_id_flags);
+
+       ASSERT0(dn->dn_dbufs_count);
+       avl_destroy(&dn->dn_dbufs);
 }
 
 void
@@ -173,13 +221,26 @@ dnode_init(void)
 {
        ASSERT(dnode_cache == NULL);
        dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
-           0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_KMEM);
+           0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
        kmem_cache_set_move(dnode_cache, dnode_move);
+
+       dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
+           KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
+           KSTAT_FLAG_VIRTUAL);
+       if (dnode_ksp != NULL) {
+               dnode_ksp->ks_data = &dnode_stats;
+               kstat_install(dnode_ksp);
+       }
 }
 
 void
 dnode_fini(void)
 {
+       if (dnode_ksp != NULL) {
+               kstat_delete(dnode_ksp);
+               dnode_ksp = NULL;
+       }
+
        kmem_cache_destroy(dnode_cache);
        dnode_cache = NULL;
 }
@@ -206,6 +267,7 @@ dnode_verify(dnode_t *dn)
        }
        if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
                int i;
+               int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
                ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
                if (dn->dn_datablkshift) {
                        ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
@@ -216,12 +278,12 @@ dnode_verify(dnode_t *dn)
                ASSERT(DMU_OT_IS_VALID(dn->dn_type));
                ASSERT3U(dn->dn_nblkptr, >=, 1);
                ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
-               ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+               ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
                ASSERT3U(dn->dn_datablksz, ==,
                    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
                ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
                ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
-                   dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+                   dn->dn_bonuslen, <=, max_bonuslen);
                for (i = 0; i < TXG_SIZE; i++) {
                        ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
                }
@@ -252,6 +314,7 @@ dnode_byteswap(dnode_phys_t *dnp)
 
        dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
        dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+       dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
        dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
        dnp->dn_used = BSWAP_64(dnp->dn_used);
 
@@ -278,7 +341,8 @@ dnode_byteswap(dnode_phys_t *dnp)
                 * dnode buffer).
                 */
                int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
-               size_t len = DN_MAX_BONUSLEN - off;
+               int slots = dnp->dn_extra_slots + 1;
+               size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
                dmu_object_byteswap_t byteswap;
                ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
                byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
@@ -287,37 +351,25 @@ dnode_byteswap(dnode_phys_t *dnp)
 
        /* Swap SPILL block if we have one */
        if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
-               byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
-
+               byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
 }
 
 void
 dnode_buf_byteswap(void *vbuf, size_t size)
 {
-       dnode_phys_t *buf = vbuf;
-       int i;
+       int i = 0;
 
        ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
        ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 
-       size >>= DNODE_SHIFT;
-       for (i = 0; i < size; i++) {
-               dnode_byteswap(buf);
-               buf++;
-       }
-}
-
-static int
-free_range_compar(const void *node1, const void *node2)
-{
-       const free_range_t *rp1 = node1;
-       const free_range_t *rp2 = node2;
+       while (i < size) {
+               dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
+               dnode_byteswap(dnp);
 
-       if (rp1->fr_blkid < rp2->fr_blkid)
-               return (-1);
-       else if (rp1->fr_blkid > rp2->fr_blkid)
-               return (1);
-       else return (0);
+               i += DNODE_MIN_SIZE;
+               if (dnp->dn_type != DMU_OT_NONE)
+                       i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
+       }
 }
 
 void
@@ -327,7 +379,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 
        dnode_setdirty(dn, tx);
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-       ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+       ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
            (dn->dn_nblkptr-1) * sizeof (blkptr_t));
        dn->dn_bonuslen = newsize;
        if (newsize == 0)
@@ -361,22 +413,23 @@ dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 static void
 dnode_setdblksz(dnode_t *dn, int size)
 {
-       ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
+       ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
        ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
        ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
        ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
            1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
        dn->dn_datablksz = size;
        dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
-       dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+       dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 }
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
-       dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_PUSHPAGE);
+       dnode_t *dn;
 
+       dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
        ASSERT(!POINTER_IS_VALID(dn->dn_objset));
        dn->dn_moved = 0;
 
@@ -404,6 +457,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        dn->dn_compress = dnp->dn_compress;
        dn->dn_bonustype = dnp->dn_bonustype;
        dn->dn_bonuslen = dnp->dn_bonuslen;
+       dn->dn_num_slots = dnp->dn_extra_slots + 1;
        dn->dn_maxblkid = dnp->dn_maxblkid;
        dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
        dn->dn_id_flags = 0;
@@ -411,18 +465,33 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        dmu_zfetch_init(&dn->dn_zfetch, dn);
 
        ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+       ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+       ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
 
        mutex_enter(&os->os_lock);
-       list_insert_head(&os->os_dnodes, dn);
+
+       /*
+        * Exclude special dnodes from os_dnodes so an empty os_dnodes
+        * signifies that the special dnodes have no references from
+        * their children (the entries in os_dnodes).  This allows
+        * dnode_destroy() to easily determine if the last child has
+        * been removed and then complete eviction of the objset.
+        */
+       if (!DMU_OBJECT_IS_SPECIAL(object))
+               list_insert_head(&os->os_dnodes, dn);
        membar_producer();
+
        /*
-        * Everything else must be valid before assigning dn_objset makes the
-        * dnode eligible for dnode_move().
+        * Everything else must be valid before assigning dn_objset
+        * makes the dnode eligible for dnode_move().
         */
        dn->dn_objset = os;
+
+       dnh->dnh_dnode = dn;
        mutex_exit(&os->os_lock);
 
-       arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
+       arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
+
        return (dn);
 }
 
@@ -433,16 +502,23 @@ static void
 dnode_destroy(dnode_t *dn)
 {
        objset_t *os = dn->dn_objset;
+       boolean_t complete_os_eviction = B_FALSE;
 
        ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
        mutex_enter(&os->os_lock);
        POINTER_INVALIDATE(&dn->dn_objset);
-       list_remove(&os->os_dnodes, dn);
+       if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+               list_remove(&os->os_dnodes, dn);
+               complete_os_eviction =
+                   list_is_empty(&os->os_dnodes) &&
+                   list_link_active(&os->os_evicting_node);
+       }
        mutex_exit(&os->os_lock);
 
        /* the dnode can no longer move, so we can release the handle */
-       zrl_remove(&dn->dn_handle->dnh_zrlock);
+       if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+               zrl_remove(&dn->dn_handle->dnh_zrlock);
 
        dn->dn_allocated_txg = 0;
        dn->dn_free_txg = 0;
@@ -455,7 +531,7 @@ dnode_destroy(dnode_t *dn)
        }
        if (dn->dn_bonus != NULL) {
                mutex_enter(&dn->dn_bonus->db_mtx);
-               dbuf_evict(dn->dn_bonus);
+               dbuf_destroy(dn->dn_bonus);
                dn->dn_bonus = NULL;
        }
        dn->dn_zio = NULL;
@@ -469,21 +545,27 @@ dnode_destroy(dnode_t *dn)
        dn->dn_newgid = 0;
        dn->dn_id_flags = 0;
 
-       dmu_zfetch_rele(&dn->dn_zfetch);
+       dmu_zfetch_fini(&dn->dn_zfetch);
        kmem_cache_free(dnode_cache, dn);
-       arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+       arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+       if (complete_os_eviction)
+               dmu_objset_evict_done(os);
 }
 
 void
 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
 {
        int i;
 
+       ASSERT3U(dn_slots, >, 0);
+       ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+           spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (blocksize == 0)
                blocksize = 1 << zfs_default_bs;
-       else if (blocksize > SPA_MAXBLOCKSIZE)
-               blocksize = SPA_MAXBLOCKSIZE;
        else
                blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
@@ -492,8 +574,9 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
        ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 
-       dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
-           dn->dn_object, tx->tx_txg, blocksize, ibs);
+       dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
+           dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
+       DNODE_STAT_BUMP(dnode_allocate);
 
        ASSERT(dn->dn_type == DMU_OT_NONE);
        ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
@@ -504,37 +587,41 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
            (bonustype == DMU_OT_SA && bonuslen == 0) ||
            (bonustype != DMU_OT_NONE && bonuslen != 0));
        ASSERT(DMU_OT_IS_VALID(bonustype));
-       ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+       ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
        ASSERT(dn->dn_type == DMU_OT_NONE);
-       ASSERT3U(dn->dn_maxblkid, ==, 0);
-       ASSERT3U(dn->dn_allocated_txg, ==, 0);
-       ASSERT3U(dn->dn_assigned_txg, ==, 0);
+       ASSERT0(dn->dn_maxblkid);
+       ASSERT0(dn->dn_allocated_txg);
+       ASSERT0(dn->dn_assigned_txg);
        ASSERT(refcount_is_zero(&dn->dn_tx_holds));
        ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
-       ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+       ASSERT(avl_is_empty(&dn->dn_dbufs));
 
        for (i = 0; i < TXG_SIZE; i++) {
-               ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
-               ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
-               ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
-               ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
-               ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
-               ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
-               ASSERT3U(dn->dn_next_blksz[i], ==, 0);
+               ASSERT0(dn->dn_next_nblkptr[i]);
+               ASSERT0(dn->dn_next_nlevels[i]);
+               ASSERT0(dn->dn_next_indblkshift[i]);
+               ASSERT0(dn->dn_next_bonuslen[i]);
+               ASSERT0(dn->dn_next_bonustype[i]);
+               ASSERT0(dn->dn_rm_spillblk[i]);
+               ASSERT0(dn->dn_next_blksz[i]);
                ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
                ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
-               ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
+               ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
        }
 
        dn->dn_type = ot;
        dnode_setdblksz(dn, blocksize);
        dn->dn_indblkshift = ibs;
        dn->dn_nlevels = 1;
+       dn->dn_num_slots = dn_slots;
        if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
                dn->dn_nblkptr = 1;
-       else
-               dn->dn_nblkptr = 1 +
-                   ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+       else {
+               dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+                   1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+                   SPA_BLKPTRSHIFT));
+       }
+
        dn->dn_bonustype = bonustype;
        dn->dn_bonuslen = bonuslen;
        dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -559,20 +646,26 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
 void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
 {
        int nblkptr;
 
        ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
-       ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
-       ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+       ASSERT0(blocksize % SPA_MINBLOCKSIZE);
        ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
        ASSERT(tx->tx_txg != 0);
        ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
            (bonustype != DMU_OT_NONE && bonuslen != 0) ||
            (bonustype == DMU_OT_SA && bonuslen == 0));
        ASSERT(DMU_OT_IS_VALID(bonustype));
-       ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+       ASSERT3U(bonuslen, <=,
+           DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+       ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
+
+       dnode_free_interior_slots(dn);
+       DNODE_STAT_BUMP(dnode_reallocate);
 
        /* clean up any unreferenced dbufs */
        dnode_evict_dbufs(dn);
@@ -595,7 +688,9 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
                nblkptr = 1;
        else
-               nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+               nblkptr = MIN(DN_MAX_NBLKPTR,
+                   1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+                   SPA_BLKPTRSHIFT));
        if (dn->dn_bonustype != bonustype)
                dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
        if (dn->dn_nblkptr != nblkptr)
@@ -613,6 +708,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        mutex_enter(&dn->dn_mtx);
        dn->dn_bonustype = bonustype;
        dn->dn_bonuslen = bonuslen;
+       dn->dn_num_slots = dn_slots;
        dn->dn_nblkptr = nblkptr;
        dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
        dn->dn_compress = ZIO_COMPRESS_INHERIT;
@@ -621,7 +717,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        /* fix up the bonus db_size */
        if (dn->dn_bonus) {
                dn->dn_bonus->db.db_size =
-                   DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+                   DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+                   (dn->dn_nblkptr-1) * sizeof (blkptr_t);
                ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
        }
 
@@ -630,18 +727,6 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 }
 
 #ifdef _KERNEL
-#ifdef DNODE_STATS
-static struct {
-       uint64_t dms_dnode_invalid;
-       uint64_t dms_dnode_recheck1;
-       uint64_t dms_dnode_recheck2;
-       uint64_t dms_dnode_special;
-       uint64_t dms_dnode_handle;
-       uint64_t dms_dnode_rwlock;
-       uint64_t dms_dnode_active;
-} dnode_move_stats;
-#endif /* DNODE_STATS */
-
 static void
 dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 {
@@ -670,6 +755,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        ndn->dn_datablkszsec = odn->dn_datablkszsec;
        ndn->dn_datablksz = odn->dn_datablksz;
        ndn->dn_maxblkid = odn->dn_maxblkid;
+       ndn->dn_num_slots = odn->dn_num_slots;
        bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
            sizeof (odn->dn_next_nblkptr));
        bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
@@ -688,7 +774,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
                list_move_tail(&ndn->dn_dirty_records[i],
                    &odn->dn_dirty_records[i]);
        }
-       bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+       bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+           sizeof (odn->dn_free_ranges));
        ndn->dn_allocated_txg = odn->dn_allocated_txg;
        ndn->dn_free_txg = odn->dn_free_txg;
        ndn->dn_assigned_txg = odn->dn_assigned_txg;
@@ -696,8 +783,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
        ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
        refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
-       ASSERT(list_is_empty(&ndn->dn_dbufs));
-       list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+       ASSERT(avl_is_empty(&ndn->dn_dbufs));
+       avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
        ndn->dn_dbufs_count = odn->dn_dbufs_count;
        ndn->dn_bonus = odn->dn_bonus;
        ndn->dn_have_spill = odn->dn_have_spill;
@@ -712,8 +799,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        dmu_zfetch_init(&ndn->dn_zfetch, NULL);
        list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
        ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
-       ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
-       ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
 
        /*
         * Update back pointers. Updating the handle fixes the back pointer of
@@ -730,7 +815,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
         */
        odn->dn_dbuf = NULL;
        odn->dn_handle = NULL;
-       list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
        odn->dn_dbufs_count = 0;
        odn->dn_bonus = NULL;
@@ -749,8 +834,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
                list_create(&odn->dn_dirty_records[i],
                    sizeof (dbuf_dirty_record_t),
                    offsetof(dbuf_dirty_record_t, dr_dirty_node));
-               odn->dn_ranges[i].avl_root = NULL;
-               odn->dn_ranges[i].avl_numnodes = 0;
+               odn->dn_free_ranges[i] = NULL;
                odn->dn_next_nlevels[i] = 0;
                odn->dn_next_indblkshift[i] = 0;
                odn->dn_next_bonustype[i] = 0;
@@ -800,7 +884,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
         */
        os = odn->dn_objset;
        if (!POINTER_IS_VALID(os)) {
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+               DNODE_STAT_BUMP(dnode_move_invalid);
                return (KMEM_CBRC_DONT_KNOW);
        }
 
@@ -810,7 +894,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
        rw_enter(&os_lock, RW_WRITER);
        if (os != odn->dn_objset) {
                rw_exit(&os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+               DNODE_STAT_BUMP(dnode_move_recheck1);
                return (KMEM_CBRC_DONT_KNOW);
        }
 
@@ -828,7 +912,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
        if (os != odn->dn_objset) {
                mutex_exit(&os->os_lock);
                rw_exit(&os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+               DNODE_STAT_BUMP(dnode_move_recheck2);
                return (KMEM_CBRC_DONT_KNOW);
        }
 
@@ -841,7 +925,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
        rw_exit(&os_lock);
        if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
                mutex_exit(&os->os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+               DNODE_STAT_BUMP(dnode_move_special);
                return (KMEM_CBRC_NO);
        }
        ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
@@ -856,7 +940,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
         */
        if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
                mutex_exit(&os->os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+               DNODE_STAT_BUMP(dnode_move_handle);
                return (KMEM_CBRC_LATER);
        }
 
@@ -872,7 +956,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
        if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
                zrl_exit(&odn->dn_handle->dnh_zrlock);
                mutex_exit(&os->os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+               DNODE_STAT_BUMP(dnode_move_rwlock);
                return (KMEM_CBRC_LATER);
        }
 
@@ -898,7 +982,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
                rw_exit(&odn->dn_struct_rwlock);
                zrl_exit(&odn->dn_handle->dnh_zrlock);
                mutex_exit(&os->os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+               DNODE_STAT_BUMP(dnode_move_active);
                return (KMEM_CBRC_LATER);
        }
 
@@ -922,6 +1006,132 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 }
 #endif /* _KERNEL */
 
+static void
+dnode_slots_hold(dnode_children_t *children, int idx, int slots)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+               zrl_add(&dnh->dnh_zrlock);
+       }
+}
+
+static void
+dnode_slots_rele(dnode_children_t *children, int idx, int slots)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+
+               if (zrl_is_locked(&dnh->dnh_zrlock))
+                       zrl_exit(&dnh->dnh_zrlock);
+               else
+                       zrl_remove(&dnh->dnh_zrlock);
+       }
+}
+
+static int
+dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+
+               if (!zrl_tryenter(&dnh->dnh_zrlock)) {
+                       for (int j = idx; j < i; j++) {
+                               dnh = &children->dnc_children[j];
+                               zrl_exit(&dnh->dnh_zrlock);
+                       }
+
+                       return (0);
+               }
+       }
+
+       return (1);
+}
+
+static void
+dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+               dnh->dnh_dnode = ptr;
+       }
+}
+
+static boolean_t
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+               dnode_t *dn = dnh->dnh_dnode;
+
+               if (dn == DN_SLOT_FREE) {
+                       continue;
+               } else if (DN_SLOT_IS_PTR(dn)) {
+                       mutex_enter(&dn->dn_mtx);
+                       dmu_object_type_t type = dn->dn_type;
+                       mutex_exit(&dn->dn_mtx);
+
+                       if (type != DMU_OT_NONE)
+                               return (B_FALSE);
+
+                       continue;
+               } else {
+                       return (B_FALSE);
+               }
+
+               return (B_FALSE);
+       }
+
+       return (B_TRUE);
+}
+
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+
+               ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+               if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                       ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+                       dnode_destroy(dnh->dnh_dnode);
+                       dnh->dnh_dnode = DN_SLOT_FREE;
+               }
+       }
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+       dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+       int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+       int idx = (dn->dn_object & (epb - 1)) + 1;
+       int slots = dn->dn_num_slots - 1;
+
+       if (slots == 0)
+               return;
+
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       while (!dnode_slots_tryenter(children, idx, slots))
+               DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+
+       dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+       dnode_slots_rele(children, idx, slots);
+}
+
 void
 dnode_special_close(dnode_handle_t *dnh)
 {
@@ -929,40 +1139,44 @@ dnode_special_close(dnode_handle_t *dnh)
 
        /*
         * Wait for final references to the dnode to clear.  This can
-        * only happen if the arc is asyncronously evicting state that
+        * only happen if the arc is asynchronously evicting state that
         * has a hold on this dnode while we are trying to evict this
         * dnode.
         */
        while (refcount_count(&dn->dn_holds) > 0)
                delay(1);
+       ASSERT(dn->dn_dbuf == NULL ||
+           dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
        zrl_add(&dnh->dnh_zrlock);
        dnode_destroy(dn); /* implicit zrl_remove() */
        zrl_destroy(&dnh->dnh_zrlock);
        dnh->dnh_dnode = NULL;
 }
 
-dnode_t *
+void
 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
     dnode_handle_t *dnh)
 {
-       dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
-       dnh->dnh_dnode = dn;
+       dnode_t *dn;
+
        zrl_init(&dnh->dnh_zrlock);
+       zrl_tryenter(&dnh->dnh_zrlock);
+
+       dn = dnode_create(os, dnp, NULL, object, dnh);
        DNODE_VERIFY(dn);
-       return (dn);
+
+       zrl_exit(&dnh->dnh_zrlock);
 }
 
 static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
+dnode_buf_evict_async(void *dbu)
 {
-       dnode_children_t *children_dnodes = arg;
-       int i;
-       int epb = db->db_size >> DNODE_SHIFT;
+       dnode_children_t *dnc = dbu;
 
-       ASSERT(epb == children_dnodes->dnc_count);
+       DNODE_STAT_BUMP(dnode_buf_evict);
 
-       for (i = 0; i < epb; i++) {
-               dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+       for (int i = 0; i < dnc->dnc_count; i++) {
+               dnode_handle_t *dnh = &dnc->dnc_children[i];
                dnode_t *dn;
 
                /*
@@ -970,8 +1184,9 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
                 * another valid address, so there is no need here to guard
                 * against changes to or from NULL.
                 */
-               if (dnh->dnh_dnode == NULL) {
+               if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
                        zrl_destroy(&dnh->dnh_zrlock);
+                       dnh->dnh_dnode = DN_SLOT_UNINIT;
                        continue;
                }
 
@@ -986,22 +1201,27 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
                ASSERT(refcount_is_zero(&dn->dn_holds));
                ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 
-               dnode_destroy(dn); /* implicit zrl_remove() */
+               dnode_destroy(dn); /* implicit zrl_remove() for first slot */
                zrl_destroy(&dnh->dnh_zrlock);
-               dnh->dnh_dnode = NULL;
+               dnh->dnh_dnode = DN_SLOT_UNINIT;
        }
-       kmem_free(children_dnodes, sizeof (dnode_children_t) +
-           (epb - 1) * sizeof (dnode_handle_t));
+       kmem_free(dnc, sizeof (dnode_children_t) +
+           dnc->dnc_count * sizeof (dnode_handle_t));
 }
 
 /*
  * errors:
- * EINVAL - invalid object number.
- * EIO - i/o error.
+ * EINVAL - Invalid object number or flags.
+ * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
+ * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
+ *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
+ * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
+ * EIO    - I/O error when reading the meta dnode dbuf.
+ *
  * succeeds even for free dnodes.
  */
 int
-dnode_hold_impl(objset_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
     void *tag, dnode_t **dnp)
 {
        int epb, idx, err;
@@ -1010,9 +1230,13 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
        uint64_t blk;
        dnode_t *mdn, *dn;
        dmu_buf_impl_t *db;
-       dnode_children_t *children_dnodes;
+       dnode_children_t *dnc;
+       dnode_phys_t *dn_block;
        dnode_handle_t *dnh;
 
+       ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+       ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+
        /*
         * If you are holding the spa config lock as writer, you shouldn't
         * be asking the DMU to do *anything* unless it's the root pool
@@ -1027,12 +1251,12 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
                dn = (object == DMU_USERUSED_OBJECT) ?
                    DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
                if (dn == NULL)
-                       return (ENOENT);
+                       return (SET_ERROR(ENOENT));
                type = dn->dn_type;
                if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
-                       return (ENOENT);
+                       return (SET_ERROR(ENOENT));
                if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
-                       return (EEXIST);
+                       return (SET_ERROR(EEXIST));
                DNODE_VERIFY(dn);
                (void) refcount_add(&dn->dn_holds, tag);
                *dnp = dn;
@@ -1040,7 +1264,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
        }
 
        if (object == 0 || object >= DN_MAX_OBJECT)
-               return (EINVAL);
+               return (SET_ERROR(EINVAL));
 
        mdn = DMU_META_DNODE(os);
        ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
@@ -1052,15 +1276,18 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
                drop_struct_lock = TRUE;
        }
 
-       blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+       blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 
        db = dbuf_hold(mdn, blk, FTAG);
        if (drop_struct_lock)
                rw_exit(&mdn->dn_struct_rwlock);
-       if (db == NULL)
-               return (EIO);
+       if (db == NULL) {
+               DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
+               return (SET_ERROR(EIO));
+       }
        err = dbuf_read(db, NULL, DB_RF_CANFAIL);
        if (err) {
+               DNODE_STAT_BUMP(dnode_hold_dbuf_read);
                dbuf_rele(db, FTAG);
                return (err);
        }
@@ -1068,63 +1295,193 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
        ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
        epb = db->db.db_size >> DNODE_SHIFT;
 
-       idx = object & (epb-1);
+       idx = object & (epb - 1);
+       dn_block = (dnode_phys_t *)db->db.db_data;
 
        ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
-       children_dnodes = dmu_buf_get_user(&db->db);
-       if (children_dnodes == NULL) {
-               int i;
+       dnc = dmu_buf_get_user(&db->db);
+       dnh = NULL;
+       if (dnc == NULL) {
                dnode_children_t *winner;
-               children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
-                   (epb - 1) * sizeof (dnode_handle_t),
-                   KM_PUSHPAGE | KM_NODEBUG);
-               children_dnodes->dnc_count = epb;
-               dnh = &children_dnodes->dnc_children[0];
-               for (i = 0; i < epb; i++) {
+               int skip = 0;
+
+               dnc = kmem_zalloc(sizeof (dnode_children_t) +
+                   epb * sizeof (dnode_handle_t), KM_SLEEP);
+               dnc->dnc_count = epb;
+               dnh = &dnc->dnc_children[0];
+
+               /* Initialize dnode slot status from dnode_phys_t */
+               for (int i = 0; i < epb; i++) {
                        zrl_init(&dnh[i].dnh_zrlock);
-                       dnh[i].dnh_dnode = NULL;
+
+                       if (skip) {
+                               skip--;
+                               continue;
+                       }
+
+                       if (dn_block[i].dn_type != DMU_OT_NONE) {
+                               int interior = dn_block[i].dn_extra_slots;
+
+                               dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
+                               dnode_set_slots(dnc, i + 1, interior,
+                                   DN_SLOT_INTERIOR);
+                               skip = interior;
+                       } else {
+                               dnh[i].dnh_dnode = DN_SLOT_FREE;
+                               skip = 0;
+                       }
                }
-               if ((winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
-                   dnode_buf_pageout))) {
-                       kmem_free(children_dnodes, sizeof (dnode_children_t) +
-                           (epb - 1) * sizeof (dnode_handle_t));
-                       children_dnodes = winner;
+
+               dmu_buf_init_user(&dnc->dnc_dbu, NULL,
+                   dnode_buf_evict_async, NULL);
+               winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
+               if (winner != NULL) {
+
+                       for (int i = 0; i < epb; i++)
+                               zrl_destroy(&dnh[i].dnh_zrlock);
+
+                       kmem_free(dnc, sizeof (dnode_children_t) +
+                           epb * sizeof (dnode_handle_t));
+                       dnc = winner;
                }
        }
-       ASSERT(children_dnodes->dnc_count == epb);
 
-       dnh = &children_dnodes->dnc_children[idx];
-       zrl_add(&dnh->dnh_zrlock);
-       if ((dn = dnh->dnh_dnode) == NULL) {
-               dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
-               dnode_t *winner;
+       ASSERT(dnc->dnc_count == epb);
+       dn = DN_SLOT_UNINIT;
 
-               dn = dnode_create(os, phys, db, object, dnh);
-               winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
-               if (winner != NULL) {
-                       zrl_add(&dnh->dnh_zrlock);
-                       dnode_destroy(dn); /* implicit zrl_remove() */
-                       dn = winner;
+       if (flag & DNODE_MUST_BE_ALLOCATED) {
+               slots = 1;
+
+               while (dn == DN_SLOT_UNINIT) {
+                       dnode_slots_hold(dnc, idx, slots);
+                       dnh = &dnc->dnc_children[idx];
+
+                       if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                               dn = dnh->dnh_dnode;
+                               break;
+                       } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
+                               DNODE_STAT_BUMP(dnode_hold_alloc_interior);
+                               dnode_slots_rele(dnc, idx, slots);
+                               dbuf_rele(db, FTAG);
+                               return (SET_ERROR(EEXIST));
+                       } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
+                               DNODE_STAT_BUMP(dnode_hold_alloc_misses);
+                               dnode_slots_rele(dnc, idx, slots);
+                               dbuf_rele(db, FTAG);
+                               return (SET_ERROR(ENOENT));
+                       }
+
+                       dnode_slots_rele(dnc, idx, slots);
+                       if (!dnode_slots_tryenter(dnc, idx, slots)) {
+                               DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
+                               continue;
+                       }
+
+                       /*
+                        * Someone else won the race and called dnode_create()
+                        * after we checked DN_SLOT_IS_PTR() above but before
+                        * we acquired the lock.
+                        */
+                       if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                               DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
+                               dn = dnh->dnh_dnode;
+                       } else {
+                               dn = dnode_create(os, dn_block + idx, db,
+                                   object, dnh);
+                       }
+               }
+
+               mutex_enter(&dn->dn_mtx);
+               if (dn->dn_type == DMU_OT_NONE) {
+                       DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
+                       mutex_exit(&dn->dn_mtx);
+                       dnode_slots_rele(dnc, idx, slots);
+                       dbuf_rele(db, FTAG);
+                       return (SET_ERROR(ENOENT));
+               }
+
+               DNODE_STAT_BUMP(dnode_hold_alloc_hits);
+       } else if (flag & DNODE_MUST_BE_FREE) {
+
+               if (idx + slots - 1 >= DNODES_PER_BLOCK) {
+                       DNODE_STAT_BUMP(dnode_hold_free_overflow);
+                       dbuf_rele(db, FTAG);
+                       return (SET_ERROR(ENOSPC));
+               }
+
+               while (dn == DN_SLOT_UNINIT) {
+                       dnode_slots_hold(dnc, idx, slots);
+
+                       if (!dnode_check_slots_free(dnc, idx, slots)) {
+                               DNODE_STAT_BUMP(dnode_hold_free_misses);
+                               dnode_slots_rele(dnc, idx, slots);
+                               dbuf_rele(db, FTAG);
+                               return (SET_ERROR(ENOSPC));
+                       }
+
+                       dnode_slots_rele(dnc, idx, slots);
+                       if (!dnode_slots_tryenter(dnc, idx, slots)) {
+                               DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
+                               continue;
+                       }
+
+                       if (!dnode_check_slots_free(dnc, idx, slots)) {
+                               DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
+                               dnode_slots_rele(dnc, idx, slots);
+                               dbuf_rele(db, FTAG);
+                               return (SET_ERROR(ENOSPC));
+                       }
+
+                       /*
+                        * Allocated but otherwise free dnodes which would
+                        * be in the interior of a multi-slot dnodes need
+                        * to be freed.  Single slot dnodes can be safely
+                        * re-purposed as a performance optimization.
+                        */
+                       if (slots > 1)
+                               dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
+                       dnh = &dnc->dnc_children[idx];
+                       if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                               dn = dnh->dnh_dnode;
+                       } else {
+                               dn = dnode_create(os, dn_block + idx, db,
+                                   object, dnh);
+                       }
                }
+
+               mutex_enter(&dn->dn_mtx);
+               if (!refcount_is_zero(&dn->dn_holds)) {
+                       DNODE_STAT_BUMP(dnode_hold_free_refcount);
+                       mutex_exit(&dn->dn_mtx);
+                       dnode_slots_rele(dnc, idx, slots);
+                       dbuf_rele(db, FTAG);
+                       return (SET_ERROR(EEXIST));
+               }
+
+               dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
+               DNODE_STAT_BUMP(dnode_hold_free_hits);
+       } else {
+               dbuf_rele(db, FTAG);
+               return (SET_ERROR(EINVAL));
        }
 
-       mutex_enter(&dn->dn_mtx);
-       type = dn->dn_type;
-       if (dn->dn_free_txg ||
-           ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
-           ((flag & DNODE_MUST_BE_FREE) &&
-           (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
+       if (dn->dn_free_txg) {
+               DNODE_STAT_BUMP(dnode_hold_free_txg);
+               type = dn->dn_type;
                mutex_exit(&dn->dn_mtx);
-               zrl_remove(&dnh->dnh_zrlock);
+               dnode_slots_rele(dnc, idx, slots);
                dbuf_rele(db, FTAG);
                return (type == DMU_OT_NONE ? ENOENT : EEXIST);
        }
-       mutex_exit(&dn->dn_mtx);
 
        if (refcount_add(&dn->dn_holds, tag) == 1)
                dbuf_add_ref(db, dnh);
+
+       mutex_exit(&dn->dn_mtx);
+
        /* Now we can rely on the hold to prevent the dnode from moving. */
-       zrl_remove(&dnh->dnh_zrlock);
+       dnode_slots_rele(dnc, idx, slots);
 
        DNODE_VERIFY(dn);
        ASSERT3P(dn->dn_dbuf, ==, db);
@@ -1141,7 +1498,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
 int
 dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
-       return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+       return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+           dnp));
 }
 
 /*
@@ -1164,13 +1522,19 @@ dnode_add_ref(dnode_t *dn, void *tag)
 
 void
 dnode_rele(dnode_t *dn, void *tag)
+{
+       mutex_enter(&dn->dn_mtx);
+       dnode_rele_and_unlock(dn, tag);
+}
+
+void
+dnode_rele_and_unlock(dnode_t *dn, void *tag)
 {
        uint64_t refs;
        /* Get while the hold prevents the dnode from moving. */
        dmu_buf_impl_t *db = dn->dn_dbuf;
        dnode_handle_t *dnh = dn->dn_handle;
 
-       mutex_enter(&dn->dn_mtx);
        refs = refcount_remove(&dn->dn_holds, tag);
        mutex_exit(&dn->dn_mtx);
 
@@ -1224,32 +1588,30 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
         */
        dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
 
-       mutex_enter(&os->os_lock);
+       multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
+       multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
 
        /*
         * If we are already marked dirty, we're done.
         */
        if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
-               mutex_exit(&os->os_lock);
+               multilist_sublist_unlock(mls);
                return;
        }
 
-       ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+       ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+           !avl_is_empty(&dn->dn_dbufs));
        ASSERT(dn->dn_datablksz != 0);
-       ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
-       ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
-       ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0);
+       ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
+       ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
+       ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
 
        dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
            dn->dn_object, txg);
 
-       if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
-               list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
-       } else {
-               list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
-       }
+       multilist_sublist_insert_head(mls, dn);
 
-       mutex_exit(&os->os_lock);
+       multilist_sublist_unlock(mls);
 
        /*
         * The dnode maintains a hold on its containing dbuf as
@@ -1270,13 +1632,6 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 void
 dnode_free(dnode_t *dn, dmu_tx_t *tx)
 {
-       int txgoff = tx->tx_txg & TXG_MASK;
-
-       dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
-
-       /* we should be the only holder... hopefully */
-       /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
-
        mutex_enter(&dn->dn_mtx);
        if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
                mutex_exit(&dn->dn_mtx);
@@ -1285,19 +1640,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
        dn->dn_free_txg = tx->tx_txg;
        mutex_exit(&dn->dn_mtx);
 
-       /*
-        * If the dnode is already dirty, it needs to be moved from
-        * the dirty list to the free list.
-        */
-       mutex_enter(&dn->dn_objset->os_lock);
-       if (list_link_active(&dn->dn_dirty_link[txgoff])) {
-               list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
-               list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
-               mutex_exit(&dn->dn_objset->os_lock);
-       } else {
-               mutex_exit(&dn->dn_objset->os_lock);
-               dnode_setdirty(dn, tx);
-       }
+       dnode_setdirty(dn, tx);
 }
 
 /*
@@ -1307,13 +1650,12 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
-       dmu_buf_impl_t *db, *db_next;
+       dmu_buf_impl_t *db;
        int err;
 
+       ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (size == 0)
                size = SPA_MINBLOCKSIZE;
-       if (size > SPA_MAXBLOCKSIZE)
-               size = SPA_MAXBLOCKSIZE;
        else
                size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
@@ -1326,13 +1668,12 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
        /* Check for any allocated blocks beyond the first */
-       if (dn->dn_phys->dn_maxblkid != 0)
+       if (dn->dn_maxblkid != 0)
                goto fail;
 
        mutex_enter(&dn->dn_dbufs_mtx);
-       for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-               db_next = list_next(&dn->dn_dbufs, db);
-
+       for (db = avl_first(&dn->dn_dbufs); db != NULL;
+           db = AVL_NEXT(&dn->dn_dbufs, db)) {
                if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
                    db->db_blkid != DMU_SPILL_BLKID) {
                        mutex_exit(&dn->dn_dbufs_mtx);
@@ -1345,7 +1686,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
                goto fail;
 
        /* resize the old block */
-       err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+       err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
        if (err == 0)
                dbuf_new_size(db, size, tx);
        else if (err != ENOENT)
@@ -1367,7 +1708,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 
 fail:
        rw_exit(&dn->dn_struct_rwlock);
-       return (ENOTSUP);
+       return (SET_ERROR(ENOTSUP));
 }
 
 /* read-holding callers must not rely on the lock being continuously held */
@@ -1412,6 +1753,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
            sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
                new_nlevels++;
 
+       ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
+
        if (new_nlevels > dn->dn_nlevels) {
                int old_nlevels = dn->dn_nlevels;
                dmu_buf_impl_t *db;
@@ -1453,56 +1796,13 @@ out:
                rw_downgrade(&dn->dn_struct_rwlock);
 }
 
-void
-dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
 {
-       avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-       avl_index_t where;
-       free_range_t *rp;
-       free_range_t rp_tofind;
-       uint64_t endblk = blkid + nblks;
-
-       ASSERT(MUTEX_HELD(&dn->dn_mtx));
-       ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
-
-       dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-           blkid, nblks, tx->tx_txg);
-       rp_tofind.fr_blkid = blkid;
-       rp = avl_find(tree, &rp_tofind, &where);
-       if (rp == NULL)
-               rp = avl_nearest(tree, where, AVL_BEFORE);
-       if (rp == NULL)
-               rp = avl_nearest(tree, where, AVL_AFTER);
-
-       while (rp && (rp->fr_blkid <= blkid + nblks)) {
-               uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
-               free_range_t *nrp = AVL_NEXT(tree, rp);
-
-               if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
-                       /* clear this entire range */
-                       avl_remove(tree, rp);
-                       kmem_free(rp, sizeof (free_range_t));
-               } else if (blkid <= rp->fr_blkid &&
-                   endblk > rp->fr_blkid && endblk < fr_endblk) {
-                       /* clear the beginning of this range */
-                       rp->fr_blkid = endblk;
-                       rp->fr_nblks = fr_endblk - endblk;
-               } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
-                   endblk >= fr_endblk) {
-                       /* clear the end of this range */
-                       rp->fr_nblks = blkid - rp->fr_blkid;
-               } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
-                       /* clear a chunk out of this range */
-                       free_range_t *new_rp =
-                           kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE);
-
-                       new_rp->fr_blkid = endblk;
-                       new_rp->fr_nblks = fr_endblk - endblk;
-                       avl_insert_here(tree, new_rp, rp, AVL_AFTER);
-                       rp->fr_nblks = blkid - rp->fr_blkid;
-               }
-               /* there may be no overlap */
-               rp = nrp;
+       dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+       if (db != NULL) {
+               dmu_buf_will_dirty(&db->db, tx);
+               dbuf_rele(db, FTAG);
        }
 }
 
@@ -1520,7 +1820,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        blkshift = dn->dn_datablkshift;
        epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
-       if (len == -1ULL) {
+       if (len == DMU_OBJECT_END) {
                len = UINT64_MAX - off;
                trunc = TRUE;
        }
@@ -1536,7 +1836,13 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        } else {
                ASSERT(dn->dn_maxblkid == 0);
                if (off == 0 && len >= blksz) {
-                       /* Freeing the whole block; fast-track this request */
+                       /*
+                        * Freeing the whole block; fast-track this request.
+                        * Note that we won't dirty any indirect blocks,
+                        * which is fine because we will be freeing the entire
+                        * file and thus all indirect blocks will be freed
+                        * by free_children().
+                        */
                        blkid = 0;
                        nblks = 1;
                        goto done;
@@ -1555,15 +1861,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
                ASSERT3U(blkoff + head, ==, blksz);
                if (len < head)
                        head = len;
-               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
-                   FTAG, &db) == 0) {
+               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+                   TRUE, FALSE, FTAG, &db) == 0) {
                        caddr_t data;
 
                        /* don't dirty if it isn't on disk and isn't dirty */
                        if (db->db_last_dirty ||
                            (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
                                rw_exit(&dn->dn_struct_rwlock);
-                               dbuf_will_dirty(db, tx);
+                               dmu_buf_will_dirty(&db->db, tx);
                                rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                                data = db->db.db_data;
                                bzero(data + blkoff, head);
@@ -1588,18 +1894,18 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        else
                tail = P2PHASE(len, blksz);
 
-       ASSERT3U(P2PHASE(off, blksz), ==, 0);
+       ASSERT0(P2PHASE(off, blksz));
        /* zero out any partial block data at the end of the range */
        if (tail) {
                if (len < tail)
                        tail = len;
-               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-                   TRUE, FTAG, &db) == 0) {
+               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+                   TRUE, FALSE, FTAG, &db) == 0) {
                        /* don't dirty if not on disk and not dirty */
                        if (db->db_last_dirty ||
                            (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
                                rw_exit(&dn->dn_struct_rwlock);
-                               dbuf_will_dirty(db, tx);
+                               dmu_buf_will_dirty(&db->db, tx);
                                rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                                bzero(db->db.db_data, tail);
                        }
@@ -1620,74 +1926,93 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
                nblks += 1;
 
        /*
-        * Read in and mark all the level-1 indirects dirty,
-        * so that they will stay in memory until syncing phase.
-        * Always dirty the first and last indirect to make sure
-        * we dirty all the partial indirects.
+        * Dirty all the indirect blocks in this range.  Note that only
+        * the first and last indirect blocks can actually be written
+        * (if they were partially freed) -- they must be dirtied, even if
+        * they do not exist on disk yet.  The interior blocks will
+        * be freed by free_children(), so they will not actually be written.
+        * Even though these interior blocks will not be written, we
+        * dirty them for two reasons:
+        *
+        *  - It ensures that the indirect blocks remain in memory until
+        *    syncing context.  (They have already been prefetched by
+        *    dmu_tx_hold_free(), so we don't have to worry about reading
+        *    them serially here.)
+        *
+        *  - The dirty space accounting will put pressure on the txg sync
+        *    mechanism to begin syncing, and to delay transactions if there
+        *    is a large amount of freeing.  Even though these indirect
+        *    blocks will not be written, we could need to write the same
+        *    amount of space if we copy the freed BPs into deadlists.
         */
        if (dn->dn_nlevels > 1) {
-               uint64_t i, first, last;
-               int shift = epbs + dn->dn_datablkshift;
+               uint64_t first, last, i, ibyte;
+               int shift, err;
 
                first = blkid >> epbs;
-               if ((db = dbuf_hold_level(dn, 1, first, FTAG))) {
-                       dbuf_will_dirty(db, tx);
-                       dbuf_rele(db, FTAG);
-               }
+               dnode_dirty_l1(dn, first, tx);
                if (trunc)
                        last = dn->dn_maxblkid >> epbs;
                else
                        last = (blkid + nblks - 1) >> epbs;
-               if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
-                       dbuf_will_dirty(db, tx);
-                       dbuf_rele(db, FTAG);
-               }
-               for (i = first + 1; i < last; i++) {
-                       uint64_t ibyte = i << shift;
-                       int err;
+               if (last != first)
+                       dnode_dirty_l1(dn, last, tx);
 
-                       err = dnode_next_offset(dn,
-                           DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+               shift = dn->dn_datablkshift + dn->dn_indblkshift -
+                   SPA_BLKPTRSHIFT;
+               for (i = first + 1; i < last; i++) {
+                       /*
+                        * Set i to the blockid of the next non-hole
+                        * level-1 indirect block at or after i.  Note
+                        * that dnode_next_offset() operates in terms of
+                        * level-0-equivalent bytes.
+                        */
+                       ibyte = i << shift;
+                       err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+                           &ibyte, 2, 1, 0);
                        i = ibyte >> shift;
-                       if (err == ESRCH || i >= last)
+                       if (i >= last)
                                break;
-                       ASSERT(err == 0);
-                       db = dbuf_hold_level(dn, 1, i, FTAG);
-                       if (db) {
-                               dbuf_will_dirty(db, tx);
-                               dbuf_rele(db, FTAG);
-                       }
+
+                       /*
+                        * Normally we should not see an error, either
+                        * from dnode_next_offset() or dbuf_hold_level()
+                        * (except for ESRCH from dnode_next_offset).
+                        * If there is an i/o error, then when we read
+                        * this block in syncing context, it will use
+                        * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+                        * to the "failmode" property.  dnode_next_offset()
+                        * doesn't have a flag to indicate MUSTSUCCEED.
+                        */
+                       if (err != 0)
+                               break;
+
+                       dnode_dirty_l1(dn, i, tx);
                }
        }
+
 done:
        /*
         * Add this range to the dnode range list.
         * We will finish up this free operation in the syncing phase.
         */
        mutex_enter(&dn->dn_mtx);
-       dnode_clear_range(dn, blkid, nblks, tx);
        {
-               free_range_t *rp, *found;
-               avl_index_t where;
-               avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-
-               /* Add new range to dn_ranges */
-               rp = kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE);
-               rp->fr_blkid = blkid;
-               rp->fr_nblks = nblks;
-               found = avl_find(tree, rp, &where);
-               ASSERT(found == NULL);
-               avl_insert(tree, rp, where);
-               dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-                   blkid, nblks, tx->tx_txg);
+       int txgoff = tx->tx_txg & TXG_MASK;
+       if (dn->dn_free_ranges[txgoff] == NULL) {
+               dn->dn_free_ranges[txgoff] =
+                   range_tree_create(NULL, NULL, &dn->dn_mtx);
+       }
+       range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+       range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
        }
+       dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+           blkid, nblks, tx->tx_txg);
        mutex_exit(&dn->dn_mtx);
 
        dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
        dnode_setdirty(dn, tx);
 out:
-       if (trunc && dn->dn_maxblkid >= (off >> blkshift))
-               dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
 
        rw_exit(&dn->dn_struct_rwlock);
 }
@@ -1710,7 +2035,6 @@ dnode_spill_freed(dnode_t *dn)
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
 {
-       free_range_t range_tofind;
        void *dp = spa_get_dsl(dn->dn_objset->os_spa);
        int i;
 
@@ -1730,20 +2054,10 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
        if (blkid == DMU_SPILL_BLKID)
                return (dnode_spill_freed(dn));
 
-       range_tofind.fr_blkid = blkid;
        mutex_enter(&dn->dn_mtx);
        for (i = 0; i < TXG_SIZE; i++) {
-               free_range_t *range_found;
-               avl_index_t idx;
-
-               range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
-               if (range_found) {
-                       ASSERT(range_found->fr_nblks > 0);
-                       break;
-               }
-               range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
-               if (range_found &&
-                   range_found->fr_blkid + range_found->fr_nblks > blkid)
+               if (dn->dn_free_ranges[i] != NULL &&
+                   range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
                        break;
        }
        mutex_exit(&dn->dn_mtx);
@@ -1770,7 +2084,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
        space += delta;
        if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
                ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
-               ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
+               ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
                dn->dn_phys->dn_used = space >> DEV_BSHIFT;
        } else {
                dn->dn_phys->dn_used = space;
@@ -1780,34 +2094,16 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
 }
 
 /*
- * Call when we think we're going to write/free space in open context.
- * Be conservative (ie. OK to write less than this or free more than
- * this, but don't write more or free less).
- */
-void
-dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
-{
-       objset_t *os = dn->dn_objset;
-       dsl_dataset_t *ds = os->os_dsl_dataset;
-
-       if (space > 0)
-               space = spa_get_asize(os->os_spa, space);
-
-       if (ds)
-               dsl_dir_willuse_space(ds->ds_dir, space, tx);
-
-       dmu_tx_willuse_space(tx, space);
-}
-
-/*
- * This function scans a block at the indicated "level" looking for
- * a hole or data (depending on 'flags').  If level > 0, then we are
- * scanning an indirect block looking at its pointers.  If level == 0,
- * then we are looking at a block of dnodes.  If we don't find what we
- * are looking for in the block, we return ESRCH.  Otherwise, return
- * with *offset pointing to the beginning (if searching forwards) or
- * end (if searching backwards) of the range covered by the block
- * pointer we matched on (or dnode).
+ * Scans a block at the indicated "level" looking for a hole or data,
+ * depending on 'flags'.
+ *
+ * If level > 0, then we are scanning an indirect block looking at its
+ * pointers.  If level == 0, then we are looking at a block of dnodes.
+ *
+ * If we don't find what we are looking for in the block, we return ESRCH.
+ * Otherwise, return with *offset pointing to the beginning (if searching
+ * forwards) or end (if searching backwards) of the range covered by the
+ * block pointer we matched on (or dnode).
  *
  * The basic search algorithm used below by dnode_next_offset() is to
  * use this function to search up the block tree (widen the search) until
@@ -1817,7 +2113,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
-       int lvl, uint64_t blkfill, uint64_t txg)
+    int lvl, uint64_t blkfill, uint64_t txg)
 {
        dmu_buf_impl_t *db = NULL;
        void *data = NULL;
@@ -1827,9 +2123,6 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
        boolean_t hole;
        int i, inc, error, span;
 
-       dprintf("probing object %llu offset %llx level %d of %u\n",
-           dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
-
        hole = ((flags & DNODE_FIND_HOLE) != 0);
        inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
        ASSERT(txg == 0 || !hole);
@@ -1839,8 +2132,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                epb = dn->dn_phys->dn_nblkptr;
                data = dn->dn_phys->dn_blkptr;
        } else {
-               uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
-               error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+               uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+               error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
                if (error) {
                        if (error != ENOENT)
                                return (error);
@@ -1853,7 +2146,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                         * at the pointer to this block in its parent, and its
                         * going to be unallocated, so we will skip over it.
                         */
-                       return (ESRCH);
+                       return (SET_ERROR(ESRCH));
                }
                error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
                if (error) {
@@ -1863,26 +2156,32 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                data = db->db.db_data;
        }
 
-       if (db && txg &&
-           (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+
+       if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
+           db->db_blkptr->blk_birth <= txg ||
+           BP_IS_HOLE(db->db_blkptr))) {
                /*
                 * This can only happen when we are searching up the tree
                 * and these conditions mean that we need to keep climbing.
                 */
-               error = ESRCH;
+               error = SET_ERROR(ESRCH);
        } else if (lvl == 0) {
                dnode_phys_t *dnp = data;
-               span = DNODE_SHIFT;
+
                ASSERT(dn->dn_type == DMU_OT_DNODE);
+               ASSERT(!(flags & DNODE_FIND_BACKWARDS));
 
-               for (i = (*offset >> span) & (blkfill - 1);
-                   i >= 0 && i < blkfill; i += inc) {
+               for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+                   i < blkfill; i += dnp[i].dn_extra_slots + 1) {
                        if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
                                break;
-                       *offset += (1ULL << span) * inc;
                }
-               if (i < 0 || i == blkfill)
-                       error = ESRCH;
+
+               if (i == blkfill)
+                       error = SET_ERROR(ESRCH);
+
+               *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+                   (i << DNODE_SHIFT);
        } else {
                blkptr_t *bp = data;
                uint64_t start = *offset;
@@ -1895,17 +2194,30 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                else
                        minfill++;
 
-               *offset = *offset >> span;
+               if (span >= 8 * sizeof (*offset)) {
+                       /* This only happens on the highest indirection level */
+                       ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
+                       *offset = 0;
+               } else {
+                       *offset = *offset >> span;
+               }
+
                for (i = BF64_GET(*offset, 0, epbs);
                    i >= 0 && i < epb; i += inc) {
-                       if (bp[i].blk_fill >= minfill &&
-                           bp[i].blk_fill <= maxfill &&
+                       if (BP_GET_FILL(&bp[i]) >= minfill &&
+                           BP_GET_FILL(&bp[i]) <= maxfill &&
                            (hole || bp[i].blk_birth > txg))
                                break;
                        if (inc > 0 || *offset > 0)
                                *offset += inc;
                }
-               *offset = *offset << span;
+
+               if (span >= 8 * sizeof (*offset)) {
+                       *offset = start;
+               } else {
+                       *offset = *offset << span;
+               }
+
                if (inc < 0) {
                        /* traversing backwards; position offset at the end */
                        ASSERT3U(*offset, <=, start);
@@ -1914,7 +2226,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                        *offset = start;
                }
                if (i < 0 || i >= epb)
-                       error = ESRCH;
+                       error = SET_ERROR(ESRCH);
        }
 
        if (db)
@@ -1958,7 +2270,7 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
                rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
        if (dn->dn_phys->dn_nlevels == 0) {
-               error = ESRCH;
+               error = SET_ERROR(ESRCH);
                goto out;
        }
 
@@ -1967,7 +2279,7 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
                        if (flags & DNODE_FIND_HOLE)
                                *offset = dn->dn_datablksz;
                } else {
-                       error = ESRCH;
+                       error = SET_ERROR(ESRCH);
                }
                goto out;
        }
@@ -1986,9 +2298,18 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
                    flags, offset, lvl, blkfill, txg);
        }
 
+       /*
+        * There's always a "virtual hole" at the end of the object, even
+        * if all BP's which physically exist are non-holes.
+        */
+       if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
+           minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
+               error = 0;
+       }
+
        if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
            initial_offset < *offset : initial_offset > *offset))
-               error = ESRCH;
+               error = SET_ERROR(ESRCH);
 out:
        if (!(flags & DNODE_FIND_HAVELOCK))
                rw_exit(&dn->dn_struct_rwlock);