]> git.proxmox.com Git - mirror_zfs.git/blobdiff - module/zfs/dnode.c
OpenZFS 7500 - Simplify dbuf_free_range by removing dn_unlisted_l0_blkid
[mirror_zfs.git] / module / zfs / dnode.c
index c01d724a914faa131c0f032dff03f5badf4cfc2a..be12ac0fe7169076914ad9490a3214d83923c2de 100644 (file)
@@ -20,7 +20,8 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -35,8 +36,8 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
-
-static int free_range_compar(const void *node1, const void *node2);
+#include <sys/range_tree.h>
+#include <sys/trace_dnode.h>
 
 static kmem_cache_t *dnode_cache;
 /*
@@ -62,6 +63,31 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
 static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
 #endif /* _KERNEL */
 
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+       const dmu_buf_impl_t *d1 = x1;
+       const dmu_buf_impl_t *d2 = x2;
+
+       int cmp = AVL_CMP(d1->db_level, d2->db_level);
+       if (likely(cmp))
+               return (cmp);
+
+       cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+       if (likely(cmp))
+               return (cmp);
+
+       if (d1->db_state == DB_SEARCH) {
+               ASSERT3S(d2->db_state, !=, DB_SEARCH);
+               return (-1);
+       } else if (d2->db_state == DB_SEARCH) {
+               ASSERT3S(d1->db_state, !=, DB_SEARCH);
+               return (1);
+       }
+
+       return (AVL_PCMP(d1, d2));
+}
+
 /* ARGSUSED */
 static int
 dnode_cons(void *arg, void *unused, int kmflag)
@@ -69,7 +95,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
        dnode_t *dn = arg;
        int i;
 
-       rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+       rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
        mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
@@ -92,9 +118,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
 
        for (i = 0; i < TXG_SIZE; i++) {
                list_link_init(&dn->dn_dirty_link[i]);
-               avl_create(&dn->dn_ranges[i], free_range_compar,
-                   sizeof (free_range_t),
-                   offsetof(struct free_range, fr_node));
+               dn->dn_free_ranges[i] = NULL;
                list_create(&dn->dn_dirty_records[i],
                    sizeof (dbuf_dirty_record_t),
                    offsetof(dbuf_dirty_record_t, dr_dirty_node));
@@ -117,7 +141,7 @@ dnode_cons(void *arg, void *unused, int kmflag)
        dn->dn_id_flags = 0;
 
        dn->dn_dbufs_count = 0;
-       list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
 
        dn->dn_moved = 0;
@@ -141,7 +165,7 @@ dnode_dest(void *arg, void *unused)
 
        for (i = 0; i < TXG_SIZE; i++) {
                ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-               avl_destroy(&dn->dn_ranges[i]);
+               ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
                list_destroy(&dn->dn_dirty_records[i]);
                ASSERT0(dn->dn_next_nblkptr[i]);
                ASSERT0(dn->dn_next_nlevels[i]);
@@ -169,7 +193,7 @@ dnode_dest(void *arg, void *unused)
        ASSERT0(dn->dn_id_flags);
 
        ASSERT0(dn->dn_dbufs_count);
-       list_destroy(&dn->dn_dbufs);
+       avl_destroy(&dn->dn_dbufs);
 }
 
 void
@@ -177,7 +201,7 @@ dnode_init(void)
 {
        ASSERT(dnode_cache == NULL);
        dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
-           0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_KMEM);
+           0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
        kmem_cache_set_move(dnode_cache, dnode_move);
 }
 
@@ -210,6 +234,7 @@ dnode_verify(dnode_t *dn)
        }
        if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
                int i;
+               int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
                ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
                if (dn->dn_datablkshift) {
                        ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
@@ -220,12 +245,12 @@ dnode_verify(dnode_t *dn)
                ASSERT(DMU_OT_IS_VALID(dn->dn_type));
                ASSERT3U(dn->dn_nblkptr, >=, 1);
                ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
-               ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+               ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
                ASSERT3U(dn->dn_datablksz, ==,
                    dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
                ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
                ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
-                   dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+                   dn->dn_bonuslen, <=, max_bonuslen);
                for (i = 0; i < TXG_SIZE; i++) {
                        ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
                }
@@ -256,6 +281,7 @@ dnode_byteswap(dnode_phys_t *dnp)
 
        dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
        dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+       dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
        dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
        dnp->dn_used = BSWAP_64(dnp->dn_used);
 
@@ -282,7 +308,8 @@ dnode_byteswap(dnode_phys_t *dnp)
                 * dnode buffer).
                 */
                int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
-               size_t len = DN_MAX_BONUSLEN - off;
+               int slots = dnp->dn_extra_slots + 1;
+               size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
                dmu_object_byteswap_t byteswap;
                ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
                byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
@@ -291,37 +318,25 @@ dnode_byteswap(dnode_phys_t *dnp)
 
        /* Swap SPILL block if we have one */
        if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
-               byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
-
+               byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
 }
 
 void
 dnode_buf_byteswap(void *vbuf, size_t size)
 {
-       dnode_phys_t *buf = vbuf;
-       int i;
+       int i = 0;
 
        ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
        ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
 
-       size >>= DNODE_SHIFT;
-       for (i = 0; i < size; i++) {
-               dnode_byteswap(buf);
-               buf++;
-       }
-}
+       while (i < size) {
+               dnode_phys_t *dnp = vbuf + i;
+               dnode_byteswap(dnp);
 
-static int
-free_range_compar(const void *node1, const void *node2)
-{
-       const free_range_t *rp1 = node1;
-       const free_range_t *rp2 = node2;
-
-       if (rp1->fr_blkid < rp2->fr_blkid)
-               return (-1);
-       else if (rp1->fr_blkid > rp2->fr_blkid)
-               return (1);
-       else return (0);
+               i += DNODE_MIN_SIZE;
+               if (dnp->dn_type != DMU_OT_NONE)
+                       i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
+       }
 }
 
 void
@@ -331,7 +346,7 @@ dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx)
 
        dnode_setdirty(dn, tx);
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-       ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+       ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
            (dn->dn_nblkptr-1) * sizeof (blkptr_t));
        dn->dn_bonuslen = newsize;
        if (newsize == 0)
@@ -372,15 +387,16 @@ dnode_setdblksz(dnode_t *dn, int size)
            1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
        dn->dn_datablksz = size;
        dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
-       dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+       dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 }
 
 static dnode_t *
 dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
-       dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_PUSHPAGE);
+       dnode_t *dn;
 
+       dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
        ASSERT(!POINTER_IS_VALID(dn->dn_objset));
        dn->dn_moved = 0;
 
@@ -408,6 +424,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        dn->dn_compress = dnp->dn_compress;
        dn->dn_bonustype = dnp->dn_bonustype;
        dn->dn_bonuslen = dnp->dn_bonuslen;
+       dn->dn_num_slots = dnp->dn_extra_slots + 1;
        dn->dn_maxblkid = dnp->dn_maxblkid;
        dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
        dn->dn_id_flags = 0;
@@ -417,16 +434,34 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
        ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 
        mutex_enter(&os->os_lock);
-       list_insert_head(&os->os_dnodes, dn);
+       if (dnh->dnh_dnode != NULL) {
+               /* Lost the allocation race. */
+               mutex_exit(&os->os_lock);
+               kmem_cache_free(dnode_cache, dn);
+               return (dnh->dnh_dnode);
+       }
+
+       /*
+        * Exclude special dnodes from os_dnodes so an empty os_dnodes
+        * signifies that the special dnodes have no references from
+        * their children (the entries in os_dnodes).  This allows
+        * dnode_destroy() to easily determine if the last child has
+        * been removed and then complete eviction of the objset.
+        */
+       if (!DMU_OBJECT_IS_SPECIAL(object))
+               list_insert_head(&os->os_dnodes, dn);
        membar_producer();
+
        /*
-        * Everything else must be valid before assigning dn_objset makes the
-        * dnode eligible for dnode_move().
+        * Everything else must be valid before assigning dn_objset
+        * makes the dnode eligible for dnode_move().
         */
        dn->dn_objset = os;
+
+       dnh->dnh_dnode = dn;
        mutex_exit(&os->os_lock);
 
-       arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
+       arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
        return (dn);
 }
 
@@ -437,12 +472,18 @@ static void
 dnode_destroy(dnode_t *dn)
 {
        objset_t *os = dn->dn_objset;
+       boolean_t complete_os_eviction = B_FALSE;
 
        ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 
        mutex_enter(&os->os_lock);
        POINTER_INVALIDATE(&dn->dn_objset);
-       list_remove(&os->os_dnodes, dn);
+       if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+               list_remove(&os->os_dnodes, dn);
+               complete_os_eviction =
+                   list_is_empty(&os->os_dnodes) &&
+                   list_link_active(&os->os_evicting_node);
+       }
        mutex_exit(&os->os_lock);
 
        /* the dnode can no longer move, so we can release the handle */
@@ -459,7 +500,7 @@ dnode_destroy(dnode_t *dn)
        }
        if (dn->dn_bonus != NULL) {
                mutex_enter(&dn->dn_bonus->db_mtx);
-               dbuf_evict(dn->dn_bonus);
+               dbuf_destroy(dn->dn_bonus);
                dn->dn_bonus = NULL;
        }
        dn->dn_zio = NULL;
@@ -473,21 +514,27 @@ dnode_destroy(dnode_t *dn)
        dn->dn_newgid = 0;
        dn->dn_id_flags = 0;
 
-       dmu_zfetch_rele(&dn->dn_zfetch);
+       dmu_zfetch_fini(&dn->dn_zfetch);
        kmem_cache_free(dnode_cache, dn);
-       arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+       arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+       if (complete_os_eviction)
+               dmu_objset_evict_done(os);
 }
 
 void
 dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
 {
        int i;
 
+       ASSERT3U(dn_slots, >, 0);
+       ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+           spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (blocksize == 0)
                blocksize = 1 << zfs_default_bs;
-       else if (blocksize > SPA_MAXBLOCKSIZE)
-               blocksize = SPA_MAXBLOCKSIZE;
        else
                blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
 
@@ -496,8 +543,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
        ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
 
-       dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
-           dn->dn_object, tx->tx_txg, blocksize, ibs);
+       dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
+           dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
 
        ASSERT(dn->dn_type == DMU_OT_NONE);
        ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
@@ -508,14 +555,14 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
            (bonustype == DMU_OT_SA && bonuslen == 0) ||
            (bonustype != DMU_OT_NONE && bonuslen != 0));
        ASSERT(DMU_OT_IS_VALID(bonustype));
-       ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+       ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
        ASSERT(dn->dn_type == DMU_OT_NONE);
        ASSERT0(dn->dn_maxblkid);
        ASSERT0(dn->dn_allocated_txg);
        ASSERT0(dn->dn_assigned_txg);
        ASSERT(refcount_is_zero(&dn->dn_tx_holds));
        ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
-       ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+       ASSERT(avl_is_empty(&dn->dn_dbufs));
 
        for (i = 0; i < TXG_SIZE; i++) {
                ASSERT0(dn->dn_next_nblkptr[i]);
@@ -527,18 +574,22 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
                ASSERT0(dn->dn_next_blksz[i]);
                ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
                ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
-               ASSERT0(avl_numnodes(&dn->dn_ranges[i]));
+               ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
        }
 
        dn->dn_type = ot;
        dnode_setdblksz(dn, blocksize);
        dn->dn_indblkshift = ibs;
        dn->dn_nlevels = 1;
+       dn->dn_num_slots = dn_slots;
        if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
                dn->dn_nblkptr = 1;
-       else
-               dn->dn_nblkptr = 1 +
-                   ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+       else {
+               dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+                   1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+                   SPA_BLKPTRSHIFT));
+       }
+
        dn->dn_bonustype = bonustype;
        dn->dn_bonuslen = bonuslen;
        dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
@@ -563,12 +614,13 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
 void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
 {
        int nblkptr;
 
        ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
-       ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+       ASSERT3U(blocksize, <=,
+           spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        ASSERT0(blocksize % SPA_MINBLOCKSIZE);
        ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
        ASSERT(tx->tx_txg != 0);
@@ -576,7 +628,10 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
            (bonustype != DMU_OT_NONE && bonuslen != 0) ||
            (bonustype == DMU_OT_SA && bonuslen == 0));
        ASSERT(DMU_OT_IS_VALID(bonustype));
-       ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+       ASSERT3U(bonuslen, <=,
+           DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+
+       dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
 
        /* clean up any unreferenced dbufs */
        dnode_evict_dbufs(dn);
@@ -599,7 +654,9 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
                nblkptr = 1;
        else
-               nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+               nblkptr = MIN(DN_MAX_NBLKPTR,
+                   1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+                   SPA_BLKPTRSHIFT));
        if (dn->dn_bonustype != bonustype)
                dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
        if (dn->dn_nblkptr != nblkptr)
@@ -617,6 +674,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        mutex_enter(&dn->dn_mtx);
        dn->dn_bonustype = bonustype;
        dn->dn_bonuslen = bonuslen;
+       dn->dn_num_slots = dn_slots;
        dn->dn_nblkptr = nblkptr;
        dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
        dn->dn_compress = ZIO_COMPRESS_INHERIT;
@@ -625,7 +683,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
        /* fix up the bonus db_size */
        if (dn->dn_bonus) {
                dn->dn_bonus->db.db_size =
-                   DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+                   DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+                   (dn->dn_nblkptr-1) * sizeof (blkptr_t);
                ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
        }
 
@@ -692,7 +751,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
                list_move_tail(&ndn->dn_dirty_records[i],
                    &odn->dn_dirty_records[i]);
        }
-       bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+       bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+           sizeof (odn->dn_free_ranges));
        ndn->dn_allocated_txg = odn->dn_allocated_txg;
        ndn->dn_free_txg = odn->dn_free_txg;
        ndn->dn_assigned_txg = odn->dn_assigned_txg;
@@ -700,8 +760,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
        ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
        refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
-       ASSERT(list_is_empty(&ndn->dn_dbufs));
-       list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+       ASSERT(avl_is_empty(&ndn->dn_dbufs));
+       avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
        ndn->dn_dbufs_count = odn->dn_dbufs_count;
        ndn->dn_bonus = odn->dn_bonus;
        ndn->dn_have_spill = odn->dn_have_spill;
@@ -716,8 +776,6 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
        dmu_zfetch_init(&ndn->dn_zfetch, NULL);
        list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
        ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
-       ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
-       ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
 
        /*
         * Update back pointers. Updating the handle fixes the back pointer of
@@ -734,7 +792,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
         */
        odn->dn_dbuf = NULL;
        odn->dn_handle = NULL;
-       list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+       avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
            offsetof(dmu_buf_impl_t, db_link));
        odn->dn_dbufs_count = 0;
        odn->dn_bonus = NULL;
@@ -753,8 +811,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
                list_create(&odn->dn_dirty_records[i],
                    sizeof (dbuf_dirty_record_t),
                    offsetof(dbuf_dirty_record_t, dr_dirty_node));
-               odn->dn_ranges[i].avl_root = NULL;
-               odn->dn_ranges[i].avl_numnodes = 0;
+               odn->dn_free_ranges[i] = NULL;
                odn->dn_next_nlevels[i] = 0;
                odn->dn_next_indblkshift[i] = 0;
                odn->dn_next_bonustype[i] = 0;
@@ -939,33 +996,32 @@ dnode_special_close(dnode_handle_t *dnh)
         */
        while (refcount_count(&dn->dn_holds) > 0)
                delay(1);
+       ASSERT(dn->dn_dbuf == NULL ||
+           dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
        zrl_add(&dnh->dnh_zrlock);
        dnode_destroy(dn); /* implicit zrl_remove() */
        zrl_destroy(&dnh->dnh_zrlock);
        dnh->dnh_dnode = NULL;
 }
 
-dnode_t *
+void
 dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
     dnode_handle_t *dnh)
 {
-       dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
-       dnh->dnh_dnode = dn;
+       dnode_t *dn;
+
+       dn = dnode_create(os, dnp, NULL, object, dnh);
        zrl_init(&dnh->dnh_zrlock);
        DNODE_VERIFY(dn);
-       return (dn);
 }
 
 static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
+dnode_buf_evict_async(void *dbu)
 {
-       dnode_children_t *children_dnodes = arg;
+       dnode_children_t *children_dnodes = dbu;
        int i;
-       int epb = db->db_size >> DNODE_SHIFT;
-
-       ASSERT(epb == children_dnodes->dnc_count);
 
-       for (i = 0; i < epb; i++) {
+       for (i = 0; i < children_dnodes->dnc_count; i++) {
                dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
                dnode_t *dn;
 
@@ -995,28 +1051,155 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
                dnh->dnh_dnode = NULL;
        }
        kmem_free(children_dnodes, sizeof (dnode_children_t) +
-           (epb - 1) * sizeof (dnode_handle_t));
+           children_dnodes->dnc_count * sizeof (dnode_handle_t));
+}
+
+/*
+ * Return true if the given index is interior to a dnode already
+ * allocated in the block. That is, the index is neither free nor
+ * allocated, but is consumed by a large dnode.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_consumed(dmu_buf_impl_t *db, int idx)
+{
+       dnode_handle_t *dnh;
+       dmu_object_type_t ot;
+       dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block;
+       int skip;
+       int i;
+
+       children_dnodes = dmu_buf_get_user(&db->db);
+       dn_block = (dnode_phys_t *)db->db.db_data;
+
+       for (i = 0; i < idx; i += skip) {
+               dnh = &children_dnodes->dnc_children[i];
+
+               zrl_add(&dnh->dnh_zrlock);
+               if (dnh->dnh_dnode != NULL) {
+                       ot = dnh->dnh_dnode->dn_type;
+                       skip = dnh->dnh_dnode->dn_num_slots;
+               } else {
+                       ot = dn_block[i].dn_type;
+                       skip = dn_block[i].dn_extra_slots + 1;
+               }
+               zrl_remove(&dnh->dnh_zrlock);
+
+               if (ot == DMU_OT_NONE)
+                       skip = 1;
+       }
+
+       return (i > idx);
+}
+
+/*
+ * Return true if the given index in the dnode block is a valid
+ * allocated dnode. That is, the index is not consumed by a large
+ * dnode and is not free.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_allocated(dmu_buf_impl_t *db, int idx)
+{
+       dnode_handle_t *dnh;
+       dmu_object_type_t ot;
+       dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block;
+
+       if (dnode_is_consumed(db, idx))
+               return (B_FALSE);
+
+       children_dnodes = dmu_buf_get_user(&db->db);
+       dn_block = (dnode_phys_t *)db->db.db_data;
+
+       dnh = &children_dnodes->dnc_children[idx];
+
+       zrl_add(&dnh->dnh_zrlock);
+       if (dnh->dnh_dnode != NULL)
+               ot = dnh->dnh_dnode->dn_type;
+       else
+               ot = dn_block[idx].dn_type;
+       zrl_remove(&dnh->dnh_zrlock);
+
+       return (ot != DMU_OT_NONE);
+}
+
+/*
+ * Return true if the given range of indices in the dnode block are
+ * free. That is, the starting index is not consumed by a large dnode
+ * and none of the indices are allocated.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
+{
+       dnode_handle_t *dnh;
+       dmu_object_type_t ot;
+       dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block;
+       int i;
+
+       if (idx + slots > DNODES_PER_BLOCK)
+               return (B_FALSE);
+
+       children_dnodes = dmu_buf_get_user(&db->db);
+       dn_block = (dnode_phys_t *)db->db.db_data;
+
+       if (dnode_is_consumed(db, idx))
+               return (B_FALSE);
+
+       for (i = idx; i < idx + slots; i++) {
+               dnh = &children_dnodes->dnc_children[i];
+
+               zrl_add(&dnh->dnh_zrlock);
+               if (dnh->dnh_dnode != NULL)
+                       ot = dnh->dnh_dnode->dn_type;
+               else
+                       ot = dn_block[i].dn_type;
+               zrl_remove(&dnh->dnh_zrlock);
+
+               if (ot != DMU_OT_NONE)
+                       return (B_FALSE);
+       }
+
+       return (B_TRUE);
 }
 
 /*
  * errors:
  * EINVAL - invalid object number.
+ * ENOSPC - hole too small to fulfill "slots" request
+ * ENOENT - the requested dnode is not allocated
  * EIO - i/o error.
  * succeeds even for free dnodes.
  */
 int
-dnode_hold_impl(objset_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
     void *tag, dnode_t **dnp)
 {
-       int epb, idx, err;
+       int epb, idx, err, i;
        int drop_struct_lock = FALSE;
        int type;
        uint64_t blk;
        dnode_t *mdn, *dn;
        dmu_buf_impl_t *db;
        dnode_children_t *children_dnodes;
+       dnode_phys_t *dn_block_begin;
        dnode_handle_t *dnh;
 
+       ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+       ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+
        /*
         * If you are holding the spa config lock as writer, you shouldn't
         * be asking the DMU to do *anything* unless it's the root pool
@@ -1056,7 +1239,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
                drop_struct_lock = TRUE;
        }
 
-       blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+       blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
 
        db = dbuf_hold(mdn, blk, FTAG);
        if (drop_struct_lock)
@@ -1072,61 +1255,64 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
        ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
        epb = db->db.db_size >> DNODE_SHIFT;
 
-       idx = object & (epb-1);
-
        ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
        children_dnodes = dmu_buf_get_user(&db->db);
        if (children_dnodes == NULL) {
-               int i;
                dnode_children_t *winner;
-               children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
-                   (epb - 1) * sizeof (dnode_handle_t),
-                   KM_PUSHPAGE | KM_NODEBUG);
+               children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
+                   epb * sizeof (dnode_handle_t), KM_SLEEP);
                children_dnodes->dnc_count = epb;
                dnh = &children_dnodes->dnc_children[0];
                for (i = 0; i < epb; i++) {
                        zrl_init(&dnh[i].dnh_zrlock);
-                       dnh[i].dnh_dnode = NULL;
                }
-               if ((winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
-                   dnode_buf_pageout))) {
+               dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
+                   dnode_buf_evict_async, NULL);
+               winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+               if (winner != NULL) {
+
+                       for (i = 0; i < epb; i++) {
+                               zrl_destroy(&dnh[i].dnh_zrlock);
+                       }
+
                        kmem_free(children_dnodes, sizeof (dnode_children_t) +
-                           (epb - 1) * sizeof (dnode_handle_t));
+                           epb * sizeof (dnode_handle_t));
                        children_dnodes = winner;
                }
        }
        ASSERT(children_dnodes->dnc_count == epb);
 
-       dnh = &children_dnodes->dnc_children[idx];
-       zrl_add(&dnh->dnh_zrlock);
-       if ((dn = dnh->dnh_dnode) == NULL) {
-               dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
-               dnode_t *winner;
+       idx = object & (epb - 1);
+       dn_block_begin = (dnode_phys_t *)db->db.db_data;
 
-               dn = dnode_create(os, phys, db, object, dnh);
-               winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
-               if (winner != NULL) {
-                       zrl_add(&dnh->dnh_zrlock);
-                       dnode_destroy(dn); /* implicit zrl_remove() */
-                       dn = winner;
-               }
+       if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
+               dbuf_rele(db, FTAG);
+               return (ENOSPC);
+       } else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
+           !dnode_is_allocated(db, idx)) {
+               dbuf_rele(db, FTAG);
+               return (ENOENT);
        }
 
+       dnh = &children_dnodes->dnc_children[idx];
+       zrl_add(&dnh->dnh_zrlock);
+       dn = dnh->dnh_dnode;
+       if (dn == NULL)
+               dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
+
        mutex_enter(&dn->dn_mtx);
        type = dn->dn_type;
        if (dn->dn_free_txg ||
-           ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
-           ((flag & DNODE_MUST_BE_FREE) &&
-           (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
+           ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
                mutex_exit(&dn->dn_mtx);
                zrl_remove(&dnh->dnh_zrlock);
                dbuf_rele(db, FTAG);
                return (type == DMU_OT_NONE ? ENOENT : EEXIST);
        }
-       mutex_exit(&dn->dn_mtx);
-
        if (refcount_add(&dn->dn_holds, tag) == 1)
                dbuf_add_ref(db, dnh);
+       mutex_exit(&dn->dn_mtx);
+
        /* Now we can rely on the hold to prevent the dnode from moving. */
        zrl_remove(&dnh->dnh_zrlock);
 
@@ -1145,7 +1331,8 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag,
 int
 dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
-       return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+       return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+           dnp));
 }
 
 /*
@@ -1168,13 +1355,19 @@ dnode_add_ref(dnode_t *dn, void *tag)
 
 void
 dnode_rele(dnode_t *dn, void *tag)
+{
+       mutex_enter(&dn->dn_mtx);
+       dnode_rele_and_unlock(dn, tag);
+}
+
+void
+dnode_rele_and_unlock(dnode_t *dn, void *tag)
 {
        uint64_t refs;
        /* Get while the hold prevents the dnode from moving. */
        dmu_buf_impl_t *db = dn->dn_dbuf;
        dnode_handle_t *dnh = dn->dn_handle;
 
-       mutex_enter(&dn->dn_mtx);
        refs = refcount_remove(&dn->dn_holds, tag);
        mutex_exit(&dn->dn_mtx);
 
@@ -1238,7 +1431,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
                return;
        }
 
-       ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+       ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+           !avl_is_empty(&dn->dn_dbufs));
        ASSERT(dn->dn_datablksz != 0);
        ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
        ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
@@ -1311,13 +1505,12 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
 int
 dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
 {
-       dmu_buf_impl_t *db, *db_next;
+       dmu_buf_impl_t *db;
        int err;
 
+       ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
        if (size == 0)
                size = SPA_MINBLOCKSIZE;
-       if (size > SPA_MAXBLOCKSIZE)
-               size = SPA_MAXBLOCKSIZE;
        else
                size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
 
@@ -1330,13 +1523,12 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
        rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 
        /* Check for any allocated blocks beyond the first */
-       if (dn->dn_phys->dn_maxblkid != 0)
+       if (dn->dn_maxblkid != 0)
                goto fail;
 
        mutex_enter(&dn->dn_dbufs_mtx);
-       for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
-               db_next = list_next(&dn->dn_dbufs, db);
-
+       for (db = avl_first(&dn->dn_dbufs); db != NULL;
+           db = AVL_NEXT(&dn->dn_dbufs, db)) {
                if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
                    db->db_blkid != DMU_SPILL_BLKID) {
                        mutex_exit(&dn->dn_dbufs_mtx);
@@ -1349,7 +1541,7 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
                goto fail;
 
        /* resize the old block */
-       err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+       err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
        if (err == 0)
                dbuf_new_size(db, size, tx);
        else if (err != ENOENT)
@@ -1416,6 +1608,8 @@ dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read)
            sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
                new_nlevels++;
 
+       ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
+
        if (new_nlevels > dn->dn_nlevels) {
                int old_nlevels = dn->dn_nlevels;
                dmu_buf_impl_t *db;
@@ -1457,56 +1651,13 @@ out:
                rw_downgrade(&dn->dn_struct_rwlock);
 }
 
-void
-dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
 {
-       avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-       avl_index_t where;
-       free_range_t *rp;
-       free_range_t rp_tofind;
-       uint64_t endblk = blkid + nblks;
-
-       ASSERT(MUTEX_HELD(&dn->dn_mtx));
-       ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
-
-       dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-           blkid, nblks, tx->tx_txg);
-       rp_tofind.fr_blkid = blkid;
-       rp = avl_find(tree, &rp_tofind, &where);
-       if (rp == NULL)
-               rp = avl_nearest(tree, where, AVL_BEFORE);
-       if (rp == NULL)
-               rp = avl_nearest(tree, where, AVL_AFTER);
-
-       while (rp && (rp->fr_blkid <= blkid + nblks)) {
-               uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
-               free_range_t *nrp = AVL_NEXT(tree, rp);
-
-               if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
-                       /* clear this entire range */
-                       avl_remove(tree, rp);
-                       kmem_free(rp, sizeof (free_range_t));
-               } else if (blkid <= rp->fr_blkid &&
-                   endblk > rp->fr_blkid && endblk < fr_endblk) {
-                       /* clear the beginning of this range */
-                       rp->fr_blkid = endblk;
-                       rp->fr_nblks = fr_endblk - endblk;
-               } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
-                   endblk >= fr_endblk) {
-                       /* clear the end of this range */
-                       rp->fr_nblks = blkid - rp->fr_blkid;
-               } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
-                       /* clear a chunk out of this range */
-                       free_range_t *new_rp =
-                           kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE);
-
-                       new_rp->fr_blkid = endblk;
-                       new_rp->fr_nblks = fr_endblk - endblk;
-                       avl_insert_here(tree, new_rp, rp, AVL_AFTER);
-                       rp->fr_nblks = blkid - rp->fr_blkid;
-               }
-               /* there may be no overlap */
-               rp = nrp;
+       dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+       if (db != NULL) {
+               dmu_buf_will_dirty(&db->db, tx);
+               dbuf_rele(db, FTAG);
        }
 }
 
@@ -1524,7 +1675,7 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        blkshift = dn->dn_datablkshift;
        epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
-       if (len == -1ULL) {
+       if (len == DMU_OBJECT_END) {
                len = UINT64_MAX - off;
                trunc = TRUE;
        }
@@ -1540,7 +1691,13 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        } else {
                ASSERT(dn->dn_maxblkid == 0);
                if (off == 0 && len >= blksz) {
-                       /* Freeing the whole block; fast-track this request */
+                       /*
+                        * Freeing the whole block; fast-track this request.
+                        * Note that we won't dirty any indirect blocks,
+                        * which is fine because we will be freeing the entire
+                        * file and thus all indirect blocks will be freed
+                        * by free_children().
+                        */
                        blkid = 0;
                        nblks = 1;
                        goto done;
@@ -1559,15 +1716,15 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
                ASSERT3U(blkoff + head, ==, blksz);
                if (len < head)
                        head = len;
-               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
-                   FTAG, &db) == 0) {
+               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+                   TRUE, FALSE, FTAG, &db) == 0) {
                        caddr_t data;
 
                        /* don't dirty if it isn't on disk and isn't dirty */
                        if (db->db_last_dirty ||
                            (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
                                rw_exit(&dn->dn_struct_rwlock);
-                               dbuf_will_dirty(db, tx);
+                               dmu_buf_will_dirty(&db->db, tx);
                                rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                                data = db->db.db_data;
                                bzero(data + blkoff, head);
@@ -1597,13 +1754,13 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
        if (tail) {
                if (len < tail)
                        tail = len;
-               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
-                   TRUE, FTAG, &db) == 0) {
+               if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+                   TRUE, FALSE, FTAG, &db) == 0) {
                        /* don't dirty if not on disk and not dirty */
                        if (db->db_last_dirty ||
                            (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
                                rw_exit(&dn->dn_struct_rwlock);
-                               dbuf_will_dirty(db, tx);
+                               dmu_buf_will_dirty(&db->db, tx);
                                rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
                                bzero(db->db.db_data, tail);
                        }
@@ -1624,74 +1781,93 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
                nblks += 1;
 
        /*
-        * Read in and mark all the level-1 indirects dirty,
-        * so that they will stay in memory until syncing phase.
-        * Always dirty the first and last indirect to make sure
-        * we dirty all the partial indirects.
+        * Dirty all the indirect blocks in this range.  Note that only
+        * the first and last indirect blocks can actually be written
+        * (if they were partially freed) -- they must be dirtied, even if
+        * they do not exist on disk yet.  The interior blocks will
+        * be freed by free_children(), so they will not actually be written.
+        * Even though these interior blocks will not be written, we
+        * dirty them for two reasons:
+        *
+        *  - It ensures that the indirect blocks remain in memory until
+        *    syncing context.  (They have already been prefetched by
+        *    dmu_tx_hold_free(), so we don't have to worry about reading
+        *    them serially here.)
+        *
+        *  - The dirty space accounting will put pressure on the txg sync
+        *    mechanism to begin syncing, and to delay transactions if there
+        *    is a large amount of freeing.  Even though these indirect
+        *    blocks will not be written, we could need to write the same
+        *    amount of space if we copy the freed BPs into deadlists.
         */
        if (dn->dn_nlevels > 1) {
-               uint64_t i, first, last;
-               int shift = epbs + dn->dn_datablkshift;
+               uint64_t first, last, i, ibyte;
+               int shift, err;
 
                first = blkid >> epbs;
-               if ((db = dbuf_hold_level(dn, 1, first, FTAG))) {
-                       dbuf_will_dirty(db, tx);
-                       dbuf_rele(db, FTAG);
-               }
+               dnode_dirty_l1(dn, first, tx);
                if (trunc)
                        last = dn->dn_maxblkid >> epbs;
                else
                        last = (blkid + nblks - 1) >> epbs;
-               if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
-                       dbuf_will_dirty(db, tx);
-                       dbuf_rele(db, FTAG);
-               }
-               for (i = first + 1; i < last; i++) {
-                       uint64_t ibyte = i << shift;
-                       int err;
+               if (last != first)
+                       dnode_dirty_l1(dn, last, tx);
 
-                       err = dnode_next_offset(dn,
-                           DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+               shift = dn->dn_datablkshift + dn->dn_indblkshift -
+                   SPA_BLKPTRSHIFT;
+               for (i = first + 1; i < last; i++) {
+                       /*
+                        * Set i to the blockid of the next non-hole
+                        * level-1 indirect block at or after i.  Note
+                        * that dnode_next_offset() operates in terms of
+                        * level-0-equivalent bytes.
+                        */
+                       ibyte = i << shift;
+                       err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+                           &ibyte, 2, 1, 0);
                        i = ibyte >> shift;
-                       if (err == ESRCH || i >= last)
+                       if (i >= last)
                                break;
-                       ASSERT(err == 0);
-                       db = dbuf_hold_level(dn, 1, i, FTAG);
-                       if (db) {
-                               dbuf_will_dirty(db, tx);
-                               dbuf_rele(db, FTAG);
-                       }
+
+                       /*
+                        * Normally we should not see an error, either
+                        * from dnode_next_offset() or dbuf_hold_level()
+                        * (except for ESRCH from dnode_next_offset).
+                        * If there is an i/o error, then when we read
+                        * this block in syncing context, it will use
+                        * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+                        * to the "failmode" property.  dnode_next_offset()
+                        * doesn't have a flag to indicate MUSTSUCCEED.
+                        */
+                       if (err != 0)
+                               break;
+
+                       dnode_dirty_l1(dn, i, tx);
                }
        }
+
 done:
        /*
         * Add this range to the dnode range list.
         * We will finish up this free operation in the syncing phase.
         */
        mutex_enter(&dn->dn_mtx);
-       dnode_clear_range(dn, blkid, nblks, tx);
        {
-               free_range_t *rp, *found;
-               avl_index_t where;
-               avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-
-               /* Add new range to dn_ranges */
-               rp = kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE);
-               rp->fr_blkid = blkid;
-               rp->fr_nblks = nblks;
-               found = avl_find(tree, rp, &where);
-               ASSERT(found == NULL);
-               avl_insert(tree, rp, where);
-               dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
-                   blkid, nblks, tx->tx_txg);
+       int txgoff = tx->tx_txg & TXG_MASK;
+       if (dn->dn_free_ranges[txgoff] == NULL) {
+               dn->dn_free_ranges[txgoff] =
+                   range_tree_create(NULL, NULL, &dn->dn_mtx);
+       }
+       range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+       range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
        }
+       dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+           blkid, nblks, tx->tx_txg);
        mutex_exit(&dn->dn_mtx);
 
        dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
        dnode_setdirty(dn, tx);
 out:
-       if (trunc && dn->dn_maxblkid >= (off >> blkshift))
-               dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
 
        rw_exit(&dn->dn_struct_rwlock);
 }
@@ -1714,7 +1890,6 @@ dnode_spill_freed(dnode_t *dn)
 uint64_t
 dnode_block_freed(dnode_t *dn, uint64_t blkid)
 {
-       free_range_t range_tofind;
        void *dp = spa_get_dsl(dn->dn_objset->os_spa);
        int i;
 
@@ -1734,20 +1909,10 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
        if (blkid == DMU_SPILL_BLKID)
                return (dnode_spill_freed(dn));
 
-       range_tofind.fr_blkid = blkid;
        mutex_enter(&dn->dn_mtx);
        for (i = 0; i < TXG_SIZE; i++) {
-               free_range_t *range_found;
-               avl_index_t idx;
-
-               range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
-               if (range_found) {
-                       ASSERT(range_found->fr_nblks > 0);
-                       break;
-               }
-               range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
-               if (range_found &&
-                   range_found->fr_blkid + range_found->fr_nblks > blkid)
+               if (dn->dn_free_ranges[i] != NULL &&
+                   range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
                        break;
        }
        mutex_exit(&dn->dn_mtx);
@@ -1784,23 +1949,22 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
 }
 
 /*
- * Call when we think we're going to write/free space in open context.
- * Be conservative (ie. OK to write less than this or free more than
- * this, but don't write more or free less).
+ * Call when we think we're going to write/free space in open context to track
+ * the amount of memory in use by the currently open txg.
  */
 void
 dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
 {
        objset_t *os = dn->dn_objset;
        dsl_dataset_t *ds = os->os_dsl_dataset;
+       int64_t aspace = spa_get_asize(os->os_spa, space);
 
-       if (space > 0)
-               space = spa_get_asize(os->os_spa, space);
-
-       if (ds)
-               dsl_dir_willuse_space(ds->ds_dir, space, tx);
+       if (ds != NULL) {
+               dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
+               dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
+       }
 
-       dmu_tx_willuse_space(tx, space);
+       dmu_tx_willuse_space(tx, aspace);
 }
 
 /*
@@ -1823,7 +1987,7 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
  */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
-       int lvl, uint64_t blkfill, uint64_t txg)
+    int lvl, uint64_t blkfill, uint64_t txg)
 {
        dmu_buf_impl_t *db = NULL;
        void *data = NULL;
@@ -1833,9 +1997,6 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
        boolean_t hole;
        int i, inc, error, span;
 
-       dprintf("probing object %llu offset %llx level %d of %u\n",
-           dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
-
        hole = ((flags & DNODE_FIND_HOLE) != 0);
        inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
        ASSERT(txg == 0 || !hole);
@@ -1845,8 +2006,8 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                epb = dn->dn_phys->dn_nblkptr;
                data = dn->dn_phys->dn_blkptr;
        } else {
-               uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
-               error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+               uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+               error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
                if (error) {
                        if (error != ENOENT)
                                return (error);
@@ -1869,8 +2030,10 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                data = db->db.db_data;
        }
 
-       if (db && txg &&
-           (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+
+       if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
+           db->db_blkptr->blk_birth <= txg ||
+           BP_IS_HOLE(db->db_blkptr))) {
                /*
                 * This can only happen when we are searching up the tree
                 * and these conditions mean that we need to keep climbing.
@@ -1878,17 +2041,21 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                error = SET_ERROR(ESRCH);
        } else if (lvl == 0) {
                dnode_phys_t *dnp = data;
-               span = DNODE_SHIFT;
+
                ASSERT(dn->dn_type == DMU_OT_DNODE);
+               ASSERT(!(flags & DNODE_FIND_BACKWARDS));
 
-               for (i = (*offset >> span) & (blkfill - 1);
-                   i >= 0 && i < blkfill; i += inc) {
+               for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+                   i < blkfill; i += dnp[i].dn_extra_slots + 1) {
                        if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
                                break;
-                       *offset += (1ULL << span) * inc;
                }
-               if (i < 0 || i == blkfill)
+
+               if (i == blkfill)
                        error = SET_ERROR(ESRCH);
+
+               *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+                   (i << DNODE_SHIFT);
        } else {
                blkptr_t *bp = data;
                uint64_t start = *offset;
@@ -1901,17 +2068,30 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
                else
                        minfill++;
 
-               *offset = *offset >> span;
+               if (span >= 8 * sizeof (*offset)) {
+                       /* This only happens on the highest indirection level */
+                       ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
+                       *offset = 0;
+               } else {
+                       *offset = *offset >> span;
+               }
+
                for (i = BF64_GET(*offset, 0, epbs);
                    i >= 0 && i < epb; i += inc) {
-                       if (bp[i].blk_fill >= minfill &&
-                           bp[i].blk_fill <= maxfill &&
+                       if (BP_GET_FILL(&bp[i]) >= minfill &&
+                           BP_GET_FILL(&bp[i]) <= maxfill &&
                            (hole || bp[i].blk_birth > txg))
                                break;
                        if (inc > 0 || *offset > 0)
                                *offset += inc;
                }
-               *offset = *offset << span;
+
+               if (span >= 8 * sizeof (*offset)) {
+                       *offset = start;
+               } else {
+                       *offset = *offset << span;
+               }
+
                if (inc < 0) {
                        /* traversing backwards; position offset at the end */
                        ASSERT3U(*offset, <=, start);
@@ -1992,6 +2172,15 @@ dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset,
                    flags, offset, lvl, blkfill, txg);
        }
 
+       /*
+        * There's always a "virtual hole" at the end of the object, even
+        * if all BP's which physically exist are non-holes.
+        */
+       if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
+           minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
+               error = 0;
+       }
+
        if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
            initial_offset < *offset : initial_offset > *offset))
                error = SET_ERROR(ESRCH);