]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Improved dnode allocation and dmu_hold_impl()
authorOlaf Faaland <faaland1@llnl.gov>
Tue, 5 Sep 2017 23:15:04 +0000 (16:15 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 5 Sep 2017 23:15:04 +0000 (16:15 -0700)
Refactor dmu_object_alloc_dnsize() and dnode_hold_impl() to simplify the
code, fix errors introduced by commit dbeb879 (PR #6117) interacting
badly with large dnodes, and improve performance.

* When allocating a new dnode in dmu_object_alloc_dnsize(), update the
percpu object ID for the core's metadnode chunk immediately.  This
eliminates most lock contention when taking the hold and creating the
dnode.

* Correct detection of the chunk boundary to work properly with large
dnodes.

* Separate the dmu_hold_impl() code for the FREE case from the code for
the ALLOCATED case to make it easier to read.

* Fully populate the dnode handle array immediately after reading a
block of the metadnode from disk.  Subsequently the dnode handle array
provides enough information to determine which dnode slots are in use
and which are free.

* Add several kstats to allow the behavior of the code to be examined.

* Verify dnode packing in large_dnode_008_pos.ksh.  Since the test is
purely creates, it should leave very few holes in the metadnode.

* Add test large_dnode_009_pos.ksh, which performs concurrent creates
and deletes, to complement existing test which does only creates.

With the above fixes, there is very little contention in a test of about
200,000 racing dnode allocations produced by tests 'large_dnode_008_pos'
and 'large_dnode_009_pos'.

name                            type data
dnode_hold_dbuf_hold            4    0
dnode_hold_dbuf_read            4    0
dnode_hold_alloc_hits           4    3804690
dnode_hold_alloc_misses         4    216
dnode_hold_alloc_interior       4    3
dnode_hold_alloc_lock_retry     4    0
dnode_hold_alloc_lock_misses    4    0
dnode_hold_alloc_type_none      4    0
dnode_hold_free_hits            4    203105
dnode_hold_free_misses          4    4
dnode_hold_free_lock_misses     4    0
dnode_hold_free_lock_retry      4    0
dnode_hold_free_overflow        4    0
dnode_hold_free_refcount        4    57
dnode_hold_free_txg             4    0
dnode_allocate                  4    203154
dnode_reallocate                4    0
dnode_buf_evict                 4    23918
dnode_alloc_next_chunk          4    4887
dnode_alloc_race                4    0
dnode_alloc_next_block          4    18

The performance is slightly improved for concurrent creates with
16+ threads, and unchanged for low thread counts.

Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Olaf Faaland <faaland1@llnl.gov>
Closes #5396
Closes #6522
Closes #6414
Closes #6564

cmd/zdb/zdb.c
include/sys/dnode.h
module/zfs/dbuf_stats.c
module/zfs/dmu_object.c
module/zfs/dnode.c
tests/runfiles/linux.run
tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am
tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh
tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh [new file with mode: 0755]

index 88b1040738e987a110075eaf15dacdb6af8be665..ae8d00f154427bd5a8b2b51bea8d638a1a14fed6 100644 (file)
@@ -1934,7 +1934,8 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
 };
 
 static void
-dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
+dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header,
+    uint64_t *dnode_slots_used)
 {
        dmu_buf_t *db = NULL;
        dmu_object_info_t doi;
@@ -1984,6 +1985,9 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
                }
        }
 
+       if (dnode_slots_used)
+               *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE;
+
        zdb_nicenum(doi.doi_metadata_block_size, iblk);
        zdb_nicenum(doi.doi_data_block_size, dblk);
        zdb_nicenum(doi.doi_max_offset, lsize);
@@ -2104,6 +2108,9 @@ dump_dir(objset_t *os)
        int verbosity = dump_opt['d'];
        int print_header = 1;
        int i, error;
+       uint64_t total_slots_used = 0;
+       uint64_t max_slot_used = 0;
+       uint64_t dnode_slots;
 
        dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
        dmu_objset_fast_stat(os, &dds);
@@ -2144,7 +2151,7 @@ dump_dir(objset_t *os)
        if (zopt_objects != 0) {
                for (i = 0; i < zopt_objects; i++)
                        dump_object(os, zopt_object[i], verbosity,
-                           &print_header);
+                           &print_header, NULL);
                (void) printf("\n");
                return;
        }
@@ -2161,24 +2168,39 @@ dump_dir(objset_t *os)
        if (BP_IS_HOLE(os->os_rootbp))
                return;
 
-       dump_object(os, 0, verbosity, &print_header);
+       dump_object(os, 0, verbosity, &print_header, NULL);
        object_count = 0;
        if (DMU_USERUSED_DNODE(os) != NULL &&
            DMU_USERUSED_DNODE(os)->dn_type != 0) {
-               dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
-               dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
+               dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header,
+                   NULL);
+               dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header,
+                   NULL);
        }
 
        object = 0;
        while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
-               dump_object(os, object, verbosity, &print_header);
+               dump_object(os, object, verbosity, &print_header, &dnode_slots);
                object_count++;
+               total_slots_used += dnode_slots;
+               max_slot_used = object + dnode_slots - 1;
        }
 
        ASSERT3U(object_count, ==, usedobjs);
 
        (void) printf("\n");
 
+       (void) printf("    Dnode slots:\n");
+       (void) printf("\tTotal used:    %10llu\n",
+           (u_longlong_t)total_slots_used);
+       (void) printf("\tMax used:      %10llu\n",
+           (u_longlong_t)max_slot_used);
+       (void) printf("\tPercent empty: %10lf\n",
+           (double)(max_slot_used - total_slots_used)*100 /
+           (double)max_slot_used);
+
+       (void) printf("\n");
+
        if (error != ESRCH) {
                (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
                abort();
@@ -2642,7 +2664,7 @@ dump_path_impl(objset_t *os, uint64_t obj, char *name)
                        return (dump_path_impl(os, child_obj, s + 1));
                /*FALLTHROUGH*/
        case DMU_OT_PLAIN_FILE_CONTENTS:
-               dump_object(os, child_obj, dump_opt['v'], &header);
+               dump_object(os, child_obj, dump_opt['v'], &header, NULL);
                return (0);
        default:
                (void) fprintf(stderr, "object %llu has non-file/directory "
index 5d589a95c5fdeaf29f51d86bc9ddb36bdb5970e3..e5e39b18c9e92b5319debef4518515ffec15c490 100644 (file)
@@ -98,6 +98,13 @@ extern "C" {
 #define        DN_ZERO_BONUSLEN        (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1)
 #define        DN_KILL_SPILLBLK (1)
 
+#define        DN_SLOT_UNINIT          ((void *)NULL)  /* Uninitialized */
+#define        DN_SLOT_FREE            ((void *)1UL)   /* Free slot */
+#define        DN_SLOT_ALLOCATED       ((void *)2UL)   /* Allocated slot */
+#define        DN_SLOT_INTERIOR        ((void *)3UL)   /* Interior allocated slot */
+#define        DN_SLOT_IS_PTR(dn)      ((void *)dn > DN_SLOT_INTERIOR)
+#define        DN_SLOT_IS_VALID(dn)    ((void *)dn != NULL)
+
 #define        DNODES_PER_BLOCK_SHIFT  (DNODE_BLOCK_SHIFT - DNODE_SHIFT)
 #define        DNODES_PER_BLOCK        (1ULL << DNODES_PER_BLOCK_SHIFT)
 
@@ -419,6 +426,135 @@ void dnode_evict_bonus(dnode_t *dn);
        ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
        (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
 
+/*
+ * Used for dnodestats kstat.
+ */
+typedef struct dnode_stats {
+       /*
+        * Number of failed attempts to hold a meta dnode dbuf.
+        */
+       kstat_named_t dnode_hold_dbuf_hold;
+       /*
+        * Number of failed attempts to read a meta dnode dbuf.
+        */
+       kstat_named_t dnode_hold_dbuf_read;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able
+        * to hold the requested object number which was allocated.  This is
+        * the common case when looking up any allocated object number.
+        */
+       kstat_named_t dnode_hold_alloc_hits;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
+        * able to hold the request object number because it was not allocated.
+        */
+       kstat_named_t dnode_hold_alloc_misses;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not
+        * able to hold the request object number because the object number
+        * refers to an interior large dnode slot.
+        */
+       kstat_named_t dnode_hold_alloc_interior;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed
+        * to retry acquiring slot zrl locks due to contention.
+        */
+       kstat_named_t dnode_hold_alloc_lock_retry;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not
+        * need to create the dnode because another thread did so after
+        * dropping the read lock but before acquiring the write lock.
+        */
+       kstat_named_t dnode_hold_alloc_lock_misses;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found
+        * a free dnode instantiated by dnode_create() but not yet allocated
+        * by dnode_allocate().
+        */
+       kstat_named_t dnode_hold_alloc_type_none;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able
+        * to hold the requested range of free dnode slots.
+        */
+       kstat_named_t dnode_hold_free_hits;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
+        * able to hold the requested range of free dnode slots because
+        * at least one slot was allocated.
+        */
+       kstat_named_t dnode_hold_free_misses;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not
+        * able to hold the requested range of free dnode slots because
+        * after acquiring the zrl lock at least one slot was allocated.
+        */
+       kstat_named_t dnode_hold_free_lock_misses;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed
+        * to retry acquiring slot zrl locks due to contention.
+        */
+       kstat_named_t dnode_hold_free_lock_retry;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
+        * a range of dnode slots which were held by another thread.
+        */
+       kstat_named_t dnode_hold_free_refcount;
+       /*
+        * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested
+        * a range of dnode slots which would overflow the dnode_phys_t.
+        */
+       kstat_named_t dnode_hold_free_overflow;
+       /*
+        * Number of times a dnode_hold(...) was attempted on a dnode
+        * which had already been unlinked in an earlier txg.
+        */
+       kstat_named_t dnode_hold_free_txg;
+       /*
+        * Number of new dnodes allocated by dnode_allocate().
+        */
+       kstat_named_t dnode_allocate;
+       /*
+        * Number of dnodes re-allocated by dnode_reallocate().
+        */
+       kstat_named_t dnode_reallocate;
+       /*
+        * Number of meta dnode dbufs evicted.
+        */
+       kstat_named_t dnode_buf_evict;
+       /*
+        * Number of times dmu_object_alloc*() reached the end of the existing
+        * object ID chunk and advanced to a new one.
+        */
+       kstat_named_t dnode_alloc_next_chunk;
+       /*
+        * Number of times multiple threads attempted to allocate a dnode
+        * from the same block of free dnodes.
+        */
+       kstat_named_t dnode_alloc_race;
+       /*
+        * Number of times dmu_object_alloc*() was forced to advance to the
+        * next meta dnode dbuf due to an error from  dmu_object_next().
+        */
+       kstat_named_t dnode_alloc_next_block;
+       /*
+        * Statistics for tracking dnodes which have been moved.
+        */
+       kstat_named_t dnode_move_invalid;
+       kstat_named_t dnode_move_recheck1;
+       kstat_named_t dnode_move_recheck2;
+       kstat_named_t dnode_move_special;
+       kstat_named_t dnode_move_handle;
+       kstat_named_t dnode_move_rwlock;
+       kstat_named_t dnode_move_active;
+} dnode_stats_t;
+
+extern dnode_stats_t dnode_stats;
+
+#define        DNODE_STAT_INCR(stat, val) \
+    atomic_add_64(&dnode_stats.stat.value.ui64, (val));
+#define        DNODE_STAT_BUMP(stat) \
+    DNODE_STAT_INCR(stat, 1);
+
 #ifdef ZFS_DEBUG
 
 /*
index 4eec72a3684683b28841502e053b6375ab1a4a48..985bbd3e9be4b073c8db49c8e78811680d99c892 100644 (file)
@@ -72,8 +72,7 @@ __dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db)
        if (db->db_buf)
                arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index);
 
-       if (dn)
-               __dmu_object_info_from_dnode(dn, &doi);
+       __dmu_object_info_from_dnode(dn, &doi);
 
        nwritten = snprintf(buf, size,
            "%-16s %-8llu %-8lld %-8lld %-8lld %-8llu %-8llu %-5d %-5d %-5lu | "
index 38ce6746e2d4673f37721d1640bfd9ba6c1475d5..e7412b7509f446fb3ac6d5727fcd7230e9143c9c 100644 (file)
@@ -93,7 +93,10 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
                 * If we finished a chunk of dnodes, get a new one from
                 * the global allocator.
                 */
-               if (P2PHASE(object, dnodes_per_chunk) == 0) {
+               if ((P2PHASE(object, dnodes_per_chunk) == 0) ||
+                   (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) <
+                   dn_slots)) {
+                       DNODE_STAT_BUMP(dnode_alloc_next_chunk);
                        mutex_enter(&os->os_obj_lock);
                        ASSERT0(P2PHASE(os->os_obj_next_chunk,
                            dnodes_per_chunk));
@@ -157,6 +160,13 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
                        mutex_exit(&os->os_obj_lock);
                }
 
+               /*
+                * The value of (*cpuobj) before adding dn_slots is the object
+                * ID assigned to us.  The value afterwards is the object ID
+                * assigned to whoever wants to do an allocation next.
+                */
+               object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots;
+
                /*
                 * XXX We should check for an i/o error here and return
                 * up to our caller.  Actually we should pre-read it in
@@ -177,21 +187,20 @@ dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize,
                                rw_exit(&dn->dn_struct_rwlock);
                                dmu_tx_add_new_object(tx, dn);
                                dnode_rele(dn, FTAG);
-
-                               (void) atomic_swap_64(cpuobj,
-                                   object + dn_slots);
                                return (object);
                        }
                        rw_exit(&dn->dn_struct_rwlock);
                        dnode_rele(dn, FTAG);
+                       DNODE_STAT_BUMP(dnode_alloc_race);
                }
 
+               /*
+                * Skip to next known valid starting point on error.  This
+                * is the start of the next block of dnodes.
+                */
                if (dmu_object_next(os, &object, B_TRUE, 0) != 0) {
-                       /*
-                        * Skip to next known valid starting point for a
-                        * dnode.
-                        */
                        object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK);
+                       DNODE_STAT_BUMP(dnode_alloc_next_block);
                }
                (void) atomic_swap_64(cpuobj, object);
        }
@@ -304,24 +313,37 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
        if (*objectp == 0) {
                start_obj = 1;
        } else if (ds && ds->ds_feature_inuse[SPA_FEATURE_LARGE_DNODE]) {
+               uint64_t i = *objectp + 1;
+               uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1);
+               dmu_object_info_t doi;
+
                /*
-                * For large_dnode datasets, scan from the beginning of the
-                * dnode block to find the starting offset. This is needed
-                * because objectp could be part of a large dnode so we can't
-                * assume it's a hole even if dmu_object_info() returns ENOENT.
+                * Scan through the remaining meta dnode block.  The contents
+                * of each slot in the block are known so it can be quickly
+                * checked.  If the block is exhausted without a match then
+                * hand off to dnode_next_offset() for further scanning.
                 */
-               int epb = DNODE_BLOCK_SIZE >> DNODE_SHIFT;
-               int skip;
-               uint64_t i;
-
-               for (i = *objectp & ~(epb - 1); i <= *objectp; i += skip) {
-                       dmu_object_info_t doi;
-
+               while (i <= last_obj) {
                        error = dmu_object_info(os, i, &doi);
-                       if (error != 0)
-                               skip = 1;
-                       else
-                               skip = doi.doi_dnodesize >> DNODE_SHIFT;
+                       if (error == ENOENT) {
+                               if (hole) {
+                                       *objectp = i;
+                                       return (0);
+                               } else {
+                                       i++;
+                               }
+                       } else if (error == EEXIST) {
+                               i++;
+                       } else if (error == 0) {
+                               if (hole) {
+                                       i += doi.doi_dnodesize >> DNODE_SHIFT;
+                               } else {
+                                       *objectp = i;
+                                       return (0);
+                               }
+                       } else {
+                               return (error);
+                       }
                }
 
                start_obj = i;
index 8b3ec3aab0ae70f2fdc646f691026c54364bd800..08e57b2748e12fa1e01b5c4faf64702cd7f974ed 100644 (file)
 #include <sys/range_tree.h>
 #include <sys/trace_dnode.h>
 
+dnode_stats_t dnode_stats = {
+       { "dnode_hold_dbuf_hold",               KSTAT_DATA_UINT64 },
+       { "dnode_hold_dbuf_read",               KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_hits",              KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_misses",            KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_interior",          KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_lock_retry",        KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_lock_misses",       KSTAT_DATA_UINT64 },
+       { "dnode_hold_alloc_type_none",         KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_hits",               KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_misses",             KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_lock_misses",        KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_lock_retry",         KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_overflow",           KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_refcount",           KSTAT_DATA_UINT64 },
+       { "dnode_hold_free_txg",                KSTAT_DATA_UINT64 },
+       { "dnode_allocate",                     KSTAT_DATA_UINT64 },
+       { "dnode_reallocate",                   KSTAT_DATA_UINT64 },
+       { "dnode_buf_evict",                    KSTAT_DATA_UINT64 },
+       { "dnode_alloc_next_chunk",             KSTAT_DATA_UINT64 },
+       { "dnode_alloc_race",                   KSTAT_DATA_UINT64 },
+       { "dnode_alloc_next_block",             KSTAT_DATA_UINT64 },
+       { "dnode_move_invalid",                 KSTAT_DATA_UINT64 },
+       { "dnode_move_recheck1",                KSTAT_DATA_UINT64 },
+       { "dnode_move_recheck2",                KSTAT_DATA_UINT64 },
+       { "dnode_move_special",                 KSTAT_DATA_UINT64 },
+       { "dnode_move_handle",                  KSTAT_DATA_UINT64 },
+       { "dnode_move_rwlock",                  KSTAT_DATA_UINT64 },
+       { "dnode_move_active",                  KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dnode_ksp;
 static kmem_cache_t *dnode_cache;
-/*
- * Define DNODE_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
-#ifdef DEBUG
-#define        DNODE_STATS
-#endif /* DEBUG */
-
-#ifdef DNODE_STATS
-#define        DNODE_STAT_ADD(stat)                    ((stat)++)
-#else
-#define        DNODE_STAT_ADD(stat)                    /* nothing */
-#endif /* DNODE_STATS */
 
 ASSERTV(static dnode_phys_t dnode_phys_zero);
 
@@ -203,11 +222,24 @@ dnode_init(void)
        dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
            0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
        kmem_cache_set_move(dnode_cache, dnode_move);
+
+       dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
+           KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
+           KSTAT_FLAG_VIRTUAL);
+       if (dnode_ksp != NULL) {
+               dnode_ksp->ks_data = &dnode_stats;
+               kstat_install(dnode_ksp);
+       }
 }
 
 void
 dnode_fini(void)
 {
+       if (dnode_ksp != NULL) {
+               kstat_delete(dnode_ksp);
+               dnode_ksp = NULL;
+       }
+
        kmem_cache_destroy(dnode_cache);
        dnode_cache = NULL;
 }
@@ -391,7 +423,7 @@ dnode_setdblksz(dnode_t *dn, int size)
 }
 
 static dnode_t *
-dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
+dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object, dnode_handle_t *dnh)
 {
        dnode_t *dn;
@@ -424,26 +456,18 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
        dn->dn_compress = dnp->dn_compress;
        dn->dn_bonustype = dnp->dn_bonustype;
        dn->dn_bonuslen = dnp->dn_bonuslen;
+       dn->dn_num_slots = dnp->dn_extra_slots + 1;
        dn->dn_maxblkid = dnp->dn_maxblkid;
        dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
        dn->dn_id_flags = 0;
 
-       if (slots && dn->dn_type == DMU_OT_NONE)
-               dn->dn_num_slots = slots;
-       else
-               dn->dn_num_slots = dnp->dn_extra_slots + 1;
-
        dmu_zfetch_init(&dn->dn_zfetch, dn);
 
        ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+       ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+       ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
 
        mutex_enter(&os->os_lock);
-       if (dnh->dnh_dnode != NULL) {
-               /* Lost the allocation race. */
-               mutex_exit(&os->os_lock);
-               kmem_cache_free(dnode_cache, dn);
-               return (dnh->dnh_dnode);
-       }
 
        /*
         * Exclude special dnodes from os_dnodes so an empty os_dnodes
@@ -466,6 +490,7 @@ dnode_create(objset_t *os, dnode_phys_t *dnp, int slots, dmu_buf_impl_t *db,
        mutex_exit(&os->os_lock);
 
        arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
+
        return (dn);
 }
 
@@ -549,6 +574,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
        dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
            dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
+       DNODE_STAT_BUMP(dnode_allocate);
 
        ASSERT(dn->dn_type == DMU_OT_NONE);
        ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
@@ -636,6 +662,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
            DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
 
        dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
+       DNODE_STAT_BUMP(dnode_reallocate);
 
        /* clean up any unreferenced dbufs */
        dnode_evict_dbufs(dn);
@@ -697,18 +724,6 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 }
 
 #ifdef _KERNEL
-#ifdef DNODE_STATS
-static struct {
-       uint64_t dms_dnode_invalid;
-       uint64_t dms_dnode_recheck1;
-       uint64_t dms_dnode_recheck2;
-       uint64_t dms_dnode_special;
-       uint64_t dms_dnode_handle;
-       uint64_t dms_dnode_rwlock;
-       uint64_t dms_dnode_active;
-} dnode_move_stats;
-#endif /* DNODE_STATS */
-
 static void
 dnode_move_impl(dnode_t *odn, dnode_t *ndn)
 {
@@ -866,7 +881,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
         */
        os = odn->dn_objset;
        if (!POINTER_IS_VALID(os)) {
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+               DNODE_STAT_BUMP(dnode_move_invalid);
                return (KMEM_CBRC_DONT_KNOW);
        }
 
@@ -876,7 +891,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
        rw_enter(&os_lock, RW_WRITER);
        if (os != odn->dn_objset) {
                rw_exit(&os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+               DNODE_STAT_BUMP(dnode_move_recheck1);
                return (KMEM_CBRC_DONT_KNOW);
        }
 
@@ -894,7 +909,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
        if (os != odn->dn_objset) {
                mutex_exit(&os->os_lock);
                rw_exit(&os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+               DNODE_STAT_BUMP(dnode_move_recheck2);
                return (KMEM_CBRC_DONT_KNOW);
        }
 
@@ -907,7 +922,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
        rw_exit(&os_lock);
        if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
                mutex_exit(&os->os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+               DNODE_STAT_BUMP(dnode_move_special);
                return (KMEM_CBRC_NO);
        }
        ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
@@ -922,7 +937,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
         */
        if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
                mutex_exit(&os->os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+               DNODE_STAT_BUMP(dnode_move_handle);
                return (KMEM_CBRC_LATER);
        }
 
@@ -938,7 +953,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
        if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
                zrl_exit(&odn->dn_handle->dnh_zrlock);
                mutex_exit(&os->os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+               DNODE_STAT_BUMP(dnode_move_rwlock);
                return (KMEM_CBRC_LATER);
        }
 
@@ -964,7 +979,7 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
                rw_exit(&odn->dn_struct_rwlock);
                zrl_exit(&odn->dn_handle->dnh_zrlock);
                mutex_exit(&os->os_lock);
-               DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+               DNODE_STAT_BUMP(dnode_move_active);
                return (KMEM_CBRC_LATER);
        }
 
@@ -988,6 +1003,78 @@ dnode_move(void *buf, void *newbuf, size_t size, void *arg)
 }
 #endif /* _KERNEL */
 
+static void
+dnode_slots_hold(dnode_children_t *children, int idx, int slots)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+               zrl_add(&dnh->dnh_zrlock);
+       }
+}
+
+static void
+dnode_slots_rele(dnode_children_t *children, int idx, int slots)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+
+               if (zrl_is_locked(&dnh->dnh_zrlock))
+                       zrl_exit(&dnh->dnh_zrlock);
+               else
+                       zrl_remove(&dnh->dnh_zrlock);
+       }
+}
+
+static int
+dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+
+               if (!zrl_tryenter(&dnh->dnh_zrlock)) {
+                       for (int j = idx; j < i; j++) {
+                               dnh = &children->dnc_children[j];
+                               zrl_exit(&dnh->dnh_zrlock);
+                       }
+
+                       return (0);
+               }
+       }
+
+       return (1);
+}
+
+static void
+dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+               dnh->dnh_dnode = ptr;
+       }
+}
+
+static boolean_t
+dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+               if (dnh->dnh_dnode != ptr)
+                       return (B_FALSE);
+       }
+
+       return (B_TRUE);
+}
+
 void
 dnode_special_close(dnode_handle_t *dnh)
 {
@@ -995,7 +1082,7 @@ dnode_special_close(dnode_handle_t *dnh)
 
        /*
         * Wait for final references to the dnode to clear.  This can
-        * only happen if the arc is asyncronously evicting state that
+        * only happen if the arc is asynchronously evicting state that
         * has a hold on this dnode while we are trying to evict this
         * dnode.
         */
@@ -1015,19 +1102,24 @@ dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
 {
        dnode_t *dn;
 
-       dn = dnode_create(os, dnp, 0, NULL, object, dnh);
        zrl_init(&dnh->dnh_zrlock);
+       zrl_tryenter(&dnh->dnh_zrlock);
+
+       dn = dnode_create(os, dnp, NULL, object, dnh);
        DNODE_VERIFY(dn);
+
+       zrl_exit(&dnh->dnh_zrlock);
 }
 
 static void
 dnode_buf_evict_async(void *dbu)
 {
-       dnode_children_t *children_dnodes = dbu;
-       int i;
+       dnode_children_t *dnc = dbu;
+
+       DNODE_STAT_BUMP(dnode_buf_evict);
 
-       for (i = 0; i < children_dnodes->dnc_count; i++) {
-               dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+       for (int i = 0; i < dnc->dnc_count; i++) {
+               dnode_handle_t *dnh = &dnc->dnc_children[i];
                dnode_t *dn;
 
                /*
@@ -1035,8 +1127,9 @@ dnode_buf_evict_async(void *dbu)
                 * another valid address, so there is no need here to guard
                 * against changes to or from NULL.
                 */
-               if (dnh->dnh_dnode == NULL) {
+               if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
                        zrl_destroy(&dnh->dnh_zrlock);
+                       dnh->dnh_dnode = DN_SLOT_UNINIT;
                        continue;
                }
 
@@ -1051,128 +1144,12 @@ dnode_buf_evict_async(void *dbu)
                ASSERT(refcount_is_zero(&dn->dn_holds));
                ASSERT(refcount_is_zero(&dn->dn_tx_holds));
 
-               dnode_destroy(dn); /* implicit zrl_remove() */
+               dnode_destroy(dn); /* implicit zrl_remove() for first slot */
                zrl_destroy(&dnh->dnh_zrlock);
-               dnh->dnh_dnode = NULL;
-       }
-       kmem_free(children_dnodes, sizeof (dnode_children_t) +
-           children_dnodes->dnc_count * sizeof (dnode_handle_t));
-}
-
-/*
- * Return true if the given index is interior to a dnode already
- * allocated in the block. That is, the index is neither free nor
- * allocated, but is consumed by a large dnode.
- *
- * The dnode_phys_t buffer may not be in sync with the in-core dnode
- * structure, so we try to check the dnode structure first and fall back
- * to the dnode_phys_t buffer it doesn't exist.  When an in-code dnode
- * exists we can always trust dn->dn_num_slots to be accurate, even for
- * a held dnode which has not yet been fully allocated.
- */
-static boolean_t
-dnode_is_consumed(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
-{
-       int skip, i;
-
-       for (i = 0; i < idx; i += skip) {
-               dnode_handle_t *dnh = &children->dnc_children[i];
-
-               if (dnh->dnh_dnode != NULL) {
-                       skip = dnh->dnh_dnode->dn_num_slots;
-               } else {
-                       if (dn_block[i].dn_type != DMU_OT_NONE)
-                               skip = dn_block[i].dn_extra_slots + 1;
-                       else
-                               skip = 1;
-               }
-       }
-
-       return (i > idx);
-}
-
-/*
- * Return true if the given index in the dnode block is a valid
- * allocated dnode. That is, the index is not consumed by a large
- * dnode and is not free.
- *
- * The dnode_phys_t buffer may not be in sync with the in-core dnode
- * structure, so we try to check the dnode structure first and fall back
- * to the dnode_phys_t buffer it doesn't exist.
- */
-static boolean_t
-dnode_is_allocated(dnode_children_t *children, dnode_phys_t *dn_block, int idx)
-{
-       dnode_handle_t *dnh;
-       dmu_object_type_t ot;
-
-       if (dnode_is_consumed(children, dn_block, idx))
-               return (B_FALSE);
-
-       dnh = &children->dnc_children[idx];
-       if (dnh->dnh_dnode != NULL)
-               ot = dnh->dnh_dnode->dn_type;
-       else
-               ot = dn_block[idx].dn_type;
-
-       return (ot != DMU_OT_NONE);
-}
-
-/*
- * Return true if the given range of indices in the dnode block are
- * free. That is, the starting index is not consumed by a large dnode
- * and none of the indices are allocated.
- *
- * The dnode_phys_t buffer may not be in sync with the in-core dnode
- * structure, so we try to check the dnode structure first and fall back
- * to the dnode_phys_t buffer it doesn't exist.
- */
-static boolean_t
-dnode_is_free(dnode_children_t *children, dnode_phys_t *dn_block, int idx,
-    int slots)
-{
-       if (idx + slots > DNODES_PER_BLOCK)
-               return (B_FALSE);
-
-       if (dnode_is_consumed(children, dn_block, idx))
-               return (B_FALSE);
-
-       for (int i = idx; i < idx + slots; i++) {
-               dnode_handle_t *dnh = &children->dnc_children[i];
-               dmu_object_type_t ot;
-
-               if (dnh->dnh_dnode != NULL) {
-                       if (dnh->dnh_dnode->dn_num_slots > 1)
-                               return (B_FALSE);
-
-                       ot = dnh->dnh_dnode->dn_type;
-               } else {
-                       ot = dn_block[i].dn_type;
-               }
-
-               if (ot != DMU_OT_NONE)
-                       return (B_FALSE);
-       }
-
-       return (B_TRUE);
-}
-
-static void
-dnode_hold_slots(dnode_children_t *children, int idx, int slots)
-{
-       for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
-               dnode_handle_t *dnh = &children->dnc_children[i];
-               zrl_add(&dnh->dnh_zrlock);
-       }
-}
-
-static void
-dnode_rele_slots(dnode_children_t *children, int idx, int slots)
-{
-       for (int i = idx; i < MIN(idx + slots, DNODES_PER_BLOCK); i++) {
-               dnode_handle_t *dnh = &children->dnc_children[i];
-               zrl_remove(&dnh->dnh_zrlock);
+               dnh->dnh_dnode = DN_SLOT_UNINIT;
        }
+       kmem_free(dnc, sizeof (dnode_children_t) +
+           dnc->dnc_count * sizeof (dnode_handle_t));
 }
 
 /*
@@ -1189,24 +1166,27 @@ dnode_rele_slots(dnode_children_t *children, int idx, int slots)
  * ENOENT.
  *
  * errors:
- * EINVAL - invalid object number.
- * ENOSPC - hole too small to fulfill "slots" request
- * ENOENT - the requested dnode is not allocated
- * EIO - i/o error.
+ * EINVAL - Invalid object number or flags.
+ * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
+ * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
+ *        - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
+ * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
+ * EIO    - I/O error when reading the meta dnode dbuf.
+ *
  * succeeds even for free dnodes.
  */
 int
 dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
     void *tag, dnode_t **dnp)
 {
-       int epb, idx, err, i;
+       int epb, idx, err;
        int drop_struct_lock = FALSE;
        int type;
        uint64_t blk;
        dnode_t *mdn, *dn;
        dmu_buf_impl_t *db;
-       dnode_children_t *children_dnodes;
-       dnode_phys_t *dn_block_begin;
+       dnode_children_t *dnc;
+       dnode_phys_t *dn_block;
        dnode_handle_t *dnh;
 
        ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
@@ -1256,8 +1236,10 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
        db = dbuf_hold(mdn, blk, FTAG);
        if (drop_struct_lock)
                rw_exit(&mdn->dn_struct_rwlock);
-       if (db == NULL)
+       if (db == NULL) {
+               DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
                return (SET_ERROR(EIO));
+       }
 
        /*
         * We do not need to decrypt to read the dnode so it doesn't matter
@@ -1265,6 +1247,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
         */
        err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT);
        if (err) {
+               DNODE_STAT_BUMP(dnode_hold_dbuf_read);
                dbuf_rele(db, FTAG);
                return (err);
        }
@@ -1272,72 +1255,179 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
        ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
        epb = db->db.db_size >> DNODE_SHIFT;
 
+       idx = object & (epb - 1);
+       dn_block = (dnode_phys_t *)db->db.db_data;
+
        ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
-       children_dnodes = dmu_buf_get_user(&db->db);
-       if (children_dnodes == NULL) {
+       dnc = dmu_buf_get_user(&db->db);
+       dnh = NULL;
+       if (dnc == NULL) {
                dnode_children_t *winner;
-               children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
+               int skip = 0;
+
+               dnc = kmem_zalloc(sizeof (dnode_children_t) +
                    epb * sizeof (dnode_handle_t), KM_SLEEP);
-               children_dnodes->dnc_count = epb;
-               dnh = &children_dnodes->dnc_children[0];
-               for (i = 0; i < epb; i++) {
+               dnc->dnc_count = epb;
+               dnh = &dnc->dnc_children[0];
+
+               /* Initialize dnode slot status from dnode_phys_t */
+               for (int i = 0; i < epb; i++) {
                        zrl_init(&dnh[i].dnh_zrlock);
+
+                       if (skip) {
+                               skip--;
+                               continue;
+                       }
+
+                       if (dn_block[i].dn_type != DMU_OT_NONE) {
+                               int interior = dn_block[i].dn_extra_slots;
+
+                               dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
+                               dnode_set_slots(dnc, i + 1, interior,
+                                   DN_SLOT_INTERIOR);
+                               skip = interior;
+                       } else {
+                               dnh[i].dnh_dnode = DN_SLOT_FREE;
+                               skip = 0;
+                       }
                }
-               dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
+
+               dmu_buf_init_user(&dnc->dnc_dbu, NULL,
                    dnode_buf_evict_async, NULL);
-               winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+               winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
                if (winner != NULL) {
 
-                       for (i = 0; i < epb; i++) {
+                       for (int i = 0; i < epb; i++)
                                zrl_destroy(&dnh[i].dnh_zrlock);
-                       }
 
-                       kmem_free(children_dnodes, sizeof (dnode_children_t) +
+                       kmem_free(dnc, sizeof (dnode_children_t) +
                            epb * sizeof (dnode_handle_t));
-                       children_dnodes = winner;
+                       dnc = winner;
                }
        }
-       ASSERT(children_dnodes->dnc_count == epb);
 
-       idx = object & (epb - 1);
-       dn_block_begin = (dnode_phys_t *)db->db.db_data;
+       ASSERT(dnc->dnc_count == epb);
+       dn = DN_SLOT_UNINIT;
 
-       dnode_hold_slots(children_dnodes, idx, slots);
+       if (flag & DNODE_MUST_BE_ALLOCATED) {
+               slots = 1;
 
-       if ((flag & DNODE_MUST_BE_FREE) &&
-           !dnode_is_free(children_dnodes, dn_block_begin, idx, slots)) {
-               dnode_rele_slots(children_dnodes, idx, slots);
-               dbuf_rele(db, FTAG);
-               return (SET_ERROR(ENOSPC));
-       } else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
-           !dnode_is_allocated(children_dnodes, dn_block_begin, idx)) {
-               dnode_rele_slots(children_dnodes, idx, slots);
+               while (dn == DN_SLOT_UNINIT) {
+                       dnode_slots_hold(dnc, idx, slots);
+                       dnh = &dnc->dnc_children[idx];
+
+                       if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                               dn = dnh->dnh_dnode;
+                               break;
+                       } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
+                               DNODE_STAT_BUMP(dnode_hold_alloc_interior);
+                               dnode_slots_rele(dnc, idx, slots);
+                               dbuf_rele(db, FTAG);
+                               return (SET_ERROR(EEXIST));
+                       } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
+                               DNODE_STAT_BUMP(dnode_hold_alloc_misses);
+                               dnode_slots_rele(dnc, idx, slots);
+                               dbuf_rele(db, FTAG);
+                               return (SET_ERROR(ENOENT));
+                       }
+
+                       dnode_slots_rele(dnc, idx, slots);
+                       if (!dnode_slots_tryenter(dnc, idx, slots)) {
+                               DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
+                               continue;
+                       }
+
+                       /*
+                        * Someone else won the race and called dnode_create()
+                        * after we checked DN_SLOT_IS_PTR() above but before
+                        * we acquired the lock.
+                        */
+                       if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                               DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
+                               dn = dnh->dnh_dnode;
+                       } else {
+                               dn = dnode_create(os, dn_block + idx, db,
+                                   object, dnh);
+                       }
+               }
+
+               mutex_enter(&dn->dn_mtx);
+               if (dn->dn_type == DMU_OT_NONE) {
+                       DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
+                       mutex_exit(&dn->dn_mtx);
+                       dnode_slots_rele(dnc, idx, slots);
+                       dbuf_rele(db, FTAG);
+                       return (SET_ERROR(ENOENT));
+               }
+
+               DNODE_STAT_BUMP(dnode_hold_alloc_hits);
+       } else if (flag & DNODE_MUST_BE_FREE) {
+
+               if (idx + slots - 1 >= DNODES_PER_BLOCK) {
+                       DNODE_STAT_BUMP(dnode_hold_free_overflow);
+                       dbuf_rele(db, FTAG);
+                       return (SET_ERROR(ENOSPC));
+               }
+
+               while (dn == DN_SLOT_UNINIT) {
+                       dnode_slots_hold(dnc, idx, slots);
+
+                       if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+                               DNODE_STAT_BUMP(dnode_hold_free_misses);
+                               dnode_slots_rele(dnc, idx, slots);
+                               dbuf_rele(db, FTAG);
+                               return (SET_ERROR(ENOSPC));
+                       }
+
+                       dnode_slots_rele(dnc, idx, slots);
+                       if (!dnode_slots_tryenter(dnc, idx, slots)) {
+                               DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
+                               continue;
+                       }
+
+                       if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+                               DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
+                               dnode_slots_rele(dnc, idx, slots);
+                               dbuf_rele(db, FTAG);
+                               return (SET_ERROR(ENOSPC));
+                       }
+
+                       dnh = &dnc->dnc_children[idx];
+                       dn = dnode_create(os, dn_block + idx, db, object, dnh);
+               }
+
+               mutex_enter(&dn->dn_mtx);
+               if (!refcount_is_zero(&dn->dn_holds)) {
+                       DNODE_STAT_BUMP(dnode_hold_free_refcount);
+                       mutex_exit(&dn->dn_mtx);
+                       dnode_slots_rele(dnc, idx, slots);
+                       dbuf_rele(db, FTAG);
+                       return (SET_ERROR(EEXIST));
+               }
+
+               dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
+               DNODE_STAT_BUMP(dnode_hold_free_hits);
+       } else {
                dbuf_rele(db, FTAG);
-               return (SET_ERROR(ENOENT));
+               return (SET_ERROR(EINVAL));
        }
 
-       dnh = &children_dnodes->dnc_children[idx];
-       dn = dnh->dnh_dnode;
-       if (dn == NULL)
-               dn = dnode_create(os, dn_block_begin + idx, slots, db,
-                   object, dnh);
-
-       mutex_enter(&dn->dn_mtx);
-       type = dn->dn_type;
-       if (dn->dn_free_txg ||
-           ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
+       if (dn->dn_free_txg) {
+               DNODE_STAT_BUMP(dnode_hold_free_txg);
+               type = dn->dn_type;
                mutex_exit(&dn->dn_mtx);
-               dnode_rele_slots(children_dnodes, idx, slots);
+               dnode_slots_rele(dnc, idx, slots);
                dbuf_rele(db, FTAG);
                return (SET_ERROR(type == DMU_OT_NONE ? ENOENT : EEXIST));
        }
+
        if (refcount_add(&dn->dn_holds, tag) == 1)
                dbuf_add_ref(db, dnh);
 
        mutex_exit(&dn->dn_mtx);
 
        /* Now we can rely on the hold to prevent the dnode from moving. */
-       dnode_rele_slots(children_dnodes, idx, slots);
+       dnode_slots_rele(dnc, idx, slots);
 
        DNODE_VERIFY(dn);
        ASSERT3P(dn->dn_dbuf, ==, db);
index 422e0a773d39153d6404bf32be0af2fa26acf1cb..b8e5efd2145bdbbc10add9ca6b5acc43c7d31f2d 100644 (file)
@@ -384,7 +384,7 @@ tests = ['async_destroy_001_pos']
 [tests/functional/features/large_dnode]
 tests = ['large_dnode_001_pos', 'large_dnode_002_pos', 'large_dnode_003_pos',
          'large_dnode_004_neg', 'large_dnode_005_pos', 'large_dnode_006_pos',
-         'large_dnode_007_neg', 'large_dnode_008_pos']
+         'large_dnode_007_neg', 'large_dnode_008_pos', 'large_dnode_009_pos']
 
 [tests/functional/grow_pool]
 tests = ['grow_pool_001_pos']
index 69ec5e18a0ac68b03ee0232f6e159cfa3ed26c9c..13ba3ab33d9ebc4831a6c412def156c960608791 100644 (file)
@@ -9,4 +9,5 @@ dist_pkgdata_SCRIPTS = \
        large_dnode_005_pos.ksh \
        large_dnode_006_pos.ksh \
        large_dnode_007_neg.ksh \
-       large_dnode_008_pos.ksh
+       large_dnode_008_pos.ksh \
+       large_dnode_009_pos.ksh
index 1f900b5efbe175b609faf43be4c1494e05be3605..eac292cbe064cda91647e8468eae5e8673d6b8a5 100755 (executable)
@@ -42,6 +42,21 @@ function cleanup
        datasetexists $TEST_FS && log_must zfs destroy $TEST_FS
 }
 
+function verify_dnode_packing
+{
+       zdb -dd $TEST_FS | grep -A 3 'Dnode slots' | awk '
+               /Total used:/ {total_used=$NF}
+               /Max used:/ {max_used=$NF}
+               /Percent empty:/ {print total_used, max_used, int($NF)}
+       ' | while read total_used max_used pct_empty
+       do
+               log_note "total_used $total_used max_used $max_used pct_empty $pct_empty"
+               if [ $pct_empty -gt 5 ]; then
+                       log_fail "Holes in dnode array: pct empty $pct_empty > 5"
+               fi
+       done
+}
+
 log_onexit cleanup
 log_assert "xattrtest runs concurrently on dataset with large dnodes"
 
@@ -52,9 +67,11 @@ log_must zfs set xattr=sa $TEST_FS
 for ((i=0; i < 100; i++)); do
        dir="/$TEST_FS/dir.$i"
        log_must mkdir "$dir"
-       log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir &"
+       log_must eval "xattrtest -R -r -y -x 1 -f 1024 -k -p $dir >/dev/null 2>&1 &"
 done
 
 log_must wait
 
+verify_dnode_packing
+
 log_pass
diff --git a/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh b/tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh
new file mode 100755 (executable)
index 0000000..fa746c5
--- /dev/null
@@ -0,0 +1,71 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+# Use is subject to license terms.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Run many xattrtests on a dataset with large dnodes and xattr=sa to
+# stress concurrent allocation of large dnodes.
+#
+
+TEST_FS=$TESTPOOL/large_dnode
+
+verify_runnable "both"
+
+function cleanup
+{
+       datasetexists $TEST_FS && log_must zfs destroy $TEST_FS
+}
+
+log_onexit cleanup
+log_assert "xattrtest runs concurrently on dataset with large dnodes"
+
+log_must zfs create $TEST_FS
+log_must zfs set dnsize=auto $TEST_FS
+log_must zfs set xattr=sa $TEST_FS
+
+for ((i=0; i < 100; i++)); do
+       dir="/$TEST_FS/dir.$i"
+       log_must mkdir "$dir"
+
+       do_unlink=""
+       if [ $((RANDOM % 2)) -eq 0 ]; then
+               do_unlink="-k -f 1024"
+       else
+               do_unlink="-f $((RANDOM % 1024))"
+       fi
+       log_must eval "xattrtest -R -r -y -x 1 $do_unlink -p $dir >/dev/null 2>&1 &"
+done
+
+log_must wait
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+log_must ls -lR "/$TEST_FS/" >/dev/null 2>&1
+log_must zdb -d $TESTPOOL
+log_pass