*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#include <sys/zfs_context.h>
#include <sys/spa.h>
#include <sys/zio.h>
#include <sys/dmu_zfetch.h>
-
-static int free_range_compar(const void *node1, const void *node2);
-
+#include <sys/range_tree.h>
+#include <sys/trace_dnode.h>
+
+dnode_stats_t dnode_stats = {
+ { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 },
+ { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_hits", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 },
+ { "dnode_hold_free_txg", KSTAT_DATA_UINT64 },
+ { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 },
+ { "dnode_allocate", KSTAT_DATA_UINT64 },
+ { "dnode_reallocate", KSTAT_DATA_UINT64 },
+ { "dnode_buf_evict", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_race", KSTAT_DATA_UINT64 },
+ { "dnode_alloc_next_block", KSTAT_DATA_UINT64 },
+ { "dnode_move_invalid", KSTAT_DATA_UINT64 },
+ { "dnode_move_recheck1", KSTAT_DATA_UINT64 },
+ { "dnode_move_recheck2", KSTAT_DATA_UINT64 },
+ { "dnode_move_special", KSTAT_DATA_UINT64 },
+ { "dnode_move_handle", KSTAT_DATA_UINT64 },
+ { "dnode_move_rwlock", KSTAT_DATA_UINT64 },
+ { "dnode_move_active", KSTAT_DATA_UINT64 },
+};
+
+static kstat_t *dnode_ksp;
static kmem_cache_t *dnode_cache;
-/*
- * Define DNODE_STATS to turn on statistic gathering. By default, it is only
- * turned on when DEBUG is also defined.
- */
-#ifdef DEBUG
-#define DNODE_STATS
-#endif /* DEBUG */
-
-#ifdef DNODE_STATS
-#define DNODE_STAT_ADD(stat) ((stat)++)
-#else
-#define DNODE_STAT_ADD(stat) /* nothing */
-#endif /* DNODE_STATS */
ASSERTV(static dnode_phys_t dnode_phys_zero);
static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
#endif /* _KERNEL */
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+ const dmu_buf_impl_t *d1 = x1;
+ const dmu_buf_impl_t *d2 = x2;
+
+ int cmp = AVL_CMP(d1->db_level, d2->db_level);
+ if (likely(cmp))
+ return (cmp);
+
+ cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+ if (likely(cmp))
+ return (cmp);
+
+ if (d1->db_state == DB_SEARCH) {
+ ASSERT3S(d2->db_state, !=, DB_SEARCH);
+ return (-1);
+ } else if (d2->db_state == DB_SEARCH) {
+ ASSERT3S(d1->db_state, !=, DB_SEARCH);
+ return (1);
+ }
+
+ return (AVL_PCMP(d1, d2));
+}
+
/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
dnode_t *dn = arg;
int i;
- rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+ rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
- refcount_create(&dn->dn_holds);
+ /*
+ * Every dbuf has a reference, and dropping a tracked reference is
+ * O(number of references), so don't track dn_holds.
+ */
+ refcount_create_untracked(&dn->dn_holds);
refcount_create(&dn->dn_tx_holds);
list_link_init(&dn->dn_link);
for (i = 0; i < TXG_SIZE; i++) {
list_link_init(&dn->dn_dirty_link[i]);
- avl_create(&dn->dn_ranges[i], free_range_compar,
- sizeof (free_range_t),
- offsetof(struct free_range, fr_node));
+ dn->dn_free_ranges[i] = NULL;
list_create(&dn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
dn->dn_id_flags = 0;
dn->dn_dbufs_count = 0;
- list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
dn->dn_moved = 0;
for (i = 0; i < TXG_SIZE; i++) {
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
- avl_destroy(&dn->dn_ranges[i]);
+ ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
list_destroy(&dn->dn_dirty_records[i]);
- ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
- ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
- ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
- ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
- ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
- ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
- ASSERT3U(dn->dn_next_blksz[i], ==, 0);
- }
-
- ASSERT3U(dn->dn_allocated_txg, ==, 0);
- ASSERT3U(dn->dn_free_txg, ==, 0);
- ASSERT3U(dn->dn_assigned_txg, ==, 0);
- ASSERT3U(dn->dn_dirtyctx, ==, 0);
+ ASSERT0(dn->dn_next_nblkptr[i]);
+ ASSERT0(dn->dn_next_nlevels[i]);
+ ASSERT0(dn->dn_next_indblkshift[i]);
+ ASSERT0(dn->dn_next_bonustype[i]);
+ ASSERT0(dn->dn_rm_spillblk[i]);
+ ASSERT0(dn->dn_next_bonuslen[i]);
+ ASSERT0(dn->dn_next_blksz[i]);
+ }
+
+ ASSERT0(dn->dn_allocated_txg);
+ ASSERT0(dn->dn_free_txg);
+ ASSERT0(dn->dn_assigned_txg);
+ ASSERT0(dn->dn_dirtyctx);
ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL);
ASSERT3P(dn->dn_bonus, ==, NULL);
ASSERT(!dn->dn_have_spill);
ASSERT3P(dn->dn_zio, ==, NULL);
- ASSERT3U(dn->dn_oldused, ==, 0);
- ASSERT3U(dn->dn_oldflags, ==, 0);
- ASSERT3U(dn->dn_olduid, ==, 0);
- ASSERT3U(dn->dn_oldgid, ==, 0);
- ASSERT3U(dn->dn_newuid, ==, 0);
- ASSERT3U(dn->dn_newgid, ==, 0);
- ASSERT3U(dn->dn_id_flags, ==, 0);
-
- ASSERT3U(dn->dn_dbufs_count, ==, 0);
- list_destroy(&dn->dn_dbufs);
+ ASSERT0(dn->dn_oldused);
+ ASSERT0(dn->dn_oldflags);
+ ASSERT0(dn->dn_olduid);
+ ASSERT0(dn->dn_oldgid);
+ ASSERT0(dn->dn_newuid);
+ ASSERT0(dn->dn_newgid);
+ ASSERT0(dn->dn_id_flags);
+
+ ASSERT0(dn->dn_dbufs_count);
+ avl_destroy(&dn->dn_dbufs);
}
void
{
ASSERT(dnode_cache == NULL);
dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t),
- 0, dnode_cons, dnode_dest, NULL, NULL, NULL, KMC_KMEM);
+ 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
kmem_cache_set_move(dnode_cache, dnode_move);
+
+ dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
+ KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
+ KSTAT_FLAG_VIRTUAL);
+ if (dnode_ksp != NULL) {
+ dnode_ksp->ks_data = &dnode_stats;
+ kstat_install(dnode_ksp);
+ }
}
void
dnode_fini(void)
{
+ if (dnode_ksp != NULL) {
+ kstat_delete(dnode_ksp);
+ dnode_ksp = NULL;
+ }
+
kmem_cache_destroy(dnode_cache);
dnode_cache = NULL;
}
}
if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
int i;
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
if (dn->dn_datablkshift) {
ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
ASSERT(DMU_OT_IS_VALID(dn->dn_type));
ASSERT3U(dn->dn_nblkptr, >=, 1);
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
- ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
ASSERT3U(dn->dn_datablksz, ==,
dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
- dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ dn->dn_bonuslen, <=, max_bonuslen);
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
}
dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
dnp->dn_used = BSWAP_64(dnp->dn_used);
* dnode buffer).
*/
int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
- size_t len = DN_MAX_BONUSLEN - off;
+ int slots = dnp->dn_extra_slots + 1;
+ size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
dmu_object_byteswap_t byteswap;
ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
/* Swap SPILL block if we have one */
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
- byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
-
+ byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
}
void
dnode_buf_byteswap(void *vbuf, size_t size)
{
- dnode_phys_t *buf = vbuf;
- int i;
+ int i = 0;
ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
- size >>= DNODE_SHIFT;
- for (i = 0; i < size; i++) {
- dnode_byteswap(buf);
- buf++;
- }
-}
-
-static int
-free_range_compar(const void *node1, const void *node2)
-{
- const free_range_t *rp1 = node1;
- const free_range_t *rp2 = node2;
+ while (i < size) {
+ dnode_phys_t *dnp = (void *)(((char *)vbuf) + i);
+ dnode_byteswap(dnp);
- if (rp1->fr_blkid < rp2->fr_blkid)
- return (-1);
- else if (rp1->fr_blkid > rp2->fr_blkid)
- return (1);
- else return (0);
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE)
+ i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
+ }
}
void
dnode_setdirty(dn, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+ ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
(dn->dn_nblkptr-1) * sizeof (blkptr_t));
dn->dn_bonuslen = newsize;
if (newsize == 0)
static void
dnode_setdblksz(dnode_t *dn, int size)
{
- ASSERT3U(P2PHASE(size, SPA_MINBLOCKSIZE), ==, 0);
+ ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE));
ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(size, >=, SPA_MINBLOCKSIZE);
ASSERT3U(size >> SPA_MINBLOCKSHIFT, <,
1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
dn->dn_datablksz = size;
dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
- dn->dn_datablkshift = ISP2(size) ? highbit(size - 1) : 0;
+ dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
}
static dnode_t *
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
uint64_t object, dnode_handle_t *dnh)
{
- dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_PUSHPAGE);
+ dnode_t *dn;
+ dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
ASSERT(!POINTER_IS_VALID(dn->dn_objset));
dn->dn_moved = 0;
dn->dn_compress = dnp->dn_compress;
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_num_slots = dnp->dn_extra_slots + 1;
dn->dn_maxblkid = dnp->dn_maxblkid;
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
dn->dn_id_flags = 0;
dmu_zfetch_init(&dn->dn_zfetch, dn);
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
+ ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+ ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode));
mutex_enter(&os->os_lock);
- list_insert_head(&os->os_dnodes, dn);
+
+ /*
+ * Exclude special dnodes from os_dnodes so an empty os_dnodes
+ * signifies that the special dnodes have no references from
+ * their children (the entries in os_dnodes). This allows
+ * dnode_destroy() to easily determine if the last child has
+ * been removed and then complete eviction of the objset.
+ */
+ if (!DMU_OBJECT_IS_SPECIAL(object))
+ list_insert_head(&os->os_dnodes, dn);
membar_producer();
+
/*
- * Everything else must be valid before assigning dn_objset makes the
- * dnode eligible for dnode_move().
+ * Everything else must be valid before assigning dn_objset
+ * makes the dnode eligible for dnode_move().
*/
dn->dn_objset = os;
+
+ dnh->dnh_dnode = dn;
mutex_exit(&os->os_lock);
- arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
+ arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
+
return (dn);
}
dnode_destroy(dnode_t *dn)
{
objset_t *os = dn->dn_objset;
+ boolean_t complete_os_eviction = B_FALSE;
ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
mutex_enter(&os->os_lock);
POINTER_INVALIDATE(&dn->dn_objset);
- list_remove(&os->os_dnodes, dn);
+ if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ list_remove(&os->os_dnodes, dn);
+ complete_os_eviction =
+ list_is_empty(&os->os_dnodes) &&
+ list_link_active(&os->os_evicting_node);
+ }
mutex_exit(&os->os_lock);
/* the dnode can no longer move, so we can release the handle */
- zrl_remove(&dn->dn_handle->dnh_zrlock);
+ if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+ zrl_remove(&dn->dn_handle->dnh_zrlock);
dn->dn_allocated_txg = 0;
dn->dn_free_txg = 0;
}
if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
- dbuf_evict(dn->dn_bonus);
+ dbuf_destroy(dn->dn_bonus);
dn->dn_bonus = NULL;
}
dn->dn_zio = NULL;
dn->dn_newgid = 0;
dn->dn_id_flags = 0;
- dmu_zfetch_rele(&dn->dn_zfetch);
+ dmu_zfetch_fini(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn);
- arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+ arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+ if (complete_os_eviction)
+ dmu_objset_evict_done(os);
}
void
dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
{
int i;
+ ASSERT3U(dn_slots, >, 0);
+ ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+ spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
blocksize = 1 << zfs_default_bs;
- else if (blocksize > SPA_MAXBLOCKSIZE)
- blocksize = SPA_MAXBLOCKSIZE;
else
blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
- dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
- dn->dn_object, tx->tx_txg, blocksize, ibs);
+ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
+ dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
+ DNODE_STAT_BUMP(dnode_allocate);
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
(bonustype == DMU_OT_SA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
ASSERT(dn->dn_type == DMU_OT_NONE);
- ASSERT3U(dn->dn_maxblkid, ==, 0);
- ASSERT3U(dn->dn_allocated_txg, ==, 0);
- ASSERT3U(dn->dn_assigned_txg, ==, 0);
+ ASSERT0(dn->dn_maxblkid);
+ ASSERT0(dn->dn_allocated_txg);
+ ASSERT0(dn->dn_assigned_txg);
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
- ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+ ASSERT(avl_is_empty(&dn->dn_dbufs));
for (i = 0; i < TXG_SIZE; i++) {
- ASSERT3U(dn->dn_next_nblkptr[i], ==, 0);
- ASSERT3U(dn->dn_next_nlevels[i], ==, 0);
- ASSERT3U(dn->dn_next_indblkshift[i], ==, 0);
- ASSERT3U(dn->dn_next_bonuslen[i], ==, 0);
- ASSERT3U(dn->dn_next_bonustype[i], ==, 0);
- ASSERT3U(dn->dn_rm_spillblk[i], ==, 0);
- ASSERT3U(dn->dn_next_blksz[i], ==, 0);
+ ASSERT0(dn->dn_next_nblkptr[i]);
+ ASSERT0(dn->dn_next_nlevels[i]);
+ ASSERT0(dn->dn_next_indblkshift[i]);
+ ASSERT0(dn->dn_next_bonuslen[i]);
+ ASSERT0(dn->dn_next_bonustype[i]);
+ ASSERT0(dn->dn_rm_spillblk[i]);
+ ASSERT0(dn->dn_next_blksz[i]);
ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL);
- ASSERT3U(avl_numnodes(&dn->dn_ranges[i]), ==, 0);
+ ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
}
dn->dn_type = ot;
dnode_setdblksz(dn, blocksize);
dn->dn_indblkshift = ibs;
dn->dn_nlevels = 1;
+ dn->dn_num_slots = dn_slots;
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
dn->dn_nblkptr = 1;
- else
- dn->dn_nblkptr = 1 +
- ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ else {
+ dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
+ }
+
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
{
int nblkptr;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
- ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
- ASSERT3U(blocksize % SPA_MINBLOCKSIZE, ==, 0);
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
+ ASSERT0(blocksize % SPA_MINBLOCKSIZE);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0) ||
(bonustype == DMU_OT_SA && bonuslen == 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(bonuslen, <=,
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+ ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT));
+
+ dnode_free_interior_slots(dn);
+ DNODE_STAT_BUMP(dnode_reallocate);
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
nblkptr = 1;
else
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
if (dn->dn_bonustype != bonustype)
dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
mutex_enter(&dn->dn_mtx);
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
+ dn->dn_num_slots = dn_slots;
dn->dn_nblkptr = nblkptr;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
dn->dn_compress = ZIO_COMPRESS_INHERIT;
/* fix up the bonus db_size */
if (dn->dn_bonus) {
dn->dn_bonus->db.db_size =
- DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
}
}
#ifdef _KERNEL
-#ifdef DNODE_STATS
-static struct {
- uint64_t dms_dnode_invalid;
- uint64_t dms_dnode_recheck1;
- uint64_t dms_dnode_recheck2;
- uint64_t dms_dnode_special;
- uint64_t dms_dnode_handle;
- uint64_t dms_dnode_rwlock;
- uint64_t dms_dnode_active;
-} dnode_move_stats;
-#endif /* DNODE_STATS */
-
static void
dnode_move_impl(dnode_t *odn, dnode_t *ndn)
{
ndn->dn_datablkszsec = odn->dn_datablkszsec;
ndn->dn_datablksz = odn->dn_datablksz;
ndn->dn_maxblkid = odn->dn_maxblkid;
+ ndn->dn_num_slots = odn->dn_num_slots;
bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
sizeof (odn->dn_next_nblkptr));
bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
list_move_tail(&ndn->dn_dirty_records[i],
&odn->dn_dirty_records[i]);
}
- bcopy(&odn->dn_ranges[0], &ndn->dn_ranges[0], sizeof (odn->dn_ranges));
+ bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+ sizeof (odn->dn_free_ranges));
ndn->dn_allocated_txg = odn->dn_allocated_txg;
ndn->dn_free_txg = odn->dn_free_txg;
ndn->dn_assigned_txg = odn->dn_assigned_txg;
ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
- ASSERT(list_is_empty(&ndn->dn_dbufs));
- list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+ ASSERT(avl_is_empty(&ndn->dn_dbufs));
+ avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
ndn->dn_dbufs_count = odn->dn_dbufs_count;
ndn->dn_bonus = odn->dn_bonus;
ndn->dn_have_spill = odn->dn_have_spill;
dmu_zfetch_init(&ndn->dn_zfetch, NULL);
list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
- ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
- ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
/*
* Update back pointers. Updating the handle fixes the back pointer of
*/
odn->dn_dbuf = NULL;
odn->dn_handle = NULL;
- list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
odn->dn_dbufs_count = 0;
odn->dn_bonus = NULL;
list_create(&odn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
- odn->dn_ranges[i].avl_root = NULL;
- odn->dn_ranges[i].avl_numnodes = 0;
+ odn->dn_free_ranges[i] = NULL;
odn->dn_next_nlevels[i] = 0;
odn->dn_next_indblkshift[i] = 0;
odn->dn_next_bonustype[i] = 0;
*/
os = odn->dn_objset;
if (!POINTER_IS_VALID(os)) {
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid);
+ DNODE_STAT_BUMP(dnode_move_invalid);
return (KMEM_CBRC_DONT_KNOW);
}
rw_enter(&os_lock, RW_WRITER);
if (os != odn->dn_objset) {
rw_exit(&os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1);
+ DNODE_STAT_BUMP(dnode_move_recheck1);
return (KMEM_CBRC_DONT_KNOW);
}
if (os != odn->dn_objset) {
mutex_exit(&os->os_lock);
rw_exit(&os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2);
+ DNODE_STAT_BUMP(dnode_move_recheck2);
return (KMEM_CBRC_DONT_KNOW);
}
rw_exit(&os_lock);
if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) {
mutex_exit(&os->os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special);
+ DNODE_STAT_BUMP(dnode_move_special);
return (KMEM_CBRC_NO);
}
ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */
*/
if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) {
mutex_exit(&os->os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle);
+ DNODE_STAT_BUMP(dnode_move_handle);
return (KMEM_CBRC_LATER);
}
if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) {
zrl_exit(&odn->dn_handle->dnh_zrlock);
mutex_exit(&os->os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock);
+ DNODE_STAT_BUMP(dnode_move_rwlock);
return (KMEM_CBRC_LATER);
}
rw_exit(&odn->dn_struct_rwlock);
zrl_exit(&odn->dn_handle->dnh_zrlock);
mutex_exit(&os->os_lock);
- DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active);
+ DNODE_STAT_BUMP(dnode_move_active);
return (KMEM_CBRC_LATER);
}
}
#endif /* _KERNEL */
+static void
+dnode_slots_hold(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ zrl_add(&dnh->dnh_zrlock);
+ }
+}
+
+static void
+dnode_slots_rele(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ if (zrl_is_locked(&dnh->dnh_zrlock))
+ zrl_exit(&dnh->dnh_zrlock);
+ else
+ zrl_remove(&dnh->dnh_zrlock);
+ }
+}
+
+static int
+dnode_slots_tryenter(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ if (!zrl_tryenter(&dnh->dnh_zrlock)) {
+ for (int j = idx; j < i; j++) {
+ dnh = &children->dnc_children[j];
+ zrl_exit(&dnh->dnh_zrlock);
+ }
+
+ return (0);
+ }
+ }
+
+ return (1);
+}
+
+static void
+dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ dnh->dnh_dnode = ptr;
+ }
+}
+
+static boolean_t
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+ dnode_t *dn = dnh->dnh_dnode;
+
+ if (dn == DN_SLOT_FREE) {
+ continue;
+ } else if (DN_SLOT_IS_PTR(dn)) {
+ mutex_enter(&dn->dn_mtx);
+ dmu_object_type_t type = dn->dn_type;
+ mutex_exit(&dn->dn_mtx);
+
+ if (type != DMU_OT_NONE)
+ return (B_FALSE);
+
+ continue;
+ } else {
+ return (B_FALSE);
+ }
+
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
+}
+
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ for (int i = idx; i < idx + slots; i++) {
+ dnode_handle_t *dnh = &children->dnc_children[i];
+
+ ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+ dnode_destroy(dnh->dnh_dnode);
+ dnh->dnh_dnode = DN_SLOT_FREE;
+ }
+ }
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+ dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+ int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+ int idx = (dn->dn_object & (epb - 1)) + 1;
+ int slots = dn->dn_num_slots - 1;
+
+ if (slots == 0)
+ return;
+
+ ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+ while (!dnode_slots_tryenter(children, idx, slots))
+ DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+
+ dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+ dnode_slots_rele(children, idx, slots);
+}
+
void
dnode_special_close(dnode_handle_t *dnh)
{
/*
* Wait for final references to the dnode to clear. This can
- * only happen if the arc is asyncronously evicting state that
+ * only happen if the arc is asynchronously evicting state that
* has a hold on this dnode while we are trying to evict this
* dnode.
*/
while (refcount_count(&dn->dn_holds) > 0)
delay(1);
+ ASSERT(dn->dn_dbuf == NULL ||
+ dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
zrl_add(&dnh->dnh_zrlock);
dnode_destroy(dn); /* implicit zrl_remove() */
zrl_destroy(&dnh->dnh_zrlock);
dnh->dnh_dnode = NULL;
}
-dnode_t *
+void
dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
dnode_handle_t *dnh)
{
- dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
- dnh->dnh_dnode = dn;
+ dnode_t *dn;
+
zrl_init(&dnh->dnh_zrlock);
+ zrl_tryenter(&dnh->dnh_zrlock);
+
+ dn = dnode_create(os, dnp, NULL, object, dnh);
DNODE_VERIFY(dn);
- return (dn);
+
+ zrl_exit(&dnh->dnh_zrlock);
}
static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
+dnode_buf_evict_async(void *dbu)
{
- dnode_children_t *children_dnodes = arg;
- int i;
- int epb = db->db_size >> DNODE_SHIFT;
+ dnode_children_t *dnc = dbu;
- ASSERT(epb == children_dnodes->dnc_count);
+ DNODE_STAT_BUMP(dnode_buf_evict);
- for (i = 0; i < epb; i++) {
- dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
+ for (int i = 0; i < dnc->dnc_count; i++) {
+ dnode_handle_t *dnh = &dnc->dnc_children[i];
dnode_t *dn;
/*
* another valid address, so there is no need here to guard
* against changes to or from NULL.
*/
- if (dnh->dnh_dnode == NULL) {
+ if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
zrl_destroy(&dnh->dnh_zrlock);
+ dnh->dnh_dnode = DN_SLOT_UNINIT;
continue;
}
ASSERT(refcount_is_zero(&dn->dn_holds));
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
- dnode_destroy(dn); /* implicit zrl_remove() */
+ dnode_destroy(dn); /* implicit zrl_remove() for first slot */
zrl_destroy(&dnh->dnh_zrlock);
- dnh->dnh_dnode = NULL;
+ dnh->dnh_dnode = DN_SLOT_UNINIT;
}
- kmem_free(children_dnodes, sizeof (dnode_children_t) +
- (epb - 1) * sizeof (dnode_handle_t));
+ kmem_free(dnc, sizeof (dnode_children_t) +
+ dnc->dnc_count * sizeof (dnode_handle_t));
}
/*
* errors:
- * EINVAL - invalid object number.
- * EIO - i/o error.
+ * EINVAL - Invalid object number or flags.
+ * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE)
+ * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE)
+ * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED)
+ * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED)
+ * EIO - I/O error when reading the meta dnode dbuf.
+ *
* succeeds even for free dnodes.
*/
int
-dnode_hold_impl(objset_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
void *tag, dnode_t **dnp)
{
int epb, idx, err;
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
- dnode_children_t *children_dnodes;
+ dnode_children_t *dnc;
+ dnode_phys_t *dn_block;
dnode_handle_t *dnh;
+ ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+ ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+
/*
* If you are holding the spa config lock as writer, you shouldn't
* be asking the DMU to do *anything* unless it's the root pool
dn = (object == DMU_USERUSED_OBJECT) ?
DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os);
if (dn == NULL)
- return (ENOENT);
+ return (SET_ERROR(ENOENT));
type = dn->dn_type;
if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
- return (ENOENT);
+ return (SET_ERROR(ENOENT));
if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
- return (EEXIST);
+ return (SET_ERROR(EEXIST));
DNODE_VERIFY(dn);
(void) refcount_add(&dn->dn_holds, tag);
*dnp = dn;
}
if (object == 0 || object >= DN_MAX_OBJECT)
- return (EINVAL);
+ return (SET_ERROR(EINVAL));
mdn = DMU_META_DNODE(os);
ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT);
drop_struct_lock = TRUE;
}
- blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+ blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock)
rw_exit(&mdn->dn_struct_rwlock);
- if (db == NULL)
- return (EIO);
+ if (db == NULL) {
+ DNODE_STAT_BUMP(dnode_hold_dbuf_hold);
+ return (SET_ERROR(EIO));
+ }
err = dbuf_read(db, NULL, DB_RF_CANFAIL);
if (err) {
+ DNODE_STAT_BUMP(dnode_hold_dbuf_read);
dbuf_rele(db, FTAG);
return (err);
}
ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
epb = db->db.db_size >> DNODE_SHIFT;
- idx = object & (epb-1);
+ idx = object & (epb - 1);
+ dn_block = (dnode_phys_t *)db->db.db_data;
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
- children_dnodes = dmu_buf_get_user(&db->db);
- if (children_dnodes == NULL) {
- int i;
+ dnc = dmu_buf_get_user(&db->db);
+ dnh = NULL;
+ if (dnc == NULL) {
dnode_children_t *winner;
- children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
- (epb - 1) * sizeof (dnode_handle_t),
- KM_PUSHPAGE | KM_NODEBUG);
- children_dnodes->dnc_count = epb;
- dnh = &children_dnodes->dnc_children[0];
- for (i = 0; i < epb; i++) {
+ int skip = 0;
+
+ dnc = kmem_zalloc(sizeof (dnode_children_t) +
+ epb * sizeof (dnode_handle_t), KM_SLEEP);
+ dnc->dnc_count = epb;
+ dnh = &dnc->dnc_children[0];
+
+ /* Initialize dnode slot status from dnode_phys_t */
+ for (int i = 0; i < epb; i++) {
zrl_init(&dnh[i].dnh_zrlock);
- dnh[i].dnh_dnode = NULL;
+
+ if (skip) {
+ skip--;
+ continue;
+ }
+
+ if (dn_block[i].dn_type != DMU_OT_NONE) {
+ int interior = dn_block[i].dn_extra_slots;
+
+ dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED);
+ dnode_set_slots(dnc, i + 1, interior,
+ DN_SLOT_INTERIOR);
+ skip = interior;
+ } else {
+ dnh[i].dnh_dnode = DN_SLOT_FREE;
+ skip = 0;
+ }
}
- if ((winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
- dnode_buf_pageout))) {
- kmem_free(children_dnodes, sizeof (dnode_children_t) +
- (epb - 1) * sizeof (dnode_handle_t));
- children_dnodes = winner;
+
+ dmu_buf_init_user(&dnc->dnc_dbu, NULL,
+ dnode_buf_evict_async, NULL);
+ winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu);
+ if (winner != NULL) {
+
+ for (int i = 0; i < epb; i++)
+ zrl_destroy(&dnh[i].dnh_zrlock);
+
+ kmem_free(dnc, sizeof (dnode_children_t) +
+ epb * sizeof (dnode_handle_t));
+ dnc = winner;
}
}
- ASSERT(children_dnodes->dnc_count == epb);
- dnh = &children_dnodes->dnc_children[idx];
- zrl_add(&dnh->dnh_zrlock);
- if ((dn = dnh->dnh_dnode) == NULL) {
- dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
- dnode_t *winner;
+ ASSERT(dnc->dnc_count == epb);
+ dn = DN_SLOT_UNINIT;
- dn = dnode_create(os, phys, db, object, dnh);
- winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
- if (winner != NULL) {
- zrl_add(&dnh->dnh_zrlock);
- dnode_destroy(dn); /* implicit zrl_remove() */
- dn = winner;
+ if (flag & DNODE_MUST_BE_ALLOCATED) {
+ slots = 1;
+
+ while (dn == DN_SLOT_UNINIT) {
+ dnode_slots_hold(dnc, idx, slots);
+ dnh = &dnc->dnc_children[idx];
+
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ dn = dnh->dnh_dnode;
+ break;
+ } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_interior);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EEXIST));
+ } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ dnode_slots_rele(dnc, idx, slots);
+ if (!dnode_slots_tryenter(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
+ continue;
+ }
+
+ /*
+ * Someone else won the race and called dnode_create()
+ * after we checked DN_SLOT_IS_PTR() above but before
+ * we acquired the lock.
+ */
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses);
+ dn = dnh->dnh_dnode;
+ } else {
+ dn = dnode_create(os, dn_block + idx, db,
+ object, dnh);
+ }
+ }
+
+ mutex_enter(&dn->dn_mtx);
+ if (dn->dn_type == DMU_OT_NONE) {
+ DNODE_STAT_BUMP(dnode_hold_alloc_type_none);
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOENT));
+ }
+
+ DNODE_STAT_BUMP(dnode_hold_alloc_hits);
+ } else if (flag & DNODE_MUST_BE_FREE) {
+
+ if (idx + slots - 1 >= DNODES_PER_BLOCK) {
+ DNODE_STAT_BUMP(dnode_hold_free_overflow);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ while (dn == DN_SLOT_UNINIT) {
+ dnode_slots_hold(dnc, idx, slots);
+
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ dnode_slots_rele(dnc, idx, slots);
+ if (!dnode_slots_tryenter(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
+ continue;
+ }
+
+ if (!dnode_check_slots_free(dnc, idx, slots)) {
+ DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(ENOSPC));
+ }
+
+ /*
+ * Allocated but otherwise free dnodes which would
+ * be in the interior of a multi-slot dnodes need
+ * to be freed. Single slot dnodes can be safely
+ * re-purposed as a performance optimization.
+ */
+ if (slots > 1)
+ dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
+ dnh = &dnc->dnc_children[idx];
+ if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+ dn = dnh->dnh_dnode;
+ } else {
+ dn = dnode_create(os, dn_block + idx, db,
+ object, dnh);
+ }
}
+
+ mutex_enter(&dn->dn_mtx);
+ if (!refcount_is_zero(&dn->dn_holds)) {
+ DNODE_STAT_BUMP(dnode_hold_free_refcount);
+ mutex_exit(&dn->dn_mtx);
+ dnode_slots_rele(dnc, idx, slots);
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EEXIST));
+ }
+
+ dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR);
+ DNODE_STAT_BUMP(dnode_hold_free_hits);
+ } else {
+ dbuf_rele(db, FTAG);
+ return (SET_ERROR(EINVAL));
}
- mutex_enter(&dn->dn_mtx);
- type = dn->dn_type;
- if (dn->dn_free_txg ||
- ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
- ((flag & DNODE_MUST_BE_FREE) &&
- (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
+ if (dn->dn_free_txg) {
+ DNODE_STAT_BUMP(dnode_hold_free_txg);
+ type = dn->dn_type;
mutex_exit(&dn->dn_mtx);
- zrl_remove(&dnh->dnh_zrlock);
+ dnode_slots_rele(dnc, idx, slots);
dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
}
- mutex_exit(&dn->dn_mtx);
if (refcount_add(&dn->dn_holds, tag) == 1)
dbuf_add_ref(db, dnh);
+
+ mutex_exit(&dn->dn_mtx);
+
/* Now we can rely on the hold to prevent the dnode from moving. */
- zrl_remove(&dnh->dnh_zrlock);
+ dnode_slots_rele(dnc, idx, slots);
DNODE_VERIFY(dn);
ASSERT3P(dn->dn_dbuf, ==, db);
int
dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
{
- return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+ dnp));
}
/*
void
dnode_rele(dnode_t *dn, void *tag)
+{
+ mutex_enter(&dn->dn_mtx);
+ dnode_rele_and_unlock(dn, tag);
+}
+
+void
+dnode_rele_and_unlock(dnode_t *dn, void *tag)
{
uint64_t refs;
/* Get while the hold prevents the dnode from moving. */
dmu_buf_impl_t *db = dn->dn_dbuf;
dnode_handle_t *dnh = dn->dn_handle;
- mutex_enter(&dn->dn_mtx);
refs = refcount_remove(&dn->dn_holds, tag);
mutex_exit(&dn->dn_mtx);
*/
dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
- mutex_enter(&os->os_lock);
+ multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
+ multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
/*
* If we are already marked dirty, we're done.
*/
if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
- mutex_exit(&os->os_lock);
+ multilist_sublist_unlock(mls);
return;
}
- ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+ ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+ !avl_is_empty(&dn->dn_dbufs));
ASSERT(dn->dn_datablksz != 0);
- ASSERT3U(dn->dn_next_bonuslen[txg&TXG_MASK], ==, 0);
- ASSERT3U(dn->dn_next_blksz[txg&TXG_MASK], ==, 0);
- ASSERT3U(dn->dn_next_bonustype[txg&TXG_MASK], ==, 0);
+ ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
+ ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
+ ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
dn->dn_object, txg);
- if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
- list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
- } else {
- list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
- }
+ multilist_sublist_insert_head(mls, dn);
- mutex_exit(&os->os_lock);
+ multilist_sublist_unlock(mls);
/*
* The dnode maintains a hold on its containing dbuf as
void
dnode_free(dnode_t *dn, dmu_tx_t *tx)
{
- int txgoff = tx->tx_txg & TXG_MASK;
-
- dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
-
- /* we should be the only holder... hopefully */
- /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
-
mutex_enter(&dn->dn_mtx);
if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
mutex_exit(&dn->dn_mtx);
dn->dn_free_txg = tx->tx_txg;
mutex_exit(&dn->dn_mtx);
- /*
- * If the dnode is already dirty, it needs to be moved from
- * the dirty list to the free list.
- */
- mutex_enter(&dn->dn_objset->os_lock);
- if (list_link_active(&dn->dn_dirty_link[txgoff])) {
- list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
- list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
- mutex_exit(&dn->dn_objset->os_lock);
- } else {
- mutex_exit(&dn->dn_objset->os_lock);
- dnode_setdirty(dn, tx);
- }
+ dnode_setdirty(dn, tx);
}
/*
int
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
{
- dmu_buf_impl_t *db, *db_next;
+ dmu_buf_impl_t *db;
int err;
+ ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (size == 0)
size = SPA_MINBLOCKSIZE;
- if (size > SPA_MAXBLOCKSIZE)
- size = SPA_MAXBLOCKSIZE;
else
size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
/* Check for any allocated blocks beyond the first */
- if (dn->dn_phys->dn_maxblkid != 0)
+ if (dn->dn_maxblkid != 0)
goto fail;
mutex_enter(&dn->dn_dbufs_mtx);
- for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
- db_next = list_next(&dn->dn_dbufs, db);
-
+ for (db = avl_first(&dn->dn_dbufs); db != NULL;
+ db = AVL_NEXT(&dn->dn_dbufs, db)) {
if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
db->db_blkid != DMU_SPILL_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
goto fail;
/* resize the old block */
- err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
if (err == 0)
dbuf_new_size(db, size, tx);
else if (err != ENOENT)
fail:
rw_exit(&dn->dn_struct_rwlock);
- return (ENOTSUP);
+ return (SET_ERROR(ENOTSUP));
}
/* read-holding callers must not rely on the lock being continuously held */
sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
new_nlevels++;
+ ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
+
if (new_nlevels > dn->dn_nlevels) {
int old_nlevels = dn->dn_nlevels;
dmu_buf_impl_t *db;
rw_downgrade(&dn->dn_struct_rwlock);
}
-void
-dnode_clear_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
{
- avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
- avl_index_t where;
- free_range_t *rp;
- free_range_t rp_tofind;
- uint64_t endblk = blkid + nblks;
-
- ASSERT(MUTEX_HELD(&dn->dn_mtx));
- ASSERT(nblks <= UINT64_MAX - blkid); /* no overflow */
-
- dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
- blkid, nblks, tx->tx_txg);
- rp_tofind.fr_blkid = blkid;
- rp = avl_find(tree, &rp_tofind, &where);
- if (rp == NULL)
- rp = avl_nearest(tree, where, AVL_BEFORE);
- if (rp == NULL)
- rp = avl_nearest(tree, where, AVL_AFTER);
-
- while (rp && (rp->fr_blkid <= blkid + nblks)) {
- uint64_t fr_endblk = rp->fr_blkid + rp->fr_nblks;
- free_range_t *nrp = AVL_NEXT(tree, rp);
-
- if (blkid <= rp->fr_blkid && endblk >= fr_endblk) {
- /* clear this entire range */
- avl_remove(tree, rp);
- kmem_free(rp, sizeof (free_range_t));
- } else if (blkid <= rp->fr_blkid &&
- endblk > rp->fr_blkid && endblk < fr_endblk) {
- /* clear the beginning of this range */
- rp->fr_blkid = endblk;
- rp->fr_nblks = fr_endblk - endblk;
- } else if (blkid > rp->fr_blkid && blkid < fr_endblk &&
- endblk >= fr_endblk) {
- /* clear the end of this range */
- rp->fr_nblks = blkid - rp->fr_blkid;
- } else if (blkid > rp->fr_blkid && endblk < fr_endblk) {
- /* clear a chunk out of this range */
- free_range_t *new_rp =
- kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE);
-
- new_rp->fr_blkid = endblk;
- new_rp->fr_nblks = fr_endblk - endblk;
- avl_insert_here(tree, new_rp, rp, AVL_AFTER);
- rp->fr_nblks = blkid - rp->fr_blkid;
- }
- /* there may be no overlap */
- rp = nrp;
+ dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+ if (db != NULL) {
+ dmu_buf_will_dirty(&db->db, tx);
+ dbuf_rele(db, FTAG);
}
}
blkshift = dn->dn_datablkshift;
epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
- if (len == -1ULL) {
+ if (len == DMU_OBJECT_END) {
len = UINT64_MAX - off;
trunc = TRUE;
}
} else {
ASSERT(dn->dn_maxblkid == 0);
if (off == 0 && len >= blksz) {
- /* Freeing the whole block; fast-track this request */
+ /*
+ * Freeing the whole block; fast-track this request.
+ * Note that we won't dirty any indirect blocks,
+ * which is fine because we will be freeing the entire
+ * file and thus all indirect blocks will be freed
+ * by free_children().
+ */
blkid = 0;
nblks = 1;
goto done;
ASSERT3U(blkoff + head, ==, blksz);
if (len < head)
head = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
- FTAG, &db) == 0) {
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+ TRUE, FALSE, FTAG, &db) == 0) {
caddr_t data;
/* don't dirty if it isn't on disk and isn't dirty */
if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
rw_exit(&dn->dn_struct_rwlock);
- dbuf_will_dirty(db, tx);
+ dmu_buf_will_dirty(&db->db, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
data = db->db.db_data;
bzero(data + blkoff, head);
else
tail = P2PHASE(len, blksz);
- ASSERT3U(P2PHASE(off, blksz), ==, 0);
+ ASSERT0(P2PHASE(off, blksz));
/* zero out any partial block data at the end of the range */
if (tail) {
if (len < tail)
tail = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
- TRUE, FTAG, &db) == 0) {
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+ TRUE, FALSE, FTAG, &db) == 0) {
/* don't dirty if not on disk and not dirty */
if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
rw_exit(&dn->dn_struct_rwlock);
- dbuf_will_dirty(db, tx);
+ dmu_buf_will_dirty(&db->db, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
bzero(db->db.db_data, tail);
}
nblks += 1;
/*
- * Read in and mark all the level-1 indirects dirty,
- * so that they will stay in memory until syncing phase.
- * Always dirty the first and last indirect to make sure
- * we dirty all the partial indirects.
+ * Dirty all the indirect blocks in this range. Note that only
+ * the first and last indirect blocks can actually be written
+ * (if they were partially freed) -- they must be dirtied, even if
+ * they do not exist on disk yet. The interior blocks will
+ * be freed by free_children(), so they will not actually be written.
+ * Even though these interior blocks will not be written, we
+ * dirty them for two reasons:
+ *
+ * - It ensures that the indirect blocks remain in memory until
+ * syncing context. (They have already been prefetched by
+ * dmu_tx_hold_free(), so we don't have to worry about reading
+ * them serially here.)
+ *
+ * - The dirty space accounting will put pressure on the txg sync
+ * mechanism to begin syncing, and to delay transactions if there
+ * is a large amount of freeing. Even though these indirect
+ * blocks will not be written, we could need to write the same
+ * amount of space if we copy the freed BPs into deadlists.
*/
if (dn->dn_nlevels > 1) {
- uint64_t i, first, last;
- int shift = epbs + dn->dn_datablkshift;
+ uint64_t first, last, i, ibyte;
+ int shift, err;
first = blkid >> epbs;
- if ((db = dbuf_hold_level(dn, 1, first, FTAG))) {
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- }
+ dnode_dirty_l1(dn, first, tx);
if (trunc)
last = dn->dn_maxblkid >> epbs;
else
last = (blkid + nblks - 1) >> epbs;
- if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- }
- for (i = first + 1; i < last; i++) {
- uint64_t ibyte = i << shift;
- int err;
+ if (last != first)
+ dnode_dirty_l1(dn, last, tx);
- err = dnode_next_offset(dn,
- DNODE_FIND_HAVELOCK, &ibyte, 1, 1, 0);
+ shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ for (i = first + 1; i < last; i++) {
+ /*
+ * Set i to the blockid of the next non-hole
+ * level-1 indirect block at or after i. Note
+ * that dnode_next_offset() operates in terms of
+ * level-0-equivalent bytes.
+ */
+ ibyte = i << shift;
+ err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+ &ibyte, 2, 1, 0);
i = ibyte >> shift;
- if (err == ESRCH || i >= last)
+ if (i >= last)
break;
- ASSERT(err == 0);
- db = dbuf_hold_level(dn, 1, i, FTAG);
- if (db) {
- dbuf_will_dirty(db, tx);
- dbuf_rele(db, FTAG);
- }
+
+ /*
+ * Normally we should not see an error, either
+ * from dnode_next_offset() or dbuf_hold_level()
+ * (except for ESRCH from dnode_next_offset).
+ * If there is an i/o error, then when we read
+ * this block in syncing context, it will use
+ * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+ * to the "failmode" property. dnode_next_offset()
+ * doesn't have a flag to indicate MUSTSUCCEED.
+ */
+ if (err != 0)
+ break;
+
+ dnode_dirty_l1(dn, i, tx);
}
}
+
done:
/*
* Add this range to the dnode range list.
* We will finish up this free operation in the syncing phase.
*/
mutex_enter(&dn->dn_mtx);
- dnode_clear_range(dn, blkid, nblks, tx);
{
- free_range_t *rp, *found;
- avl_index_t where;
- avl_tree_t *tree = &dn->dn_ranges[tx->tx_txg&TXG_MASK];
-
- /* Add new range to dn_ranges */
- rp = kmem_alloc(sizeof (free_range_t), KM_PUSHPAGE);
- rp->fr_blkid = blkid;
- rp->fr_nblks = nblks;
- found = avl_find(tree, rp, &where);
- ASSERT(found == NULL);
- avl_insert(tree, rp, where);
- dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
- blkid, nblks, tx->tx_txg);
+ int txgoff = tx->tx_txg & TXG_MASK;
+ if (dn->dn_free_ranges[txgoff] == NULL) {
+ dn->dn_free_ranges[txgoff] =
+ range_tree_create(NULL, NULL, &dn->dn_mtx);
+ }
+ range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+ range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
}
+ dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
+ blkid, nblks, tx->tx_txg);
mutex_exit(&dn->dn_mtx);
dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
dnode_setdirty(dn, tx);
out:
- if (trunc && dn->dn_maxblkid >= (off >> blkshift))
- dn->dn_maxblkid = (off >> blkshift ? (off >> blkshift) - 1 : 0);
rw_exit(&dn->dn_struct_rwlock);
}
uint64_t
dnode_block_freed(dnode_t *dn, uint64_t blkid)
{
- free_range_t range_tofind;
void *dp = spa_get_dsl(dn->dn_objset->os_spa);
int i;
if (blkid == DMU_SPILL_BLKID)
return (dnode_spill_freed(dn));
- range_tofind.fr_blkid = blkid;
mutex_enter(&dn->dn_mtx);
for (i = 0; i < TXG_SIZE; i++) {
- free_range_t *range_found;
- avl_index_t idx;
-
- range_found = avl_find(&dn->dn_ranges[i], &range_tofind, &idx);
- if (range_found) {
- ASSERT(range_found->fr_nblks > 0);
- break;
- }
- range_found = avl_nearest(&dn->dn_ranges[i], idx, AVL_BEFORE);
- if (range_found &&
- range_found->fr_blkid + range_found->fr_nblks > blkid)
+ if (dn->dn_free_ranges[i] != NULL &&
+ range_tree_contains(dn->dn_free_ranges[i], blkid, 1))
break;
}
mutex_exit(&dn->dn_mtx);
space += delta;
if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) {
ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0);
- ASSERT3U(P2PHASE(space, 1<<DEV_BSHIFT), ==, 0);
+ ASSERT0(P2PHASE(space, 1<<DEV_BSHIFT));
dn->dn_phys->dn_used = space >> DEV_BSHIFT;
} else {
dn->dn_phys->dn_used = space;
}
/*
- * Call when we think we're going to write/free space in open context.
- * Be conservative (ie. OK to write less than this or free more than
- * this, but don't write more or free less).
- */
-void
-dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
-{
- objset_t *os = dn->dn_objset;
- dsl_dataset_t *ds = os->os_dsl_dataset;
-
- if (space > 0)
- space = spa_get_asize(os->os_spa, space);
-
- if (ds)
- dsl_dir_willuse_space(ds->ds_dir, space, tx);
-
- dmu_tx_willuse_space(tx, space);
-}
-
-/*
- * This function scans a block at the indicated "level" looking for
- * a hole or data (depending on 'flags'). If level > 0, then we are
- * scanning an indirect block looking at its pointers. If level == 0,
- * then we are looking at a block of dnodes. If we don't find what we
- * are looking for in the block, we return ESRCH. Otherwise, return
- * with *offset pointing to the beginning (if searching forwards) or
- * end (if searching backwards) of the range covered by the block
- * pointer we matched on (or dnode).
+ * Scans a block at the indicated "level" looking for a hole or data,
+ * depending on 'flags'.
+ *
+ * If level > 0, then we are scanning an indirect block looking at its
+ * pointers. If level == 0, then we are looking at a block of dnodes.
+ *
+ * If we don't find what we are looking for in the block, we return ESRCH.
+ * Otherwise, return with *offset pointing to the beginning (if searching
+ * forwards) or end (if searching backwards) of the range covered by the
+ * block pointer we matched on (or dnode).
*
* The basic search algorithm used below by dnode_next_offset() is to
* use this function to search up the block tree (widen the search) until
*/
static int
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
- int lvl, uint64_t blkfill, uint64_t txg)
+ int lvl, uint64_t blkfill, uint64_t txg)
{
dmu_buf_impl_t *db = NULL;
void *data = NULL;
boolean_t hole;
int i, inc, error, span;
- dprintf("probing object %llu offset %llx level %d of %u\n",
- dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
-
hole = ((flags & DNODE_FIND_HOLE) != 0);
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
ASSERT(txg == 0 || !hole);
epb = dn->dn_phys->dn_nblkptr;
data = dn->dn_phys->dn_blkptr;
} else {
- uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
- error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+ uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
if (error) {
if (error != ENOENT)
return (error);
* at the pointer to this block in its parent, and its
* going to be unallocated, so we will skip over it.
*/
- return (ESRCH);
+ return (SET_ERROR(ESRCH));
}
error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT);
if (error) {
data = db->db.db_data;
}
- if (db && txg &&
- (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg)) {
+
+ if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
+ db->db_blkptr->blk_birth <= txg ||
+ BP_IS_HOLE(db->db_blkptr))) {
/*
* This can only happen when we are searching up the tree
* and these conditions mean that we need to keep climbing.
*/
- error = ESRCH;
+ error = SET_ERROR(ESRCH);
} else if (lvl == 0) {
dnode_phys_t *dnp = data;
- span = DNODE_SHIFT;
+
ASSERT(dn->dn_type == DMU_OT_DNODE);
+ ASSERT(!(flags & DNODE_FIND_BACKWARDS));
- for (i = (*offset >> span) & (blkfill - 1);
- i >= 0 && i < blkfill; i += inc) {
+ for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+ i < blkfill; i += dnp[i].dn_extra_slots + 1) {
if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
break;
- *offset += (1ULL << span) * inc;
}
- if (i < 0 || i == blkfill)
- error = ESRCH;
+
+ if (i == blkfill)
+ error = SET_ERROR(ESRCH);
+
+ *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+ (i << DNODE_SHIFT);
} else {
blkptr_t *bp = data;
uint64_t start = *offset;
else
minfill++;
- *offset = *offset >> span;
+ if (span >= 8 * sizeof (*offset)) {
+ /* This only happens on the highest indirection level */
+ ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
+ *offset = 0;
+ } else {
+ *offset = *offset >> span;
+ }
+
for (i = BF64_GET(*offset, 0, epbs);
i >= 0 && i < epb; i += inc) {
- if (bp[i].blk_fill >= minfill &&
- bp[i].blk_fill <= maxfill &&
+ if (BP_GET_FILL(&bp[i]) >= minfill &&
+ BP_GET_FILL(&bp[i]) <= maxfill &&
(hole || bp[i].blk_birth > txg))
break;
if (inc > 0 || *offset > 0)
*offset += inc;
}
- *offset = *offset << span;
+
+ if (span >= 8 * sizeof (*offset)) {
+ *offset = start;
+ } else {
+ *offset = *offset << span;
+ }
+
if (inc < 0) {
/* traversing backwards; position offset at the end */
ASSERT3U(*offset, <=, start);
*offset = start;
}
if (i < 0 || i >= epb)
- error = ESRCH;
+ error = SET_ERROR(ESRCH);
}
if (db)
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_phys->dn_nlevels == 0) {
- error = ESRCH;
+ error = SET_ERROR(ESRCH);
goto out;
}
if (flags & DNODE_FIND_HOLE)
*offset = dn->dn_datablksz;
} else {
- error = ESRCH;
+ error = SET_ERROR(ESRCH);
}
goto out;
}
flags, offset, lvl, blkfill, txg);
}
+ /*
+ * There's always a "virtual hole" at the end of the object, even
+ * if all BP's which physically exist are non-holes.
+ */
+ if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 &&
+ minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) {
+ error = 0;
+ }
+
if (error == 0 && (flags & DNODE_FIND_BACKWARDS ?
initial_offset < *offset : initial_offset > *offset))
- error = ESRCH;
+ error = SET_ERROR(ESRCH);
out:
if (!(flags & DNODE_FIND_HAVELOCK))
rw_exit(&dn->dn_struct_rwlock);