*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
+ * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#include <sys/zfs_context.h>
static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);
#endif /* _KERNEL */
+static int
+dbuf_compare(const void *x1, const void *x2)
+{
+ const dmu_buf_impl_t *d1 = x1;
+ const dmu_buf_impl_t *d2 = x2;
+
+ int cmp = AVL_CMP(d1->db_level, d2->db_level);
+ if (likely(cmp))
+ return (cmp);
+
+ cmp = AVL_CMP(d1->db_blkid, d2->db_blkid);
+ if (likely(cmp))
+ return (cmp);
+
+ if (d1->db_state == DB_SEARCH) {
+ ASSERT3S(d2->db_state, !=, DB_SEARCH);
+ return (-1);
+ } else if (d2->db_state == DB_SEARCH) {
+ ASSERT3S(d1->db_state, !=, DB_SEARCH);
+ return (1);
+ }
+
+ return (AVL_PCMP(d1, d2));
+}
+
/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
dnode_t *dn = arg;
int i;
- rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
+ rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
dn->dn_id_flags = 0;
dn->dn_dbufs_count = 0;
- dn->dn_unlisted_l0_blkid = 0;
- list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
dn->dn_moved = 0;
ASSERT0(dn->dn_id_flags);
ASSERT0(dn->dn_dbufs_count);
- ASSERT0(dn->dn_unlisted_l0_blkid);
- list_destroy(&dn->dn_dbufs);
+ avl_destroy(&dn->dn_dbufs);
}
void
}
if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) {
int i;
+ int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT);
if (dn->dn_datablkshift) {
ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT);
ASSERT(DMU_OT_IS_VALID(dn->dn_type));
ASSERT3U(dn->dn_nblkptr, >=, 1);
ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
- ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen);
ASSERT3U(dn->dn_datablksz, ==,
dn->dn_datablkszsec << SPA_MINBLOCKSHIFT);
ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0);
ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) +
- dn->dn_bonuslen, <=, DN_MAX_BONUSLEN);
+ dn->dn_bonuslen, <=, max_bonuslen);
for (i = 0; i < TXG_SIZE; i++) {
ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels);
}
dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec);
dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen);
+ dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots);
dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid);
dnp->dn_used = BSWAP_64(dnp->dn_used);
* dnode buffer).
*/
int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
- size_t len = DN_MAX_BONUSLEN - off;
+ int slots = dnp->dn_extra_slots + 1;
+ size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
dmu_object_byteswap_t byteswap;
ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
/* Swap SPILL block if we have one */
if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
- byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t));
-
+ byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t));
}
void
dnode_buf_byteswap(void *vbuf, size_t size)
{
- dnode_phys_t *buf = vbuf;
- int i;
+ int i = 0;
ASSERT3U(sizeof (dnode_phys_t), ==, (1<<DNODE_SHIFT));
ASSERT((size & (sizeof (dnode_phys_t)-1)) == 0);
- size >>= DNODE_SHIFT;
- for (i = 0; i < size; i++) {
- dnode_byteswap(buf);
- buf++;
+ while (i < size) {
+ dnode_phys_t *dnp = vbuf + i;
+ dnode_byteswap(dnp);
+
+ i += DNODE_MIN_SIZE;
+ if (dnp->dn_type != DMU_OT_NONE)
+ i += dnp->dn_extra_slots * DNODE_MIN_SIZE;
}
}
dnode_setdirty(dn, tx);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
- ASSERT3U(newsize, <=, DN_MAX_BONUSLEN -
+ ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
(dn->dn_nblkptr-1) * sizeof (blkptr_t));
dn->dn_bonuslen = newsize;
if (newsize == 0)
dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
uint64_t object, dnode_handle_t *dnh)
{
- dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
+ dnode_t *dn;
+ dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
ASSERT(!POINTER_IS_VALID(dn->dn_objset));
dn->dn_moved = 0;
dn->dn_compress = dnp->dn_compress;
dn->dn_bonustype = dnp->dn_bonustype;
dn->dn_bonuslen = dnp->dn_bonuslen;
+ dn->dn_num_slots = dnp->dn_extra_slots + 1;
dn->dn_maxblkid = dnp->dn_maxblkid;
dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
dn->dn_id_flags = 0;
ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
mutex_enter(&os->os_lock);
- list_insert_head(&os->os_dnodes, dn);
+ if (dnh->dnh_dnode != NULL) {
+ /* Lost the allocation race. */
+ mutex_exit(&os->os_lock);
+ kmem_cache_free(dnode_cache, dn);
+ return (dnh->dnh_dnode);
+ }
+
+ /*
+ * Exclude special dnodes from os_dnodes so an empty os_dnodes
+ * signifies that the special dnodes have no references from
+ * their children (the entries in os_dnodes). This allows
+ * dnode_destroy() to easily determine if the last child has
+ * been removed and then complete eviction of the objset.
+ */
+ if (!DMU_OBJECT_IS_SPECIAL(object))
+ list_insert_head(&os->os_dnodes, dn);
membar_producer();
+
/*
- * Everything else must be valid before assigning dn_objset makes the
- * dnode eligible for dnode_move().
+ * Everything else must be valid before assigning dn_objset
+ * makes the dnode eligible for dnode_move().
*/
dn->dn_objset = os;
+
+ dnh->dnh_dnode = dn;
mutex_exit(&os->os_lock);
- arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
+ arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE);
return (dn);
}
dnode_destroy(dnode_t *dn)
{
objset_t *os = dn->dn_objset;
+ boolean_t complete_os_eviction = B_FALSE;
ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
mutex_enter(&os->os_lock);
POINTER_INVALIDATE(&dn->dn_objset);
- list_remove(&os->os_dnodes, dn);
+ if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+ list_remove(&os->os_dnodes, dn);
+ complete_os_eviction =
+ list_is_empty(&os->os_dnodes) &&
+ list_link_active(&os->os_evicting_node);
+ }
mutex_exit(&os->os_lock);
/* the dnode can no longer move, so we can release the handle */
}
if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
- dbuf_evict(dn->dn_bonus);
+ dbuf_destroy(dn->dn_bonus);
dn->dn_bonus = NULL;
}
dn->dn_zio = NULL;
dn->dn_newuid = 0;
dn->dn_newgid = 0;
dn->dn_id_flags = 0;
- dn->dn_unlisted_l0_blkid = 0;
- dmu_zfetch_rele(&dn->dn_zfetch);
+ dmu_zfetch_fini(&dn->dn_zfetch);
kmem_cache_free(dnode_cache, dn);
- arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
+ arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE);
+
+ if (complete_os_eviction)
+ dmu_objset_evict_done(os);
}
void
dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
{
int i;
+ ASSERT3U(dn_slots, >, 0);
+ ASSERT3U(dn_slots << DNODE_SHIFT, <=,
+ spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)));
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (blocksize == 0)
blocksize = 1 << zfs_default_bs;
- else if (blocksize > SPA_MAXBLOCKSIZE)
- blocksize = SPA_MAXBLOCKSIZE;
else
blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE);
ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
- dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset,
- dn->dn_object, tx->tx_txg, blocksize, ibs);
+ dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
+ dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
(bonustype == DMU_OT_SA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
ASSERT(dn->dn_type == DMU_OT_NONE);
ASSERT0(dn->dn_maxblkid);
ASSERT0(dn->dn_allocated_txg);
ASSERT0(dn->dn_assigned_txg);
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
- ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
+ ASSERT(avl_is_empty(&dn->dn_dbufs));
for (i = 0; i < TXG_SIZE; i++) {
ASSERT0(dn->dn_next_nblkptr[i]);
dnode_setdblksz(dn, blocksize);
dn->dn_indblkshift = ibs;
dn->dn_nlevels = 1;
+ dn->dn_num_slots = dn_slots;
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
dn->dn_nblkptr = 1;
- else
- dn->dn_nblkptr = 1 +
- ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ else {
+ dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
+ }
+
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
void
dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
- dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+ dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
{
int nblkptr;
ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
- ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
+ ASSERT3U(blocksize, <=,
+ spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
ASSERT0(blocksize % SPA_MINBLOCKSIZE);
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
ASSERT(tx->tx_txg != 0);
(bonustype != DMU_OT_NONE && bonuslen != 0) ||
(bonustype == DMU_OT_SA && bonuslen == 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
- ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
+ ASSERT3U(bonuslen, <=,
+ DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
+
+ dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
/* clean up any unreferenced dbufs */
dnode_evict_dbufs(dn);
if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */
nblkptr = 1;
else
- nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+ nblkptr = MIN(DN_MAX_NBLKPTR,
+ 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >>
+ SPA_BLKPTRSHIFT));
if (dn->dn_bonustype != bonustype)
dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype;
if (dn->dn_nblkptr != nblkptr)
mutex_enter(&dn->dn_mtx);
dn->dn_bonustype = bonustype;
dn->dn_bonuslen = bonuslen;
+ dn->dn_num_slots = dn_slots;
dn->dn_nblkptr = nblkptr;
dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
dn->dn_compress = ZIO_COMPRESS_INHERIT;
/* fix up the bonus db_size */
if (dn->dn_bonus) {
dn->dn_bonus->db.db_size =
- DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
}
ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
- ASSERT(list_is_empty(&ndn->dn_dbufs));
- list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
+ ASSERT(avl_is_empty(&ndn->dn_dbufs));
+ avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
ndn->dn_dbufs_count = odn->dn_dbufs_count;
- ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
ndn->dn_bonus = odn->dn_bonus;
ndn->dn_have_spill = odn->dn_have_spill;
ndn->dn_zio = odn->dn_zio;
dmu_zfetch_init(&ndn->dn_zfetch, NULL);
list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
- ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt;
- ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail;
/*
* Update back pointers. Updating the handle fixes the back pointer of
*/
odn->dn_dbuf = NULL;
odn->dn_handle = NULL;
- list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
+ avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
odn->dn_dbufs_count = 0;
- odn->dn_unlisted_l0_blkid = 0;
odn->dn_bonus = NULL;
odn->dn_zfetch.zf_dnode = NULL;
*/
while (refcount_count(&dn->dn_holds) > 0)
delay(1);
+ ASSERT(dn->dn_dbuf == NULL ||
+ dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
zrl_add(&dnh->dnh_zrlock);
dnode_destroy(dn); /* implicit zrl_remove() */
zrl_destroy(&dnh->dnh_zrlock);
dnh->dnh_dnode = NULL;
}
-dnode_t *
+void
dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
dnode_handle_t *dnh)
{
- dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
- dnh->dnh_dnode = dn;
+ dnode_t *dn;
+
+ dn = dnode_create(os, dnp, NULL, object, dnh);
zrl_init(&dnh->dnh_zrlock);
DNODE_VERIFY(dn);
- return (dn);
}
static void
-dnode_buf_pageout(dmu_buf_t *db, void *arg)
+dnode_buf_evict_async(void *dbu)
{
- dnode_children_t *children_dnodes = arg;
+ dnode_children_t *children_dnodes = dbu;
int i;
- int epb = db->db_size >> DNODE_SHIFT;
-
- ASSERT(epb == children_dnodes->dnc_count);
- for (i = 0; i < epb; i++) {
+ for (i = 0; i < children_dnodes->dnc_count; i++) {
dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
dnode_t *dn;
dnh->dnh_dnode = NULL;
}
kmem_free(children_dnodes, sizeof (dnode_children_t) +
- (epb - 1) * sizeof (dnode_handle_t));
+ children_dnodes->dnc_count * sizeof (dnode_handle_t));
+}
+
+/*
+ * Return true if the given index is interior to a dnode already
+ * allocated in the block. That is, the index is neither free nor
+ * allocated, but is consumed by a large dnode.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_consumed(dmu_buf_impl_t *db, int idx)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+ int skip;
+ int i;
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ for (i = 0; i < idx; i += skip) {
+ dnh = &children_dnodes->dnc_children[i];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL) {
+ ot = dnh->dnh_dnode->dn_type;
+ skip = dnh->dnh_dnode->dn_num_slots;
+ } else {
+ ot = dn_block[i].dn_type;
+ skip = dn_block[i].dn_extra_slots + 1;
+ }
+ zrl_remove(&dnh->dnh_zrlock);
+
+ if (ot == DMU_OT_NONE)
+ skip = 1;
+ }
+
+ return (i > idx);
+}
+
+/*
+ * Return true if the given index in the dnode block is a valid
+ * allocated dnode. That is, the index is not consumed by a large
+ * dnode and is not free.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_allocated(dmu_buf_impl_t *db, int idx)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+
+ if (dnode_is_consumed(db, idx))
+ return (B_FALSE);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ dnh = &children_dnodes->dnc_children[idx];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL)
+ ot = dnh->dnh_dnode->dn_type;
+ else
+ ot = dn_block[idx].dn_type;
+ zrl_remove(&dnh->dnh_zrlock);
+
+ return (ot != DMU_OT_NONE);
+}
+
+/*
+ * Return true if the given range of indices in the dnode block are
+ * free. That is, the starting index is not consumed by a large dnode
+ * and none of the indices are allocated.
+ *
+ * The dnode_phys_t buffer may not be in sync with the in-core dnode
+ * structure, so we try to check the dnode structure first and fall back
+ * to the dnode_phys_t buffer it doesn't exist.
+ */
+static boolean_t
+dnode_is_free(dmu_buf_impl_t *db, int idx, int slots)
+{
+ dnode_handle_t *dnh;
+ dmu_object_type_t ot;
+ dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block;
+ int i;
+
+ if (idx + slots > DNODES_PER_BLOCK)
+ return (B_FALSE);
+
+ children_dnodes = dmu_buf_get_user(&db->db);
+ dn_block = (dnode_phys_t *)db->db.db_data;
+
+ if (dnode_is_consumed(db, idx))
+ return (B_FALSE);
+
+ for (i = idx; i < idx + slots; i++) {
+ dnh = &children_dnodes->dnc_children[i];
+
+ zrl_add(&dnh->dnh_zrlock);
+ if (dnh->dnh_dnode != NULL)
+ ot = dnh->dnh_dnode->dn_type;
+ else
+ ot = dn_block[i].dn_type;
+ zrl_remove(&dnh->dnh_zrlock);
+
+ if (ot != DMU_OT_NONE)
+ return (B_FALSE);
+ }
+
+ return (B_TRUE);
}
/*
* errors:
* EINVAL - invalid object number.
+ * ENOSPC - hole too small to fulfill "slots" request
+ * ENOENT - the requested dnode is not allocated
* EIO - i/o error.
* succeeds even for free dnodes.
*/
int
-dnode_hold_impl(objset_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
void *tag, dnode_t **dnp)
{
- int epb, idx, err;
+ int epb, idx, err, i;
int drop_struct_lock = FALSE;
int type;
uint64_t blk;
dnode_t *mdn, *dn;
dmu_buf_impl_t *db;
dnode_children_t *children_dnodes;
+ dnode_phys_t *dn_block_begin;
dnode_handle_t *dnh;
+ ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0));
+ ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0));
+
/*
* If you are holding the spa config lock as writer, you shouldn't
* be asking the DMU to do *anything* unless it's the root pool
drop_struct_lock = TRUE;
}
- blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t));
+ blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t));
db = dbuf_hold(mdn, blk, FTAG);
if (drop_struct_lock)
ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
epb = db->db.db_size >> DNODE_SHIFT;
- idx = object & (epb-1);
-
ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
children_dnodes = dmu_buf_get_user(&db->db);
if (children_dnodes == NULL) {
- int i;
dnode_children_t *winner;
- children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
- (epb - 1) * sizeof (dnode_handle_t), KM_SLEEP);
+ children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
+ epb * sizeof (dnode_handle_t), KM_SLEEP);
children_dnodes->dnc_count = epb;
dnh = &children_dnodes->dnc_children[0];
for (i = 0; i < epb; i++) {
zrl_init(&dnh[i].dnh_zrlock);
- dnh[i].dnh_dnode = NULL;
}
- if ((winner = dmu_buf_set_user(&db->db, children_dnodes, NULL,
- dnode_buf_pageout))) {
+ dmu_buf_init_user(&children_dnodes->dnc_dbu, NULL,
+ dnode_buf_evict_async, NULL);
+ winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
+ if (winner != NULL) {
for (i = 0; i < epb; i++) {
zrl_destroy(&dnh[i].dnh_zrlock);
}
kmem_free(children_dnodes, sizeof (dnode_children_t) +
- (epb - 1) * sizeof (dnode_handle_t));
+ epb * sizeof (dnode_handle_t));
children_dnodes = winner;
}
}
ASSERT(children_dnodes->dnc_count == epb);
- dnh = &children_dnodes->dnc_children[idx];
- zrl_add(&dnh->dnh_zrlock);
- if ((dn = dnh->dnh_dnode) == NULL) {
- dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
- dnode_t *winner;
+ idx = object & (epb - 1);
+ dn_block_begin = (dnode_phys_t *)db->db.db_data;
- dn = dnode_create(os, phys, db, object, dnh);
- winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
- if (winner != NULL) {
- zrl_add(&dnh->dnh_zrlock);
- dnode_destroy(dn); /* implicit zrl_remove() */
- dn = winner;
- }
+ if ((flag & DNODE_MUST_BE_FREE) && !dnode_is_free(db, idx, slots)) {
+ dbuf_rele(db, FTAG);
+ return (ENOSPC);
+ } else if ((flag & DNODE_MUST_BE_ALLOCATED) &&
+ !dnode_is_allocated(db, idx)) {
+ dbuf_rele(db, FTAG);
+ return (ENOENT);
}
+ dnh = &children_dnodes->dnc_children[idx];
+ zrl_add(&dnh->dnh_zrlock);
+ dn = dnh->dnh_dnode;
+ if (dn == NULL)
+ dn = dnode_create(os, dn_block_begin + idx, db, object, dnh);
+
mutex_enter(&dn->dn_mtx);
type = dn->dn_type;
if (dn->dn_free_txg ||
- ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
- ((flag & DNODE_MUST_BE_FREE) &&
- (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
+ ((flag & DNODE_MUST_BE_FREE) && !refcount_is_zero(&dn->dn_holds))) {
mutex_exit(&dn->dn_mtx);
zrl_remove(&dnh->dnh_zrlock);
dbuf_rele(db, FTAG);
return (type == DMU_OT_NONE ? ENOENT : EEXIST);
}
- mutex_exit(&dn->dn_mtx);
-
if (refcount_add(&dn->dn_holds, tag) == 1)
dbuf_add_ref(db, dnh);
+ mutex_exit(&dn->dn_mtx);
+
/* Now we can rely on the hold to prevent the dnode from moving. */
zrl_remove(&dnh->dnh_zrlock);
int
dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
{
- return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
+ return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
+ dnp));
}
/*
*/
dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
- mutex_enter(&os->os_lock);
+ multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
+ multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
/*
* If we are already marked dirty, we're done.
*/
if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) {
- mutex_exit(&os->os_lock);
+ multilist_sublist_unlock(mls);
return;
}
- ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
+ ASSERT(!refcount_is_zero(&dn->dn_holds) ||
+ !avl_is_empty(&dn->dn_dbufs));
ASSERT(dn->dn_datablksz != 0);
ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
dn->dn_object, txg);
- if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) {
- list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn);
- } else {
- list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn);
- }
+ multilist_sublist_insert_head(mls, dn);
- mutex_exit(&os->os_lock);
+ multilist_sublist_unlock(mls);
/*
* The dnode maintains a hold on its containing dbuf as
void
dnode_free(dnode_t *dn, dmu_tx_t *tx)
{
- int txgoff = tx->tx_txg & TXG_MASK;
-
- dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg);
-
- /* we should be the only holder... hopefully */
- /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */
-
mutex_enter(&dn->dn_mtx);
if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) {
mutex_exit(&dn->dn_mtx);
dn->dn_free_txg = tx->tx_txg;
mutex_exit(&dn->dn_mtx);
- /*
- * If the dnode is already dirty, it needs to be moved from
- * the dirty list to the free list.
- */
- mutex_enter(&dn->dn_objset->os_lock);
- if (list_link_active(&dn->dn_dirty_link[txgoff])) {
- list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn);
- list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn);
- mutex_exit(&dn->dn_objset->os_lock);
- } else {
- mutex_exit(&dn->dn_objset->os_lock);
- dnode_setdirty(dn, tx);
- }
+ dnode_setdirty(dn, tx);
}
/*
int
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
{
- dmu_buf_impl_t *db, *db_next;
+ dmu_buf_impl_t *db;
int err;
+ ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
if (size == 0)
size = SPA_MINBLOCKSIZE;
- if (size > SPA_MAXBLOCKSIZE)
- size = SPA_MAXBLOCKSIZE;
else
size = P2ROUNDUP(size, SPA_MINBLOCKSIZE);
goto fail;
mutex_enter(&dn->dn_dbufs_mtx);
- for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
- db_next = list_next(&dn->dn_dbufs, db);
-
+ for (db = avl_first(&dn->dn_dbufs); db != NULL;
+ db = AVL_NEXT(&dn->dn_dbufs, db)) {
if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
db->db_blkid != DMU_SPILL_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
goto fail;
/* resize the old block */
- err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db);
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
if (err == 0)
dbuf_new_size(db, size, tx);
else if (err != ENOENT)
sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs)
new_nlevels++;
+ ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS);
+
if (new_nlevels > dn->dn_nlevels) {
int old_nlevels = dn->dn_nlevels;
dmu_buf_impl_t *db;
rw_downgrade(&dn->dn_struct_rwlock);
}
+static void
+dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG);
+ if (db != NULL) {
+ dmu_buf_will_dirty(&db->db, tx);
+ dbuf_rele(db, FTAG);
+ }
+}
+
void
dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
{
ASSERT3U(blkoff + head, ==, blksz);
if (len < head)
head = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE,
- FTAG, &db) == 0) {
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
+ TRUE, FALSE, FTAG, &db) == 0) {
caddr_t data;
/* don't dirty if it isn't on disk and isn't dirty */
if (tail) {
if (len < tail)
tail = len;
- if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len),
- TRUE, FTAG, &db) == 0) {
+ if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
+ TRUE, FALSE, FTAG, &db) == 0) {
/* don't dirty if not on disk and not dirty */
if (db->db_last_dirty ||
(db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) {
nblks += 1;
/*
- * Dirty the first and last indirect blocks, as they (and/or their
- * parents) will need to be written out if they were only
- * partially freed. Interior indirect blocks will be themselves freed,
- * by free_children(), so they need not be dirtied. Note that these
- * interior blocks have already been prefetched by dmu_tx_hold_free().
+ * Dirty all the indirect blocks in this range. Note that only
+ * the first and last indirect blocks can actually be written
+ * (if they were partially freed) -- they must be dirtied, even if
+ * they do not exist on disk yet. The interior blocks will
+ * be freed by free_children(), so they will not actually be written.
+ * Even though these interior blocks will not be written, we
+ * dirty them for two reasons:
+ *
+ * - It ensures that the indirect blocks remain in memory until
+ * syncing context. (They have already been prefetched by
+ * dmu_tx_hold_free(), so we don't have to worry about reading
+ * them serially here.)
+ *
+ * - The dirty space accounting will put pressure on the txg sync
+ * mechanism to begin syncing, and to delay transactions if there
+ * is a large amount of freeing. Even though these indirect
+ * blocks will not be written, we could need to write the same
+ * amount of space if we copy the freed BPs into deadlists.
*/
if (dn->dn_nlevels > 1) {
- uint64_t first, last;
+ uint64_t first, last, i, ibyte;
+ int shift, err;
first = blkid >> epbs;
- if ((db = dbuf_hold_level(dn, 1, first, FTAG))) {
- dmu_buf_will_dirty(&db->db, tx);
- dbuf_rele(db, FTAG);
- }
+ dnode_dirty_l1(dn, first, tx);
if (trunc)
last = dn->dn_maxblkid >> epbs;
else
last = (blkid + nblks - 1) >> epbs;
- if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) {
- dmu_buf_will_dirty(&db->db, tx);
- dbuf_rele(db, FTAG);
+ if (last != first)
+ dnode_dirty_l1(dn, last, tx);
+
+ shift = dn->dn_datablkshift + dn->dn_indblkshift -
+ SPA_BLKPTRSHIFT;
+ for (i = first + 1; i < last; i++) {
+ /*
+ * Set i to the blockid of the next non-hole
+ * level-1 indirect block at or after i. Note
+ * that dnode_next_offset() operates in terms of
+ * level-0-equivalent bytes.
+ */
+ ibyte = i << shift;
+ err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK,
+ &ibyte, 2, 1, 0);
+ i = ibyte >> shift;
+ if (i >= last)
+ break;
+
+ /*
+ * Normally we should not see an error, either
+ * from dnode_next_offset() or dbuf_hold_level()
+ * (except for ESRCH from dnode_next_offset).
+ * If there is an i/o error, then when we read
+ * this block in syncing context, it will use
+ * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according
+ * to the "failmode" property. dnode_next_offset()
+ * doesn't have a flag to indicate MUSTSUCCEED.
+ */
+ if (err != 0)
+ break;
+
+ dnode_dirty_l1(dn, i, tx);
}
}
mutex_exit(&dn->dn_mtx);
}
-/*
- * Call when we think we're going to write/free space in open context to track
- * the amount of memory in use by the currently open txg.
- */
-void
-dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
-{
- objset_t *os = dn->dn_objset;
- dsl_dataset_t *ds = os->os_dsl_dataset;
- int64_t aspace = spa_get_asize(os->os_spa, space);
-
- if (ds != NULL) {
- dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
- dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
- }
-
- dmu_tx_willuse_space(tx, aspace);
-}
-
/*
* Scans a block at the indicated "level" looking for a hole or data,
* depending on 'flags'.
*/
static int
dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
- int lvl, uint64_t blkfill, uint64_t txg)
+ int lvl, uint64_t blkfill, uint64_t txg)
{
dmu_buf_impl_t *db = NULL;
void *data = NULL;
boolean_t hole;
int i, inc, error, span;
- dprintf("probing object %llu offset %llx level %d of %u\n",
- dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
-
hole = ((flags & DNODE_FIND_HOLE) != 0);
inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
ASSERT(txg == 0 || !hole);
epb = dn->dn_phys->dn_nblkptr;
data = dn->dn_phys->dn_blkptr;
} else {
- uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl);
- error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db);
+ uint64_t blkid = dbuf_whichblock(dn, lvl, *offset);
+ error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db);
if (error) {
if (error != ENOENT)
return (error);
error = SET_ERROR(ESRCH);
} else if (lvl == 0) {
dnode_phys_t *dnp = data;
- span = DNODE_SHIFT;
+
ASSERT(dn->dn_type == DMU_OT_DNODE);
+ ASSERT(!(flags & DNODE_FIND_BACKWARDS));
- for (i = (*offset >> span) & (blkfill - 1);
- i >= 0 && i < blkfill; i += inc) {
+ for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1);
+ i < blkfill; i += dnp[i].dn_extra_slots + 1) {
if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
break;
- *offset += (1ULL << span) * inc;
}
- if (i < 0 || i == blkfill)
+
+ if (i == blkfill)
error = SET_ERROR(ESRCH);
+
+ *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) +
+ (i << DNODE_SHIFT);
} else {
blkptr_t *bp = data;
uint64_t start = *offset;
else
minfill++;
- *offset = *offset >> span;
+ if (span >= 8 * sizeof (*offset)) {
+ /* This only happens on the highest indirection level */
+ ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1);
+ *offset = 0;
+ } else {
+ *offset = *offset >> span;
+ }
+
for (i = BF64_GET(*offset, 0, epbs);
i >= 0 && i < epb; i += inc) {
if (BP_GET_FILL(&bp[i]) >= minfill &&
if (inc > 0 || *offset > 0)
*offset += inc;
}
- *offset = *offset << span;
+
+ if (span >= 8 * sizeof (*offset)) {
+ *offset = start;
+ } else {
+ *offset = *offset << span;
+ }
+
if (inc < 0) {
/* traversing backwards; position offset at the end */
ASSERT3U(*offset, <=, start);