* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
#include <sys/zio.h>
#include <sys/dmu_zfetch.h>
#include <sys/range_tree.h>
-#include <sys/trace_defs.h>
+#include <sys/trace_zfs.h>
#include <sys/zfs_project.h>
dnode_stats_t dnode_stats = {
{ "dnode_move_active", KSTAT_DATA_UINT64 },
};
+dnode_sums_t dnode_sums;
+
static kstat_t *dnode_ksp;
static kmem_cache_t *dnode_cache;
-ASSERTV(static dnode_phys_t dnode_phys_zero);
+static dnode_phys_t dnode_phys_zero __maybe_unused;
int zfs_default_bs = SPA_MINBLOCKSHIFT;
int zfs_default_ibs = DN_MAX_INDBLKSHIFT;
if (likely(cmp))
return (cmp);
+ if (d1->db_state == DB_MARKER) {
+ ASSERT3S(d2->db_state, !=, DB_MARKER);
+ return (TREE_PCMP(d1->db_parent, d2));
+ } else if (d2->db_state == DB_MARKER) {
+ ASSERT3S(d1->db_state, !=, DB_MARKER);
+ return (TREE_PCMP(d1, d2->db_parent));
+ }
+
if (d1->db_state == DB_SEARCH) {
ASSERT3S(d2->db_state, !=, DB_SEARCH);
return (-1);
return (TREE_PCMP(d1, d2));
}
-/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
{
+ (void) unused, (void) kmflag;
dnode_t *dn = arg;
- int i;
rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL);
mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
+ cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL);
/*
* Every dbuf has a reference, and dropping a tracked reference is
zfs_refcount_create(&dn->dn_tx_holds);
list_link_init(&dn->dn_link);
- bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr));
- bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels));
- bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift));
- bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype));
- bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk));
- bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen));
- bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz));
- bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid));
-
- for (i = 0; i < TXG_SIZE; i++) {
+ memset(dn->dn_next_type, 0, sizeof (dn->dn_next_type));
+ memset(dn->dn_next_nblkptr, 0, sizeof (dn->dn_next_nblkptr));
+ memset(dn->dn_next_nlevels, 0, sizeof (dn->dn_next_nlevels));
+ memset(dn->dn_next_indblkshift, 0, sizeof (dn->dn_next_indblkshift));
+ memset(dn->dn_next_bonustype, 0, sizeof (dn->dn_next_bonustype));
+ memset(dn->dn_rm_spillblk, 0, sizeof (dn->dn_rm_spillblk));
+ memset(dn->dn_next_bonuslen, 0, sizeof (dn->dn_next_bonuslen));
+ memset(dn->dn_next_blksz, 0, sizeof (dn->dn_next_blksz));
+ memset(dn->dn_next_maxblkid, 0, sizeof (dn->dn_next_maxblkid));
+
+ for (int i = 0; i < TXG_SIZE; i++) {
multilist_link_init(&dn->dn_dirty_link[i]);
dn->dn_free_ranges[i] = NULL;
list_create(&dn->dn_dirty_records[i],
return (0);
}
-/* ARGSUSED */
static void
dnode_dest(void *arg, void *unused)
{
- int i;
+ (void) unused;
dnode_t *dn = arg;
rw_destroy(&dn->dn_struct_rwlock);
mutex_destroy(&dn->dn_mtx);
mutex_destroy(&dn->dn_dbufs_mtx);
cv_destroy(&dn->dn_notxholds);
+ cv_destroy(&dn->dn_nodnholds);
zfs_refcount_destroy(&dn->dn_holds);
zfs_refcount_destroy(&dn->dn_tx_holds);
ASSERT(!list_link_active(&dn->dn_link));
- for (i = 0; i < TXG_SIZE; i++) {
+ for (int i = 0; i < TXG_SIZE; i++) {
ASSERT(!multilist_link_active(&dn->dn_dirty_link[i]));
ASSERT3P(dn->dn_free_ranges[i], ==, NULL);
list_destroy(&dn->dn_dirty_records[i]);
avl_destroy(&dn->dn_dbufs);
}
+static int
+dnode_kstats_update(kstat_t *ksp, int rw)
+{
+ dnode_stats_t *ds = ksp->ks_data;
+
+ if (rw == KSTAT_WRITE)
+ return (EACCES);
+ ds->dnode_hold_dbuf_hold.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_dbuf_hold);
+ ds->dnode_hold_dbuf_read.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_dbuf_read);
+ ds->dnode_hold_alloc_hits.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_hits);
+ ds->dnode_hold_alloc_misses.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_misses);
+ ds->dnode_hold_alloc_interior.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_interior);
+ ds->dnode_hold_alloc_lock_retry.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_lock_retry);
+ ds->dnode_hold_alloc_lock_misses.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_lock_misses);
+ ds->dnode_hold_alloc_type_none.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_alloc_type_none);
+ ds->dnode_hold_free_hits.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_hits);
+ ds->dnode_hold_free_misses.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_misses);
+ ds->dnode_hold_free_lock_misses.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_lock_misses);
+ ds->dnode_hold_free_lock_retry.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_lock_retry);
+ ds->dnode_hold_free_refcount.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_refcount);
+ ds->dnode_hold_free_overflow.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_hold_free_overflow);
+ ds->dnode_free_interior_lock_retry.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_free_interior_lock_retry);
+ ds->dnode_allocate.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_allocate);
+ ds->dnode_reallocate.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_reallocate);
+ ds->dnode_buf_evict.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_buf_evict);
+ ds->dnode_alloc_next_chunk.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_alloc_next_chunk);
+ ds->dnode_alloc_race.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_alloc_race);
+ ds->dnode_alloc_next_block.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_alloc_next_block);
+ ds->dnode_move_invalid.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_invalid);
+ ds->dnode_move_recheck1.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_recheck1);
+ ds->dnode_move_recheck2.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_recheck2);
+ ds->dnode_move_special.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_special);
+ ds->dnode_move_handle.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_handle);
+ ds->dnode_move_rwlock.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_rwlock);
+ ds->dnode_move_active.value.ui64 =
+ wmsum_value(&dnode_sums.dnode_move_active);
+ return (0);
+}
+
void
dnode_init(void)
{
0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0);
kmem_cache_set_move(dnode_cache, dnode_move);
+ wmsum_init(&dnode_sums.dnode_hold_dbuf_hold, 0);
+ wmsum_init(&dnode_sums.dnode_hold_dbuf_read, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_hits, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_misses, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_interior, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_lock_retry, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_lock_misses, 0);
+ wmsum_init(&dnode_sums.dnode_hold_alloc_type_none, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_hits, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_misses, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_lock_misses, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_lock_retry, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_refcount, 0);
+ wmsum_init(&dnode_sums.dnode_hold_free_overflow, 0);
+ wmsum_init(&dnode_sums.dnode_free_interior_lock_retry, 0);
+ wmsum_init(&dnode_sums.dnode_allocate, 0);
+ wmsum_init(&dnode_sums.dnode_reallocate, 0);
+ wmsum_init(&dnode_sums.dnode_buf_evict, 0);
+ wmsum_init(&dnode_sums.dnode_alloc_next_chunk, 0);
+ wmsum_init(&dnode_sums.dnode_alloc_race, 0);
+ wmsum_init(&dnode_sums.dnode_alloc_next_block, 0);
+ wmsum_init(&dnode_sums.dnode_move_invalid, 0);
+ wmsum_init(&dnode_sums.dnode_move_recheck1, 0);
+ wmsum_init(&dnode_sums.dnode_move_recheck2, 0);
+ wmsum_init(&dnode_sums.dnode_move_special, 0);
+ wmsum_init(&dnode_sums.dnode_move_handle, 0);
+ wmsum_init(&dnode_sums.dnode_move_rwlock, 0);
+ wmsum_init(&dnode_sums.dnode_move_active, 0);
+
dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc",
KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (dnode_ksp != NULL) {
dnode_ksp->ks_data = &dnode_stats;
+ dnode_ksp->ks_update = dnode_kstats_update;
kstat_install(dnode_ksp);
}
}
dnode_ksp = NULL;
}
+ wmsum_fini(&dnode_sums.dnode_hold_dbuf_hold);
+ wmsum_fini(&dnode_sums.dnode_hold_dbuf_read);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_hits);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_misses);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_interior);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_retry);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_lock_misses);
+ wmsum_fini(&dnode_sums.dnode_hold_alloc_type_none);
+ wmsum_fini(&dnode_sums.dnode_hold_free_hits);
+ wmsum_fini(&dnode_sums.dnode_hold_free_misses);
+ wmsum_fini(&dnode_sums.dnode_hold_free_lock_misses);
+ wmsum_fini(&dnode_sums.dnode_hold_free_lock_retry);
+ wmsum_fini(&dnode_sums.dnode_hold_free_refcount);
+ wmsum_fini(&dnode_sums.dnode_hold_free_overflow);
+ wmsum_fini(&dnode_sums.dnode_free_interior_lock_retry);
+ wmsum_fini(&dnode_sums.dnode_allocate);
+ wmsum_fini(&dnode_sums.dnode_reallocate);
+ wmsum_fini(&dnode_sums.dnode_buf_evict);
+ wmsum_fini(&dnode_sums.dnode_alloc_next_chunk);
+ wmsum_fini(&dnode_sums.dnode_alloc_race);
+ wmsum_fini(&dnode_sums.dnode_alloc_next_block);
+ wmsum_fini(&dnode_sums.dnode_move_invalid);
+ wmsum_fini(&dnode_sums.dnode_move_recheck1);
+ wmsum_fini(&dnode_sums.dnode_move_recheck2);
+ wmsum_fini(&dnode_sums.dnode_move_special);
+ wmsum_fini(&dnode_sums.dnode_move_handle);
+ wmsum_fini(&dnode_sums.dnode_move_rwlock);
+ wmsum_fini(&dnode_sums.dnode_move_active);
+
kmem_cache_destroy(dnode_cache);
dnode_cache = NULL;
}
int i;
if (dnp->dn_type == DMU_OT_NONE) {
- bzero(dnp, sizeof (dnode_phys_t));
+ memset(dnp, 0, sizeof (dnode_phys_t));
return;
}
* dnode dnode is smaller than a regular dnode.
*/
if (dnp->dn_bonuslen != 0) {
- /*
- * Note that the bonus length calculated here may be
- * longer than the actual bonus buffer. This is because
- * we always put the bonus buffer after the last block
- * pointer (instead of packing it against the end of the
- * dnode buffer).
- */
- int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t);
- int slots = dnp->dn_extra_slots + 1;
- size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off;
dmu_object_byteswap_t byteswap;
ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype));
byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype);
- dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len);
+ dmu_ot_byteswap[byteswap].ob_func(DN_BONUS(dnp),
+ DN_MAX_BONUS_LEN(dnp));
}
/* Swap SPILL block if we have one */
/* clear any data after the end of the new size */
size_t diff = dn->dn_bonuslen - newsize;
char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize;
- bzero(data_end, diff);
+ memset(data_end, 0, diff);
}
dn->dn_bonuslen = newsize;
dn->dn_dirty_txg = 0;
dn->dn_dirtyctx = 0;
- if (dn->dn_dirtyctx_firstset != NULL) {
- kmem_free(dn->dn_dirtyctx_firstset, 1);
- dn->dn_dirtyctx_firstset = NULL;
- }
+ dn->dn_dirtyctx_firstset = NULL;
if (dn->dn_bonus != NULL) {
mutex_enter(&dn->dn_bonus->db_mtx);
dbuf_destroy(dn->dn_bonus);
ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT);
dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n",
- dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots);
+ dn->dn_objset, (u_longlong_t)dn->dn_object,
+ (u_longlong_t)tx->tx_txg, blocksize, ibs, dn_slots);
DNODE_STAT_BUMP(dnode_allocate);
ASSERT(dn->dn_type == DMU_OT_NONE);
- ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0);
+ ASSERT0(memcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)));
ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE);
ASSERT(ot != DMU_OT_NONE);
ASSERT(DMU_OT_IS_VALID(ot));
ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) ||
(bonustype == DMU_OT_SA && bonuslen == 0) ||
+ (bonustype == DMU_OTN_UINT64_METADATA && bonuslen == 0) ||
(bonustype != DMU_OT_NONE && bonuslen != 0));
ASSERT(DMU_OT_IS_VALID(bonustype));
ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots));
ASSERT0(dn->dn_maxblkid);
ASSERT0(dn->dn_allocated_txg);
ASSERT0(dn->dn_assigned_txg);
- ASSERT0(dn->dn_dirty_txg);
ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds));
ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1);
ASSERT(avl_is_empty(&dn->dn_dbufs));
dn->dn_dirtyctx = 0;
dn->dn_free_txg = 0;
- if (dn->dn_dirtyctx_firstset) {
- kmem_free(dn->dn_dirtyctx_firstset, 1);
- dn->dn_dirtyctx_firstset = NULL;
- }
+ dn->dn_dirtyctx_firstset = NULL;
+ dn->dn_dirty_txg = 0;
dn->dn_allocated_txg = tx->tx_txg;
dn->dn_id_flags = 0;
static void
dnode_move_impl(dnode_t *odn, dnode_t *ndn)
{
- int i;
-
ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock));
ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx));
ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx));
- ASSERT(!MUTEX_HELD(&odn->dn_zfetch.zf_lock));
/* Copy fields. */
ndn->dn_objset = odn->dn_objset;
ndn->dn_datablksz = odn->dn_datablksz;
ndn->dn_maxblkid = odn->dn_maxblkid;
ndn->dn_num_slots = odn->dn_num_slots;
- bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0],
+ memcpy(ndn->dn_next_type, odn->dn_next_type,
sizeof (odn->dn_next_type));
- bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0],
+ memcpy(ndn->dn_next_nblkptr, odn->dn_next_nblkptr,
sizeof (odn->dn_next_nblkptr));
- bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0],
+ memcpy(ndn->dn_next_nlevels, odn->dn_next_nlevels,
sizeof (odn->dn_next_nlevels));
- bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0],
+ memcpy(ndn->dn_next_indblkshift, odn->dn_next_indblkshift,
sizeof (odn->dn_next_indblkshift));
- bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0],
+ memcpy(ndn->dn_next_bonustype, odn->dn_next_bonustype,
sizeof (odn->dn_next_bonustype));
- bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0],
+ memcpy(ndn->dn_rm_spillblk, odn->dn_rm_spillblk,
sizeof (odn->dn_rm_spillblk));
- bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0],
+ memcpy(ndn->dn_next_bonuslen, odn->dn_next_bonuslen,
sizeof (odn->dn_next_bonuslen));
- bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0],
+ memcpy(ndn->dn_next_blksz, odn->dn_next_blksz,
sizeof (odn->dn_next_blksz));
- bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0],
+ memcpy(ndn->dn_next_maxblkid, odn->dn_next_maxblkid,
sizeof (odn->dn_next_maxblkid));
- for (i = 0; i < TXG_SIZE; i++) {
+ for (int i = 0; i < TXG_SIZE; i++) {
list_move_tail(&ndn->dn_dirty_records[i],
&odn->dn_dirty_records[i]);
}
- bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0],
+ memcpy(ndn->dn_free_ranges, odn->dn_free_ranges,
sizeof (odn->dn_free_ranges));
ndn->dn_allocated_txg = odn->dn_allocated_txg;
ndn->dn_free_txg = odn->dn_free_txg;
ndn->dn_newgid = odn->dn_newgid;
ndn->dn_newprojid = odn->dn_newprojid;
ndn->dn_id_flags = odn->dn_id_flags;
- dmu_zfetch_init(&ndn->dn_zfetch, NULL);
- list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream);
- ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode;
+ dmu_zfetch_init(&ndn->dn_zfetch, ndn);
/*
* Update back pointers. Updating the handle fixes the back pointer of
*/
ASSERT(ndn->dn_handle->dnh_dnode == odn);
ndn->dn_handle->dnh_dnode = ndn;
- if (ndn->dn_zfetch.zf_dnode == odn) {
- ndn->dn_zfetch.zf_dnode = ndn;
- }
/*
* Invalidate the original dnode by clearing all of its back pointers.
/*
* Satisfy the destructor.
*/
- for (i = 0; i < TXG_SIZE; i++) {
+ for (int i = 0; i < TXG_SIZE; i++) {
list_create(&odn->dn_dirty_records[i],
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
odn->dn_moved = (uint8_t)-1;
}
-/*ARGSUSED*/
static kmem_cbrc_t
dnode_move(void *buf, void *newbuf, size_t size, void *arg)
{
*/
refcount = zfs_refcount_count(&odn->dn_holds);
ASSERT(refcount >= 0);
- dbufs = odn->dn_dbufs_count;
+ dbufs = DN_DBUFS_COUNT(odn);
/* We can't have more dbufs than dnode holds. */
ASSERT3U(dbufs, <=, refcount);
list_link_replace(&odn->dn_link, &ndn->dn_link);
/* If the dnode was safe to move, the refcount cannot have changed. */
ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds));
- ASSERT(dbufs == ndn->dn_dbufs_count);
+ ASSERT(dbufs == DN_DBUFS_COUNT(ndn));
zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */
mutex_exit(&os->os_lock);
return (B_TRUE);
}
-static void
+static uint_t
dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
{
+ uint_t reclaimed = 0;
+
ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
for (int i = idx; i < idx + slots; i++) {
ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
dnode_destroy(dnh->dnh_dnode);
dnh->dnh_dnode = DN_SLOT_FREE;
+ reclaimed++;
}
}
+
+ return (reclaimed);
}
void
while (!dnode_slots_tryenter(children, idx, slots)) {
DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
- cond_resched();
+ kpreempt(KPREEMPT_SYNC);
}
dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
dnode_t *dn = dnh->dnh_dnode;
/*
- * Wait for final references to the dnode to clear. This can
- * only happen if the arc is asynchronously evicting state that
- * has a hold on this dnode while we are trying to evict this
- * dnode.
+ * Ensure dnode_rele_and_unlock() has released dn_mtx, after final
+ * zfs_refcount_remove()
*/
- while (zfs_refcount_count(&dn->dn_holds) > 0)
- delay(1);
+ mutex_enter(&dn->dn_mtx);
+ if (zfs_refcount_count(&dn->dn_holds) > 0)
+ cv_wait(&dn->dn_nodnholds, &dn->dn_mtx);
+ mutex_exit(&dn->dn_mtx);
+ ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0);
+
ASSERT(dn->dn_dbuf == NULL ||
dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
zrl_add(&dnh->dnh_zrlock);
dnode_t *dn;
zrl_init(&dnh->dnh_zrlock);
- zrl_tryenter(&dnh->dnh_zrlock);
+ VERIFY3U(1, ==, zrl_tryenter(&dnh->dnh_zrlock));
dn = dnode_create(os, dnp, NULL, object, dnh);
DNODE_VERIFY(dn);
*/
int
dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
- void *tag, dnode_t **dnp)
+ const void *tag, dnode_t **dnp)
{
int epb, idx, err;
int drop_struct_lock = FALSE;
* We do not need to decrypt to read the dnode so it doesn't matter
* if we get the encrypted or decrypted version.
*/
- err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT);
+ err = dbuf_read(db, NULL, DB_RF_CANFAIL |
+ DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
if (err) {
DNODE_STAT_BUMP(dnode_hold_dbuf_read);
dbuf_rele(db, FTAG);
dnode_slots_rele(dnc, idx, slots);
while (!dnode_slots_tryenter(dnc, idx, slots)) {
DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry);
- cond_resched();
+ kpreempt(KPREEMPT_SYNC);
}
/*
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
+ dmu_buf_add_user_size(&db->db,
+ sizeof (dnode_t));
}
}
dnode_slots_rele(dnc, idx, slots);
while (!dnode_slots_tryenter(dnc, idx, slots)) {
DNODE_STAT_BUMP(dnode_hold_free_lock_retry);
- cond_resched();
+ kpreempt(KPREEMPT_SYNC);
}
if (!dnode_check_slots_free(dnc, idx, slots)) {
* to be freed. Single slot dnodes can be safely
* re-purposed as a performance optimization.
*/
- if (slots > 1)
- dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+ if (slots > 1) {
+ uint_t reclaimed =
+ dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+ if (reclaimed > 0)
+ dmu_buf_sub_user_size(&db->db,
+ reclaimed * sizeof (dnode_t));
+ }
dnh = &dnc->dnc_children[idx];
if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
} else {
dn = dnode_create(os, dn_block + idx, db,
object, dnh);
+ dmu_buf_add_user_size(&db->db, sizeof (dnode_t));
}
mutex_enter(&dn->dn_mtx);
dnode_slots_rele(dnc, idx, slots);
DNODE_VERIFY(dn);
+ ASSERT3P(dnp, !=, NULL);
ASSERT3P(dn->dn_dbuf, ==, db);
ASSERT3U(dn->dn_object, ==, object);
dbuf_rele(db, FTAG);
* Return held dnode if the object is allocated, NULL if not.
*/
int
-dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
+dnode_hold(objset_t *os, uint64_t object, const void *tag, dnode_t **dnp)
{
return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag,
dnp));
* new reference.
*/
boolean_t
-dnode_add_ref(dnode_t *dn, void *tag)
+dnode_add_ref(dnode_t *dn, const void *tag)
{
mutex_enter(&dn->dn_mtx);
if (zfs_refcount_is_zero(&dn->dn_holds)) {
}
void
-dnode_rele(dnode_t *dn, void *tag)
+dnode_rele(dnode_t *dn, const void *tag)
{
mutex_enter(&dn->dn_mtx);
dnode_rele_and_unlock(dn, tag, B_FALSE);
}
void
-dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting)
+dnode_rele_and_unlock(dnode_t *dn, const void *tag, boolean_t evicting)
{
uint64_t refs;
/* Get while the hold prevents the dnode from moving. */
dnode_handle_t *dnh = dn->dn_handle;
refs = zfs_refcount_remove(&dn->dn_holds, tag);
+ if (refs == 0)
+ cv_broadcast(&dn->dn_nodnholds);
mutex_exit(&dn->dn_mtx);
+ /* dnode could get destroyed at this point, so don't use it anymore */
/*
* It's unsafe to release the last hold on a dnode by dnode_rele() or
* other direct or indirect hold on the dnode must first drop the dnode
* handle.
*/
+#ifdef ZFS_DEBUG
ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread);
+#endif
/* NOTE: the DNODE_DNODE does not have a dn_dbuf */
if (refs == 0 && db != NULL) {
slots, NULL, NULL));
}
+/*
+ * Checks if the dnode itself is dirty, or is carrying any uncommitted records.
+ * It is important to check both conditions, as some operations (eg appending
+ * to a file) can dirty both as a single logical unit, but they are not synced
+ * out atomically, so checking one and not the other can result in an object
+ * appearing to be clean mid-way through a commit.
+ *
+ * Do not change this lightly! If you get it wrong, dmu_offset_next() can
+ * detect a hole where there is really data, leading to silent corruption.
+ */
+boolean_t
+dnode_is_dirty(dnode_t *dn)
+{
+ mutex_enter(&dn->dn_mtx);
+
+ for (int i = 0; i < TXG_SIZE; i++) {
+ if (multilist_link_active(&dn->dn_dirty_link[i]) ||
+ !list_is_empty(&dn->dn_dirty_records[i])) {
+ mutex_exit(&dn->dn_mtx);
+ return (B_TRUE);
+ }
+ }
+
+ mutex_exit(&dn->dn_mtx);
+
+ return (B_FALSE);
+}
+
void
dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
{
*/
dmu_objset_userquota_get_ids(dn, B_TRUE, tx);
- multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK];
+ multilist_t *dirtylist = &os->os_dirty_dnodes[txg & TXG_MASK];
multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn);
/*
ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]);
dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n",
- dn->dn_object, txg);
+ (u_longlong_t)dn->dn_object, (u_longlong_t)txg);
multilist_sublist_insert_head(mls, dn);
if (ibs == dn->dn_indblkshift)
ibs = 0;
- if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0)
+ if (size == dn->dn_datablksz && ibs == 0)
return (0);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (ibs && dn->dn_nlevels != 1)
goto fail;
- /* resize the old block */
- err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
- if (err == 0) {
- dbuf_new_size(db, size, tx);
- } else if (err != ENOENT) {
- goto fail;
- }
-
- dnode_setdblksz(dn, size);
dnode_setdirty(dn, tx);
- dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size;
+ if (size != dn->dn_datablksz) {
+ /* resize the old block */
+ err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db);
+ if (err == 0) {
+ dbuf_new_size(db, size, tx);
+ } else if (err != ENOENT) {
+ goto fail;
+ }
+
+ dnode_setdblksz(dn, size);
+ dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = size;
+ if (db)
+ dbuf_rele(db, FTAG);
+ }
if (ibs) {
dn->dn_indblkshift = ibs;
- dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs;
+ dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs;
}
- /* release after we have fixed the blocksize in the dnode */
- if (db)
- dbuf_rele(db, FTAG);
rw_exit(&dn->dn_struct_rwlock);
return (0);
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
+ ASSERT3U(new_nlevels, >, dn->dn_nlevels);
dn->dn_nlevels = new_nlevels;
ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]);
list = &dn->dn_dirty_records[txgoff];
for (dr = list_head(list); dr; dr = dr_next) {
dr_next = list_next(&dn->dn_dirty_records[txgoff], dr);
- if (dr->dr_dbuf->db_level != new_nlevels-1 &&
+
+ IMPLY(dr->dr_dbuf == NULL, old_nlevels == 1);
+ if (dr->dr_dbuf == NULL ||
+ (dr->dr_dbuf->db_level == old_nlevels - 1 &&
dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
- dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
- ASSERT(dr->dr_dbuf->db_level == old_nlevels-1);
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID)) {
list_remove(&dn->dn_dirty_records[txgoff], dr);
list_insert_tail(&new->dt.di.dr_children, dr);
dr->dr_parent = new;
dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
dmu_tx_t *tx)
{
- dmu_buf_impl_t db_search;
+ dmu_buf_impl_t *db_search;
dmu_buf_impl_t *db;
avl_index_t where;
+ db_search = kmem_zalloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
+
mutex_enter(&dn->dn_dbufs_mtx);
- db_search.db_level = 1;
- db_search.db_blkid = start_blkid + 1;
- db_search.db_state = DB_SEARCH;
+ db_search->db_level = 1;
+ db_search->db_blkid = start_blkid + 1;
+ db_search->db_state = DB_SEARCH;
for (;;) {
- db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ db = avl_find(&dn->dn_dbufs, db_search, &where);
if (db == NULL)
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
/*
* Setup the next blkid we want to search for.
*/
- db_search.db_blkid = db->db_blkid + 1;
+ db_search->db_blkid = db->db_blkid + 1;
ASSERT3U(db->db_blkid, >=, start_blkid);
/*
/*
* Walk all the in-core level-1 dbufs and verify they have been dirtied.
*/
- db_search.db_level = 1;
- db_search.db_blkid = start_blkid + 1;
- db_search.db_state = DB_SEARCH;
- db = avl_find(&dn->dn_dbufs, &db_search, &where);
+ db_search->db_level = 1;
+ db_search->db_blkid = start_blkid + 1;
+ db_search->db_state = DB_SEARCH;
+ db = avl_find(&dn->dn_dbufs, db_search, &where);
if (db == NULL)
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) {
ASSERT(db->db_dirtycnt > 0);
}
#endif
+ kmem_free(db_search, sizeof (dmu_buf_impl_t));
mutex_exit(&dn->dn_dbufs_mtx);
}
void
-dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, const void *tag)
+{
+ /*
+ * Don't set dirtyctx to SYNC if we're just modifying this as we
+ * initialize the objset.
+ */
+ if (dn->dn_dirtyctx == DN_UNDIRTIED) {
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
+
+ if (ds != NULL) {
+ rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag);
+ }
+ if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
+ if (dmu_tx_is_syncing(tx))
+ dn->dn_dirtyctx = DN_DIRTY_SYNC;
+ else
+ dn->dn_dirtyctx = DN_DIRTY_OPEN;
+ dn->dn_dirtyctx_firstset = tag;
+ }
+ if (ds != NULL) {
+ rrw_exit(&ds->ds_bp_rwlock, tag);
+ }
+ }
+}
+
+static void
+dnode_partial_zero(dnode_t *dn, uint64_t off, uint64_t blkoff, uint64_t len,
+ dmu_tx_t *tx)
{
dmu_buf_impl_t *db;
+ int res;
+
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), TRUE, FALSE,
+ FTAG, &db);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (res == 0) {
+ db_lock_type_t dblt;
+ boolean_t dirty;
+
+ dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
+ /* don't dirty if not on disk and not dirty */
+ dirty = !list_is_empty(&db->db_dirty_records) ||
+ (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
+ dmu_buf_unlock_parent(db, dblt, FTAG);
+ if (dirty) {
+ caddr_t data;
+
+ dmu_buf_will_dirty(&db->db, tx);
+ data = db->db.db_data;
+ memset(data + blkoff, 0, len);
+ }
+ dbuf_rele(db, FTAG);
+ }
+}
+
+void
+dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx)
+{
uint64_t blkoff, blkid, nblks;
int blksz, blkshift, head, tail;
int trunc = FALSE;
}
/* zero out any partial block data at the start of the range */
if (head) {
- int res;
ASSERT3U(blkoff + head, ==, blksz);
if (len < head)
head = len;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off),
- TRUE, FALSE, FTAG, &db);
- rw_exit(&dn->dn_struct_rwlock);
- if (res == 0) {
- caddr_t data;
- boolean_t dirty;
-
- db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER,
- FTAG);
- /* don't dirty if it isn't on disk and isn't dirty */
- dirty = db->db_last_dirty ||
- (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
- dmu_buf_unlock_parent(db, dblt, FTAG);
- if (dirty) {
- dmu_buf_will_dirty(&db->db, tx);
- data = db->db.db_data;
- bzero(data + blkoff, head);
- }
- dbuf_rele(db, FTAG);
- }
+ dnode_partial_zero(dn, off, blkoff, head, tx);
off += head;
len -= head;
}
ASSERT0(P2PHASE(off, blksz));
/* zero out any partial block data at the end of the range */
if (tail) {
- int res;
if (len < tail)
tail = len;
- rw_enter(&dn->dn_struct_rwlock, RW_READER);
- res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len),
- TRUE, FALSE, FTAG, &db);
- rw_exit(&dn->dn_struct_rwlock);
- if (res == 0) {
- boolean_t dirty;
- /* don't dirty if not on disk and not dirty */
- db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER,
- FTAG);
- dirty = db->db_last_dirty ||
- (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr));
- dmu_buf_unlock_parent(db, type, FTAG);
- if (dirty) {
- dmu_buf_will_dirty(&db->db, tx);
- bzero(db->db.db_data, tail);
- }
- dbuf_rele(db, FTAG);
- }
+ dnode_partial_zero(dn, off + len, 0, tail, tx);
len -= tail;
}
*/
mutex_enter(&dn->dn_mtx);
{
- int txgoff = tx->tx_txg & TXG_MASK;
- if (dn->dn_free_ranges[txgoff] == NULL) {
- dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
- RANGE_SEG64, NULL, 0, 0);
- }
- range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
- range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
+ int txgoff = tx->tx_txg & TXG_MASK;
+ if (dn->dn_free_ranges[txgoff] == NULL) {
+ dn->dn_free_ranges[txgoff] = range_tree_create(NULL,
+ RANGE_SEG64, NULL, 0, 0);
+ }
+ range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks);
+ range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks);
}
dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n",
- blkid, nblks, tx->tx_txg);
+ (u_longlong_t)blkid, (u_longlong_t)nblks,
+ (u_longlong_t)tx->tx_txg);
mutex_exit(&dn->dn_mtx);
dbuf_free_range(dn, blkid, blkid + nblks - 1, tx);
uint64_t
dnode_block_freed(dnode_t *dn, uint64_t blkid)
{
- void *dp = spa_get_dsl(dn->dn_objset->os_spa);
int i;
if (blkid == DMU_BONUS_BLKID)
return (FALSE);
- /*
- * If we're in the process of opening the pool, dp will not be
- * set yet, but there shouldn't be anything dirty.
- */
- if (dp == NULL)
- return (FALSE);
-
if (dn->dn_free_txg)
return (TRUE);
return (SET_ERROR(ESRCH));
}
error = dbuf_read(db, NULL,
- DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT);
+ DB_RF_CANFAIL | DB_RF_HAVESTRUCT |
+ DB_RF_NO_DECRYPT | DB_RF_NOPREFETCH);
if (error) {
dbuf_rele(db, FTAG);
return (error);
}
if (db != NULL && txg != 0 && (db->db_blkptr == NULL ||
- db->db_blkptr->blk_birth <= txg ||
+ BP_GET_LOGICAL_BIRTH(db->db_blkptr) <= txg ||
BP_IS_HOLE(db->db_blkptr))) {
/*
* This can only happen when we are searching up the tree
i >= 0 && i < epb; i += inc) {
if (BP_GET_FILL(&bp[i]) >= minfill &&
BP_GET_FILL(&bp[i]) <= maxfill &&
- (hole || bp[i].blk_birth > txg))
+ (hole || BP_GET_LOGICAL_BIRTH(&bp[i]) > txg))
break;
if (inc > 0 || *offset > 0)
*offset += inc;
if (inc < 0) {
/* traversing backwards; position offset at the end */
- ASSERT3U(*offset, <=, start);
- *offset = MIN(*offset + (1ULL << span) - 1, start);
+ if (span < 8 * sizeof (*offset))
+ *offset = MIN(*offset + (1ULL << span) - 1,
+ start);
} else if (*offset < start) {
*offset = start;
}
EXPORT_SYMBOL(dnode_evict_dbufs);
EXPORT_SYMBOL(dnode_evict_bonus);
#endif
+
+ZFS_MODULE_PARAM(zfs, zfs_, default_bs, INT, ZMOD_RW,
+ "Default dnode block shift");
+ZFS_MODULE_PARAM(zfs, zfs_, default_ibs, INT, ZMOD_RW,
+ "Default dnode indirect block shift");