* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
* Copyright (c) 2019, Klara Inc.
* Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
#include <sys/zfs_context.h>
#include <sys/trace_zfs.h>
#include <sys/callb.h>
#include <sys/abd.h>
+#include <sys/brt.h>
#include <sys/vdev.h>
#include <cityhash.h>
#include <sys/spa_impl.h>
continue; \
}
-static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
/* Size limits for the caches */
-static unsigned long dbuf_cache_max_bytes = ULONG_MAX;
-static unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
+static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
+static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
/* Set the default sizes of the caches to log2 fraction of arc size */
static uint_t dbuf_cache_shift = 5;
(dbuf)->db_blkid == (blkid))
dmu_buf_impl_t *
-dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
+ uint64_t *hash_out)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
uint64_t hv;
}
}
mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ if (hash_out != NULL)
+ *hash_out = hv;
return (NULL);
}
objset_t *os = db->db_objset;
uint64_t obj = db->db.db_object;
int level = db->db_level;
- uint64_t blkid, hv, idx;
+ uint64_t blkid, idx;
dmu_buf_impl_t *dbf;
uint32_t i;
blkid = db->db_blkid;
- hv = dbuf_hash(os, obj, level, blkid);
- idx = hv & h->hash_table_mask;
+ ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
+ idx = db->db_hash & h->hash_table_mask;
mutex_enter(DBUF_HASH_MUTEX(h, idx));
for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
dbuf_hash_remove(dmu_buf_impl_t *db)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- uint64_t hv, idx;
+ uint64_t idx;
dmu_buf_impl_t *dbf, **dbp;
- hv = dbuf_hash(db->db_objset, db->db.db_object,
- db->db_level, db->db_blkid);
- idx = hv & h->hash_table_mask;
+ ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
+ db->db_blkid), ==, db->db_hash);
+ idx = db->db_hash & h->hash_table_mask;
/*
* We mustn't hold db_mtx to maintain lock ordering:
*dbu->dbu_clear_on_evict_dbufp = NULL;
#endif
+ if (db->db_caching_status != DB_NO_CACHE) {
+ /*
+ * This is a cached dbuf, so the size of the user data is
+ * included in its cached amount. We adjust it here because the
+ * user data has already been detached from the dbuf, and the
+ * sync functions are not supposed to touch it (the dbuf might
+ * not exist anymore by the time the sync functions run.
+ */
+ uint64_t size = dbu->dbu_size;
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size, size, db);
+ if (db->db_caching_status == DB_DBUF_CACHE)
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
+ }
+
/*
* There are two eviction callbacks - one that we call synchronously
* and one that we invoke via a taskq. The async one is useful for
boolean_t
dbuf_is_l2cacheable(dmu_buf_impl_t *db)
{
- vdev_t *vd = NULL;
- zfs_cache_type_t cache = db->db_objset->os_secondary_cache;
- blkptr_t *bp = db->db_blkptr;
-
- if (bp != NULL && !BP_IS_HOLE(bp)) {
+ if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+ (db->db_objset->os_secondary_cache ==
+ ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
+ if (l2arc_exclude_special == 0)
+ return (B_TRUE);
+
+ blkptr_t *bp = db->db_blkptr;
+ if (bp == NULL || BP_IS_HOLE(bp))
+ return (B_FALSE);
uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
+ vdev_t *vd = NULL;
if (vdev < rvd->vdev_children)
vd = rvd->vdev_child[vdev];
- if (cache == ZFS_CACHE_ALL ||
- (dbuf_is_metadata(db) && cache == ZFS_CACHE_METADATA)) {
- if (vd == NULL)
- return (B_TRUE);
+ if (vd == NULL)
+ return (B_TRUE);
- if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
- vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
- l2arc_exclude_special == 0)
- return (B_TRUE);
- }
+ if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+ vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+ return (B_TRUE);
}
-
return (B_FALSE);
}
static inline boolean_t
dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
{
- vdev_t *vd = NULL;
- zfs_cache_type_t cache = dn->dn_objset->os_secondary_cache;
-
- if (bp != NULL && !BP_IS_HOLE(bp)) {
+ if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+ (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
+ (level > 0 ||
+ DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
+ if (l2arc_exclude_special == 0)
+ return (B_TRUE);
+
+ if (bp == NULL || BP_IS_HOLE(bp))
+ return (B_FALSE);
uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
+ vdev_t *vd = NULL;
if (vdev < rvd->vdev_children)
vd = rvd->vdev_child[vdev];
- if (cache == ZFS_CACHE_ALL || ((level > 0 ||
- DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)) &&
- cache == ZFS_CACHE_METADATA)) {
- if (vd == NULL)
- return (B_TRUE);
+ if (vd == NULL)
+ return (B_TRUE);
- if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
- vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) ||
- l2arc_exclude_special == 0)
- return (B_TRUE);
- }
+ if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+ vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+ return (B_TRUE);
}
-
return (B_FALSE);
}
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
+ uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
- &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+ &dbuf_caches[DB_DBUF_CACHE].size, size, db);
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
- db->db.db_size);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
(db->db_buf == NULL || db->db_buf->b_data) &&
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
- db->db_state != DB_FILL && !dn->dn_free_txg) {
+ db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
/*
* If the blkptr isn't set but they have nonzero data,
* it had better be dirty, otherwise we'll lose that
ASSERT0(bp->blk_pad[1]);
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(BP_IS_HOLE(bp));
- ASSERT0(bp->blk_phys_birth);
+ ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
}
}
}
}
static void
-dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
{
blkptr_t *bps = db->db.db_data;
uint32_t indbs = 1ULL << dn->dn_indblkshift;
for (int i = 0; i < n_bps; i++) {
blkptr_t *bp = &bps[i];
- ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
- BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
- dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
- BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
- BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
- BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+ ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
+ BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
+ dn->dn_datablksz : BP_GET_LSIZE(dbbp));
+ BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
+ BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
+ BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
}
}
* was taken, ENOENT if no action was taken.
*/
static int
-dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
- int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+ int is_hole = bp == NULL || BP_IS_HOLE(bp);
/*
* For level 0 blocks only, if the above check fails:
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
* processes the delete record and clears the bp while we are waiting
* for the dn_mtx (resulting in a "no" from block_freed).
*/
- if (!is_hole && db->db_level == 0) {
- is_hole = dnode_block_freed(dn, db->db_blkid) ||
- BP_IS_HOLE(db->db_blkptr);
- }
+ if (!is_hole && db->db_level == 0)
+ is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
if (is_hole) {
dbuf_set_data(db, dbuf_alloc_arcbuf(db));
memset(db->db.db_data, 0, db->db.db_size);
- if (db->db_blkptr != NULL && db->db_level > 0 &&
- BP_IS_HOLE(db->db_blkptr) &&
- db->db_blkptr->blk_birth != 0) {
- dbuf_handle_indirect_hole(db, dn);
+ if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
+ BP_GET_LOGICAL_BIRTH(bp) != 0) {
+ dbuf_handle_indirect_hole(db, dn, bp);
}
db->db_state = DB_CACHED;
DTRACE_SET_STATE(db, "hole read satisfied");
ASSERT(MUTEX_HELD(&db->db_mtx));
- if (!os->os_encrypted || os->os_raw_receive ||
- (flags & DB_RF_NO_DECRYPT) != 0)
+ if ((flags & DB_RF_NO_DECRYPT) != 0 ||
+ !os->os_encrypted || os->os_raw_receive)
return (0);
DB_DNODE_ENTER(db);
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
int err, zio_flags;
+ blkptr_t bp, *bpp;
- err = zio_flags = 0;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
ASSERT(db->db_buf == NULL);
ASSERT(db->db_parent == NULL ||
RW_LOCK_HELD(&db->db_parent->db_rwlock));
goto early_unlock;
}
- err = dbuf_read_hole(db, dn);
+ if (db->db_state == DB_UNCACHED) {
+ if (db->db_blkptr == NULL) {
+ bpp = NULL;
+ } else {
+ bp = *db->db_blkptr;
+ bpp = &bp;
+ }
+ } else {
+ dbuf_dirty_record_t *dr;
+
+ ASSERT3S(db->db_state, ==, DB_NOFILL);
+
+ /*
+ * Block cloning: If we have a pending block clone,
+ * we don't want to read the underlying block, but the content
+ * of the block being cloned, so we have the most recent data.
+ */
+ dr = list_head(&db->db_dirty_records);
+ if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
+ err = EIO;
+ goto early_unlock;
+ }
+ bp = dr->dt.dl.dr_overridden_by;
+ bpp = &bp;
+ }
+
+ err = dbuf_read_hole(db, dn, bpp);
if (err == 0)
goto early_unlock;
+ ASSERT(bpp != NULL);
+
/*
* Any attempt to read a redacted block should result in an error. This
* will never happen under normal conditions, but can be useful for
* debugging purposes.
*/
- if (BP_IS_REDACTED(db->db_blkptr)) {
+ if (BP_IS_REDACTED(bpp)) {
ASSERT(dsl_dataset_feature_is_active(
db->db_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
* All bps of an encrypted os should have the encryption bit set.
* If this is not true it indicates tampering and we report an error.
*/
- if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
- spa_log_error(db->db_objset->os_spa, &zb);
- zfs_panic_recover("unencrypted block in encrypted "
- "object set %llu", dmu_objset_id(db->db_objset));
+ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
+ spa_log_error(db->db_objset->os_spa, &zb,
+ BP_GET_LOGICAL_BIRTH(bpp));
err = SET_ERROR(EIO);
goto early_unlock;
}
DTRACE_SET_STATE(db, "read issued");
mutex_exit(&db->db_mtx);
- if (dbuf_is_l2cacheable(db))
+ if (!DBUF_IS_CACHEABLE(db))
+ aflags |= ARC_FLAG_UNCACHED;
+ else if (dbuf_is_l2cacheable(db))
aflags |= ARC_FLAG_L2CACHE;
dbuf_add_ref(db, NULL);
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
zio_flags |= ZIO_FLAG_RAW;
/*
- * The zio layer will copy the provided blkptr later, but we need to
- * do this now so that we can release the parent's rwlock. We have to
- * do that now so that if dbuf_read_done is called synchronously (on
+ * The zio layer will copy the provided blkptr later, but we have our
+ * own copy so that we can release the parent's rwlock. We have to
+ * do that so that if dbuf_read_done is called synchronously (on
* an l1 cache hit) we don't acquire the db_mtx while holding the
* parent's rwlock, which would be a lock ordering violation.
*/
- blkptr_t bp = *db->db_blkptr;
dmu_buf_unlock_parent(db, dblt, tag);
- (void) arc_read(zio, db->db_objset->os_spa, &bp,
+ (void) arc_read(zio, db->db_objset->os_spa, bpp,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb);
return (err);
*/
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
- if (db->db_state == DB_NOFILL)
- return (SET_ERROR(EIO));
-
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
- DBUF_IS_CACHEABLE(db);
+ (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
mutex_enter(&db->db_mtx);
+ if (flags & DB_RF_PARTIAL_FIRST)
+ db->db_partial_read = B_TRUE;
+ else if (!(flags & DB_RF_PARTIAL_MORE))
+ db->db_partial_read = B_FALSE;
if (db->db_state == DB_CACHED) {
- spa_t *spa = dn->dn_objset->os_spa;
-
/*
* Ensure that this block's dnode has been decrypted if
* the caller has requested decrypted data.
(arc_is_encrypted(db->db_buf) ||
arc_is_unauthenticated(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+ spa_t *spa = dn->dn_objset->os_spa;
zbookmark_phys_t zb;
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_hits);
- } else if (db->db_state == DB_UNCACHED) {
- spa_t *spa = dn->dn_objset->os_spa;
+ } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
boolean_t need_wait = B_FALSE;
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
- if (zio == NULL &&
- db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+ if (zio == NULL && (db->db_state == DB_NOFILL ||
+ (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
+ spa_t *spa = dn->dn_objset->os_spa;
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
need_wait = B_TRUE;
}
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
zio_free(db->db_objset->os_spa, txg, bp);
+ if (dr->dt.dl.dr_brtwrite) {
+ ASSERT0P(dr->dt.dl.dr_data);
+ dr->dt.dl.dr_data = db->db_buf;
+ }
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
+ dr->dt.dl.dr_brtwrite = B_FALSE;
dr->dt.dl.dr_has_raw_params = B_FALSE;
/*
* the buf thawed to save the effort of freezing &
* immediately re-thawing it.
*/
- arc_release(dr->dt.dl.dr_data, db);
+ if (dr->dt.dl.dr_data)
+ arc_release(dr->dt.dl.dr_data, db);
}
/*
* Otherwise the buffer contents could be inconsistent between the
* dbuf and the lightweight dirty record.
*/
- ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid));
+ ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
+ NULL));
mutex_enter(&dn->dn_mtx);
int txgoff = tx->tx_txg & TXG_MASK;
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
- if (db->db_blkid != DMU_BONUS_BLKID) {
+ if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
dmu_objset_willuse_space(os, db->db.db_size, tx);
}
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
- if (db->db_blkid != DMU_BONUS_BLKID)
+ if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
dr->dr_accounted = db->db.db_size;
+ }
dr->dr_dbuf = db;
dr->dr_txg = tx->tx_txg;
list_insert_before(&db->db_dirty_records, dr_next, dr);
* Undirty a buffer in the transaction group referenced by the given
* transaction. Return whether this evicted the dbuf.
*/
-static boolean_t
+boolean_t
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
uint64_t txg = tx->tx_txg;
+ boolean_t brtwrite;
ASSERT(txg != 0);
return (B_FALSE);
ASSERT(dr->dr_dbuf == db);
+ brtwrite = dr->dt.dl.dr_brtwrite;
+ if (brtwrite) {
+ /*
+ * We are freeing a block that we cloned in the same
+ * transaction group.
+ */
+ brt_pending_remove(dmu_objset_spa(db->db_objset),
+ &dr->dt.dl.dr_overridden_by, tx);
+ }
+
dnode_t *dn = dr->dr_dnode;
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
mutex_exit(&dn->dn_mtx);
}
- if (db->db_state != DB_NOFILL) {
+ if (db->db_state != DB_NOFILL && !brtwrite) {
dbuf_unoverride(dr);
ASSERT(db->db_buf != NULL);
db->db_dirtycnt -= 1;
if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
- ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+ ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+ arc_released(db->db_buf));
dbuf_destroy(db);
return (B_TRUE);
}
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ boolean_t undirty = B_FALSE;
ASSERT(tx->tx_txg != 0);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
*/
mutex_enter(&db->db_mtx);
- if (db->db_state == DB_CACHED) {
+ if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
/*
* It's possible that it is already dirty but not cached,
* go through dmu_buf_will_dirty().
*/
if (dr != NULL) {
- /* This dbuf is already dirty and cached. */
- dbuf_redirty(dr);
- mutex_exit(&db->db_mtx);
- return;
+ if (dr->dt.dl.dr_brtwrite) {
+ /*
+ * Block cloning: If we are dirtying a cloned
+ * block, we cannot simply redirty it, because
+ * this dr has no data associated with it.
+ * We will go through a full undirtying below,
+ * before dirtying it again.
+ */
+ undirty = B_TRUE;
+ } else {
+ /* This dbuf is already dirty and cached. */
+ dbuf_redirty(dr);
+ mutex_exit(&db->db_mtx);
+ return;
+ }
}
}
mutex_exit(&db->db_mtx);
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
flags |= DB_RF_HAVESTRUCT;
DB_DNODE_EXIT(db);
+
+ /*
+ * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
+ * want to make sure dbuf_read() will read the pending cloned block and
+ * not the uderlying block that is being replaced. dbuf_undirty() will
+ * do dbuf_unoverride(), so we will end up with cloned block content,
+ * without overridden BP.
+ */
(void) dbuf_read(db, NULL, flags);
+ if (undirty) {
+ mutex_enter(&db->db_mtx);
+ VERIFY(!dbuf_undirty(db, tx));
+ mutex_exit(&db->db_mtx);
+ }
(void) dbuf_dirty(db, tx);
}
return (dr != NULL);
}
+void
+dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ /*
+ * Block cloning: We are going to clone into this block, so undirty
+ * modifications done to this block so far in this txg. This includes
+ * writes and clones into this block.
+ */
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+ VERIFY(!dbuf_undirty(db, tx));
+ ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
+ if (db->db_buf != NULL) {
+ arc_buf_destroy(db->db_buf, db);
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ }
+
+ db->db_state = DB_NOFILL;
+ DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+
+ DBUF_VERIFY(db);
+ mutex_exit(&db->db_mtx);
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
+}
+
void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ mutex_enter(&db->db_mtx);
db->db_state = DB_NOFILL;
DTRACE_SET_STATE(db, "allocating NOFILL buffer");
- dmu_buf_will_fill(db_fake, tx);
+ mutex_exit(&db->db_mtx);
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
}
void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
dmu_tx_private_ok(tx));
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_NOFILL) {
+ /*
+ * Block cloning: We will be completely overwriting a block
+ * cloned in this transaction group, so let's undirty the
+ * pending clone and mark the block as uncached. This will be
+ * as if the clone was never done. But if the fill can fail
+ * we should have a way to return back to the cloned data.
+ */
+ if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+ mutex_exit(&db->db_mtx);
+ dmu_buf_will_dirty(db_fake, tx);
+ return;
+ }
+ VERIFY(!dbuf_undirty(db, tx));
+ db->db_state = DB_UNCACHED;
+ }
+ mutex_exit(&db->db_mtx);
+
dbuf_noread(db);
(void) dbuf_dirty(db, tx);
}
dbuf_dirty_record_t *dr;
dr = list_head(&db->db_dirty_records);
+ ASSERT3P(dr, !=, NULL);
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
dl = &dr->dt.dl;
dl->dr_overridden_by = *bp;
dl->dr_override_state = DR_OVERRIDDEN;
- dl->dr_overridden_by.blk_birth = dr->dr_txg;
+ BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
}
-void
-dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+boolean_t
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
{
(void) tx;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
- dbuf_states_t old_state;
mutex_enter(&db->db_mtx);
DBUF_VERIFY(db);
- old_state = db->db_state;
- db->db_state = DB_CACHED;
- if (old_state == DB_FILL) {
+ if (db->db_state == DB_FILL) {
if (db->db_level == 0 && db->db_freed_in_flight) {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
memset(db->db.db_data, 0, db->db.db_size);
db->db_freed_in_flight = FALSE;
+ db->db_state = DB_CACHED;
DTRACE_SET_STATE(db,
"fill done handling freed in flight");
+ failed = B_FALSE;
+ } else if (failed) {
+ VERIFY(!dbuf_undirty(db, tx));
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ DTRACE_SET_STATE(db, "fill failed");
} else {
+ db->db_state = DB_CACHED;
DTRACE_SET_STATE(db, "fill done");
}
cv_broadcast(&db->db_changed);
+ } else {
+ db->db_state = DB_CACHED;
+ failed = B_FALSE;
}
mutex_exit(&db->db_mtx);
+ return (failed);
}
void
dmu_buf_will_not_fill(dbuf, tx);
dr = list_head(&db->db_dirty_records);
+ ASSERT3P(dr, !=, NULL);
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
dl = &dr->dt.dl;
encode_embedded_bp_compressed(&dl->dr_overridden_by,
BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
dl->dr_override_state = DR_OVERRIDDEN;
- dl->dr_overridden_by.blk_birth = dr->dr_txg;
+ BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
}
void
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
- ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
if (db->db_state == DB_CACHED &&
zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
arc_buf_destroy(db->db_buf, db);
}
db->db_buf = NULL;
+ } else if (db->db_state == DB_NOFILL) {
+ /*
+ * We will be completely replacing the cloned block. In case
+ * it was cloned in this transaction group, let's undirty the
+ * pending clone and mark the block as uncached. This will be
+ * as if the clone was never done.
+ */
+ VERIFY(!dbuf_undirty(db, tx));
+ db->db_state = DB_UNCACHED;
}
ASSERT(db->db_buf == NULL);
dbuf_set_data(db, buf);
DTRACE_SET_STATE(db, "filling assigned arcbuf");
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
- dmu_buf_fill_done(&db->db, tx);
+ dmu_buf_fill_done(&db->db, tx, B_FALSE);
}
void
db->db_caching_status == DB_DBUF_METADATA_CACHE);
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+ ASSERT0(dmu_buf_user_size(&db->db));
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
static dmu_buf_impl_t *
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
- dmu_buf_impl_t *parent, blkptr_t *blkptr)
+ dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
{
objset_t *os = dn->dn_objset;
dmu_buf_impl_t *db, *odb;
db->db_dnode_handle = dn->dn_handle;
db->db_parent = parent;
db->db_blkptr = blkptr;
+ db->db_hash = hash;
db->db_user = NULL;
db->db_user_immediate_evict = FALSE;
err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
if (err == 0) {
+ ASSERT3P(bp2, !=, NULL);
*bp = *bp2;
if (dbp != NULL)
dbuf_rele(dbp, NULL);
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
- ASSERT(!BP_IS_REDACTED(bp) ||
+ ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
dsl_dataset_feature_is_active(
dpa->dpa_dnode->dn_objset->os_dsl_dataset,
- SPA_FEATURE_REDACTED_DATASETS));
+ SPA_FEATURE_REDACTED_DATASETS)));
if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
arc_buf_destroy(abuf, private);
dbuf_prefetch_fini(dpa, B_TRUE);
goto no_issue;
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
- level, blkid);
+ level, blkid, NULL);
if (db != NULL) {
mutex_exit(&db->db_mtx);
/*
dpa->dpa_cb = cb;
dpa->dpa_arg = arg;
- /* flag if L2ARC eligible, l2arc_noprefetch then decides */
- if (dnode_level_is_l2cacheable(&bp, dn, level))
+ if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
+ dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
+ else if (dnode_level_is_l2cacheable(&bp, dn, level))
dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
/*
const void *tag, dmu_buf_impl_t **dbp)
{
dmu_buf_impl_t *db, *parent = NULL;
+ uint64_t hv;
/* If the pool has been created, verify the tx_sync_lock is not held */
spa_t *spa = dn->dn_objset->os_spa;
*dbp = NULL;
/* dbuf_find() returns with db_mtx held */
- db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
+ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
if (db == NULL) {
blkptr_t *bp = NULL;
}
if (err && err != ENOENT)
return (err);
- db = dbuf_create(dn, level, blkid, parent, bp);
+ db = dbuf_create(dn, level, blkid, parent, bp, hv);
}
if (fail_uncached && db->db_state != DB_CACHED) {
dn->dn_object != DMU_META_DNODE_OBJECT &&
db->db_state == DB_CACHED && db->db_data_pending) {
dbuf_dirty_record_t *dr = db->db_data_pending;
- if (dr->dt.dl.dr_data == db->db_buf)
+ if (dr->dt.dl.dr_data == db->db_buf) {
+ ASSERT3P(db->db_buf, !=, NULL);
dbuf_hold_copy(dn, db);
+ }
}
if (multilist_link_active(&db->db_cache_link)) {
db->db_caching_status == DB_DBUF_METADATA_CACHE);
multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+ uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
- &dbuf_caches[db->db_caching_status].size,
- db->db.db_size, db);
+ &dbuf_caches[db->db_caching_status].size, size, db);
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_BUMPDOWN(metadata_cache_count);
} else {
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
- db->db.db_size);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
}
db->db_caching_status = DB_NO_CACHE;
}
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+ dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
+ dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
}
int
if (blkid == DMU_BONUS_BLKID)
found_db = dbuf_find_bonus(os, obj);
else
- found_db = dbuf_find(os, obj, 0, blkid);
+ found_db = dbuf_find(os, obj, 0, blkid, NULL);
if (found_db != NULL) {
if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
* This dbuf has anonymous data associated with it.
*/
dbuf_destroy(db);
- } else {
- boolean_t do_arc_evict = B_FALSE;
- blkptr_t bp;
- spa_t *spa = dmu_objset_spa(db->db_objset);
-
- if (!DBUF_IS_CACHEABLE(db) &&
- db->db_blkptr != NULL &&
- !BP_IS_HOLE(db->db_blkptr) &&
- !BP_IS_EMBEDDED(db->db_blkptr)) {
- do_arc_evict = B_TRUE;
- bp = *db->db_blkptr;
- }
-
- if (!DBUF_IS_CACHEABLE(db) ||
- db->db_pending_evict) {
- dbuf_destroy(db);
- } else if (!multilist_link_active(&db->db_cache_link)) {
- ASSERT3U(db->db_caching_status, ==,
- DB_NO_CACHE);
-
- dbuf_cached_state_t dcs =
- dbuf_include_in_metadata_cache(db) ?
- DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
- db->db_caching_status = dcs;
-
- multilist_insert(&dbuf_caches[dcs].cache, db);
- uint64_t db_size = db->db.db_size;
- size = zfs_refcount_add_many(
- &dbuf_caches[dcs].size, db_size, db);
- uint8_t db_level = db->db_level;
- mutex_exit(&db->db_mtx);
-
- if (dcs == DB_DBUF_METADATA_CACHE) {
- DBUF_STAT_BUMP(metadata_cache_count);
- DBUF_STAT_MAX(
- metadata_cache_size_bytes_max,
- size);
- } else {
- DBUF_STAT_BUMP(cache_count);
- DBUF_STAT_MAX(cache_size_bytes_max,
- size);
- DBUF_STAT_BUMP(cache_levels[db_level]);
- DBUF_STAT_INCR(
- cache_levels_bytes[db_level],
- db_size);
- }
+ } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
+ db->db_pending_evict) {
+ dbuf_destroy(db);
+ } else if (!multilist_link_active(&db->db_cache_link)) {
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+
+ dbuf_cached_state_t dcs =
+ dbuf_include_in_metadata_cache(db) ?
+ DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+ db->db_caching_status = dcs;
+
+ multilist_insert(&dbuf_caches[dcs].cache, db);
+ uint64_t db_size = db->db.db_size +
+ dmu_buf_user_size(&db->db);
+ size = zfs_refcount_add_many(
+ &dbuf_caches[dcs].size, db_size, db);
+ uint8_t db_level = db->db_level;
+ mutex_exit(&db->db_mtx);
- if (dcs == DB_DBUF_CACHE && !evicting)
- dbuf_evict_notify(size);
+ if (dcs == DB_DBUF_METADATA_CACHE) {
+ DBUF_STAT_BUMP(metadata_cache_count);
+ DBUF_STAT_MAX(metadata_cache_size_bytes_max,
+ size);
+ } else {
+ DBUF_STAT_BUMP(cache_count);
+ DBUF_STAT_MAX(cache_size_bytes_max, size);
+ DBUF_STAT_BUMP(cache_levels[db_level]);
+ DBUF_STAT_INCR(cache_levels_bytes[db_level],
+ db_size);
}
- if (do_arc_evict)
- arc_freed(spa, &bp);
+ if (dcs == DB_DBUF_CACHE && !evicting)
+ dbuf_evict_notify(size);
}
} else {
mutex_exit(&db->db_mtx);
return (db->db_user);
}
+uint64_t
+dmu_buf_user_size(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ if (db->db_user == NULL)
+ return (0);
+ return (atomic_load_64(&db->db_user->dbu_size));
+}
+
+void
+dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+ ASSERT3P(db->db_user, !=, NULL);
+ ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
+ atomic_add_64(&db->db_user->dbu_size, nadd);
+}
+
+void
+dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+ ASSERT3P(db->db_user, !=, NULL);
+ ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
+ atomic_sub_64(&db->db_user->dbu_size, nsub);
+}
+
void
dmu_buf_user_evict_wait(void)
{
return (dbi->db_objset);
}
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- DB_DNODE_ENTER(dbi);
- return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- DB_DNODE_EXIT(dbi);
-}
-
static void
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
{
rw_exit(&parent_db->db_rwlock);
}
-static void
-dbuf_lightweight_physdone(zio_t *zio)
-{
- dbuf_dirty_record_t *dr = zio->io_private;
- dsl_pool_t *dp = spa_get_dsl(zio->io_spa);
- ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
- /*
- * The callback will be called io_phys_children times. Retire one
- * portion of our dirty space each time we are called. Any rounding
- * error will be cleaned up by dbuf_lightweight_done().
- */
- int delta = dr->dr_accounted / zio->io_phys_children;
- dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
static void
dbuf_lightweight_done(zio_t *zio)
{
dsl_dataset_block_born(ds, zio->io_bp, tx);
}
- /*
- * See comment in dbuf_write_done().
- */
- if (zio->io_phys_children == 0) {
- dsl_pool_undirty_space(dmu_objset_pool(os),
- dr->dr_accounted, zio->io_txg);
- } else {
- dsl_pool_undirty_space(dmu_objset_pool(os),
- dr->dr_accounted % zio->io_phys_children, zio->io_txg);
- }
+ dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+ zio->io_txg);
abd_free(dr->dt.dll.dr_abd);
kmem_free(dr, sizeof (*dr));
dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
&dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
- dbuf_lightweight_physdone, dbuf_lightweight_done, dr,
- ZIO_PRIORITY_ASYNC_WRITE,
+ dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
zio_nowait(dr->dr_zio);
} else if (db->db_state == DB_FILL) {
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+ } else if (db->db_state == DB_READ) {
+ /*
+ * This buffer has a clone we need to write, and an in-flight
+ * read on the BP we're about to clone. Its safe to issue the
+ * write here because the read has already been issued and the
+ * contents won't change.
+ */
+ ASSERT(dr->dt.dl.dr_brtwrite &&
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
} else {
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
}
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
cv_wait(&db->db_changed, &db->db_mtx);
- ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
/*
}
}
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs. May be
+ * called recursively from dbuf_sync_indirect().
+ */
void
dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
{
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
- if (bp->blk_birth != 0) {
+ if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
i += DNODE_MIN_SIZE;
if (dnp->dn_type != DMU_OT_NONE) {
fill++;
+ for (int j = 0; j < dnp->dn_nblkptr;
+ j++) {
+ (void) zfs_blkptr_verify(spa,
+ &dnp->dn_blkptr[j],
+ BLK_CONFIG_SKIP,
+ BLK_VERIFY_HALT);
+ }
+ if (dnp->dn_flags &
+ DNODE_FLAG_SPILL_BLKPTR) {
+ (void) zfs_blkptr_verify(spa,
+ DN_SPILL_BLKPTR(dnp),
+ BLK_CONFIG_SKIP,
+ BLK_VERIFY_HALT);
+ }
i += dnp->dn_extra_slots *
DNODE_MIN_SIZE;
}
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
if (BP_IS_HOLE(ibp))
continue;
+ (void) zfs_blkptr_verify(spa, ibp,
+ BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
fill += BP_GET_FILL(ibp);
}
}
DB_DNODE_EXIT(db);
}
-/*
- * The SPA will call this callback several times for each zio - once
- * for every physical child i/o (zio->io_phys_children times). This
- * allows the DMU to monitor the progress of each logical i/o. For example,
- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
- * block. There may be a long delay before all copies/fragments are completed,
- * so this callback allows us to retire dirty space gradually, as the physical
- * i/os complete.
- */
-static void
-dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
-{
- (void) buf;
- dmu_buf_impl_t *db = arg;
- objset_t *os = db->db_objset;
- dsl_pool_t *dp = dmu_objset_pool(os);
- dbuf_dirty_record_t *dr;
- int delta = 0;
-
- dr = db->db_data_pending;
- ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
- /*
- * The callback will be called io_phys_children times. Retire one
- * portion of our dirty space each time we are called. Any rounding
- * error will be cleaned up by dbuf_write_done().
- */
- delta = dr->dr_accounted / zio->io_phys_children;
- dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
if (db->db_state != DB_NOFILL) {
- if (dr->dt.dl.dr_data != db->db_buf)
+ if (dr->dt.dl.dr_data != NULL &&
+ dr->dt.dl.dr_data != db->db_buf) {
arc_buf_destroy(dr->dt.dl.dr_data, db);
+ }
}
} else {
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
- /*
- * If we didn't do a physical write in this ZIO and we
- * still ended up here, it means that the space of the
- * dbuf that we just released (and undirtied) above hasn't
- * been marked as undirtied in the pool's accounting.
- *
- * Thus, we undirty that space in the pool's view of the
- * world here. For physical writes this type of update
- * happens in dbuf_write_physdone().
- *
- * If we did a physical write, cleanup any rounding errors
- * that came up due to writing multiple copies of a block
- * on disk [see dbuf_write_physdone()].
- */
- if (zio->io_phys_children == 0) {
- dsl_pool_undirty_space(dmu_objset_pool(os),
- dr->dr_accounted, zio->io_txg);
- } else {
- dsl_pool_undirty_space(dmu_objset_pool(os),
- dr->dr_accounted % zio->io_phys_children, zio->io_txg);
- }
+ dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+ zio->io_txg);
kmem_free(dr, sizeof (dbuf_dirty_record_t));
}
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
drica.drica_os = dn->dn_objset;
- drica.drica_blk_birth = bp->blk_birth;
+ drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
drica.drica_tx = tx;
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
&drica)) {
if (dn->dn_objset != spa_meta_objset(spa)) {
dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
- bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+ BP_GET_LOGICAL_BIRTH(bp) >
+ ds->ds_dir->dd_origin_txg) {
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
}
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
}
ASSERT(db->db_level == 0 || data == db->db_buf);
- ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
ASSERT(pio);
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp,
- dbuf_write_override_ready, NULL, NULL,
+ dbuf_write_override_ready, NULL,
dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
- dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+ dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
+ dr->dt.dl.dr_brtwrite);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
- dbuf_write_nofill_ready, NULL, NULL,
+ dbuf_write_nofill_ready, NULL,
dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
children_ready_cb = dbuf_write_children_ready;
dr->dr_zio = arc_write(pio, os->os_spa, txg,
- &dr->dr_bp_copy, data, dbuf_is_l2cacheable(db),
- &zp, dbuf_write_ready,
- children_ready_cb, dbuf_write_physdone,
- dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_MUSTSUCCEED, &zb);
+ &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
+ dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
+ children_ready_cb, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
}
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_is_dirty);
+EXPORT_SYMBOL(dmu_buf_will_clone);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);
EXPORT_SYMBOL(dmu_buf_fill_done);
EXPORT_SYMBOL(dmu_buf_get_user);
EXPORT_SYMBOL(dmu_buf_get_blkptr);
-ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
"Maximum size in bytes of the dbuf cache.");
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
"Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
"Maximum size in bytes of dbuf metadata cache.");
ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,