* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
- * or http://www.opensolaris.org/os/licensing.
+ * or https://opensource.org/licenses/CDDL-1.0.
* See the License for the specific language governing permissions
* and limitations under the License.
*
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2019 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
+ * Copyright (c) 2019, Klara Inc.
+ * Copyright (c) 2019, Allan Jude
+ * Copyright (c) 2021, 2022 by Pawel Jakub Dawidek
*/
#include <sys/zfs_context.h>
#include <sys/trace_zfs.h>
#include <sys/callb.h>
#include <sys/abd.h>
+#include <sys/brt.h>
#include <sys/vdev.h>
#include <cityhash.h>
#include <sys/spa_impl.h>
+#include <sys/wmsum.h>
+#include <sys/vdev_impl.h>
-kstat_t *dbuf_ksp;
+static kstat_t *dbuf_ksp;
typedef struct dbuf_stats {
/*
* already created and in the dbuf hash table.
*/
kstat_named_t hash_insert_race;
+ /*
+ * Number of entries in the hash table dbuf and mutex arrays.
+ */
+ kstat_named_t hash_table_count;
+ kstat_named_t hash_mutex_count;
/*
* Statistics about the size of the metadata dbuf cache.
*/
{ "hash_chains", KSTAT_DATA_UINT64 },
{ "hash_chain_max", KSTAT_DATA_UINT64 },
{ "hash_insert_race", KSTAT_DATA_UINT64 },
+ { "hash_table_count", KSTAT_DATA_UINT64 },
+ { "hash_mutex_count", KSTAT_DATA_UINT64 },
{ "metadata_cache_count", KSTAT_DATA_UINT64 },
{ "metadata_cache_size_bytes", KSTAT_DATA_UINT64 },
{ "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 },
{ "metadata_cache_overflow", KSTAT_DATA_UINT64 }
};
+struct {
+ wmsum_t cache_count;
+ wmsum_t cache_total_evicts;
+ wmsum_t cache_levels[DN_MAX_LEVELS];
+ wmsum_t cache_levels_bytes[DN_MAX_LEVELS];
+ wmsum_t hash_hits;
+ wmsum_t hash_misses;
+ wmsum_t hash_collisions;
+ wmsum_t hash_chains;
+ wmsum_t hash_insert_race;
+ wmsum_t metadata_cache_count;
+ wmsum_t metadata_cache_overflow;
+} dbuf_sums;
+
#define DBUF_STAT_INCR(stat, val) \
- atomic_add_64(&dbuf_stats.stat.value.ui64, (val));
+ wmsum_add(&dbuf_sums.stat, val);
#define DBUF_STAT_DECR(stat, val) \
DBUF_STAT_INCR(stat, -(val));
#define DBUF_STAT_BUMP(stat) \
continue; \
}
-static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
-extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
- dmu_buf_evict_func_t *evict_func_sync,
- dmu_buf_evict_func_t *evict_func_async,
- dmu_buf_t **clear_on_evict_dbufp);
-
/*
* Global data structures and functions for the dbuf cache.
*/
* by those caches' matching enum values (from dbuf_cached_state_t).
*/
typedef struct dbuf_cache {
- multilist_t *cache;
- zfs_refcount_t size;
+ multilist_t cache;
+ zfs_refcount_t size ____cacheline_aligned;
} dbuf_cache_t;
dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
/* Size limits for the caches */
-unsigned long dbuf_cache_max_bytes = ULONG_MAX;
-unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX;
+static uint64_t dbuf_cache_max_bytes = UINT64_MAX;
+static uint64_t dbuf_metadata_cache_max_bytes = UINT64_MAX;
/* Set the default sizes of the caches to log2 fraction of arc size */
-int dbuf_cache_shift = 5;
-int dbuf_metadata_cache_shift = 6;
+static uint_t dbuf_cache_shift = 5;
+static uint_t dbuf_metadata_cache_shift = 6;
+
+/* Set the dbuf hash mutex count as log2 shift (dynamic by default) */
+static uint_t dbuf_mutex_cache_shift = 0;
static unsigned long dbuf_cache_target_bytes(void);
static unsigned long dbuf_metadata_cache_target_bytes(void);
/*
* The percentage above and below the maximum cache size.
*/
-uint_t dbuf_cache_hiwater_pct = 10;
-uint_t dbuf_cache_lowater_pct = 10;
+static uint_t dbuf_cache_hiwater_pct = 10;
+static uint_t dbuf_cache_lowater_pct = 10;
-/* ARGSUSED */
static int
dbuf_cons(void *vdb, void *unused, int kmflag)
{
+ (void) unused, (void) kmflag;
dmu_buf_impl_t *db = vdb;
- bzero(db, sizeof (dmu_buf_impl_t));
+ memset(db, 0, sizeof (dmu_buf_impl_t));
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL);
return (0);
}
-/* ARGSUSED */
static void
dbuf_dest(void *vdb, void *unused)
{
+ (void) unused;
dmu_buf_impl_t *db = vdb;
mutex_destroy(&db->db_mtx);
rw_destroy(&db->db_rwlock);
*/
static dbuf_hash_table_t dbuf_hash_table;
-static uint64_t dbuf_hash_count;
-
/*
* We use Cityhash for this. It's fast, and has good hash properties without
* requiring any large static buffers.
(dbuf)->db_blkid == (blkid))
dmu_buf_impl_t *
-dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
+dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid,
+ uint64_t *hash_out)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
uint64_t hv;
}
}
mutex_exit(DBUF_HASH_MUTEX(h, idx));
+ if (hash_out != NULL)
+ *hash_out = hv;
return (NULL);
}
objset_t *os = db->db_objset;
uint64_t obj = db->db.db_object;
int level = db->db_level;
- uint64_t blkid, hv, idx;
+ uint64_t blkid, idx;
dmu_buf_impl_t *dbf;
uint32_t i;
blkid = db->db_blkid;
- hv = dbuf_hash(os, obj, level, blkid);
- idx = hv & h->hash_table_mask;
+ ASSERT3U(dbuf_hash(os, obj, level, blkid), ==, db->db_hash);
+ idx = db->db_hash & h->hash_table_mask;
mutex_enter(DBUF_HASH_MUTEX(h, idx));
for (dbf = h->hash_table[idx], i = 0; dbf != NULL;
db->db_hash_next = h->hash_table[idx];
h->hash_table[idx] = db;
mutex_exit(DBUF_HASH_MUTEX(h, idx));
- atomic_inc_64(&dbuf_hash_count);
- DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count);
+ uint64_t he = atomic_inc_64_nv(&dbuf_stats.hash_elements.value.ui64);
+ DBUF_STAT_MAX(hash_elements_max, he);
return (NULL);
}
dbuf_hash_remove(dmu_buf_impl_t *db)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- uint64_t hv, idx;
+ uint64_t idx;
dmu_buf_impl_t *dbf, **dbp;
- hv = dbuf_hash(db->db_objset, db->db.db_object,
- db->db_level, db->db_blkid);
- idx = hv & h->hash_table_mask;
+ ASSERT3U(dbuf_hash(db->db_objset, db->db.db_object, db->db_level,
+ db->db_blkid), ==, db->db_hash);
+ idx = db->db_hash & h->hash_table_mask;
/*
* We mustn't hold db_mtx to maintain lock ordering:
h->hash_table[idx]->db_hash_next == NULL)
DBUF_STAT_BUMPDOWN(hash_chains);
mutex_exit(DBUF_HASH_MUTEX(h, idx));
- atomic_dec_64(&dbuf_hash_count);
+ atomic_dec_64(&dbuf_stats.hash_elements.value.ui64);
}
typedef enum {
*dbu->dbu_clear_on_evict_dbufp = NULL;
#endif
+ if (db->db_caching_status != DB_NO_CACHE) {
+ /*
+ * This is a cached dbuf, so the size of the user data is
+ * included in its cached amount. We adjust it here because the
+ * user data has already been detached from the dbuf, and the
+ * sync functions are not supposed to touch it (the dbuf might
+ * not exist anymore by the time the sync functions run.
+ */
+ uint64_t size = dbu->dbu_size;
+ (void) zfs_refcount_remove_many(
+ &dbuf_caches[db->db_caching_status].size, size, db);
+ if (db->db_caching_status == DB_DBUF_CACHE)
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
+ }
+
/*
* There are two eviction callbacks - one that we call synchronously
* and one that we invoke via a taskq. The async one is useful for
}
}
+/*
+ * We want to exclude buffers that are on a special allocation class from
+ * L2ARC.
+ */
+boolean_t
+dbuf_is_l2cacheable(dmu_buf_impl_t *db)
+{
+ if (db->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+ (db->db_objset->os_secondary_cache ==
+ ZFS_CACHE_METADATA && dbuf_is_metadata(db))) {
+ if (l2arc_exclude_special == 0)
+ return (B_TRUE);
+
+ blkptr_t *bp = db->db_blkptr;
+ if (bp == NULL || BP_IS_HOLE(bp))
+ return (B_FALSE);
+ uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
+ vdev_t *rvd = db->db_objset->os_spa->spa_root_vdev;
+ vdev_t *vd = NULL;
+
+ if (vdev < rvd->vdev_children)
+ vd = rvd->vdev_child[vdev];
+
+ if (vd == NULL)
+ return (B_TRUE);
+
+ if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+ vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
+static inline boolean_t
+dnode_level_is_l2cacheable(blkptr_t *bp, dnode_t *dn, int64_t level)
+{
+ if (dn->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||
+ (dn->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA &&
+ (level > 0 ||
+ DMU_OT_IS_METADATA(dn->dn_handle->dnh_dnode->dn_type)))) {
+ if (l2arc_exclude_special == 0)
+ return (B_TRUE);
+
+ if (bp == NULL || BP_IS_HOLE(bp))
+ return (B_FALSE);
+ uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
+ vdev_t *rvd = dn->dn_objset->os_spa->spa_root_vdev;
+ vdev_t *vd = NULL;
+
+ if (vdev < rvd->vdev_children)
+ vd = rvd->vdev_child[vdev];
+
+ if (vd == NULL)
+ return (B_TRUE);
+
+ if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
+ vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
+ return (B_TRUE);
+ }
+ return (B_FALSE);
+}
+
/*
* This function *must* return indices evenly distributed between all
* Also, the low order bits of the hash value are thought to be
* distributed evenly. Otherwise, in the case that the multilist
* has a power of two number of sublists, each sublists' usage
- * would not be evenly distributed.
+ * would not be evenly distributed. In this context full 64bit
+ * division would be a waste of time, so limit it to 32 bits.
*/
- return (dbuf_hash(db->db_objset, db->db.db_object,
+ return ((unsigned int)dbuf_hash(db->db_objset, db->db.db_object,
db->db_level, db->db_blkid) %
multilist_get_num_sublists(ml));
}
static void
dbuf_evict_one(void)
{
- int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+ int idx = multilist_get_random_index(&dbuf_caches[DB_DBUF_CACHE].cache);
multilist_sublist_t *mls = multilist_sublist_lock(
- dbuf_caches[DB_DBUF_CACHE].cache, idx);
+ &dbuf_caches[DB_DBUF_CACHE].cache, idx);
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
if (db != NULL) {
multilist_sublist_remove(mls, db);
multilist_sublist_unlock(mls);
+ uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
- &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db);
+ &dbuf_caches[DB_DBUF_CACHE].size, size, db);
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
- db->db.db_size);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
db->db_caching_status = DB_NO_CACHE;
dbuf_destroy(db);
* of the dbuf cache is at or below the maximum size. Once the dbuf is aged
* out of the cache it is destroyed and becomes eligible for arc eviction.
*/
-/* ARGSUSED */
-static void
+static __attribute__((noreturn)) void
dbuf_evict_thread(void *unused)
{
+ (void) unused;
callb_cpr_t cpr;
CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
while (!dbuf_evict_thread_exit) {
while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
CALLB_CPR_SAFE_BEGIN(&cpr);
- (void) cv_timedwait_sig_hires(&dbuf_evict_cv,
+ (void) cv_timedwait_idle_hires(&dbuf_evict_cv,
&dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
}
/*
* Wake up the dbuf eviction thread if the dbuf cache is at its max size.
* If the dbuf cache is at its high water mark, then evict a dbuf from the
- * dbuf cache using the callers context.
+ * dbuf cache using the caller's context.
*/
static void
dbuf_evict_notify(uint64_t size)
dbuf_kstat_update(kstat_t *ksp, int rw)
{
dbuf_stats_t *ds = ksp->ks_data;
+ dbuf_hash_table_t *h = &dbuf_hash_table;
- if (rw == KSTAT_WRITE) {
+ if (rw == KSTAT_WRITE)
return (SET_ERROR(EACCES));
- } else {
- ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
- &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
- ds->cache_size_bytes.value.ui64 =
- zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
- ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
- ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
- ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
- ds->hash_elements.value.ui64 = dbuf_hash_count;
- }
+ ds->cache_count.value.ui64 =
+ wmsum_value(&dbuf_sums.cache_count);
+ ds->cache_size_bytes.value.ui64 =
+ zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size);
+ ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes();
+ ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes();
+ ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes();
+ ds->cache_total_evicts.value.ui64 =
+ wmsum_value(&dbuf_sums.cache_total_evicts);
+ for (int i = 0; i < DN_MAX_LEVELS; i++) {
+ ds->cache_levels[i].value.ui64 =
+ wmsum_value(&dbuf_sums.cache_levels[i]);
+ ds->cache_levels_bytes[i].value.ui64 =
+ wmsum_value(&dbuf_sums.cache_levels_bytes[i]);
+ }
+ ds->hash_hits.value.ui64 =
+ wmsum_value(&dbuf_sums.hash_hits);
+ ds->hash_misses.value.ui64 =
+ wmsum_value(&dbuf_sums.hash_misses);
+ ds->hash_collisions.value.ui64 =
+ wmsum_value(&dbuf_sums.hash_collisions);
+ ds->hash_chains.value.ui64 =
+ wmsum_value(&dbuf_sums.hash_chains);
+ ds->hash_insert_race.value.ui64 =
+ wmsum_value(&dbuf_sums.hash_insert_race);
+ ds->hash_table_count.value.ui64 = h->hash_table_mask + 1;
+ ds->hash_mutex_count.value.ui64 = h->hash_mutex_mask + 1;
+ ds->metadata_cache_count.value.ui64 =
+ wmsum_value(&dbuf_sums.metadata_cache_count);
+ ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count(
+ &dbuf_caches[DB_DBUF_METADATA_CACHE].size);
+ ds->metadata_cache_overflow.value.ui64 =
+ wmsum_value(&dbuf_sums.metadata_cache_overflow);
return (0);
}
void
dbuf_init(void)
{
- uint64_t hsize = 1ULL << 16;
+ uint64_t hmsize, hsize = 1ULL << 16;
dbuf_hash_table_t *h = &dbuf_hash_table;
- int i;
/*
- * The hash table is big enough to fill all of physical memory
+ * The hash table is big enough to fill one eighth of physical memory
* with an average block size of zfs_arc_average_blocksize (default 8K).
* By default, the table will take up
* totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers).
*/
- while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE)
+ while (hsize * zfs_arc_average_blocksize < arc_all_memory() / 8)
hsize <<= 1;
-retry:
- h->hash_table_mask = hsize - 1;
-#if defined(_KERNEL)
+ h->hash_table = NULL;
+ while (h->hash_table == NULL) {
+ h->hash_table_mask = hsize - 1;
+
+ h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
+ if (h->hash_table == NULL)
+ hsize >>= 1;
+
+ ASSERT3U(hsize, >=, 1ULL << 10);
+ }
+
/*
- * Large allocations which do not require contiguous pages
- * should be using vmem_alloc() in the linux kernel
+ * The hash table buckets are protected by an array of mutexes where
+ * each mutex is reponsible for protecting 128 buckets. A minimum
+ * array size of 8192 is targeted to avoid contention.
*/
- h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP);
-#else
- h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
-#endif
- if (h->hash_table == NULL) {
- /* XXX - we should really return an error instead of assert */
- ASSERT(hsize > (1ULL << 10));
- hsize >>= 1;
- goto retry;
+ if (dbuf_mutex_cache_shift == 0)
+ hmsize = MAX(hsize >> 7, 1ULL << 13);
+ else
+ hmsize = 1ULL << MIN(dbuf_mutex_cache_shift, 24);
+
+ h->hash_mutexes = NULL;
+ while (h->hash_mutexes == NULL) {
+ h->hash_mutex_mask = hmsize - 1;
+
+ h->hash_mutexes = vmem_zalloc(hmsize * sizeof (kmutex_t),
+ KM_SLEEP);
+ if (h->hash_mutexes == NULL)
+ hmsize >>= 1;
}
dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
sizeof (dmu_buf_impl_t),
0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
- for (i = 0; i < DBUF_MUTEXES; i++)
+ for (int i = 0; i < hmsize; i++)
mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
dbuf_stats_init(h);
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
- dbuf_caches[dcs].cache =
- multilist_create(sizeof (dmu_buf_impl_t),
+ multilist_create(&dbuf_caches[dcs].cache,
+ sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_cache_link),
dbuf_cache_multilist_index_func);
zfs_refcount_create(&dbuf_caches[dcs].size);
dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
NULL, 0, &p0, TS_RUN, minclsyspri);
+ wmsum_init(&dbuf_sums.cache_count, 0);
+ wmsum_init(&dbuf_sums.cache_total_evicts, 0);
+ for (int i = 0; i < DN_MAX_LEVELS; i++) {
+ wmsum_init(&dbuf_sums.cache_levels[i], 0);
+ wmsum_init(&dbuf_sums.cache_levels_bytes[i], 0);
+ }
+ wmsum_init(&dbuf_sums.hash_hits, 0);
+ wmsum_init(&dbuf_sums.hash_misses, 0);
+ wmsum_init(&dbuf_sums.hash_collisions, 0);
+ wmsum_init(&dbuf_sums.hash_chains, 0);
+ wmsum_init(&dbuf_sums.hash_insert_race, 0);
+ wmsum_init(&dbuf_sums.metadata_cache_count, 0);
+ wmsum_init(&dbuf_sums.metadata_cache_overflow, 0);
+
dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc",
KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t),
KSTAT_FLAG_VIRTUAL);
if (dbuf_ksp != NULL) {
- for (i = 0; i < DN_MAX_LEVELS; i++) {
+ for (int i = 0; i < DN_MAX_LEVELS; i++) {
snprintf(dbuf_stats.cache_levels[i].name,
KSTAT_STRLEN, "cache_level_%d", i);
dbuf_stats.cache_levels[i].data_type =
dbuf_fini(void)
{
dbuf_hash_table_t *h = &dbuf_hash_table;
- int i;
dbuf_stats_destroy();
- for (i = 0; i < DBUF_MUTEXES; i++)
+ for (int i = 0; i < (h->hash_mutex_mask + 1); i++)
mutex_destroy(&h->hash_mutexes[i]);
-#if defined(_KERNEL)
- /*
- * Large allocations which do not require contiguous pages
- * should be using vmem_free() in the linux kernel
- */
+
vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-#else
- kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
-#endif
+ vmem_free(h->hash_mutexes, (h->hash_mutex_mask + 1) *
+ sizeof (kmutex_t));
+
kmem_cache_destroy(dbuf_kmem_cache);
taskq_destroy(dbu_evict_taskq);
for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
zfs_refcount_destroy(&dbuf_caches[dcs].size);
- multilist_destroy(dbuf_caches[dcs].cache);
+ multilist_destroy(&dbuf_caches[dcs].cache);
}
if (dbuf_ksp != NULL) {
kstat_delete(dbuf_ksp);
dbuf_ksp = NULL;
}
+
+ wmsum_fini(&dbuf_sums.cache_count);
+ wmsum_fini(&dbuf_sums.cache_total_evicts);
+ for (int i = 0; i < DN_MAX_LEVELS; i++) {
+ wmsum_fini(&dbuf_sums.cache_levels[i]);
+ wmsum_fini(&dbuf_sums.cache_levels_bytes[i]);
+ }
+ wmsum_fini(&dbuf_sums.hash_hits);
+ wmsum_fini(&dbuf_sums.hash_misses);
+ wmsum_fini(&dbuf_sums.hash_collisions);
+ wmsum_fini(&dbuf_sums.hash_chains);
+ wmsum_fini(&dbuf_sums.hash_insert_race);
+ wmsum_fini(&dbuf_sums.metadata_cache_count);
+ wmsum_fini(&dbuf_sums.metadata_cache_overflow);
}
/*
if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
(db->db_buf == NULL || db->db_buf->b_data) &&
db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
- db->db_state != DB_FILL && !dn->dn_free_txg) {
+ db->db_state != DB_FILL && (dn == NULL || !dn->dn_free_txg)) {
/*
* If the blkptr isn't set but they have nonzero data,
* it had better be dirty, otherwise we'll lose that
ASSERT0(bp->blk_pad[1]);
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(BP_IS_HOLE(bp));
- ASSERT0(bp->blk_phys_birth);
+ ASSERT0(BP_GET_PHYSICAL_BIRTH(bp));
}
}
}
db->db.db_data = buf->b_data;
}
-static arc_buf_t *
-dbuf_alloc_arcbuf_from_arcbuf(dmu_buf_impl_t *db, arc_buf_t *data)
-{
- objset_t *os = db->db_objset;
- spa_t *spa = os->os_spa;
- arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- enum zio_compress compress_type;
- int psize, lsize;
-
- psize = arc_buf_size(data);
- lsize = arc_buf_lsize(data);
- compress_type = arc_get_compression(data);
-
- if (arc_is_encrypted(data)) {
- boolean_t byteorder;
- uint8_t salt[ZIO_DATA_SALT_LEN];
- uint8_t iv[ZIO_DATA_IV_LEN];
- uint8_t mac[ZIO_DATA_MAC_LEN];
- dnode_t *dn = DB_DNODE(db);
-
- arc_get_raw_params(data, &byteorder, salt, iv, mac);
- data = arc_alloc_raw_buf(spa, db, dmu_objset_id(os),
- byteorder, salt, iv, mac, dn->dn_type, psize, lsize,
- compress_type);
- } else if (compress_type != ZIO_COMPRESS_OFF) {
- ASSERT3U(type, ==, ARC_BUFC_DATA);
- data = arc_alloc_compressed_buf(spa, db,
- psize, lsize, compress_type);
- } else {
- data = arc_alloc_buf(spa, db, type, psize);
- }
- return (data);
-}
-
static arc_buf_t *
dbuf_alloc_arcbuf(dmu_buf_impl_t *db)
{
mutex_exit(&db->db_mtx);
abuf = arc_loan_buf(spa, B_FALSE, blksz);
- bcopy(db->db.db_data, abuf->b_data, blksz);
+ memcpy(abuf->b_data, db->db.db_data, blksz);
} else {
abuf = db->db_buf;
arc_loan_inuse_buf(abuf, db);
* used when modifying or reading db_blkptr.
*/
db_lock_type_t
-dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag)
+dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, const void *tag)
{
enum db_lock_type ret = DLT_NONE;
if (db->db_parent != NULL) {
* panic if we didn't pass the lock type in.
*/
void
-dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag)
+dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, const void *tag)
{
if (type == DLT_PARENT)
rw_exit(&db->db_parent->db_rwlock);
dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
arc_buf_t *buf, void *vdb)
{
+ (void) zb, (void) bp;
dmu_buf_impl_t *db = vdb;
mutex_enter(&db->db_mtx);
/* freed in flight */
ASSERT(zio == NULL || zio->io_error == 0);
arc_release(buf, db);
- bzero(buf->b_data, db->db.db_size);
+ memset(buf->b_data, 0, db->db.db_size);
arc_buf_freeze(buf);
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
arc_space_consume(max_bonuslen, ARC_SPACE_BONUS);
if (bonuslen < max_bonuslen)
- bzero(db->db.db_data, max_bonuslen);
+ memset(db->db.db_data, 0, max_bonuslen);
if (bonuslen)
- bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
+ memcpy(db->db.db_data, DN_BONUS(dn->dn_phys), bonuslen);
db->db_state = DB_CACHED;
DTRACE_SET_STATE(db, "bonus buffer filled");
return (0);
}
static void
-dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn)
+dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *dbbp)
{
blkptr_t *bps = db->db.db_data;
uint32_t indbs = 1ULL << dn->dn_indblkshift;
for (int i = 0; i < n_bps; i++) {
blkptr_t *bp = &bps[i];
- ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs);
- BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ?
- dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr));
- BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
- BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1);
- BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
+ ASSERT3U(BP_GET_LSIZE(dbbp), ==, indbs);
+ BP_SET_LSIZE(bp, BP_GET_LEVEL(dbbp) == 1 ?
+ dn->dn_datablksz : BP_GET_LSIZE(dbbp));
+ BP_SET_TYPE(bp, BP_GET_TYPE(dbbp));
+ BP_SET_LEVEL(bp, BP_GET_LEVEL(dbbp) - 1);
+ BP_SET_BIRTH(bp, BP_GET_LOGICAL_BIRTH(dbbp), 0);
}
}
* was taken, ENOENT if no action was taken.
*/
static int
-dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
{
ASSERT(MUTEX_HELD(&db->db_mtx));
- int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr);
+ int is_hole = bp == NULL || BP_IS_HOLE(bp);
/*
* For level 0 blocks only, if the above check fails:
* Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
* processes the delete record and clears the bp while we are waiting
* for the dn_mtx (resulting in a "no" from block_freed).
*/
- if (!is_hole && db->db_level == 0) {
- is_hole = dnode_block_freed(dn, db->db_blkid) ||
- BP_IS_HOLE(db->db_blkptr);
- }
+ if (!is_hole && db->db_level == 0)
+ is_hole = dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(bp);
if (is_hole) {
dbuf_set_data(db, dbuf_alloc_arcbuf(db));
- bzero(db->db.db_data, db->db.db_size);
+ memset(db->db.db_data, 0, db->db.db_size);
- if (db->db_blkptr != NULL && db->db_level > 0 &&
- BP_IS_HOLE(db->db_blkptr) &&
- db->db_blkptr->blk_birth != 0) {
- dbuf_handle_indirect_hole(db, dn);
+ if (bp != NULL && db->db_level > 0 && BP_IS_HOLE(bp) &&
+ BP_GET_LOGICAL_BIRTH(bp) != 0) {
+ dbuf_handle_indirect_hole(db, dn, bp);
}
db->db_state = DB_CACHED;
DTRACE_SET_STATE(db, "hole read satisfied");
ASSERT(MUTEX_HELD(&db->db_mtx));
- if (!os->os_encrypted || os->os_raw_receive ||
- (flags & DB_RF_NO_DECRYPT) != 0)
+ if ((flags & DB_RF_NO_DECRYPT) != 0 ||
+ !os->os_encrypted || os->os_raw_receive)
return (0);
DB_DNODE_ENTER(db);
*/
static int
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags,
- db_lock_type_t dblt, void *tag)
+ db_lock_type_t dblt, const void *tag)
{
dnode_t *dn;
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
int err, zio_flags;
- boolean_t bonus_read;
+ blkptr_t bp, *bpp;
- err = zio_flags = 0;
- bonus_read = B_FALSE;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(db->db_state == DB_UNCACHED);
+ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
ASSERT(db->db_buf == NULL);
ASSERT(db->db_parent == NULL ||
RW_LOCK_HELD(&db->db_parent->db_rwlock));
goto early_unlock;
}
- err = dbuf_read_hole(db, dn, flags);
+ if (db->db_state == DB_UNCACHED) {
+ if (db->db_blkptr == NULL) {
+ bpp = NULL;
+ } else {
+ bp = *db->db_blkptr;
+ bpp = &bp;
+ }
+ } else {
+ dbuf_dirty_record_t *dr;
+
+ ASSERT3S(db->db_state, ==, DB_NOFILL);
+
+ /*
+ * Block cloning: If we have a pending block clone,
+ * we don't want to read the underlying block, but the content
+ * of the block being cloned, so we have the most recent data.
+ */
+ dr = list_head(&db->db_dirty_records);
+ if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
+ err = EIO;
+ goto early_unlock;
+ }
+ bp = dr->dt.dl.dr_overridden_by;
+ bpp = &bp;
+ }
+
+ err = dbuf_read_hole(db, dn, bpp);
if (err == 0)
goto early_unlock;
+ ASSERT(bpp != NULL);
+
/*
* Any attempt to read a redacted block should result in an error. This
* will never happen under normal conditions, but can be useful for
* debugging purposes.
*/
- if (BP_IS_REDACTED(db->db_blkptr)) {
+ if (BP_IS_REDACTED(bpp)) {
ASSERT(dsl_dataset_feature_is_active(
db->db_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
* All bps of an encrypted os should have the encryption bit set.
* If this is not true it indicates tampering and we report an error.
*/
- if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
- spa_log_error(db->db_objset->os_spa, &zb);
- zfs_panic_recover("unencrypted block in encrypted "
- "object set %llu", dmu_objset_id(db->db_objset));
+ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) {
+ spa_log_error(db->db_objset->os_spa, &zb,
+ BP_GET_LOGICAL_BIRTH(bpp));
err = SET_ERROR(EIO);
goto early_unlock;
}
DTRACE_SET_STATE(db, "read issued");
mutex_exit(&db->db_mtx);
- if (DBUF_IS_L2CACHEABLE(db))
+ if (!DBUF_IS_CACHEABLE(db))
+ aflags |= ARC_FLAG_UNCACHED;
+ else if (dbuf_is_l2cacheable(db))
aflags |= ARC_FLAG_L2CACHE;
dbuf_add_ref(db, NULL);
if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
zio_flags |= ZIO_FLAG_RAW;
/*
- * The zio layer will copy the provided blkptr later, but we need to
- * do this now so that we can release the parent's rwlock. We have to
- * do that now so that if dbuf_read_done is called synchronously (on
+ * The zio layer will copy the provided blkptr later, but we have our
+ * own copy so that we can release the parent's rwlock. We have to
+ * do that so that if dbuf_read_done is called synchronously (on
* an l1 cache hit) we don't acquire the db_mtx while holding the
* parent's rwlock, which would be a lock ordering violation.
*/
- blkptr_t bp = *db->db_blkptr;
dmu_buf_unlock_parent(db, dblt, tag);
- (void) arc_read(zio, db->db_objset->os_spa, &bp,
+ (void) arc_read(zio, db->db_objset->os_spa, bpp,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb);
return (err);
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP);
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
- bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
+ memcpy(dr->dt.dl.dr_data, db->db.db_data, bonuslen);
} else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) {
- arc_buf_t *buf = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf);
- dr->dt.dl.dr_data = buf;
- bcopy(db->db.db_data, buf->b_data, arc_buf_size(buf));
+ dnode_t *dn = DB_DNODE(db);
+ int size = arc_buf_size(db->db_buf);
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ spa_t *spa = db->db_objset->os_spa;
+ enum zio_compress compress_type =
+ arc_get_compression(db->db_buf);
+ uint8_t complevel = arc_get_complevel(db->db_buf);
+
+ if (arc_is_encrypted(db->db_buf)) {
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ arc_get_raw_params(db->db_buf, &byteorder, salt,
+ iv, mac);
+ dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
+ dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
+ mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
+ compress_type, complevel);
+ } else if (compress_type != ZIO_COMPRESS_OFF) {
+ ASSERT3U(type, ==, ARC_BUFC_DATA);
+ dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
+ size, arc_buf_lsize(db->db_buf), compress_type,
+ complevel);
+ } else {
+ dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
+ }
+ memcpy(dr->dt.dl.dr_data->b_data, db->db.db_data, size);
} else {
db->db_buf = NULL;
dbuf_clear_data(db);
*/
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
- if (db->db_state == DB_NOFILL)
- return (SET_ERROR(EIO));
-
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
- DBUF_IS_CACHEABLE(db);
+ (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL;
mutex_enter(&db->db_mtx);
+ if (flags & DB_RF_PARTIAL_FIRST)
+ db->db_partial_read = B_TRUE;
+ else if (!(flags & DB_RF_PARTIAL_MORE))
+ db->db_partial_read = B_FALSE;
if (db->db_state == DB_CACHED) {
- spa_t *spa = dn->dn_objset->os_spa;
-
/*
* Ensure that this block's dnode has been decrypted if
* the caller has requested decrypted data.
(arc_is_encrypted(db->db_buf) ||
arc_is_unauthenticated(db->db_buf) ||
arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+ spa_t *spa = dn->dn_objset->os_spa;
zbookmark_phys_t zb;
SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset),
mutex_exit(&db->db_mtx);
if (err == 0 && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
- flags & DB_RF_HAVESTRUCT);
+ B_FALSE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_hits);
- } else if (db->db_state == DB_UNCACHED) {
- spa_t *spa = dn->dn_objset->os_spa;
+ } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
boolean_t need_wait = B_FALSE;
db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
- if (zio == NULL &&
- db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
+ if (zio == NULL && (db->db_state == DB_NOFILL ||
+ (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
+ spa_t *spa = dn->dn_objset->os_spa;
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
need_wait = B_TRUE;
}
*/
if (!err && prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
+ db->db_state != DB_CACHED,
flags & DB_RF_HAVESTRUCT);
}
mutex_exit(&db->db_mtx);
if (prefetch) {
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
- flags & DB_RF_HAVESTRUCT);
+ B_TRUE, flags & DB_RF_HAVESTRUCT);
}
DB_DNODE_EXIT(db);
DBUF_STAT_BUMP(hash_misses);
if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
zio_free(db->db_objset->os_spa, txg, bp);
+ if (dr->dt.dl.dr_brtwrite) {
+ ASSERT0P(dr->dt.dl.dr_data);
+ dr->dt.dl.dr_data = db->db_buf;
+ }
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
+ dr->dt.dl.dr_brtwrite = B_FALSE;
dr->dt.dl.dr_has_raw_params = B_FALSE;
/*
* the buf thawed to save the effort of freezing &
* immediately re-thawing it.
*/
- arc_release(dr->dt.dl.dr_data, db);
+ if (dr->dt.dl.dr_data)
+ arc_release(dr->dt.dl.dr_data, db);
}
/*
if (end_blkid > dn->dn_maxblkid &&
!(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
end_blkid = dn->dn_maxblkid;
- dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
+ dprintf_dnode(dn, "start=%llu end=%llu\n", (u_longlong_t)start_blkid,
+ (u_longlong_t)end_blkid);
db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP);
db_search->db_level = 0;
ASSERT(db->db.db_data != NULL);
arc_release(db->db_buf, db);
rw_enter(&db->db_rwlock, RW_WRITER);
- bzero(db->db.db_data, db->db.db_size);
+ memset(db->db.db_data, 0, db->db.db_size);
rw_exit(&db->db_rwlock);
arc_buf_freeze(db->db_buf);
}
mutex_exit(&db->db_mtx);
}
- kmem_free(db_search, sizeof (dmu_buf_impl_t));
mutex_exit(&dn->dn_dbufs_mtx);
+ kmem_free(db_search, sizeof (dmu_buf_impl_t));
}
void
/* copy old block data to the new block */
old_buf = db->db_buf;
- bcopy(old_buf->b_data, buf->b_data, MIN(osize, size));
+ memcpy(buf->b_data, old_buf->b_data, MIN(osize, size));
/* zero the remainder */
if (size > osize)
- bzero((uint8_t *)buf->b_data + osize, size - osize);
+ memset((uint8_t *)buf->b_data + osize, 0, size - osize);
mutex_enter(&db->db_mtx);
dbuf_set_data(db, buf);
}
}
+dbuf_dirty_record_t *
+dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx)
+{
+ rw_enter(&dn->dn_struct_rwlock, RW_READER);
+ IMPLY(dn->dn_objset->os_raw_receive, dn->dn_maxblkid >= blkid);
+ dnode_new_blkid(dn, blkid, tx, B_TRUE, B_FALSE);
+ ASSERT(dn->dn_maxblkid >= blkid);
+
+ dbuf_dirty_record_t *dr = kmem_zalloc(sizeof (*dr), KM_SLEEP);
+ list_link_init(&dr->dr_dirty_node);
+ list_link_init(&dr->dr_dbuf_node);
+ dr->dr_dnode = dn;
+ dr->dr_txg = tx->tx_txg;
+ dr->dt.dll.dr_blkid = blkid;
+ dr->dr_accounted = dn->dn_datablksz;
+
+ /*
+ * There should not be any dbuf for the block that we're dirtying.
+ * Otherwise the buffer contents could be inconsistent between the
+ * dbuf and the lightweight dirty record.
+ */
+ ASSERT3P(NULL, ==, dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid,
+ NULL));
+
+ mutex_enter(&dn->dn_mtx);
+ int txgoff = tx->tx_txg & TXG_MASK;
+ if (dn->dn_free_ranges[txgoff] != NULL) {
+ range_tree_clear(dn->dn_free_ranges[txgoff], blkid, 1);
+ }
+
+ if (dn->dn_nlevels == 1) {
+ ASSERT3U(blkid, <, dn->dn_nblkptr);
+ list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
+ mutex_exit(&dn->dn_mtx);
+ rw_exit(&dn->dn_struct_rwlock);
+ dnode_setdirty(dn, tx);
+ } else {
+ mutex_exit(&dn->dn_mtx);
+
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ dmu_buf_impl_t *parent_db = dbuf_hold_level(dn,
+ 1, blkid >> epbs, FTAG);
+ rw_exit(&dn->dn_struct_rwlock);
+ if (parent_db == NULL) {
+ kmem_free(dr, sizeof (*dr));
+ return (NULL);
+ }
+ int err = dbuf_read(parent_db, NULL,
+ (DB_RF_NOPREFETCH | DB_RF_CANFAIL));
+ if (err != 0) {
+ dbuf_rele(parent_db, FTAG);
+ kmem_free(dr, sizeof (*dr));
+ return (NULL);
+ }
+
+ dbuf_dirty_record_t *parent_dr = dbuf_dirty(parent_db, tx);
+ dbuf_rele(parent_db, FTAG);
+ mutex_enter(&parent_dr->dt.di.dr_mtx);
+ ASSERT3U(parent_dr->dr_txg, ==, tx->tx_txg);
+ list_insert_tail(&parent_dr->dt.di.dr_children, dr);
+ mutex_exit(&parent_dr->dt.di.dr_mtx);
+ dr->dr_parent = parent_dr;
+ }
+
+ dmu_objset_willuse_space(dn->dn_objset, dr->dr_accounted, tx);
+
+ return (dr);
+}
+
dbuf_dirty_record_t *
dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
- if (db->db_blkid != DMU_BONUS_BLKID) {
+ if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
dmu_objset_willuse_space(os, db->db.db_size, tx);
}
dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
list_link_init(&dr->dr_dirty_node);
list_link_init(&dr->dr_dbuf_node);
+ dr->dr_dnode = dn;
if (db->db_level == 0) {
void *data_old = db->db_buf;
sizeof (dbuf_dirty_record_t),
offsetof(dbuf_dirty_record_t, dr_dirty_node));
}
- if (db->db_blkid != DMU_BONUS_BLKID)
+ if (db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_NOFILL) {
dr->dr_accounted = db->db.db_size;
+ }
dr->dr_dbuf = db;
dr->dr_txg = tx->tx_txg;
list_insert_before(&db->db_dirty_records, dr_next, dr);
dmu_buf_impl_t *db = dr->dr_dbuf;
if (dr->dt.dl.dr_data != db->db.db_data) {
- struct dnode *dn = DB_DNODE(db);
+ struct dnode *dn = dr->dr_dnode;
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
kmem_free(dr->dt.dl.dr_data, max_bonuslen);
* Undirty a buffer in the transaction group referenced by the given
* transaction. Return whether this evicted the dbuf.
*/
-static boolean_t
+boolean_t
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- dnode_t *dn;
uint64_t txg = tx->tx_txg;
- dbuf_dirty_record_t *dr;
+ boolean_t brtwrite;
ASSERT(txg != 0);
/*
* If this buffer is not dirty, we're done.
*/
- dr = dbuf_find_dirty_eq(db, txg);
+ dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, txg);
if (dr == NULL)
return (B_FALSE);
ASSERT(dr->dr_dbuf == db);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
+ brtwrite = dr->dt.dl.dr_brtwrite;
+ if (brtwrite) {
+ /*
+ * We are freeing a block that we cloned in the same
+ * transaction group.
+ */
+ brt_pending_remove(dmu_objset_spa(db->db_objset),
+ &dr->dt.dl.dr_overridden_by, tx);
+ }
+
+ dnode_t *dn = dr->dr_dnode;
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
}
- DB_DNODE_EXIT(db);
- if (db->db_state != DB_NOFILL) {
+ if (db->db_state != DB_NOFILL && !brtwrite) {
dbuf_unoverride(dr);
ASSERT(db->db_buf != NULL);
db->db_dirtycnt -= 1;
if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
- ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
+ ASSERT(db->db_state == DB_NOFILL || brtwrite ||
+ arc_released(db->db_buf));
dbuf_destroy(db);
return (B_TRUE);
}
dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ boolean_t undirty = B_FALSE;
ASSERT(tx->tx_txg != 0);
ASSERT(!zfs_refcount_is_zero(&db->db_holds));
*/
mutex_enter(&db->db_mtx);
- if (db->db_state == DB_CACHED) {
+ if (db->db_state == DB_CACHED || db->db_state == DB_NOFILL) {
dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg);
/*
* It's possible that it is already dirty but not cached,
* go through dmu_buf_will_dirty().
*/
if (dr != NULL) {
- /* This dbuf is already dirty and cached. */
- dbuf_redirty(dr);
- mutex_exit(&db->db_mtx);
- return;
+ if (dr->dt.dl.dr_brtwrite) {
+ /*
+ * Block cloning: If we are dirtying a cloned
+ * block, we cannot simply redirty it, because
+ * this dr has no data associated with it.
+ * We will go through a full undirtying below,
+ * before dirtying it again.
+ */
+ undirty = B_TRUE;
+ } else {
+ /* This dbuf is already dirty and cached. */
+ dbuf_redirty(dr);
+ mutex_exit(&db->db_mtx);
+ return;
+ }
}
}
mutex_exit(&db->db_mtx);
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
flags |= DB_RF_HAVESTRUCT;
DB_DNODE_EXIT(db);
+
+ /*
+ * Block cloning: Do the dbuf_read() before undirtying the dbuf, as we
+ * want to make sure dbuf_read() will read the pending cloned block and
+ * not the uderlying block that is being replaced. dbuf_undirty() will
+ * do dbuf_unoverride(), so we will end up with cloned block content,
+ * without overridden BP.
+ */
(void) dbuf_read(db, NULL, flags);
+ if (undirty) {
+ mutex_enter(&db->db_mtx);
+ VERIFY(!dbuf_undirty(db, tx));
+ mutex_exit(&db->db_mtx);
+ }
(void) dbuf_dirty(db, tx);
}
return (dr != NULL);
}
+void
+dmu_buf_will_clone(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+
+ /*
+ * Block cloning: We are going to clone into this block, so undirty
+ * modifications done to this block so far in this txg. This includes
+ * writes and clones into this block.
+ */
+ mutex_enter(&db->db_mtx);
+ DBUF_VERIFY(db);
+ VERIFY(!dbuf_undirty(db, tx));
+ ASSERT0P(dbuf_find_dirty_eq(db, tx->tx_txg));
+ if (db->db_buf != NULL) {
+ arc_buf_destroy(db->db_buf, db);
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ }
+
+ db->db_state = DB_NOFILL;
+ DTRACE_SET_STATE(db, "allocating NOFILL buffer for clone");
+
+ DBUF_VERIFY(db);
+ mutex_exit(&db->db_mtx);
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
+}
+
void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ mutex_enter(&db->db_mtx);
db->db_state = DB_NOFILL;
DTRACE_SET_STATE(db, "allocating NOFILL buffer");
- dmu_buf_will_fill(db_fake, tx);
+ mutex_exit(&db->db_mtx);
+
+ dbuf_noread(db);
+ (void) dbuf_dirty(db, tx);
}
void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
dmu_tx_private_ok(tx));
+ mutex_enter(&db->db_mtx);
+ if (db->db_state == DB_NOFILL) {
+ /*
+ * Block cloning: We will be completely overwriting a block
+ * cloned in this transaction group, so let's undirty the
+ * pending clone and mark the block as uncached. This will be
+ * as if the clone was never done. But if the fill can fail
+ * we should have a way to return back to the cloned data.
+ */
+ if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+ mutex_exit(&db->db_mtx);
+ dmu_buf_will_dirty(db_fake, tx);
+ return;
+ }
+ VERIFY(!dbuf_undirty(db, tx));
+ db->db_state = DB_UNCACHED;
+ }
+ mutex_exit(&db->db_mtx);
+
dbuf_noread(db);
(void) dbuf_dirty(db, tx);
}
dr->dt.dl.dr_has_raw_params = B_TRUE;
dr->dt.dl.dr_byteorder = byteorder;
- bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN);
- bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN);
- bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN);
+ memcpy(dr->dt.dl.dr_salt, salt, ZIO_DATA_SALT_LEN);
+ memcpy(dr->dt.dl.dr_iv, iv, ZIO_DATA_IV_LEN);
+ memcpy(dr->dt.dl.dr_mac, mac, ZIO_DATA_MAC_LEN);
}
static void
dbuf_dirty_record_t *dr;
dr = list_head(&db->db_dirty_records);
+ ASSERT3P(dr, !=, NULL);
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
dl = &dr->dt.dl;
dl->dr_overridden_by = *bp;
dl->dr_override_state = DR_OVERRIDDEN;
- dl->dr_overridden_by.blk_birth = dr->dr_txg;
+ BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
}
-/* ARGSUSED */
-void
-dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+boolean_t
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
{
+ (void) tx;
dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
- dbuf_states_t old_state;
mutex_enter(&db->db_mtx);
DBUF_VERIFY(db);
- old_state = db->db_state;
- db->db_state = DB_CACHED;
- if (old_state == DB_FILL) {
+ if (db->db_state == DB_FILL) {
if (db->db_level == 0 && db->db_freed_in_flight) {
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
/* we were freed while filling */
/* XXX dbuf_undirty? */
- bzero(db->db.db_data, db->db.db_size);
+ memset(db->db.db_data, 0, db->db.db_size);
db->db_freed_in_flight = FALSE;
+ db->db_state = DB_CACHED;
DTRACE_SET_STATE(db,
"fill done handling freed in flight");
+ failed = B_FALSE;
+ } else if (failed) {
+ VERIFY(!dbuf_undirty(db, tx));
+ db->db_buf = NULL;
+ dbuf_clear_data(db);
+ DTRACE_SET_STATE(db, "fill failed");
} else {
+ db->db_state = DB_CACHED;
DTRACE_SET_STATE(db, "fill done");
}
cv_broadcast(&db->db_changed);
+ } else {
+ db->db_state = DB_CACHED;
+ failed = B_FALSE;
}
mutex_exit(&db->db_mtx);
+ return (failed);
}
void
dmu_buf_will_not_fill(dbuf, tx);
dr = list_head(&db->db_dirty_records);
+ ASSERT3P(dr, !=, NULL);
ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
dl = &dr->dt.dl;
encode_embedded_bp_compressed(&dl->dr_overridden_by,
BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
dl->dr_override_state = DR_OVERRIDDEN;
- dl->dr_overridden_by.blk_birth = dr->dr_txg;
+ BP_SET_LOGICAL_BIRTH(&dl->dr_overridden_by, dr->dr_txg);
}
void
while (db->db_state == DB_READ || db->db_state == DB_FILL)
cv_wait(&db->db_changed, &db->db_mtx);
- ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+ ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED ||
+ db->db_state == DB_NOFILL);
if (db->db_state == DB_CACHED &&
zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
ASSERT(!arc_is_encrypted(buf));
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
- bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+ memcpy(db->db.db_data, buf->b_data, db->db.db_size);
arc_buf_destroy(buf, db);
- xuio_stat_wbuf_copied();
return;
}
- xuio_stat_wbuf_nocopy();
if (db->db_state == DB_CACHED) {
dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
arc_buf_destroy(db->db_buf, db);
}
db->db_buf = NULL;
+ } else if (db->db_state == DB_NOFILL) {
+ /*
+ * We will be completely replacing the cloned block. In case
+ * it was cloned in this transaction group, let's undirty the
+ * pending clone and mark the block as uncached. This will be
+ * as if the clone was never done.
+ */
+ VERIFY(!dbuf_undirty(db, tx));
+ db->db_state = DB_UNCACHED;
}
ASSERT(db->db_buf == NULL);
dbuf_set_data(db, buf);
DTRACE_SET_STATE(db, "filling assigned arcbuf");
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
- dmu_buf_fill_done(&db->db, tx);
+ dmu_buf_fill_done(&db->db, tx, B_FALSE);
}
void
ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
db->db_caching_status == DB_DBUF_METADATA_CACHE);
- multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+ multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+ ASSERT0(dmu_buf_user_size(&db->db));
(void) zfs_refcount_remove_many(
&dbuf_caches[db->db_caching_status].size,
db->db.db_size, db);
ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
ASSERT(!multilist_link_active(&db->db_cache_link));
- kmem_cache_free(dbuf_kmem_cache, db);
- arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
-
/*
* If this dbuf is referenced from an indirect dbuf,
* decrement the ref count on the indirect dbuf.
mutex_enter(&parent->db_mtx);
dbuf_rele_and_unlock(parent, db, B_TRUE);
}
+
+ kmem_cache_free(dbuf_kmem_cache, db);
+ arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF);
}
/*
static dmu_buf_impl_t *
dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
- dmu_buf_impl_t *parent, blkptr_t *blkptr)
+ dmu_buf_impl_t *parent, blkptr_t *blkptr, uint64_t hash)
{
objset_t *os = dn->dn_objset;
dmu_buf_impl_t *db, *odb;
db->db_dnode_handle = dn->dn_handle;
db->db_parent = parent;
db->db_blkptr = blkptr;
+ db->db_hash = hash;
db->db_user = NULL;
db->db_user_immediate_evict = FALSE;
db->db_state = DB_EVICTING; /* not worth logging this state change */
if ((odb = dbuf_hash_insert(db)) != NULL) {
/* someone else inserted it first */
- kmem_cache_free(dbuf_kmem_cache, db);
mutex_exit(&dn->dn_dbufs_mtx);
+ kmem_cache_free(dbuf_kmem_cache, db);
DBUF_STAT_BUMP(hash_insert_race);
return (odb);
}
err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2);
if (err == 0) {
+ ASSERT3P(bp2, !=, NULL);
*bp = *bp2;
if (dbp != NULL)
dbuf_rele(dbp, NULL);
zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
+ dbuf_prefetch_fn dpa_cb; /* prefetch completion callback */
+ void *dpa_arg; /* prefetch completion arg */
} dbuf_prefetch_arg_t;
+static void
+dbuf_prefetch_fini(dbuf_prefetch_arg_t *dpa, boolean_t io_done)
+{
+ if (dpa->dpa_cb != NULL) {
+ dpa->dpa_cb(dpa->dpa_arg, dpa->dpa_zb.zb_level,
+ dpa->dpa_zb.zb_blkid, io_done);
+ }
+ kmem_free(dpa, sizeof (*dpa));
+}
+
+static void
+dbuf_issue_final_prefetch_done(zio_t *zio, const zbookmark_phys_t *zb,
+ const blkptr_t *iobp, arc_buf_t *abuf, void *private)
+{
+ (void) zio, (void) zb, (void) iobp;
+ dbuf_prefetch_arg_t *dpa = private;
+
+ if (abuf != NULL)
+ arc_buf_destroy(abuf, private);
+
+ dbuf_prefetch_fini(dpa, B_TRUE);
+}
+
/*
* Actually issue the prefetch read for the block given.
*/
SPA_FEATURE_REDACTED_DATASETS));
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
- return;
+ return (dbuf_prefetch_fini(dpa, B_FALSE));
int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
arc_flags_t aflags =
- dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+ dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+ ARC_FLAG_NO_BUF;
/* dnodes are always read as raw and then converted later */
if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) &&
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
ASSERT(dpa->dpa_zio != NULL);
- (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
+ (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp,
+ dbuf_issue_final_prefetch_done, dpa,
dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb);
}
dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
const blkptr_t *iobp, arc_buf_t *abuf, void *private)
{
+ (void) zb, (void) iobp;
dbuf_prefetch_arg_t *dpa = private;
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
if (abuf == NULL) {
ASSERT(zio == NULL || zio->io_error != 0);
- kmem_free(dpa, sizeof (*dpa));
+ dbuf_prefetch_fini(dpa, B_TRUE);
return;
}
ASSERT(zio == NULL || zio->io_error == 0);
dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
dpa->dpa_curlevel, curblkid, FTAG);
if (db == NULL) {
- kmem_free(dpa, sizeof (*dpa));
arc_buf_destroy(abuf, private);
+ dbuf_prefetch_fini(dpa, B_TRUE);
return;
}
-
(void) dbuf_read(db, NULL,
DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
dbuf_rele(db, FTAG);
blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
- ASSERT(!BP_IS_REDACTED(bp) ||
+ ASSERT(!BP_IS_REDACTED(bp) || (dpa->dpa_dnode &&
dsl_dataset_feature_is_active(
dpa->dpa_dnode->dn_objset->os_dsl_dataset,
- SPA_FEATURE_REDACTED_DATASETS));
+ SPA_FEATURE_REDACTED_DATASETS)));
if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) {
- kmem_free(dpa, sizeof (*dpa));
+ arc_buf_destroy(abuf, private);
+ dbuf_prefetch_fini(dpa, B_TRUE);
+ return;
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
dbuf_issue_final_prefetch(dpa, bp);
- kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
- bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
+ bp, dbuf_prefetch_indirect_done, dpa,
+ ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
* complete. Note that the prefetch might fail if the dataset is encrypted and
* the encryption key is unmapped before the IO completes.
*/
-void
-dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
- arc_flags_t aflags)
+int
+dbuf_prefetch_impl(dnode_t *dn, int64_t level, uint64_t blkid,
+ zio_priority_t prio, arc_flags_t aflags, dbuf_prefetch_fn cb,
+ void *arg)
{
blkptr_t bp;
int epbs, nlevels, curlevel;
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
if (blkid > dn->dn_maxblkid)
- return;
+ goto no_issue;
if (level == 0 && dnode_block_freed(dn, blkid))
- return;
+ goto no_issue;
/*
* This dnode hasn't been written to disk yet, so there's nothing to
*/
nlevels = dn->dn_phys->dn_nlevels;
if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
- return;
+ goto no_issue;
epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
- return;
+ goto no_issue;
dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
- level, blkid);
+ level, blkid, NULL);
if (db != NULL) {
mutex_exit(&db->db_mtx);
/*
* This dbuf already exists. It is either CACHED, or
* (we assume) about to be read or filled.
*/
- return;
+ goto no_issue;
}
/*
dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset,
SPA_FEATURE_REDACTED_DATASETS));
if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp))
- return;
+ goto no_issue;
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
dpa->dpa_dnode = dn;
dpa->dpa_epbs = epbs;
dpa->dpa_zio = pio;
+ dpa->dpa_cb = cb;
+ dpa->dpa_arg = arg;
- /* flag if L2ARC eligible, l2arc_noprefetch then decides */
- if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+ if (!DNODE_LEVEL_IS_CACHEABLE(dn, level))
+ dpa->dpa_aflags |= ARC_FLAG_UNCACHED;
+ else if (dnode_level_is_l2cacheable(&bp, dn, level))
dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
/*
if (curlevel == level) {
ASSERT3U(curblkid, ==, blkid);
dbuf_issue_final_prefetch(dpa, &bp);
- kmem_free(dpa, sizeof (*dpa));
} else {
arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
zbookmark_phys_t zb;
/* flag if L2ARC eligible, l2arc_noprefetch then decides */
- if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
+ if (dnode_level_is_l2cacheable(&bp, dn, level))
iter_aflags |= ARC_FLAG_L2CACHE;
SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, curlevel, curblkid);
(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
- &bp, dbuf_prefetch_indirect_done, dpa, prio,
+ &bp, dbuf_prefetch_indirect_done, dpa,
+ ZIO_PRIORITY_SYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&iter_aflags, &zb);
}
* dpa may have already been freed.
*/
zio_nowait(pio);
+ return (1);
+no_issue:
+ if (cb != NULL)
+ cb(arg, level, blkid, B_FALSE);
+ return (0);
+}
+
+int
+dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
+ arc_flags_t aflags)
+{
+
+ return (dbuf_prefetch_impl(dn, level, blkid, prio, aflags, NULL, NULL));
}
/*
dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db)
{
dbuf_dirty_record_t *dr = db->db_data_pending;
- arc_buf_t *newdata, *data = dr->dt.dl.dr_data;
+ arc_buf_t *data = dr->dt.dl.dr_data;
+ enum zio_compress compress_type = arc_get_compression(data);
+ uint8_t complevel = arc_get_complevel(data);
+
+ if (arc_is_encrypted(data)) {
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ arc_get_raw_params(data, &byteorder, salt, iv, mac);
+ dbuf_set_data(db, arc_alloc_raw_buf(dn->dn_objset->os_spa, db,
+ dmu_objset_id(dn->dn_objset), byteorder, salt, iv, mac,
+ dn->dn_type, arc_buf_size(data), arc_buf_lsize(data),
+ compress_type, complevel));
+ } else if (compress_type != ZIO_COMPRESS_OFF) {
+ dbuf_set_data(db, arc_alloc_compressed_buf(
+ dn->dn_objset->os_spa, db, arc_buf_size(data),
+ arc_buf_lsize(data), compress_type, complevel));
+ } else {
+ dbuf_set_data(db, arc_alloc_buf(dn->dn_objset->os_spa, db,
+ DBUF_GET_BUFC_TYPE(db), db->db.db_size));
+ }
- newdata = dbuf_alloc_arcbuf_from_arcbuf(db, data);
- dbuf_set_data(db, newdata);
rw_enter(&db->db_rwlock, RW_WRITER);
- bcopy(data->b_data, db->db.db_data, arc_buf_size(data));
+ memcpy(db->db.db_data, data->b_data, arc_buf_size(data));
rw_exit(&db->db_rwlock);
}
int
dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
boolean_t fail_sparse, boolean_t fail_uncached,
- void *tag, dmu_buf_impl_t **dbp)
+ const void *tag, dmu_buf_impl_t **dbp)
{
dmu_buf_impl_t *db, *parent = NULL;
+ uint64_t hv;
/* If the pool has been created, verify the tx_sync_lock is not held */
spa_t *spa = dn->dn_objset->os_spa;
*dbp = NULL;
/* dbuf_find() returns with db_mtx held */
- db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
+ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid, &hv);
if (db == NULL) {
blkptr_t *bp = NULL;
}
if (err && err != ENOENT)
return (err);
- db = dbuf_create(dn, level, blkid, parent, bp);
+ db = dbuf_create(dn, level, blkid, parent, bp, hv);
}
if (fail_uncached && db->db_state != DB_CACHED) {
dn->dn_object != DMU_META_DNODE_OBJECT &&
db->db_state == DB_CACHED && db->db_data_pending) {
dbuf_dirty_record_t *dr = db->db_data_pending;
- if (dr->dt.dl.dr_data == db->db_buf)
+ if (dr->dt.dl.dr_data == db->db_buf) {
+ ASSERT3P(db->db_buf, !=, NULL);
dbuf_hold_copy(dn, db);
+ }
}
if (multilist_link_active(&db->db_cache_link)) {
ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
db->db_caching_status == DB_DBUF_METADATA_CACHE);
- multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+ multilist_remove(&dbuf_caches[db->db_caching_status].cache, db);
+
+ uint64_t size = db->db.db_size + dmu_buf_user_size(&db->db);
(void) zfs_refcount_remove_many(
- &dbuf_caches[db->db_caching_status].size,
- db->db.db_size, db);
+ &dbuf_caches[db->db_caching_status].size, size, db);
if (db->db_caching_status == DB_DBUF_METADATA_CACHE) {
DBUF_STAT_BUMPDOWN(metadata_cache_count);
} else {
DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]);
DBUF_STAT_BUMPDOWN(cache_count);
- DBUF_STAT_DECR(cache_levels_bytes[db->db_level],
- db->db.db_size);
+ DBUF_STAT_DECR(cache_levels_bytes[db->db_level], size);
}
db->db_caching_status = DB_NO_CACHE;
}
}
dmu_buf_impl_t *
-dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
+dbuf_hold(dnode_t *dn, uint64_t blkid, const void *tag)
{
return (dbuf_hold_level(dn, 0, blkid, tag));
}
dmu_buf_impl_t *
-dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
+dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, const void *tag)
{
dmu_buf_impl_t *db;
int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
+ dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL,
+ dbuf_hash(dn->dn_objset, dn->dn_object, 0, DMU_BONUS_BLKID));
}
int
#pragma weak dmu_buf_add_ref = dbuf_add_ref
void
-dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
+dbuf_add_ref(dmu_buf_impl_t *db, const void *tag)
{
int64_t holds = zfs_refcount_add(&db->db_holds, tag);
VERIFY3S(holds, >, 1);
#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
boolean_t
dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
- void *tag)
+ const void *tag)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
dmu_buf_impl_t *found_db;
if (blkid == DMU_BONUS_BLKID)
found_db = dbuf_find_bonus(os, obj);
else
- found_db = dbuf_find(os, obj, 0, blkid);
+ found_db = dbuf_find(os, obj, 0, blkid, NULL);
if (found_db != NULL) {
if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
* dnode's parent dbuf evicting its dnode handles.
*/
void
-dbuf_rele(dmu_buf_impl_t *db, void *tag)
+dbuf_rele(dmu_buf_impl_t *db, const void *tag)
{
mutex_enter(&db->db_mtx);
dbuf_rele_and_unlock(db, tag, B_FALSE);
}
void
-dmu_buf_rele(dmu_buf_t *db, void *tag)
+dmu_buf_rele(dmu_buf_t *db, const void *tag)
{
dbuf_rele((dmu_buf_impl_t *)db, tag);
}
*
*/
void
-dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting)
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, const void *tag, boolean_t evicting)
{
int64_t holds;
uint64_t size;
* This dbuf has anonymous data associated with it.
*/
dbuf_destroy(db);
- } else {
- boolean_t do_arc_evict = B_FALSE;
- blkptr_t bp;
- spa_t *spa = dmu_objset_spa(db->db_objset);
-
- if (!DBUF_IS_CACHEABLE(db) &&
- db->db_blkptr != NULL &&
- !BP_IS_HOLE(db->db_blkptr) &&
- !BP_IS_EMBEDDED(db->db_blkptr)) {
- do_arc_evict = B_TRUE;
- bp = *db->db_blkptr;
- }
-
- if (!DBUF_IS_CACHEABLE(db) ||
- db->db_pending_evict) {
- dbuf_destroy(db);
- } else if (!multilist_link_active(&db->db_cache_link)) {
- ASSERT3U(db->db_caching_status, ==,
- DB_NO_CACHE);
-
- dbuf_cached_state_t dcs =
- dbuf_include_in_metadata_cache(db) ?
- DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
- db->db_caching_status = dcs;
-
- multilist_insert(dbuf_caches[dcs].cache, db);
- size = zfs_refcount_add_many(
- &dbuf_caches[dcs].size,
- db->db.db_size, db);
-
- if (dcs == DB_DBUF_METADATA_CACHE) {
- DBUF_STAT_BUMP(metadata_cache_count);
- DBUF_STAT_MAX(
- metadata_cache_size_bytes_max,
- size);
- } else {
- DBUF_STAT_BUMP(
- cache_levels[db->db_level]);
- DBUF_STAT_BUMP(cache_count);
- DBUF_STAT_INCR(
- cache_levels_bytes[db->db_level],
- db->db.db_size);
- DBUF_STAT_MAX(cache_size_bytes_max,
- size);
- }
- mutex_exit(&db->db_mtx);
+ } else if (!(DBUF_IS_CACHEABLE(db) || db->db_partial_read) ||
+ db->db_pending_evict) {
+ dbuf_destroy(db);
+ } else if (!multilist_link_active(&db->db_cache_link)) {
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+
+ dbuf_cached_state_t dcs =
+ dbuf_include_in_metadata_cache(db) ?
+ DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+ db->db_caching_status = dcs;
+
+ multilist_insert(&dbuf_caches[dcs].cache, db);
+ uint64_t db_size = db->db.db_size +
+ dmu_buf_user_size(&db->db);
+ size = zfs_refcount_add_many(
+ &dbuf_caches[dcs].size, db_size, db);
+ uint8_t db_level = db->db_level;
+ mutex_exit(&db->db_mtx);
- if (dcs == DB_DBUF_CACHE && !evicting)
- dbuf_evict_notify(size);
+ if (dcs == DB_DBUF_METADATA_CACHE) {
+ DBUF_STAT_BUMP(metadata_cache_count);
+ DBUF_STAT_MAX(metadata_cache_size_bytes_max,
+ size);
+ } else {
+ DBUF_STAT_BUMP(cache_count);
+ DBUF_STAT_MAX(cache_size_bytes_max, size);
+ DBUF_STAT_BUMP(cache_levels[db_level]);
+ DBUF_STAT_INCR(cache_levels_bytes[db_level],
+ db_size);
}
- if (do_arc_evict)
- arc_freed(spa, &bp);
+ if (dcs == DB_DBUF_CACHE && !evicting)
+ dbuf_evict_notify(size);
}
} else {
mutex_exit(&db->db_mtx);
return (db->db_user);
}
+uint64_t
+dmu_buf_user_size(dmu_buf_t *db_fake)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ if (db->db_user == NULL)
+ return (0);
+ return (atomic_load_64(&db->db_user->dbu_size));
+}
+
+void
+dmu_buf_add_user_size(dmu_buf_t *db_fake, uint64_t nadd)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+ ASSERT3P(db->db_user, !=, NULL);
+ ASSERT3U(atomic_load_64(&db->db_user->dbu_size), <, UINT64_MAX - nadd);
+ atomic_add_64(&db->db_user->dbu_size, nadd);
+}
+
+void
+dmu_buf_sub_user_size(dmu_buf_t *db_fake, uint64_t nsub)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
+ ASSERT3P(db->db_user, !=, NULL);
+ ASSERT3U(atomic_load_64(&db->db_user->dbu_size), >=, nsub);
+ atomic_sub_64(&db->db_user->dbu_size, nsub);
+}
+
void
-dmu_buf_user_evict_wait()
+dmu_buf_user_evict_wait(void)
{
taskq_wait(dbu_evict_taskq);
}
return (dbi->db_objset);
}
-dnode_t *
-dmu_buf_dnode_enter(dmu_buf_t *db)
-{
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- DB_DNODE_ENTER(dbi);
- return (DB_DNODE(dbi));
-}
-
-void
-dmu_buf_dnode_exit(dmu_buf_t *db)
-{
- dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
- DB_DNODE_EXIT(dbi);
-}
-
static void
dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
{
ASSERT0(db->db_level);
ASSERT(MUTEX_HELD(&db->db_mtx));
- ASSERT(DB_DNODE_HELD(db));
ASSERT(db->db_blkid == DMU_BONUS_BLKID);
ASSERT(data != NULL);
- dnode_t *dn = DB_DNODE(db);
+ dnode_t *dn = dr->dr_dnode;
ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
- bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys));
- DB_DNODE_EXIT(db);
+ memcpy(DN_BONUS(dn->dn_phys), data, DN_MAX_BONUS_LEN(dn->dn_phys));
dbuf_sync_leaf_verify_bonus_dnode(dr);
dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn;
- zio_t *zio;
+ dnode_t *dn = dr->dr_dnode;
ASSERT(dmu_tx_is_syncing(tx));
ASSERT3U(db->db_state, ==, DB_CACHED);
ASSERT(db->db_buf != NULL);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
/* Indirect block size must match what the dnode thinks it is. */
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
dbuf_check_blkptr(dn, db);
- DB_DNODE_EXIT(db);
/* Provide the pending dirty record to child dbufs */
db->db_data_pending = dr;
dbuf_write(dr, db->db_buf, tx);
- zio = dr->dr_zio;
+ zio_t *zio = dr->dr_zio;
mutex_enter(&dr->dt.di.dr_mtx);
dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr)
{
#ifdef ZFS_DEBUG
- dnode_t *dn = DB_DNODE(dr->dr_dbuf);
+ dnode_t *dn = dr->dr_dnode;
/*
* Encrypted bonus buffers can have data past their bonuslen.
#endif
}
+static blkptr_t *
+dbuf_lightweight_bp(dbuf_dirty_record_t *dr)
+{
+ /* This must be a lightweight dirty record. */
+ ASSERT3P(dr->dr_dbuf, ==, NULL);
+ dnode_t *dn = dr->dr_dnode;
+
+ if (dn->dn_phys->dn_nlevels == 1) {
+ VERIFY3U(dr->dt.dll.dr_blkid, <, dn->dn_phys->dn_nblkptr);
+ return (&dn->dn_phys->dn_blkptr[dr->dt.dll.dr_blkid]);
+ } else {
+ dmu_buf_impl_t *parent_db = dr->dr_parent->dr_dbuf;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ VERIFY3U(parent_db->db_level, ==, 1);
+ VERIFY3P(parent_db->db_dnode_handle->dnh_dnode, ==, dn);
+ VERIFY3U(dr->dt.dll.dr_blkid >> epbs, ==, parent_db->db_blkid);
+ blkptr_t *bp = parent_db->db.db_data;
+ return (&bp[dr->dt.dll.dr_blkid & ((1 << epbs) - 1)]);
+ }
+}
+
+static void
+dbuf_lightweight_ready(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+ blkptr_t *bp = zio->io_bp;
+
+ if (zio->io_error != 0)
+ return;
+
+ dnode_t *dn = dr->dr_dnode;
+
+ blkptr_t *bp_orig = dbuf_lightweight_bp(dr);
+ spa_t *spa = dmu_objset_spa(dn->dn_objset);
+ int64_t delta = bp_get_dsize_sync(spa, bp) -
+ bp_get_dsize_sync(spa, bp_orig);
+ dnode_diduse_space(dn, delta);
+
+ uint64_t blkid = dr->dt.dll.dr_blkid;
+ mutex_enter(&dn->dn_mtx);
+ if (blkid > dn->dn_phys->dn_maxblkid) {
+ ASSERT0(dn->dn_objset->os_raw_receive);
+ dn->dn_phys->dn_maxblkid = blkid;
+ }
+ mutex_exit(&dn->dn_mtx);
+
+ if (!BP_IS_EMBEDDED(bp)) {
+ uint64_t fill = BP_IS_HOLE(bp) ? 0 : 1;
+ BP_SET_FILL(bp, fill);
+ }
+
+ dmu_buf_impl_t *parent_db;
+ EQUIV(dr->dr_parent == NULL, dn->dn_phys->dn_nlevels == 1);
+ if (dr->dr_parent == NULL) {
+ parent_db = dn->dn_dbuf;
+ } else {
+ parent_db = dr->dr_parent->dr_dbuf;
+ }
+ rw_enter(&parent_db->db_rwlock, RW_WRITER);
+ *bp_orig = *bp;
+ rw_exit(&parent_db->db_rwlock);
+}
+
+static void
+dbuf_lightweight_done(zio_t *zio)
+{
+ dbuf_dirty_record_t *dr = zio->io_private;
+
+ VERIFY0(zio->io_error);
+
+ objset_t *os = dr->dr_dnode->dn_objset;
+ dmu_tx_t *tx = os->os_synctx;
+
+ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
+ ASSERT(BP_EQUAL(zio->io_bp, &zio->io_bp_orig));
+ } else {
+ dsl_dataset_t *ds = os->os_dsl_dataset;
+ (void) dsl_dataset_block_kill(ds, &zio->io_bp_orig, tx, B_TRUE);
+ dsl_dataset_block_born(ds, zio->io_bp, tx);
+ }
+
+ dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+ zio->io_txg);
+
+ abd_free(dr->dt.dll.dr_abd);
+ kmem_free(dr, sizeof (*dr));
+}
+
+noinline static void
+dbuf_sync_lightweight(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
+{
+ dnode_t *dn = dr->dr_dnode;
+ zio_t *pio;
+ if (dn->dn_phys->dn_nlevels == 1) {
+ pio = dn->dn_zio;
+ } else {
+ pio = dr->dr_parent->dr_zio;
+ }
+
+ zbookmark_phys_t zb = {
+ .zb_objset = dmu_objset_id(dn->dn_objset),
+ .zb_object = dn->dn_object,
+ .zb_level = 0,
+ .zb_blkid = dr->dt.dll.dr_blkid,
+ };
+
+ /*
+ * See comment in dbuf_write(). This is so that zio->io_bp_orig
+ * will have the old BP in dbuf_lightweight_done().
+ */
+ dr->dr_bp_copy = *dbuf_lightweight_bp(dr);
+
+ dr->dr_zio = zio_write(pio, dmu_objset_spa(dn->dn_objset),
+ dmu_tx_get_txg(tx), &dr->dr_bp_copy, dr->dt.dll.dr_abd,
+ dn->dn_datablksz, abd_get_size(dr->dt.dll.dr_abd),
+ &dr->dt.dll.dr_props, dbuf_lightweight_ready, NULL,
+ dbuf_lightweight_done, dr, ZIO_PRIORITY_ASYNC_WRITE,
+ ZIO_FLAG_MUSTSUCCEED | dr->dt.dll.dr_flags, &zb);
+
+ zio_nowait(dr->dr_zio);
+}
+
/*
* dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is
* critical the we not allow the compiler to inline this function in to
{
arc_buf_t **datap = &dr->dt.dl.dr_data;
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn;
+ dnode_t *dn = dr->dr_dnode;
objset_t *os;
uint64_t txg = tx->tx_txg;
} else if (db->db_state == DB_FILL) {
/* This buffer was freed and is now being re-filled */
ASSERT(db->db.db_data != dr->dt.dl.dr_data);
+ } else if (db->db_state == DB_READ) {
+ /*
+ * This buffer has a clone we need to write, and an in-flight
+ * read on the BP we're about to clone. Its safe to issue the
+ * write here because the read has already been issued and the
+ * contents won't change.
+ */
+ ASSERT(dr->dt.dl.dr_brtwrite &&
+ dr->dt.dl.dr_override_state == DR_OVERRIDDEN);
} else {
ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
}
DBUF_VERIFY(db);
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
-
if (db->db_blkid == DMU_SPILL_BLKID) {
mutex_enter(&dn->dn_mtx);
if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) {
while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
cv_wait(&db->db_changed, &db->db_mtx);
- ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
/*
* objects only modified in the syncing context (e.g.
* DNONE_DNODE blocks).
*/
- *datap = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf);
- bcopy(db->db.db_data, (*datap)->b_data, arc_buf_size(*datap));
+ int psize = arc_buf_size(*datap);
+ int lsize = arc_buf_lsize(*datap);
+ arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+ enum zio_compress compress_type = arc_get_compression(*datap);
+ uint8_t complevel = arc_get_complevel(*datap);
+
+ if (arc_is_encrypted(*datap)) {
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
+ *datap = arc_alloc_raw_buf(os->os_spa, db,
+ dmu_objset_id(os), byteorder, salt, iv, mac,
+ dn->dn_type, psize, lsize, compress_type,
+ complevel);
+ } else if (compress_type != ZIO_COMPRESS_OFF) {
+ ASSERT3U(type, ==, ARC_BUFC_DATA);
+ *datap = arc_alloc_compressed_buf(os->os_spa, db,
+ psize, lsize, compress_type, complevel);
+ } else {
+ *datap = arc_alloc_buf(os->os_spa, db, type, psize);
+ }
+ memcpy((*datap)->b_data, db->db.db_data, psize);
}
db->db_data_pending = dr;
ASSERT(!list_link_active(&dr->dr_dirty_node));
if (dn->dn_object == DMU_META_DNODE_OBJECT) {
list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr);
- DB_DNODE_EXIT(db);
} else {
- /*
- * Although zio_nowait() does not "wait for an IO", it does
- * initiate the IO. If this is an empty write it seems plausible
- * that the IO could actually be completed before the nowait
- * returns. We need to DB_DNODE_EXIT() first in case
- * zio_nowait() invalidates the dbuf.
- */
- DB_DNODE_EXIT(db);
zio_nowait(dr->dr_zio);
}
}
+/*
+ * Syncs out a range of dirty records for indirect or leaf dbufs. May be
+ * called recursively from dbuf_sync_indirect().
+ */
void
dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
{
DMU_META_DNODE_OBJECT);
break;
}
- if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
- dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
- VERIFY3U(dr->dr_dbuf->db_level, ==, level);
- }
list_remove(list, dr);
- if (dr->dr_dbuf->db_level > 0)
- dbuf_sync_indirect(dr, tx);
- else
- dbuf_sync_leaf(dr, tx);
+ if (dr->dr_dbuf == NULL) {
+ dbuf_sync_lightweight(dr, tx);
+ } else {
+ if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
+ dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
+ VERIFY3U(dr->dr_dbuf->db_level, ==, level);
+ }
+ if (dr->dr_dbuf->db_level > 0)
+ dbuf_sync_indirect(dr, tx);
+ else
+ dbuf_sync_leaf(dr, tx);
+ }
}
}
-/* ARGSUSED */
static void
dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
+ (void) buf;
dmu_buf_impl_t *db = vdb;
dnode_t *dn;
blkptr_t *bp = zio->io_bp;
dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
zio->io_prev_space_delta = delta;
- if (bp->blk_birth != 0) {
+ if (BP_GET_LOGICAL_BIRTH(bp) != 0) {
ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
BP_GET_TYPE(bp) == dn->dn_type) ||
(db->db_blkid == DMU_SPILL_BLKID &&
i += DNODE_MIN_SIZE;
if (dnp->dn_type != DMU_OT_NONE) {
fill++;
+ for (int j = 0; j < dnp->dn_nblkptr;
+ j++) {
+ (void) zfs_blkptr_verify(spa,
+ &dnp->dn_blkptr[j],
+ BLK_CONFIG_SKIP,
+ BLK_VERIFY_HALT);
+ }
+ if (dnp->dn_flags &
+ DNODE_FLAG_SPILL_BLKPTR) {
+ (void) zfs_blkptr_verify(spa,
+ DN_SPILL_BLKPTR(dnp),
+ BLK_CONFIG_SKIP,
+ BLK_VERIFY_HALT);
+ }
i += dnp->dn_extra_slots *
DNODE_MIN_SIZE;
}
for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
if (BP_IS_HOLE(ibp))
continue;
+ (void) zfs_blkptr_verify(spa, ibp,
+ BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
fill += BP_GET_FILL(ibp);
}
}
dmu_buf_unlock_parent(db, dblt, FTAG);
}
-/* ARGSUSED */
/*
* This function gets called just prior to running through the compression
* stage of the zio pipeline. If we're an indirect block comprised of only
static void
dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
{
+ (void) zio, (void) buf;
dmu_buf_impl_t *db = vdb;
dnode_t *dn;
blkptr_t *bp;
* zero out.
*/
rw_enter(&db->db_rwlock, RW_WRITER);
- bzero(db->db.db_data, db->db.db_size);
+ memset(db->db.db_data, 0, db->db.db_size);
rw_exit(&db->db_rwlock);
}
DB_DNODE_EXIT(db);
}
-/*
- * The SPA will call this callback several times for each zio - once
- * for every physical child i/o (zio->io_phys_children times). This
- * allows the DMU to monitor the progress of each logical i/o. For example,
- * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
- * block. There may be a long delay before all copies/fragments are completed,
- * so this callback allows us to retire dirty space gradually, as the physical
- * i/os complete.
- */
-/* ARGSUSED */
-static void
-dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
-{
- dmu_buf_impl_t *db = arg;
- objset_t *os = db->db_objset;
- dsl_pool_t *dp = dmu_objset_pool(os);
- dbuf_dirty_record_t *dr;
- int delta = 0;
-
- dr = db->db_data_pending;
- ASSERT3U(dr->dr_txg, ==, zio->io_txg);
-
- /*
- * The callback will be called io_phys_children times. Retire one
- * portion of our dirty space each time we are called. Any rounding
- * error will be cleaned up by dbuf_write_done().
- */
- delta = dr->dr_accounted / zio->io_phys_children;
- dsl_pool_undirty_space(dp, delta, zio->io_txg);
-}
-
-/* ARGSUSED */
static void
dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
{
+ (void) buf;
dmu_buf_impl_t *db = vdb;
blkptr_t *bp_orig = &zio->io_bp_orig;
blkptr_t *bp = db->db_blkptr;
objset_t *os = db->db_objset;
dmu_tx_t *tx = os->os_synctx;
- dbuf_dirty_record_t *dr;
ASSERT0(zio->io_error);
ASSERT(db->db_blkptr == bp);
DBUF_VERIFY(db);
- dr = db->db_data_pending;
+ dbuf_dirty_record_t *dr = db->db_data_pending;
+ dnode_t *dn = dr->dr_dnode;
ASSERT(!list_link_active(&dr->dr_dirty_node));
ASSERT(dr->dr_dbuf == db);
ASSERT(list_next(&db->db_dirty_records, dr) == NULL);
#ifdef ZFS_DEBUG
if (db->db_blkid == DMU_SPILL_BLKID) {
- dnode_t *dn;
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys));
- DB_DNODE_EXIT(db);
}
#endif
ASSERT(db->db_blkid != DMU_BONUS_BLKID);
ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
if (db->db_state != DB_NOFILL) {
- if (dr->dt.dl.dr_data != db->db_buf)
+ if (dr->dt.dl.dr_data != NULL &&
+ dr->dt.dl.dr_data != db->db_buf) {
arc_buf_destroy(dr->dt.dl.dr_data, db);
+ }
}
} else {
- dnode_t *dn;
-
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
if (!BP_IS_HOLE(db->db_blkptr)) {
ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
db->db.db_size);
}
- DB_DNODE_EXIT(db);
mutex_destroy(&dr->dt.di.dr_mtx);
list_destroy(&dr->dt.di.dr_children);
}
db->db_data_pending = NULL;
dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE);
- /*
- * If we didn't do a physical write in this ZIO and we
- * still ended up here, it means that the space of the
- * dbuf that we just released (and undirtied) above hasn't
- * been marked as undirtied in the pool's accounting.
- *
- * Thus, we undirty that space in the pool's view of the
- * world here. For physical writes this type of update
- * happens in dbuf_write_physdone().
- *
- * If we did a physical write, cleanup any rounding errors
- * that came up due to writing multiple copies of a block
- * on disk [see dbuf_write_physdone()].
- */
- if (zio->io_phys_children == 0) {
- dsl_pool_undirty_space(dmu_objset_pool(os),
- dr->dr_accounted, zio->io_txg);
- } else {
- dsl_pool_undirty_space(dmu_objset_pool(os),
- dr->dr_accounted % zio->io_phys_children, zio->io_txg);
- }
+ dsl_pool_undirty_space(dmu_objset_pool(os), dr->dr_accounted,
+ zio->io_txg);
kmem_free(dr, sizeof (dbuf_dirty_record_t));
}
dbuf_write_done(zio, NULL, db);
if (zio->io_abd != NULL)
- abd_put(zio->io_abd);
+ abd_free(zio->io_abd);
}
typedef struct dbuf_remap_impl_callback_arg {
ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
drica.drica_os = dn->dn_objset;
- drica.drica_blk_birth = bp->blk_birth;
+ drica.drica_blk_birth = BP_GET_LOGICAL_BIRTH(bp);
drica.drica_tx = tx;
if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
&drica)) {
if (dn->dn_objset != spa_meta_objset(spa)) {
dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset);
if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) &&
- bp->blk_birth > ds->ds_dir->dd_origin_txg) {
+ BP_GET_LOGICAL_BIRTH(bp) >
+ ds->ds_dir->dd_origin_txg) {
ASSERT(!BP_IS_EMBEDDED(bp));
ASSERT(dsl_dir_is_clone(ds->ds_dir));
ASSERT(spa_feature_is_enabled(spa,
}
-/* Issue I/O to commit a dirty buffer to disk. */
+/*
+ * Populate dr->dr_zio with a zio to commit a dirty buffer to disk.
+ * Caller is responsible for issuing the zio_[no]wait(dr->dr_zio).
+ */
static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
- dnode_t *dn;
+ dnode_t *dn = dr->dr_dnode;
objset_t *os;
dmu_buf_impl_t *parent = db->db_parent;
uint64_t txg = tx->tx_txg;
ASSERT(dmu_tx_is_syncing(tx));
- DB_DNODE_ENTER(db);
- dn = DB_DNODE(db);
os = dn->dn_objset;
if (db->db_state != DB_NOFILL) {
}
ASSERT(db->db_level == 0 || data == db->db_buf);
- ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+ ASSERT3U(BP_GET_LOGICAL_BIRTH(db->db_blkptr), <=, txg);
ASSERT(pio);
SET_BOOKMARK(&zb, os->os_dsl_dataset ?
wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
- DB_DNODE_EXIT(db);
/*
* We copy the blkptr now (rather than when we instantiate the dirty
dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy,
contents, db->db.db_size, db->db.db_size, &zp,
- dbuf_write_override_ready, NULL, NULL,
+ dbuf_write_override_ready, NULL,
dbuf_write_override_done,
dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
mutex_enter(&db->db_mtx);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
- dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
+ dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite,
+ dr->dt.dl.dr_brtwrite);
mutex_exit(&db->db_mtx);
} else if (db->db_state == DB_NOFILL) {
ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
dr->dr_zio = zio_write(pio, os->os_spa, txg,
&dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
- dbuf_write_nofill_ready, NULL, NULL,
+ dbuf_write_nofill_ready, NULL,
dbuf_write_nofill_done, db,
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
children_ready_cb = dbuf_write_children_ready;
dr->dr_zio = arc_write(pio, os->os_spa, txg,
- &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
- &zp, dbuf_write_ready,
- children_ready_cb, dbuf_write_physdone,
- dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
- ZIO_FLAG_MUSTSUCCEED, &zb);
+ &dr->dr_bp_copy, data, !DBUF_IS_CACHEABLE(db),
+ dbuf_is_l2cacheable(db), &zp, dbuf_write_ready,
+ children_ready_cb, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
}
EXPORT_SYMBOL(dmu_buf_set_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_is_dirty);
+EXPORT_SYMBOL(dmu_buf_will_clone);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);
EXPORT_SYMBOL(dmu_buf_fill_done);
EXPORT_SYMBOL(dmu_buf_get_user);
EXPORT_SYMBOL(dmu_buf_get_blkptr);
-/* BEGIN CSTYLED */
-ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW,
+ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, U64, ZMOD_RW,
"Maximum size in bytes of the dbuf cache.");
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW,
- "Percentage over dbuf_cache_max_bytes when dbufs must be evicted "
- "directly.");
+ "Percentage over dbuf_cache_max_bytes for direct dbuf eviction.");
ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW,
- "Percentage below dbuf_cache_max_bytes when the evict thread stops "
- "evicting dbufs.");
+ "Percentage below dbuf_cache_max_bytes when dbuf eviction stops.");
+
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, U64, ZMOD_RW,
+ "Maximum size in bytes of dbuf metadata cache.");
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW,
- "Maximum size in bytes of the dbuf metadata cache.");
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, UINT, ZMOD_RW,
+ "Set size of dbuf cache to log2 fraction of arc size.");
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW,
- "Set the size of the dbuf cache to a log2 fraction of arc size.");
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, UINT, ZMOD_RW,
+ "Set size of dbuf metadata cache to log2 fraction of arc size.");
-ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW,
- "Set the size of the dbuf metadata cache to a log2 fraction of arc "
- "size.");
-/* END CSTYLED */
+ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, mutex_cache_shift, UINT, ZMOD_RD,
+ "Set size of dbuf cache mutex array as log2 shift.");