/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
- * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
*/
* Dbufs that are aged out of the cache will be immediately destroyed and
* become eligible for arc eviction.
*/
-static multilist_t dbuf_cache;
+static multilist_t *dbuf_cache;
static refcount_t dbuf_cache_size;
unsigned long dbuf_cache_max_bytes = 100 * 1024 * 1024;
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
multilist_link_init(&db->db_cache_link);
refcount_create(&db->db_holds);
- multilist_link_init(&db->db_cache_link);
return (0);
}
dbuf_evict_user(dmu_buf_impl_t *db)
{
dmu_buf_user_t *dbu = db->db_user;
- boolean_t has_async;
ASSERT(MUTEX_HELD(&db->db_mtx));
* containing the dbu. In that case we need to take care to not
* dereference dbu after calling the sync evict func.
*/
- has_async = (dbu->dbu_evict_func_async != NULL);
+ boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
if (dbu->dbu_evict_func_sync != NULL)
dbu->dbu_evict_func_sync(dbu);
multilist_get_num_sublists(ml));
}
+static inline unsigned long
+dbuf_cache_target_bytes(void)
+{
+ return MIN(dbuf_cache_max_bytes,
+ arc_target_bytes() >> dbuf_cache_max_shift);
+}
+
static inline boolean_t
dbuf_cache_above_hiwater(void)
{
+ uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
+
uint64_t dbuf_cache_hiwater_bytes =
- (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
+ (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100;
return (refcount_count(&dbuf_cache_size) >
- dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
+ dbuf_cache_target + dbuf_cache_hiwater_bytes);
}
static inline boolean_t
dbuf_cache_above_lowater(void)
{
+ uint64_t dbuf_cache_target = dbuf_cache_target_bytes();
+
uint64_t dbuf_cache_lowater_bytes =
- (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
+ (dbuf_cache_target * dbuf_cache_lowater_pct) / 100;
return (refcount_count(&dbuf_cache_size) >
- dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
+ dbuf_cache_target - dbuf_cache_lowater_bytes);
}
/*
static void
dbuf_evict_one(void)
{
- int idx = multilist_get_random_index(&dbuf_cache);
- multilist_sublist_t *mls = multilist_sublist_lock(&dbuf_cache, idx);
- dmu_buf_impl_t *db;
+ int idx = multilist_get_random_index(dbuf_cache);
+ multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);
+
ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
/*
ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
- db = multilist_sublist_tail(mls);
+ dmu_buf_impl_t *db = multilist_sublist_tail(mls);
while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
db = multilist_sublist_prev(mls, db);
}
* of the dbuf cache is at or below the maximum size. Once the dbuf is aged
* out of the cache it is destroyed and becomes eligible for arc eviction.
*/
+/* ARGSUSED */
static void
-dbuf_evict_thread(void)
+dbuf_evict_thread(void *unused)
{
callb_cpr_t cpr;
if (tsd_get(zfs_dbuf_evict_key) != NULL)
return;
- if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
- boolean_t evict_now = B_FALSE;
-
- mutex_enter(&dbuf_evict_lock);
- if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
- evict_now = dbuf_cache_above_hiwater();
- cv_signal(&dbuf_evict_cv);
- }
- mutex_exit(&dbuf_evict_lock);
-
- if (evict_now) {
+ /*
+ * We check if we should evict without holding the dbuf_evict_lock,
+ * because it's OK to occasionally make the wrong decision here,
+ * and grabbing the lock results in massive lock contention.
+ */
+ if (refcount_count(&dbuf_cache_size) > dbuf_cache_target_bytes()) {
+ if (dbuf_cache_above_hiwater())
dbuf_evict_one();
- }
+ cv_signal(&dbuf_evict_cv);
}
}
* dbuf cache to 1/32nd (default) of the size of the ARC.
*/
dbuf_cache_max_bytes = MIN(dbuf_cache_max_bytes,
- arc_max_bytes() >> dbuf_cache_max_shift);
+ arc_target_bytes() >> dbuf_cache_max_shift);
/*
* All entries are queued via taskq_dispatch_ent(), so min/maxalloc
*/
dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0);
- multilist_create(&dbuf_cache, sizeof (dmu_buf_impl_t),
+ dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_cache_link),
dbuf_cache_multilist_index_func);
refcount_create(&dbuf_cache_size);
cv_destroy(&dbuf_evict_cv);
refcount_destroy(&dbuf_cache_size);
- multilist_destroy(&dbuf_cache);
+ multilist_destroy(dbuf_cache);
}
/*
ASSERT(buf[i] == 0);
}
} else {
- int i;
blkptr_t *bps = db->db.db_data;
ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
db->db.db_size);
* We iterate through each blkptr and verify
* they only have those fields set.
*/
- for (i = 0;
+ for (int i = 0;
i < db->db.db_size / sizeof (blkptr_t);
i++) {
blkptr_t *bp = &bps[i];
}
static void
-dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
+dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
{
dmu_buf_impl_t *db = vdb;
db->db_freed_in_flight = FALSE;
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
- } else if (zio == NULL || zio->io_error == 0) {
+ } else if (err == 0) {
dbuf_set_data(db, buf);
db->db_state = DB_CACHED;
} else {
dnode_t *dn;
zbookmark_phys_t zb;
uint32_t aflags = ARC_FLAG_NOWAIT;
- int err;
+ int err, zio_flags = 0;
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
*/
int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
+ arc_buf_t *dn_buf = (dn->dn_dbuf != NULL) ?
+ dn->dn_dbuf->db_buf : NULL;
+
+ /* if the underlying dnode block is encrypted, decrypt it */
+ if (dn_buf != NULL && dn->dn_objset->os_encrypted &&
+ DMU_OT_IS_ENCRYPTED(dn->dn_bonustype) &&
+ (flags & DB_RF_NO_DECRYPT) == 0 &&
+ arc_is_encrypted(dn_buf)) {
+ err = arc_untransform(dn_buf, dn->dn_objset->os_spa,
+ dmu_objset_id(dn->dn_objset), B_TRUE);
+ if (err != 0) {
+ DB_DNODE_EXIT(db);
+ mutex_exit(&db->db_mtx);
+ return (err);
+ }
+ }
ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP);
BP_IS_HOLE(db->db_blkptr) &&
db->db_blkptr->blk_birth != 0) {
blkptr_t *bps = db->db.db_data;
- int i;
- for (i = 0; i < ((1 <<
+ for (int i = 0; i < ((1 <<
DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
i++) {
blkptr_t *bp = &bps[i];
db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
db->db.db_object, db->db_level, db->db_blkid);
+ /*
+ * All bps of an encrypted os should have the encryption bit set.
+ * If this is not true it indicates tampering and we report an error.
+ */
+ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) {
+ spa_log_error(db->db_objset->os_spa, &zb);
+ zfs_panic_recover("unencrypted block in encrypted "
+ "object set %llu", dmu_objset_id(db->db_objset));
+ return (SET_ERROR(EIO));
+ }
+
dbuf_add_ref(db, NULL);
+ zio_flags = (flags & DB_RF_CANFAIL) ?
+ ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED;
+
+ if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr))
+ zio_flags |= ZIO_FLAG_RAW;
+
err = arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
- dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
- (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
+ dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags,
&aflags, &zb);
return (err);
* or (if there a no active holders)
* just null out the current db_data pointer.
*/
- ASSERT(dr->dr_txg >= txg - 2);
+ ASSERT3U(dr->dr_txg, >=, txg - 2);
if (db->db_blkid == DMU_BONUS_BLKID) {
dnode_t *dn = DB_DNODE(db);
int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
arc_space_consume(bonuslen, ARC_SPACE_BONUS);
bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
+ dnode_t *dn = DB_DNODE(db);
int size = arc_buf_size(db->db_buf);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
spa_t *spa = db->db_objset->os_spa;
enum zio_compress compress_type =
arc_get_compression(db->db_buf);
- if (compress_type == ZIO_COMPRESS_OFF) {
- dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
- } else {
+ if (arc_is_encrypted(db->db_buf)) {
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ arc_get_raw_params(db->db_buf, &byteorder, salt,
+ iv, mac);
+ dr->dt.dl.dr_data = arc_alloc_raw_buf(spa, db,
+ dmu_objset_id(dn->dn_objset), byteorder, salt, iv,
+ mac, dn->dn_type, size, arc_buf_lsize(db->db_buf),
+ compress_type);
+ } else if (compress_type != ZIO_COMPRESS_OFF) {
ASSERT3U(type, ==, ARC_BUFC_DATA);
dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
size, arc_buf_lsize(db->db_buf), compress_type);
+ } else {
+ dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
}
bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
} else {
dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
{
int err = 0;
- boolean_t havepzio = (zio != NULL);
boolean_t prefetch;
dnode_t *dn;
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
+ spa_t *spa = dn->dn_objset->os_spa;
+
/*
- * If the arc buf is compressed, we need to decompress it to
- * read the data. This could happen during the "zfs receive" of
- * a stream which is compressed and deduplicated.
+ * If the arc buf is compressed or encrypted, we need to
+ * untransform it to read the data. This could happen during
+ * the "zfs receive" of a stream which is deduplicated and
+ * either raw or compressed. We do not need to do this if the
+ * caller wants raw encrypted data.
*/
- if (db->db_buf != NULL &&
- arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
- dbuf_fix_old_data(db,
- spa_syncing_txg(dmu_objset_spa(db->db_objset)));
- err = arc_decompress(db->db_buf);
+ if (db->db_buf != NULL && (flags & DB_RF_NO_DECRYPT) == 0 &&
+ (arc_is_encrypted(db->db_buf) ||
+ arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
+ dbuf_fix_old_data(db, spa_syncing_txg(spa));
+ err = arc_untransform(db->db_buf, spa,
+ dmu_objset_id(db->db_objset), B_FALSE);
dbuf_set_data(db, db->db_buf);
}
mutex_exit(&db->db_mtx);
DB_DNODE_EXIT(db);
} else if (db->db_state == DB_UNCACHED) {
spa_t *spa = dn->dn_objset->os_spa;
+ boolean_t need_wait = B_FALSE;
if (zio == NULL &&
- db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr))
+ db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
+ need_wait = B_TRUE;
+ }
err = dbuf_read_impl(db, zio, flags);
/* dbuf_read_impl has dropped db_mtx for us */
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
- if (!err && !havepzio && zio != NULL)
+ if (!err && need_wait)
err = zio_wait(zio);
} else {
/*
mutex_exit(&db->db_mtx);
}
- ASSERT(err || havepzio || db->db_state == DB_CACHED);
return (err);
}
uint64_t txg = dr->dr_txg;
ASSERT(MUTEX_HELD(&db->db_mtx));
+ /*
+ * This assert is valid because dmu_sync() expects to be called by
+ * a zilog's get_data while holding a range lock. This call only
+ * comes from dbuf_dirty() callers who must also hold a range lock.
+ */
ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
ASSERT(db->db_level == 0);
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
dr->dt.dl.dr_nopwrite = B_FALSE;
+ dr->dt.dl.dr_raw = B_FALSE;
/*
* Release the already-written buffer, so we leave it in
(dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
ASSERT3U(dn->dn_nlevels, >, db->db_level);
- ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
- dn->dn_phys->dn_nlevels > db->db_level ||
- dn->dn_next_nlevels[txgoff] > db->db_level ||
- dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
- dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
/*
* We should only be dirtying in syncing context if it's the
* this assertion only if we're not already dirty.
*/
os = dn->dn_objset;
+ VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
#ifdef DEBUG
if (dn->dn_objset->os_dsl_dataset != NULL)
rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
drop_struct_lock = TRUE;
}
+ /*
+ * We need to hold the dn_struct_rwlock to make this assertion,
+ * because it protects dn_phys / dn_next_nlevels from changing.
+ */
+ ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
+ dn->dn_phys->dn_nlevels > db->db_level ||
+ dn->dn_next_nlevels[txgoff] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
+ dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
+
/*
* If we are overwriting a dedup BP, then unless it is snapshotted,
* when we get to syncing context we will need to decrement its
return (B_FALSE);
}
-void
-dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+static void
+dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
- int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
- dbuf_dirty_record_t *dr;
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
*/
mutex_enter(&db->db_mtx);
+ dbuf_dirty_record_t *dr;
for (dr = db->db_last_dirty;
dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
/*
DB_DNODE_ENTER(db);
if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
- rf |= DB_RF_HAVESTRUCT;
+ flags |= DB_RF_HAVESTRUCT;
DB_DNODE_EXIT(db);
- (void) dbuf_read(db, NULL, rf);
+ (void) dbuf_read(db, NULL, flags);
(void) dbuf_dirty(db, tx);
}
+void
+dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_will_dirty_impl(db_fake,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx);
+}
+
void
dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
{
(void) dbuf_dirty(db, tx);
}
+/*
+ * This function is effectively the same as dmu_buf_will_dirty(), but
+ * indicates the caller expects raw encrypted data in the db. It will
+ * also set the raw flag on the created dirty record.
+ */
+void
+dmu_buf_will_change_crypt_params(dmu_buf_t *db_fake, dmu_tx_t *tx)
+{
+ dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
+ dbuf_dirty_record_t *dr;
+
+ dmu_buf_will_dirty_impl(db_fake,
+ DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx);
+
+ dr = db->db_last_dirty;
+ while (dr != NULL && dr->dr_txg > tx->tx_txg)
+ dr = dr->dr_next;
+
+ ASSERT3P(dr, !=, NULL);
+ ASSERT3U(dr->dr_txg, ==, tx->tx_txg);
+ dr->dt.dl.dr_raw = B_TRUE;
+}
+
#pragma weak dmu_buf_fill_done = dbuf_fill_done
/* ARGSUSED */
void
if (db->db_state == DB_CACHED &&
refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
+ /*
+ * In practice, we will never have a case where we have an
+ * encrypted arc buffer while additional holds exist on the
+ * dbuf. We don't handle this here so we simply assert that
+ * fact instead.
+ */
+ ASSERT(!arc_is_encrypted(buf));
mutex_exit(&db->db_mtx);
(void) dbuf_dirty(db, tx);
bcopy(buf->b_data, db->db.db_data, db->db.db_size);
ASSERT(db->db_buf != NULL);
if (dr != NULL && dr->dr_txg == tx->tx_txg) {
ASSERT(dr->dt.dl.dr_data == db->db_buf);
+ IMPLY(arc_is_encrypted(buf), dr->dt.dl.dr_raw);
+
if (!arc_released(db->db_buf)) {
ASSERT(dr->dt.dl.dr_override_state ==
DR_OVERRIDDEN);
if (db->db_blkid == DMU_BONUS_BLKID) {
int slots = DB_DNODE(db)->dn_num_slots;
int bonuslen = DN_SLOTS_TO_BONUSLEN(slots);
- ASSERT(db->db.db_data != NULL);
- kmem_free(db->db.db_data, bonuslen);
- arc_space_return(bonuslen, ARC_SPACE_BONUS);
- db->db_state = DB_UNCACHED;
+ if (db->db.db_data != NULL) {
+ kmem_free(db->db.db_data, bonuslen);
+ arc_space_return(bonuslen, ARC_SPACE_BONUS);
+ db->db_state = DB_UNCACHED;
+ }
}
dbuf_clear_data(db);
if (multilist_link_active(&db->db_cache_link)) {
- multilist_remove(&dbuf_cache, db);
+ multilist_remove(dbuf_cache, db);
(void) refcount_remove_many(&dbuf_cache_size,
db->db.db_size, db);
}
dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
dmu_buf_impl_t **parentp, blkptr_t **bpp, struct dbuf_hold_impl_data *dh)
{
- int nlevels, epbs;
-
*parentp = NULL;
*bpp = NULL;
return (0);
}
- nlevels =
+ int nlevels =
(dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
- epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
ASSERT3U(level * epbs, <, 64);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
static void
dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
{
- arc_flags_t aflags;
if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
return;
- aflags = dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+ arc_flags_t aflags =
+ dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
* prefetch if the next block down is our target.
*/
static void
-dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
+dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
{
dbuf_prefetch_arg_t *dpa = private;
- uint64_t nextblkid;
- blkptr_t *bp;
ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
ASSERT3S(dpa->dpa_curlevel, >, 0);
*/
if (zio != NULL) {
ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
- if (zio->io_flags & ZIO_FLAG_RAW) {
+ if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) {
ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
} else {
ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
dpa->dpa_curlevel--;
- nextblkid = dpa->dpa_zb.zb_blkid >>
+ uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
(dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
- bp = ((blkptr_t *)abuf->b_data) +
+ blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
- if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
+ if (BP_IS_HOLE(bp) || err != 0) {
kmem_free(dpa, sizeof (*dpa));
} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
* Issue prefetch reads for the given block on the given level. If the indirect
* blocks above that block are not in memory, we will read them in
* asynchronously. As a result, this call never blocks waiting for a read to
- * complete.
+ * complete. Note that the prefetch might fail if the dataset is encrypted and
+ * the encryption key is unmapped before the IO completes.
*/
void
dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
blkptr_t bp;
int epbs, nlevels, curlevel;
uint64_t curblkid;
- dmu_buf_impl_t *db;
- zio_t *pio;
- dbuf_prefetch_arg_t *dpa;
- dsl_dataset_t *ds;
ASSERT(blkid != DMU_BONUS_BLKID);
ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
return;
- db = dbuf_find(dn->dn_objset, dn->dn_object,
+ dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
level, blkid);
if (db != NULL) {
mutex_exit(&db->db_mtx);
ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
- pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
+ zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
ZIO_FLAG_CANFAIL);
- dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
- ds = dn->dn_objset->os_dsl_dataset;
+ dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
+ dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
dn->dn_object, level, blkid);
dpa->dpa_curlevel = curlevel;
if (multilist_link_active(&dh->dh_db->db_cache_link)) {
ASSERT(refcount_is_zero(&dh->dh_db->db_holds));
- multilist_remove(&dbuf_cache, dh->dh_db);
+ multilist_remove(dbuf_cache, dh->dh_db);
(void) refcount_remove_many(&dbuf_cache_size,
dh->dh_db->db.db_size, dh->dh_db);
}
db->db_pending_evict) {
dbuf_destroy(db);
} else if (!multilist_link_active(&db->db_cache_link)) {
- multilist_insert(&dbuf_cache, db);
+ multilist_insert(dbuf_cache, db);
(void) refcount_add_many(&dbuf_cache_size,
db->db.db_size, db);
mutex_exit(&db->db_mtx);
}
}
+/*
+ * Ensure the dbuf's data is untransformed if the associated dirty
+ * record requires it. This is used by dbuf_sync_leaf() to ensure
+ * that a dnode block is decrypted before we write new data to it.
+ * For raw writes we assert that the buffer is already encrypted.
+ */
+static void
+dbuf_check_crypt(dbuf_dirty_record_t *dr)
+{
+ int err;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+
+ ASSERT(MUTEX_HELD(&db->db_mtx));
+
+ if (!dr->dt.dl.dr_raw && arc_is_encrypted(db->db_buf)) {
+ /*
+ * Unfortunately, there is currently no mechanism for
+ * syncing context to handle decryption errors. An error
+ * here is only possible if an attacker maliciously
+ * changed a dnode block and updated the associated
+ * checksums going up the block tree.
+ */
+ err = arc_untransform(db->db_buf, db->db_objset->os_spa,
+ dmu_objset_id(db->db_objset), B_TRUE);
+ if (err)
+ panic("Invalid dnode block MAC");
+ } else if (dr->dt.dl.dr_raw) {
+ /*
+ * Writing raw encrypted data requires the db's arc buffer
+ * to be converted to raw by the caller.
+ */
+ ASSERT(arc_is_encrypted(db->db_buf));
+ }
+}
+
/*
* dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it
* is critical the we not allow the compiler to inline this function in to
ASSERT(*datap != NULL);
ASSERT0(db->db_level);
- ASSERT3U(dn->dn_phys->dn_bonuslen, <=,
+ ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=,
DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1));
- bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
+ bcopy(*datap, DN_BONUS(dn->dn_phys),
+ DN_MAX_BONUS_LEN(dn->dn_phys));
DB_DNODE_EXIT(db);
if (*datap != db->db.db_data) {
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
+ /*
+ * If this is a dnode block, ensure it is appropriately encrypted
+ * or decrypted, depending on what we are writing to it this txg.
+ */
+ if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT)
+ dbuf_check_crypt(dr);
+
if (db->db_state != DB_NOFILL &&
dn->dn_object != DMU_META_DNODE_OBJECT &&
refcount_count(&db->db_holds) > 1 &&
* DNONE_DNODE blocks).
*/
int psize = arc_buf_size(*datap);
+ int lsize = arc_buf_lsize(*datap);
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
enum zio_compress compress_type = arc_get_compression(*datap);
- if (compress_type == ZIO_COMPRESS_OFF) {
- *datap = arc_alloc_buf(os->os_spa, db, type, psize);
- } else {
- int lsize = arc_buf_lsize(*datap);
+ if (arc_is_encrypted(*datap)) {
+ boolean_t byteorder;
+ uint8_t salt[ZIO_DATA_SALT_LEN];
+ uint8_t iv[ZIO_DATA_IV_LEN];
+ uint8_t mac[ZIO_DATA_MAC_LEN];
+
+ arc_get_raw_params(*datap, &byteorder, salt, iv, mac);
+ *datap = arc_alloc_raw_buf(os->os_spa, db,
+ dmu_objset_id(os), byteorder, salt, iv, mac,
+ dn->dn_type, psize, lsize, compress_type);
+ } else if (compress_type != ZIO_COMPRESS_OFF) {
ASSERT3U(type, ==, ARC_BUFC_DATA);
*datap = arc_alloc_compressed_buf(os->os_spa, db,
psize, lsize, compress_type);
+ } else {
+ *datap = arc_alloc_buf(os->os_spa, db, type, psize);
}
bcopy(db->db.db_data, (*datap)->b_data, psize);
}
if (dn->dn_type == DMU_OT_DNODE) {
i = 0;
while (i < db->db.db_size) {
- dnode_phys_t *dnp = db->db.db_data + i;
+ dnode_phys_t *dnp =
+ (void *)(((char *)db->db.db_data) + i);
i += DNODE_MIN_SIZE;
if (dnp->dn_type != DMU_OT_NONE) {
DB_DNODE_EXIT(db);
if (!BP_IS_EMBEDDED(bp))
- bp->blk_fill = fill;
+ BP_SET_FILL(bp, fill);
mutex_exit(&db->db_mtx);
wp_flag = WP_SPILL;
wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
- dmu_write_policy(os, dn, db->db_level, wp_flag,
- (data != NULL && arc_get_compression(data) != ZIO_COMPRESS_OFF) ?
- arc_get_compression(data) : ZIO_COMPRESS_INHERIT, &zp);
+ dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
DB_DNODE_EXIT(db);
/*
ZIO_PRIORITY_ASYNC_WRITE,
ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
} else {
- arc_done_func_t *children_ready_cb = NULL;
ASSERT(arc_released(data));
/*
* ready callback so that we can properly handle an indirect
* block that only contains holes.
*/
+ arc_write_done_func_t *children_ready_cb = NULL;
if (db->db_level != 0)
children_ready_cb = dbuf_write_children_ready;
EXPORT_SYMBOL(dbuf_new_size);
EXPORT_SYMBOL(dbuf_release_bp);
EXPORT_SYMBOL(dbuf_dirty);
+EXPORT_SYMBOL(dmu_buf_will_change_crypt_params);
EXPORT_SYMBOL(dmu_buf_will_dirty);
EXPORT_SYMBOL(dmu_buf_will_not_fill);
EXPORT_SYMBOL(dmu_buf_will_fill);