From 095495e0081c16a18641fc60e7c9872123a4f83e Mon Sep 17 00:00:00 2001 From: Tom Caputi Date: Tue, 27 Feb 2018 12:04:05 -0500 Subject: [PATCH] Raw DRR_OBJECT records must write raw data b1d21733 made it possible for empty metadnode blocks to be compressed to a hole, fixing a bug that would cause invalid metadnode MACs when a send stream attempted to free objects and allowing the blocks to be reclaimed when they were no longer needed. However, this patch also introduced a race condition; if a txg sync occurred after a DRR_OBJECT_RANGE record was received but before any objects were added, the metadnode block would be compressed to a hole and lose all of its encryption parameters. This would cause subsequent DRR_OBJECT records to fail when they attempted to write their data into an unencrypted block. This patch defers the DRR_OBJECT_RANGE handling to receive_object() so that the encryption parameters are set with each object that is written into that block. Reviewed-by: Kash Pande Reviewed-by: Brian Behlendorf Signed-off-by: Tom Caputi Closes #7215 Closes #7236 --- include/sys/dmu.h | 5 ++- module/zfs/dbuf.c | 3 +- module/zfs/dmu.c | 34 +++++++++++------ module/zfs/dmu_send.c | 85 +++++++++++++++++++++++-------------------- 4 files changed, 72 insertions(+), 55 deletions(-) diff --git a/include/sys/dmu.h b/include/sys/dmu.h index 472a4f2b6..4d805b6d7 100644 --- a/include/sys/dmu.h +++ b/include/sys/dmu.h @@ -821,8 +821,9 @@ void dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, void dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); #define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf -void dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, - const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); +int dmu_convert_mdn_block_to_raw(objset_t *os, uint64_t firstobj, + boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, + const uint8_t *mac, dmu_tx_t *tx); void dmu_copy_from_buf(objset_t *os, uint64_t object, uint64_t offset, dmu_buf_t *handle, dmu_tx_t *tx); #ifdef HAVE_UIO_ZEROCOPY diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index e885a2756..51cb0c982 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -3465,8 +3465,7 @@ dbuf_check_crypt(dbuf_dirty_record_t *dr) * Writing raw encrypted data requires the db's arc buffer * to be converted to raw by the caller. */ - ASSERT(arc_is_encrypted(db->db_buf) || - db->db.db_object == DMU_META_DNODE_OBJECT); + ASSERT(arc_is_encrypted(db->db_buf)); } } diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index e0114e659..2b727372c 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1542,29 +1542,39 @@ dmu_return_arcbuf(arc_buf_t *buf) arc_buf_destroy(buf, FTAG); } -void -dmu_convert_to_raw(dmu_buf_t *handle, boolean_t byteorder, const uint8_t *salt, - const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx) +int +dmu_convert_mdn_block_to_raw(objset_t *os, uint64_t firstobj, + boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, + const uint8_t *mac, dmu_tx_t *tx) { - dmu_object_type_t type; - dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; - uint64_t dsobj = dmu_objset_id(db->db_objset); + int ret; + dmu_buf_t *handle = NULL; + dmu_buf_impl_t *db = NULL; + uint64_t offset = firstobj * DNODE_MIN_SIZE; + uint64_t dsobj = dmu_objset_id(os); - ASSERT3P(db->db_buf, !=, NULL); - ASSERT3U(dsobj, !=, 0); + ret = dmu_buf_hold_by_dnode(DMU_META_DNODE(os), offset, FTAG, &handle, + DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT); + if (ret != 0) + return (ret); dmu_buf_will_change_crypt_params(handle, tx); - DB_DNODE_ENTER(db); - type = DB_DNODE(db)->dn_type; - DB_DNODE_EXIT(db); + db = (dmu_buf_impl_t *)handle; + ASSERT3P(db->db_buf, !=, NULL); + ASSERT3U(dsobj, !=, 0); /* * This technically violates the assumption the dmu code makes * that dnode blocks are only released in syncing context. */ (void) arc_release(db->db_buf, db); - arc_convert_to_raw(db->db_buf, dsobj, byteorder, type, salt, iv, mac); + arc_convert_to_raw(db->db_buf, dsobj, byteorder, DMU_OT_DNODE, + salt, iv, mac); + + dmu_buf_rele(handle, FTAG); + + return (0); } void diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 16dc13939..f5894e6b8 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -2167,6 +2167,14 @@ struct receive_writer_arg { uint64_t last_offset; uint64_t max_object; /* highest object ID referenced in stream */ uint64_t bytes_read; /* bytes read when current record created */ + + /* Encryption parameters for the last received DRR_OBJECT_RANGE */ + uint64_t or_firstobj; + uint64_t or_numslots; + uint8_t or_salt[ZIO_DATA_SALT_LEN]; + uint8_t or_iv[ZIO_DATA_IV_LEN]; + uint8_t or_mac[ZIO_DATA_MAC_LEN]; + boolean_t or_byteorder; }; struct objlist { @@ -2448,7 +2456,13 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, } if (rwa->raw) { - if (drro->drr_raw_bonuslen < drro->drr_bonuslen || + /* + * We should have received a DRR_OBJECT_RANGE record + * containing this block and stored it in rwa. + */ + if (drro->drr_object < rwa->or_firstobj || + drro->drr_object >= rwa->or_firstobj + rwa->or_numslots || + drro->drr_raw_bonuslen < drro->drr_bonuslen || drro->drr_indblkshift > SPA_MAXBLOCKSHIFT || drro->drr_nlevels > DN_MAX_LEVELS || drro->drr_nblkptr > DN_MAX_NBLKPTR || @@ -2611,8 +2625,27 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, return (SET_ERROR(EINVAL)); } - if (rwa->raw) - VERIFY0(dmu_object_dirty_raw(rwa->os, drro->drr_object, tx)); + if (rwa->raw) { + /* + * Convert the buffer associated with this range of dnodes + * to a raw buffer. This ensures that it will be written out + * as a raw buffer when we fill in the dnode object. Since we + * are committing this tx now, it is possible for the dnode + * block to end up on-disk with the incorrect MAC. Despite + * this, the dataset is marked as inconsistent so no other + * code paths (apart from scrubs) will attempt to read this + * data. Scrubs will not be effected by this either since + * scrubs only read raw data and do not attempt to check + * the MAC. + */ + err = dmu_convert_mdn_block_to_raw(rwa->os, rwa->or_firstobj, + rwa->or_byteorder, rwa->or_salt, rwa->or_iv, rwa->or_mac, + tx); + if (err != 0) { + dmu_tx_commit(tx); + return (SET_ERROR(EINVAL)); + } + } dmu_object_set_checksum(rwa->os, drro->drr_object, drro->drr_checksumtype, tx); @@ -2984,12 +3017,6 @@ static int receive_object_range(struct receive_writer_arg *rwa, struct drr_object_range *drror) { - int ret; - dmu_tx_t *tx; - dnode_t *mdn = NULL; - dmu_buf_t *db = NULL; - uint64_t offset; - /* * By default, we assume this block is in our native format * (ZFS_HOST_BYTEORDER). We then take into account whether @@ -3019,38 +3046,18 @@ receive_object_range(struct receive_writer_arg *rwa, if (drror->drr_firstobj > rwa->max_object) rwa->max_object = drror->drr_firstobj; - offset = drror->drr_firstobj * sizeof (dnode_phys_t); - mdn = DMU_META_DNODE(rwa->os); - - tx = dmu_tx_create(rwa->os); - ret = dmu_tx_assign(tx, TXG_WAIT); - if (ret != 0) { - dmu_tx_abort(tx); - return (ret); - } - - ret = dmu_buf_hold_by_dnode(mdn, offset, FTAG, &db, - DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT); - if (ret != 0) { - dmu_tx_commit(tx); - return (ret); - } - /* - * Convert the buffer associated with this range of dnodes to a - * raw buffer. This ensures that it will be written out as a raw - * buffer when we fill in the dnode objects in future records. - * Since we are commiting this tx now, it is technically possible - * for the dnode block to end up on-disk with the incorrect MAC. - * Despite this, the dataset is marked as inconsistent so no other - * code paths (apart from scrubs) will attempt to read this data. - * Scrubs will not be effected by this either since scrubs only - * read raw data and do not attempt to check the MAC. + * The DRR_OBJECT_RANGE handling must be deferred to receive_object() + * so that the encryption parameters are set with each object that is + * written into that block. */ - dmu_convert_to_raw(db, byteorder, drror->drr_salt, drror->drr_iv, - drror->drr_mac, tx); - dmu_buf_rele(db, FTAG); - dmu_tx_commit(tx); + rwa->or_firstobj = drror->drr_firstobj; + rwa->or_numslots = drror->drr_numslots; + bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN); + bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN); + bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN); + rwa->or_byteorder = byteorder; + return (0); } -- 2.39.2