From 65ca2c1eb9af3c3fb2d221aabc2afa5db7b0f8cc Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Wed, 2 Jan 2019 11:46:04 -0800 Subject: [PATCH] Fix 'zpool remap' freeing race The dmu_objset_remap_indirects_impl() logic depends on dnode_hold() returning ENOENT for dnodes which will be freed and should be skipped. This behavior can only be relied upon when taking a new hold and while the caller has an open transaction. This ensures that the open txg cannot advance and that a concurrent free will end up in the same txg (which is critical). Relying on an existing hold will not prevent dnode_free() from succeeding. The solution is to take an additional dnode_hold() after assigning the transaction. This ensures the remap will never dirty the dnode if it was freed while we were waiting in dmu_tx_assign(, TXG_WAIT). Randomly set zfs_object_remap_one_indirect_delay_ms in ztest. This increases the likelihood of an operation racing with the remap. Converted from ticks to milliseconds. Reviewed by: Matt Ahrens Reviewed by: Tom Caputi Reviewed by: Igor Kozhukhov Signed-off-by: Brian Behlendorf Closes #8215 --- cmd/ztest/ztest.c | 8 ++++++++ module/zfs/dmu.c | 34 ++++++++++++++++++++++++---------- 2 files changed, 32 insertions(+), 10 deletions(-) diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c index 28ab0e846..f9ba9b6d0 100644 --- a/cmd/ztest/ztest.c +++ b/cmd/ztest/ztest.c @@ -215,6 +215,8 @@ extern int dmu_object_alloc_chunk_shift; extern boolean_t zfs_force_some_double_word_sm_entries; extern unsigned long zio_decompress_fail_fraction; extern unsigned long zfs_reconstruct_indirect_damage_fraction; +extern int zfs_object_remap_one_indirect_delay_ms; + static ztest_shared_opts_t *ztest_shared_opts; static ztest_shared_opts_t ztest_opts; @@ -6526,6 +6528,12 @@ ztest_resume_thread(void *arg) */ if (ztest_random(10) == 0) zfs_abd_scatter_enabled = ztest_random(2); + + /* + * Periodically inject remapping delays (10% of the time). + */ + zfs_object_remap_one_indirect_delay_ms = + ztest_random(10) == 0 ? ztest_random(1000) + 1 : 0; } thread_exit(); diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3eed512e5..e8d0ce3be 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -76,9 +76,9 @@ int zfs_dmu_offset_next_sync = 0; /* * This can be used for testing, to ensure that certain actions happen * while in the middle of a remap (which might otherwise complete too - * quickly). + * quickly). Used by ztest(8). */ -int zfs_object_remap_one_indirect_delay_ticks = 0; +int zfs_object_remap_one_indirect_delay_ms = 0; const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, @@ -1075,6 +1075,7 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, uint64_t last_removal_txg, uint64_t offset) { uint64_t l1blkid = dbuf_whichblock(dn, 1, offset); + dnode_t *dn_tx; int err = 0; rw_enter(&dn->dn_struct_rwlock, RW_READER); @@ -1093,7 +1094,9 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, /* * If this L1 was already written after the last removal, then we've - * already tried to remap it. + * already tried to remap it. An additional hold is taken after the + * dmu_tx_assign() to handle the case where the dnode is freed while + * waiting for the next open txg. */ if (birth <= last_removal_txg && dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 && @@ -1102,7 +1105,11 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, dmu_tx_hold_remap_l1indirect(tx, dn->dn_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { - (void) dbuf_dirty(dbuf, tx); + err = dnode_hold(os, dn->dn_object, FTAG, &dn_tx); + if (err == 0) { + (void) dbuf_dirty(dbuf, tx); + dnode_rele(dn_tx, FTAG); + } dmu_tx_commit(tx); } else { dmu_tx_abort(tx); @@ -1111,7 +1118,7 @@ dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, dbuf_rele(dbuf, FTAG); - delay(zfs_object_remap_one_indirect_delay_ticks); + delay(MSEC_TO_TICK(zfs_object_remap_one_indirect_delay_ms)); return (err); } @@ -1133,7 +1140,7 @@ dmu_object_remap_indirects(objset_t *os, uint64_t object, { uint64_t offset, l1span; int err; - dnode_t *dn; + dnode_t *dn, *dn_tx; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) { @@ -1148,13 +1155,20 @@ dmu_object_remap_indirects(objset_t *os, uint64_t object, /* * If the dnode has no indirect blocks, we cannot dirty them. * We still want to remap the blkptr(s) in the dnode if - * appropriate, so mark it as dirty. + * appropriate, so mark it as dirty. An additional hold is + * taken after the dmu_tx_assign() to handle the case where + * the dnode is freed while waiting for the next open txg. */ if (err == 0 && dnode_needs_remap(dn)) { dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, dn->dn_object); - if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) { - dnode_setdirty(dn, tx); + dmu_tx_hold_bonus(tx, object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + err = dnode_hold(os, object, FTAG, &dn_tx); + if (err == 0) { + dnode_setdirty(dn_tx, tx); + dnode_rele(dn_tx, FTAG); + } dmu_tx_commit(tx); } else { dmu_tx_abort(tx); -- 2.39.5