]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Fix problems receiving reallocated dnodes
authorTim Chase <tim@chase2k.com>
Mon, 27 Aug 2018 14:28:32 +0000 (10:28 -0400)
committerTony Hutter <hutter2@llnl.gov>
Fri, 6 Jul 2018 09:46:51 +0000 (02:46 -0700)
This is a port of 047116ac - Raw sends must be able to decrease nlevels,
to the zfs-0.7-stable branch.  It includes the various fixes to the
problem of receiving incremental streams which include reallocated dnodes
in which the number of dnode slots has changed but excludes the parts
which are related to raw streams.

From 047116ac:

    Currently, when a raw zfs send file includes a
    DRR_OBJECT record that would decrease the number of
    levels of an existing object, the object is reallocated
    with dmu_object_reclaim() which creates the new dnode
    using the old object's nlevels. For non-raw sends this
    doesn't really matter, but raw sends require that
    nlevels on the receive side match that of the send
    side so that the checksum-of-MAC tree can be properly
    maintained. This patch corrects the issue by freeing
    the object completely before allocating it again in
    this case.

    This patch also corrects several issues with
    dnode_hold_impl() and related functions that prevented
    dnodes (particularly multi-slot dnodes) from being
    reallocated properly due to the fact that existing
    dnodes were not being fully cleaned up when they
    were freed.

    This patch adds a test to make sure that zfs recv
    functions properly with incremental streams containing
    dnodes of different sizes.

This also includes a one-liner fix from loli10K to fix a test failure:
https://github.com/zfsonlinux/zfs/pull/7792#discussion_r212769264

Authored-by: Tom Caputi <tcaputi@datto.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed-by: Jorgen Lundman <lundman@lundman.net>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Tim Chase <tim@chase2k.com>
Ported-by: Tim Chase <tim@chase2k.com>
Closes #6821
Closes #6864

NOTE: This is the first of the port of 3 related patches patches to the
zfs-0.7-release branch of ZoL.  The other two patches should immediately
follow this one.

cmd/ztest/ztest.c
include/sys/dnode.h
lib/libzfs/libzfs_sendrecv.c
module/zfs/dmu_object.c
module/zfs/dmu_send.c
module/zfs/dnode.c
module/zfs/dnode_sync.c
tests/runfiles/linux.run
tests/zfs-tests/tests/functional/rsend/Makefile.am
tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh [new file with mode: 0644]

index 1a320b03a0b5afa4ccb64c27695339b9fe40f666..a410eeefbabc14f863467e6a45925dcb21cd1f3e 100644 (file)
@@ -197,7 +197,8 @@ extern uint64_t metaslab_gang_bang;
 extern uint64_t metaslab_df_alloc_threshold;
 extern int metaslab_preload_limit;
 extern boolean_t zfs_compressed_arc_enabled;
-extern int  zfs_abd_scatter_enabled;
+extern int zfs_abd_scatter_enabled;
+extern int dmu_object_alloc_chunk_shift;
 
 static ztest_shared_opts_t *ztest_shared_opts;
 static ztest_shared_opts_t ztest_opts;
@@ -310,6 +311,7 @@ static ztest_shared_callstate_t *ztest_shared_callstate;
 ztest_func_t ztest_dmu_read_write;
 ztest_func_t ztest_dmu_write_parallel;
 ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_object_next_chunk;
 ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
@@ -357,6 +359,7 @@ ztest_info_t ztest_info[] = {
        ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always),
        ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always),
        ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always),
+       ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes),
        ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always),
        ZTI_INIT(ztest_zap, 30, &zopt_always),
        ZTI_INIT(ztest_zap_parallel, 100, &zopt_always),
@@ -3927,6 +3930,26 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
        umem_free(od, size);
 }
 
+/*
+ * Rewind the global allocator to verify object allocation backfilling.
+ */
+void
+ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
+{
+       objset_t *os = zd->zd_os;
+       int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+       uint64_t object;
+
+       /*
+        * Rewind the global allocator randomly back to a lower object number
+        * to force backfilling and reclamation of recently freed dnodes.
+        */
+       mutex_enter(&os->os_obj_lock);
+       object = ztest_random(os->os_obj_next_chunk);
+       os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
+       mutex_exit(&os->os_obj_lock);
+}
+
 #undef OD_ARRAY_SIZE
 #define        OD_ARRAY_SIZE   2
 
index c7efe55935660dff29503b2bde2e7345e0ec46f4..ea7defe19a84b72f7a9f5d2a6fbb04a5e5e1d588 100644 (file)
@@ -360,6 +360,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
     int minlvl, uint64_t blkfill, uint64_t txg);
 void dnode_evict_dbufs(dnode_t *dn);
 void dnode_evict_bonus(dnode_t *dn);
+void dnode_free_interior_slots(dnode_t *dn);
 
 #define        DNODE_IS_CACHEABLE(_dn)                                         \
        ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
@@ -453,6 +454,11 @@ typedef struct dnode_stats {
         * which had already been unlinked in an earlier txg.
         */
        kstat_named_t dnode_hold_free_txg;
+       /*
+        * Number of times dnode_free_interior_slots() needed to retry
+        * acquiring a slot zrl lock due to contention.
+        */
+       kstat_named_t dnode_free_interior_lock_retry;
        /*
         * Number of new dnodes allocated by dnode_allocate().
         */
index c5acd21a522266fdaa97e9c4b93e30678316da04..cadf16cce0cbe80bb803a8cc0742f84df121ac60 100644 (file)
@@ -3577,6 +3577,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                }
 
                newfs = B_TRUE;
+               *cp = '/';
        }
 
        if (flags->verbose) {
index e7412b7509f446fb3ac6d5727fcd7230e9143c9c..f53da407fd40bc1ce6dd56233751f82cf9ae5fad 100644 (file)
@@ -275,7 +275,6 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
        return (err);
 }
 
-
 int
 dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
index cdbc1cd1bc6d7c990e8477516046105b22bf92fd..148b5ff84252d3ebb3a436e02d7753e3cc9c8220 100644 (file)
@@ -2156,10 +2156,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
        }
 
        err = dmu_object_info(rwa->os, drro->drr_object, &doi);
-
-       if (err != 0 && err != ENOENT)
+       if (err != 0 && err != ENOENT && err != EEXIST)
                return (SET_ERROR(EINVAL));
-       object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
 
        if (drro->drr_object > rwa->max_object)
                rwa->max_object = drro->drr_object;
@@ -2175,13 +2173,56 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
                nblkptr = deduce_nblkptr(drro->drr_bonustype,
                    drro->drr_bonuslen);
 
+               object = drro->drr_object;
+
                if (drro->drr_blksz != doi.doi_data_block_size ||
-                   nblkptr < doi.doi_nblkptr) {
+                   nblkptr < doi.doi_nblkptr ||
+                   drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
                        err = dmu_free_long_range(rwa->os, drro->drr_object,
                            0, DMU_OBJECT_END);
                        if (err != 0)
                                return (SET_ERROR(EINVAL));
                }
+       } else if (err == EEXIST) {
+               /*
+                * The object requested is currently an interior slot of a
+                * multi-slot dnode. This will be resolved when the next txg
+                * is synced out, since the send stream will have told us
+                * to free this slot when we freed the associated dnode
+                * earlier in the stream.
+                */
+               txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+               object = drro->drr_object;
+       } else {
+               /* object is free and we are about to allocate a new one */
+               object = DMU_NEW_OBJECT;
+       }
+
+       /*
+        * If this is a multi-slot dnode there is a chance that this
+        * object will expand into a slot that is already used by
+        * another object from the previous snapshot. We must free
+        * these objects before we attempt to allocate the new dnode.
+        */
+       if (drro->drr_dn_slots > 1) {
+               for (uint64_t slot = drro->drr_object + 1;
+                   slot < drro->drr_object + drro->drr_dn_slots;
+                   slot++) {
+                       dmu_object_info_t slot_doi;
+
+                       err = dmu_object_info(rwa->os, slot, &slot_doi);
+                       if (err == ENOENT || err == EEXIST)
+                               continue;
+                       else if (err != 0)
+                               return (err);
+
+                       err = dmu_free_long_object(rwa->os, slot);
+
+                       if (err != 0)
+                               return (err);
+               }
+
+               txg_wait_synced(dmu_objset_pool(rwa->os), 0);
        }
 
        tx = dmu_tx_create(rwa->os);
@@ -2732,7 +2773,7 @@ receive_read_record(struct receive_arg *ra)
                 * See receive_read_prefetch for an explanation why we're
                 * storing this object in the ignore_obj_list.
                 */
-               if (err == ENOENT ||
+               if (err == ENOENT || err == EEXIST ||
                    (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
                        objlist_insert(&ra->ignore_objlist, drro->drr_object);
                        err = 0;
index e05a4d0a5538722ac97bd2cfce0878631615775e..df6a4872c9cf10c0affeb8a3961c13b35f427ce7 100644 (file)
@@ -55,6 +55,7 @@ dnode_stats_t dnode_stats = {
        { "dnode_hold_free_overflow",           KSTAT_DATA_UINT64 },
        { "dnode_hold_free_refcount",           KSTAT_DATA_UINT64 },
        { "dnode_hold_free_txg",                KSTAT_DATA_UINT64 },
+       { "dnode_free_interior_lock_retry",     KSTAT_DATA_UINT64 },
        { "dnode_allocate",                     KSTAT_DATA_UINT64 },
        { "dnode_reallocate",                   KSTAT_DATA_UINT64 },
        { "dnode_buf_evict",                    KSTAT_DATA_UINT64 },
@@ -516,7 +517,8 @@ dnode_destroy(dnode_t *dn)
        mutex_exit(&os->os_lock);
 
        /* the dnode can no longer move, so we can release the handle */
-       zrl_remove(&dn->dn_handle->dnh_zrlock);
+       if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+               zrl_remove(&dn->dn_handle->dnh_zrlock);
 
        dn->dn_allocated_txg = 0;
        dn->dn_free_txg = 0;
@@ -662,6 +664,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
            DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
 
        dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
+
+       dnode_free_interior_slots(dn);
        DNODE_STAT_BUMP(dnode_reallocate);
 
        /* clean up any unreferenced dbufs */
@@ -1062,19 +1066,73 @@ dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
 }
 
 static boolean_t
-dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
 {
        ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
 
        for (int i = idx; i < idx + slots; i++) {
                dnode_handle_t *dnh = &children->dnc_children[i];
-               if (dnh->dnh_dnode != ptr)
+               dnode_t *dn = dnh->dnh_dnode;
+
+               if (dn == DN_SLOT_FREE) {
+                       continue;
+               } else if (DN_SLOT_IS_PTR(dn)) {
+                       mutex_enter(&dn->dn_mtx);
+                       dmu_object_type_t type = dn->dn_type;
+                       mutex_exit(&dn->dn_mtx);
+
+                       if (type != DMU_OT_NONE)
+                               return (B_FALSE);
+
+                       continue;
+               } else {
                        return (B_FALSE);
+               }
+
+               return (B_FALSE);
        }
 
        return (B_TRUE);
 }
 
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+
+               ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+               if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                       ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+                       dnode_destroy(dnh->dnh_dnode);
+                       dnh->dnh_dnode = DN_SLOT_FREE;
+               }
+       }
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+       dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+       int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+       int idx = (dn->dn_object & (epb - 1)) + 1;
+       int slots = dn->dn_num_slots - 1;
+
+       if (slots == 0)
+               return;
+
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       while (!dnode_slots_tryenter(children, idx, slots))
+               DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+
+       dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+       dnode_slots_rele(children, idx, slots);
+}
+
 void
 dnode_special_close(dnode_handle_t *dnh)
 {
@@ -1355,7 +1413,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                while (dn == DN_SLOT_UNINIT) {
                        dnode_slots_hold(dnc, idx, slots);
 
-                       if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+                       if (!dnode_check_slots_free(dnc, idx, slots)) {
                                DNODE_STAT_BUMP(dnode_hold_free_misses);
                                dnode_slots_rele(dnc, idx, slots);
                                dbuf_rele(db, FTAG);
@@ -1368,15 +1426,29 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                                continue;
                        }
 
-                       if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+                       if (!dnode_check_slots_free(dnc, idx, slots)) {
                                DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
                                dnode_slots_rele(dnc, idx, slots);
                                dbuf_rele(db, FTAG);
                                return (SET_ERROR(ENOSPC));
                        }
 
+                       /*
+                        * Allocated but otherwise free dnodes which would
+                        * be in the interior of a multi-slot dnodes need
+                        * to be freed.  Single slot dnodes can be safely
+                        * re-purposed as a performance optimization.
+                        */
+                       if (slots > 1)
+                               dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
                        dnh = &dnc->dnc_children[idx];
-                       dn = dnode_create(os, dn_block + idx, db, object, dnh);
+                       if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                               dn = dnh->dnh_dnode;
+                       } else {
+                               dn = dnode_create(os, dn_block + idx, db,
+                                   object, dnh);
+                       }
                }
 
                mutex_enter(&dn->dn_mtx);
index 742d962bc2327a8a409e1677663c47ed52687332..8d65e38564921142499e0c8ceb4f9afc0889b674 100644 (file)
@@ -533,6 +533,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
        if (dn->dn_allocated_txg != dn->dn_free_txg)
                dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
        bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+       dnode_free_interior_slots(dn);
 
        mutex_enter(&dn->dn_mtx);
        dn->dn_type = DMU_OT_NONE;
@@ -540,6 +541,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
        dn->dn_allocated_txg = 0;
        dn->dn_free_txg = 0;
        dn->dn_have_spill = B_FALSE;
+       dn->dn_num_slots = 1;
        mutex_exit(&dn->dn_mtx);
 
        ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
index 69e9eb267dfc5bef5826b733c4c1b4441911ae2e..d8fe6f3ab0d2b8f1ea52886cf8dc07e862b6910d 100644 (file)
@@ -605,7 +605,7 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
     'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
     'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
     'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
-    'send-c_recv_dedup', 'send_freeobjects']
+    'send-c_recv_dedup', 'send_freeobjects', 'send_realloc_dnode_size']
 tags = ['functional', 'rsend']
 
 [tests/functional/scrub_mirror]
index 6b1aa8b3203d6454f7a4f651a917cd64020f0bf9..a2837d1a63fca2d2f54abc4d75fa04f559800865 100644 (file)
@@ -36,7 +36,8 @@ dist_pkgdata_SCRIPTS = \
        send-c_volume.ksh \
        send-c_zstreamdump.ksh \
        send-cpL_varied_recsize.ksh \
-       send_freeobjects.ksh
+       send_freeobjects.ksh \
+       send_realloc_dnode_size.ksh
 
 dist_pkgdata_DATA = \
        rsend.cfg \
diff --git a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
new file mode 100644 (file)
index 0000000..2067639
--- /dev/null
@@ -0,0 +1,98 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify incremental receive properly handles objects with changed
+# dnode slot count.
+#
+# Strategy:
+# 1. Populate a dataset with 1k byte dnodes and snapshot
+# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects
+#    get recycled numbers and formerly "interior" dnode slots get assigned
+#    to new objects
+# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
+#    overlap with recently recycled and formerly "normal" dnode slots get
+#    assigned to new objects
+# 4. Generate initial and incremental streams
+# 5. Verify initial and incremental streams can be received
+#
+
+verify_runnable "both"
+
+log_assert "Verify incremental receive handles objects with changed dnode size"
+
+function cleanup
+{
+       rm -f $BACKDIR/fs-dn-legacy
+       rm -f $BACKDIR/fs-dn-1k
+       rm -f $BACKDIR/fs-dn-2k
+
+       if datasetexists $POOL/fs ; then
+               log_must zfs destroy -rR $POOL/fs
+       fi
+
+       if datasetexists $POOL/newfs ; then
+               log_must zfs destroy -rR $POOL/newfs
+       fi
+}
+
+log_onexit cleanup
+
+# 1. Populate a dataset with 1k byte dnodes and snapshot
+log_must zfs create -o dnodesize=1k $POOL/fs
+log_must mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@a
+
+# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects
+#    get recycled numbers and formerly "interior" dnode slots get assigned
+#    to new objects
+rm /$POOL/fs/*
+
+log_must zfs unmount $POOL/fs
+log_must zfs set dnodesize=legacy $POOL/fs
+log_must zfs mount $POOL/fs
+
+log_must mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@b
+
+# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
+#    overlap with recently recycled and formerly "normal" dnode slots get
+#    assigned to new objects
+rm /$POOL/fs/*
+
+log_must zfs unmount $POOL/fs
+log_must zfs set dnodesize=2k $POOL/fs
+log_must zfs mount $POOL/fs
+
+mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@c
+
+# 4. Generate initial and incremental streams
+log_must eval "zfs send $POOL/fs@a > $BACKDIR/fs-dn-1k"
+log_must eval "zfs send -i $POOL/fs@a $POOL/fs@b > $BACKDIR/fs-dn-legacy"
+log_must eval "zfs send -i $POOL/fs@b $POOL/fs@c > $BACKDIR/fs-dn-2k"
+
+# 5. Verify initial and incremental streams can be received
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-1k"
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-legacy"
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-2k"
+
+log_pass "Verify incremental receive handles objects with changed dnode size"