Fix problems receiving reallocated dnodes

author Tim Chase <tim@chase2k.com>

Mon, 27 Aug 2018 14:28:32 +0000 (10:28 -0400)

committer Tony Hutter <hutter2@llnl.gov>

Fri, 6 Jul 2018 09:46:51 +0000 (02:46 -0700)
author Tim Chase <tim@chase2k.com>
Mon, 27 Aug 2018 14:28:32 +0000 (10:28 -0400)
committer Tony Hutter <hutter2@llnl.gov>
Fri, 6 Jul 2018 09:46:51 +0000 (02:46 -0700)
diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c

index 1a320b03a0b5afa4ccb64c27695339b9fe40f666..a410eeefbabc14f863467e6a45925dcb21cd1f3e 100644 (file)
--- a/cmd/ztest/ztest.c
+++ b/cmd/ztest/ztest.c
@@ -197,7 +197,8 @@ extern uint64_t metaslab_gang_bang;
  extern uint64_t metaslab_df_alloc_threshold;
  extern int metaslab_preload_limit;
  extern boolean_t zfs_compressed_arc_enabled;
-extern int  zfs_abd_scatter_enabled;
+extern int zfs_abd_scatter_enabled;
+extern int dmu_object_alloc_chunk_shift;
  
  static ztest_shared_opts_t *ztest_shared_opts;
  static ztest_shared_opts_t ztest_opts;
@@ -310,6 +311,7 @@ static ztest_shared_callstate_t *ztest_shared_callstate;
  ztest_func_t ztest_dmu_read_write;
  ztest_func_t ztest_dmu_write_parallel;
  ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_object_next_chunk;
  ztest_func_t ztest_dmu_commit_callbacks;
  ztest_func_t ztest_zap;
  ztest_func_t ztest_zap_parallel;
@@ -357,6 +359,7 @@ ztest_info_t ztest_info[] = {
         ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always),
         ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always),
         ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always),
+       ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes),
         ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always),
         ZTI_INIT(ztest_zap, 30, &zopt_always),
         ZTI_INIT(ztest_zap_parallel, 100, &zopt_always),
@@ -3927,6 +3930,26 @@ ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
         umem_free(od, size);
  }
  
+/*
+ * Rewind the global allocator to verify object allocation backfilling.
+ */
+void
+ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id)
+{
+       objset_t *os = zd->zd_os;
+       int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift;
+       uint64_t object;
+
+       /*
+        * Rewind the global allocator randomly back to a lower object number
+        * to force backfilling and reclamation of recently freed dnodes.
+        */
+       mutex_enter(&os->os_obj_lock);
+       object = ztest_random(os->os_obj_next_chunk);
+       os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk);
+       mutex_exit(&os->os_obj_lock);
+}
+
  #undef OD_ARRAY_SIZE
  #define        OD_ARRAY_SIZE   2
  
diff --git a/include/sys/dnode.h b/include/sys/dnode.h

index c7efe55935660dff29503b2bde2e7345e0ec46f4..ea7defe19a84b72f7a9f5d2a6fbb04a5e5e1d588 100644 (file)
--- a/include/sys/dnode.h
+++ b/include/sys/dnode.h
@@ -360,6 +360,7 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
      int minlvl, uint64_t blkfill, uint64_t txg);
  void dnode_evict_dbufs(dnode_t *dn);
  void dnode_evict_bonus(dnode_t *dn);
+void dnode_free_interior_slots(dnode_t *dn);
  
  #define        DNODE_IS_CACHEABLE(_dn)                                         \
         ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL ||         \
@@ -453,6 +454,11 @@ typedef struct dnode_stats {
          * which had already been unlinked in an earlier txg.
          */
         kstat_named_t dnode_hold_free_txg;
+       /*
+        * Number of times dnode_free_interior_slots() needed to retry
+        * acquiring a slot zrl lock due to contention.
+        */
+       kstat_named_t dnode_free_interior_lock_retry;
         /*
          * Number of new dnodes allocated by dnode_allocate().
          */
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c

index c5acd21a522266fdaa97e9c4b93e30678316da04..cadf16cce0cbe80bb803a8cc0742f84df121ac60 100644 (file)
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -3577,6 +3577,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                 }
  
                 newfs = B_TRUE;
+               *cp = '/';
         }
  
         if (flags->verbose) {
diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c

index e7412b7509f446fb3ac6d5727fcd7230e9143c9c..f53da407fd40bc1ce6dd56233751f82cf9ae5fad 100644 (file)
--- a/module/zfs/dmu_object.c
+++ b/module/zfs/dmu_object.c
@@ -275,7 +275,6 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
         return (err);
  }
  
-
  int
  dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
  {
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c

index cdbc1cd1bc6d7c990e8477516046105b22bf92fd..148b5ff84252d3ebb3a436e02d7753e3cc9c8220 100644 (file)
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -2156,10 +2156,8 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
         }
  
         err = dmu_object_info(rwa->os, drro->drr_object, &doi);
-
-       if (err != 0 && err != ENOENT)
+       if (err != 0 && err != ENOENT && err != EEXIST)
                 return (SET_ERROR(EINVAL));
-       object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
  
         if (drro->drr_object > rwa->max_object)
                 rwa->max_object = drro->drr_object;
@@ -2175,13 +2173,56 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
                 nblkptr = deduce_nblkptr(drro->drr_bonustype,
                     drro->drr_bonuslen);
  
+               object = drro->drr_object;
+
                 if (drro->drr_blksz != doi.doi_data_block_size ||
-                   nblkptr < doi.doi_nblkptr) {
+                   nblkptr < doi.doi_nblkptr ||
+                   drro->drr_dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
                         err = dmu_free_long_range(rwa->os, drro->drr_object,
                             0, DMU_OBJECT_END);
                         if (err != 0)
                                 return (SET_ERROR(EINVAL));
                 }
+       } else if (err == EEXIST) {
+               /*
+                * The object requested is currently an interior slot of a
+                * multi-slot dnode. This will be resolved when the next txg
+                * is synced out, since the send stream will have told us
+                * to free this slot when we freed the associated dnode
+                * earlier in the stream.
+                */
+               txg_wait_synced(dmu_objset_pool(rwa->os), 0);
+               object = drro->drr_object;
+       } else {
+               /* object is free and we are about to allocate a new one */
+               object = DMU_NEW_OBJECT;
+       }
+
+       /*
+        * If this is a multi-slot dnode there is a chance that this
+        * object will expand into a slot that is already used by
+        * another object from the previous snapshot. We must free
+        * these objects before we attempt to allocate the new dnode.
+        */
+       if (drro->drr_dn_slots > 1) {
+               for (uint64_t slot = drro->drr_object + 1;
+                   slot < drro->drr_object + drro->drr_dn_slots;
+                   slot++) {
+                       dmu_object_info_t slot_doi;
+
+                       err = dmu_object_info(rwa->os, slot, &slot_doi);
+                       if (err == ENOENT || err == EEXIST)
+                               continue;
+                       else if (err != 0)
+                               return (err);
+
+                       err = dmu_free_long_object(rwa->os, slot);
+
+                       if (err != 0)
+                               return (err);
+               }
+
+               txg_wait_synced(dmu_objset_pool(rwa->os), 0);
         }
  
         tx = dmu_tx_create(rwa->os);
@@ -2732,7 +2773,7 @@ receive_read_record(struct receive_arg *ra)
                  * See receive_read_prefetch for an explanation why we're
                  * storing this object in the ignore_obj_list.
                  */
-               if (err == ENOENT ||
+               if (err == ENOENT || err == EEXIST ||
                     (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
                         objlist_insert(&ra->ignore_objlist, drro->drr_object);
                         err = 0;
diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c

index e05a4d0a5538722ac97bd2cfce0878631615775e..df6a4872c9cf10c0affeb8a3961c13b35f427ce7 100644 (file)
--- a/module/zfs/dnode.c
+++ b/module/zfs/dnode.c
@@ -55,6 +55,7 @@ dnode_stats_t dnode_stats = {
         { "dnode_hold_free_overflow",           KSTAT_DATA_UINT64 },
         { "dnode_hold_free_refcount",           KSTAT_DATA_UINT64 },
         { "dnode_hold_free_txg",                KSTAT_DATA_UINT64 },
+       { "dnode_free_interior_lock_retry",     KSTAT_DATA_UINT64 },
         { "dnode_allocate",                     KSTAT_DATA_UINT64 },
         { "dnode_reallocate",                   KSTAT_DATA_UINT64 },
         { "dnode_buf_evict",                    KSTAT_DATA_UINT64 },
@@ -516,7 +517,8 @@ dnode_destroy(dnode_t *dn)
         mutex_exit(&os->os_lock);
  
         /* the dnode can no longer move, so we can release the handle */
-       zrl_remove(&dn->dn_handle->dnh_zrlock);
+       if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock))
+               zrl_remove(&dn->dn_handle->dnh_zrlock);
  
         dn->dn_allocated_txg = 0;
         dn->dn_free_txg = 0;
@@ -662,6 +664,8 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
             DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))));
  
         dn_slots = dn_slots > 0 ? dn_slots : DNODE_MIN_SLOTS;
+
+       dnode_free_interior_slots(dn);
         DNODE_STAT_BUMP(dnode_reallocate);
  
         /* clean up any unreferenced dbufs */
@@ -1062,19 +1066,73 @@ dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr)
  }
  
  static boolean_t
-dnode_check_slots(dnode_children_t *children, int idx, int slots, void *ptr)
+dnode_check_slots_free(dnode_children_t *children, int idx, int slots)
  {
         ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
  
         for (int i = idx; i < idx + slots; i++) {
                 dnode_handle_t *dnh = &children->dnc_children[i];
-               if (dnh->dnh_dnode != ptr)
+               dnode_t *dn = dnh->dnh_dnode;
+
+               if (dn == DN_SLOT_FREE) {
+                       continue;
+               } else if (DN_SLOT_IS_PTR(dn)) {
+                       mutex_enter(&dn->dn_mtx);
+                       dmu_object_type_t type = dn->dn_type;
+                       mutex_exit(&dn->dn_mtx);
+
+                       if (type != DMU_OT_NONE)
+                               return (B_FALSE);
+
+                       continue;
+               } else {
                         return (B_FALSE);
+               }
+
+               return (B_FALSE);
         }
  
         return (B_TRUE);
  }
  
+static void
+dnode_reclaim_slots(dnode_children_t *children, int idx, int slots)
+{
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       for (int i = idx; i < idx + slots; i++) {
+               dnode_handle_t *dnh = &children->dnc_children[i];
+
+               ASSERT(zrl_is_locked(&dnh->dnh_zrlock));
+
+               if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                       ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE);
+                       dnode_destroy(dnh->dnh_dnode);
+                       dnh->dnh_dnode = DN_SLOT_FREE;
+               }
+       }
+}
+
+void
+dnode_free_interior_slots(dnode_t *dn)
+{
+       dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db);
+       int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT;
+       int idx = (dn->dn_object & (epb - 1)) + 1;
+       int slots = dn->dn_num_slots - 1;
+
+       if (slots == 0)
+               return;
+
+       ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK);
+
+       while (!dnode_slots_tryenter(children, idx, slots))
+               DNODE_STAT_BUMP(dnode_free_interior_lock_retry);
+
+       dnode_set_slots(children, idx, slots, DN_SLOT_FREE);
+       dnode_slots_rele(children, idx, slots);
+}
+
  void
  dnode_special_close(dnode_handle_t *dnh)
  {
@@ -1355,7 +1413,7 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                 while (dn == DN_SLOT_UNINIT) {
                         dnode_slots_hold(dnc, idx, slots);
  
-                       if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+                       if (!dnode_check_slots_free(dnc, idx, slots)) {
                                 DNODE_STAT_BUMP(dnode_hold_free_misses);
                                 dnode_slots_rele(dnc, idx, slots);
                                 dbuf_rele(db, FTAG);
@@ -1368,15 +1426,29 @@ dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots,
                                 continue;
                         }
  
-                       if (!dnode_check_slots(dnc, idx, slots, DN_SLOT_FREE)) {
+                       if (!dnode_check_slots_free(dnc, idx, slots)) {
                                 DNODE_STAT_BUMP(dnode_hold_free_lock_misses);
                                 dnode_slots_rele(dnc, idx, slots);
                                 dbuf_rele(db, FTAG);
                                 return (SET_ERROR(ENOSPC));
                         }
  
+                       /*
+                        * Allocated but otherwise free dnodes which would
+                        * be in the interior of a multi-slot dnodes need
+                        * to be freed.  Single slot dnodes can be safely
+                        * re-purposed as a performance optimization.
+                        */
+                       if (slots > 1)
+                               dnode_reclaim_slots(dnc, idx + 1, slots - 1);
+
                         dnh = &dnc->dnc_children[idx];
-                       dn = dnode_create(os, dn_block + idx, db, object, dnh);
+                       if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) {
+                               dn = dnh->dnh_dnode;
+                       } else {
+                               dn = dnode_create(os, dn_block + idx, db,
+                                   object, dnh);
+                       }
                 }
  
                 mutex_enter(&dn->dn_mtx);
diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c

index 742d962bc2327a8a409e1677663c47ed52687332..8d65e38564921142499e0c8ceb4f9afc0889b674 100644 (file)
--- a/module/zfs/dnode_sync.c
+++ b/module/zfs/dnode_sync.c
@@ -533,6 +533,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
         if (dn->dn_allocated_txg != dn->dn_free_txg)
                 dmu_buf_will_dirty(&dn->dn_dbuf->db, tx);
         bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots);
+       dnode_free_interior_slots(dn);
  
         mutex_enter(&dn->dn_mtx);
         dn->dn_type = DMU_OT_NONE;
@@ -540,6 +541,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
         dn->dn_allocated_txg = 0;
         dn->dn_free_txg = 0;
         dn->dn_have_spill = B_FALSE;
+       dn->dn_num_slots = 1;
         mutex_exit(&dn->dn_mtx);
  
         ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run

index 69e9eb267dfc5bef5826b733c4c1b4441911ae2e..d8fe6f3ab0d2b8f1ea52886cf8dc07e862b6910d 100644 (file)
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -605,7 +605,7 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
      'send-c_lz4_disabled', 'send-c_recv_lz4_disabled',
      'send-c_mixed_compression', 'send-c_stream_size_estimate', 'send-cD',
      'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize',
-    'send-c_recv_dedup', 'send_freeobjects']
+    'send-c_recv_dedup', 'send_freeobjects', 'send_realloc_dnode_size']
  tags = ['functional', 'rsend']
  
  [tests/functional/scrub_mirror]
diff --git a/tests/zfs-tests/tests/functional/rsend/Makefile.am b/tests/zfs-tests/tests/functional/rsend/Makefile.am

index 6b1aa8b3203d6454f7a4f651a917cd64020f0bf9..a2837d1a63fca2d2f54abc4d75fa04f559800865 100644 (file)
--- a/tests/zfs-tests/tests/functional/rsend/Makefile.am
+++ b/tests/zfs-tests/tests/functional/rsend/Makefile.am
@@ -36,7 +36,8 @@ dist_pkgdata_SCRIPTS = \
         send-c_volume.ksh \
         send-c_zstreamdump.ksh \
         send-cpL_varied_recsize.ksh \
-       send_freeobjects.ksh
+       send_freeobjects.ksh \
+       send_realloc_dnode_size.ksh
  
  dist_pkgdata_DATA = \
         rsend.cfg \
diff --git a/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh

new file mode 100644 (file)

index 0000000..2067639
--- /dev/null
+++ b/tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh
@@ -0,0 +1,98 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2017 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify incremental receive properly handles objects with changed
+# dnode slot count.
+#
+# Strategy:
+# 1. Populate a dataset with 1k byte dnodes and snapshot
+# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects
+#    get recycled numbers and formerly "interior" dnode slots get assigned
+#    to new objects
+# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
+#    overlap with recently recycled and formerly "normal" dnode slots get
+#    assigned to new objects
+# 4. Generate initial and incremental streams
+# 5. Verify initial and incremental streams can be received
+#
+
+verify_runnable "both"
+
+log_assert "Verify incremental receive handles objects with changed dnode size"
+
+function cleanup
+{
+       rm -f $BACKDIR/fs-dn-legacy
+       rm -f $BACKDIR/fs-dn-1k
+       rm -f $BACKDIR/fs-dn-2k
+
+       if datasetexists $POOL/fs ; then
+               log_must zfs destroy -rR $POOL/fs
+       fi
+
+       if datasetexists $POOL/newfs ; then
+               log_must zfs destroy -rR $POOL/newfs
+       fi
+}
+
+log_onexit cleanup
+
+# 1. Populate a dataset with 1k byte dnodes and snapshot
+log_must zfs create -o dnodesize=1k $POOL/fs
+log_must mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@a
+
+# 2. Remove objects, set dnodesize=legacy, and remount dataset so new objects
+#    get recycled numbers and formerly "interior" dnode slots get assigned
+#    to new objects
+rm /$POOL/fs/*
+
+log_must zfs unmount $POOL/fs
+log_must zfs set dnodesize=legacy $POOL/fs
+log_must zfs mount $POOL/fs
+
+log_must mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@b
+
+# 3. Remove objects, set dnodesize=2k, and remount dataset so new objects
+#    overlap with recently recycled and formerly "normal" dnode slots get
+#    assigned to new objects
+rm /$POOL/fs/*
+
+log_must zfs unmount $POOL/fs
+log_must zfs set dnodesize=2k $POOL/fs
+log_must zfs mount $POOL/fs
+
+mk_files 200 262144 0 $POOL/fs
+log_must zfs snapshot $POOL/fs@c
+
+# 4. Generate initial and incremental streams
+log_must eval "zfs send $POOL/fs@a > $BACKDIR/fs-dn-1k"
+log_must eval "zfs send -i $POOL/fs@a $POOL/fs@b > $BACKDIR/fs-dn-legacy"
+log_must eval "zfs send -i $POOL/fs@b $POOL/fs@c > $BACKDIR/fs-dn-2k"
+
+# 5. Verify initial and incremental streams can be received
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-1k"
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-legacy"
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs-dn-2k"
+
+log_pass "Verify incremental receive handles objects with changed dnode size"
author	Tim Chase <tim@chase2k.com>
	Mon, 27 Aug 2018 14:28:32 +0000 (10:28 -0400)
committer	Tony Hutter <hutter2@llnl.gov>
	Fri, 6 Jul 2018 09:46:51 +0000 (02:46 -0700)
cmd/ztest/ztest.c		patch \| blob \| blame \| history
include/sys/dnode.h		patch \| blob \| blame \| history
lib/libzfs/libzfs_sendrecv.c		patch \| blob \| blame \| history
module/zfs/dmu_object.c		patch \| blob \| blame \| history
module/zfs/dmu_send.c		patch \| blob \| blame \| history
module/zfs/dnode.c		patch \| blob \| blame \| history
module/zfs/dnode_sync.c		patch \| blob \| blame \| history
tests/runfiles/linux.run		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/rsend/Makefile.am		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh	[new file with mode: 0644]	patch \| blob