Fix 2 small bugs with cached dsl_scan_phys_t

[mirror_zfs.git] / module / zfs / dsl_scan.c
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c

index c19a1b75cd64e309d1855ee15c868c185a14519d..61d42deca6524eec7e8c1758bc2b917912eeb77a 100644 (file)
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -175,6 +175,8 @@ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
  /* max number of blocks to free in a single TXG */
  unsigned long zfs_async_block_max_blocks = 100000;
  
+int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */
+
  /*
   * We wait a few txgs after importing a pool to begin scanning so that
   * the import / mounting code isn't held up by scrub / resilver IO.
@@ -273,7 +275,7 @@ struct dsl_scan_io_queue {
  
  /* private data for dsl_scan_prefetch_cb() */
  typedef struct scan_prefetch_ctx {
-       refcount_t spc_refcnt;          /* refcount for memory management */
+       zfs_refcount_t spc_refcnt;      /* refcount for memory management */
         dsl_scan_t *spc_scn;            /* dsl_scan_t for the pool */
         boolean_t spc_root;             /* is this prefetch for an objset? */
         uint8_t spc_indblkshift;        /* dn_indblkshift of current dnode */
@@ -388,7 +390,6 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
         scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit *
             dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20);
  
-       bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
         avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
             offsetof(scan_ds_t, sds_node));
         avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
@@ -482,6 +483,8 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
                 }
         }
  
+       bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
         /* reload the queue into the in-core state */
         if (scn->scn_phys.scn_queue_obj != 0) {
                 zap_cursor_t zc;
@@ -720,6 +723,11 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
         spa->spa_scrub_reopen = B_FALSE;
         (void) spa_vdev_state_exit(spa, NULL, 0);
  
+       if (func == POOL_SCAN_RESILVER) {
+               dsl_resilver_restart(spa->spa_dsl_pool, 0);
+               return (0);
+       }
+
         if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
                 /* got scrub start cmd, resume paused scrub */
                 int err = dsl_scrub_set_pause_resume(scn->scn_dp,
@@ -733,7 +741,42 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
         }
  
         return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
-           dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+           dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
+}
+
+/*
+ * Sets the resilver defer flag to B_FALSE on all leaf devs under vd. Returns
+ * B_TRUE if we have devices that need to be resilvered and are available to
+ * accept resilver I/Os.
+ */
+static boolean_t
+dsl_scan_clear_deferred(vdev_t *vd, dmu_tx_t *tx)
+{
+       boolean_t resilver_needed = B_FALSE;
+       spa_t *spa = vd->vdev_spa;
+
+       for (int c = 0; c < vd->vdev_children; c++) {
+               resilver_needed |=
+                   dsl_scan_clear_deferred(vd->vdev_child[c], tx);
+       }
+
+       if (vd == spa->spa_root_vdev &&
+           spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) {
+               spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+               vdev_config_dirty(vd);
+               spa->spa_resilver_deferred = B_FALSE;
+               return (resilver_needed);
+       }
+
+       if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+           !vd->vdev_ops->vdev_op_leaf)
+               return (resilver_needed);
+
+       if (vd->vdev_resilver_deferred)
+               vd->vdev_resilver_deferred = B_FALSE;
+
+       return (!vdev_is_dead(vd) && !vd->vdev_offline &&
+           vdev_resilver_needed(vd, NULL, NULL));
  }
  
  /* ARGSUSED */
@@ -810,13 +853,23 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
                  * If the scrub/resilver completed, update all DTLs to
                  * reflect this.  Whether it succeeded or not, vacate
                  * all temporary scrub DTLs.
+                *
+                * As the scrub does not currently support traversing
+                * data that have been freed but are part of a checkpoint,
+                * we don't mark the scrub as done in the DTLs as faults
+                * may still exist in those vdevs.
                  */
-               vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
-                   complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE);
-               if (complete) {
+               if (complete &&
+                   !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
+                       vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+                           scn->scn_phys.scn_max_txg, B_TRUE);
+
                         spa_event_notify(spa, NULL, NULL,
                             scn->scn_phys.scn_min_txg ?
                             ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+               } else {
+                       vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
+                           0, B_TRUE);
                 }
                 spa_errlog_rotate(spa);
  
@@ -825,6 +878,25 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
                  * Let the async thread assess this and handle the detach.
                  */
                 spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
+
+               /*
+                * Clear any deferred_resilver flags in the config.
+                * If there are drives that need resilvering, kick
+                * off an asynchronous request to start resilver.
+                * dsl_scan_clear_deferred() may update the config
+                * before the resilver can restart. In the event of
+                * a crash during this period, the spa loading code
+                * will find the drives that need to be resilvered
+                * when the machine reboots and start the resilver then.
+                */
+               boolean_t resilver_needed =
+                   dsl_scan_clear_deferred(spa->spa_root_vdev, tx);
+               if (resilver_needed) {
+                       spa_history_log_internal(spa,
+                           "starting deferred resilver", tx,
+                           "errors=%llu", spa_get_errlog_size(spa));
+                       spa_async_request(spa, SPA_ASYNC_RESILVER);
+               }
         }
  
         scn->scn_phys.scn_end_time = gethrestime_sec();
@@ -898,6 +970,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
                 /* can't pause a scrub when there is no in-progress scrub */
                 spa->spa_scan_pass_scrub_pause = gethrestime_sec();
                 scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
+               scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED;
                 dsl_scan_sync_state(scn, tx, SYNC_CACHED);
                 spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED);
         } else {
@@ -912,6 +985,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
                             gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
                         spa->spa_scan_pass_scrub_pause = 0;
                         scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
+                       scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED;
                         dsl_scan_sync_state(scn, tx, SYNC_CACHED);
                 }
         }
@@ -1217,7 +1291,7 @@ dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
          * (on-disk) even if it hasn't been claimed (even though for
          * scrub there's nothing to do to it).
          */
-       if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
+       if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa))
                 return (0);
  
         SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
@@ -1268,11 +1342,13 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
         zil_scan_arg_t zsa = { dp, zh };
         zilog_t *zilog;
  
+       ASSERT(spa_writeable(dp->dp_spa));
+
         /*
          * We only want to visit blocks that have been claimed but not yet
          * replayed (or, in read-only mode, blocks that *would* be claimed).
          */
-       if (claim_txg == 0 && spa_writeable(dp->dp_spa))
+       if (claim_txg == 0)
                 return;
  
         zilog = zil_alloc(dp->dp_meta_objset, zh);
@@ -1302,8 +1378,8 @@ scan_prefetch_queue_compare(const void *a, const void *b)
  static void
  scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
  {
-       if (refcount_remove(&spc->spc_refcnt, tag) == 0) {
-               refcount_destroy(&spc->spc_refcnt);
+       if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) {
+               zfs_refcount_destroy(&spc->spc_refcnt);
                 kmem_free(spc, sizeof (scan_prefetch_ctx_t));
         }
  }
@@ -1314,8 +1390,8 @@ scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
         scan_prefetch_ctx_t *spc;
  
         spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
-       refcount_create(&spc->spc_refcnt);
-       refcount_add(&spc->spc_refcnt, tag);
+       zfs_refcount_create(&spc->spc_refcnt);
+       zfs_refcount_add(&spc->spc_refcnt, tag);
         spc->spc_scn = scn;
         if (dnp != NULL) {
                 spc->spc_datablkszsec = dnp->dn_datablkszsec;
@@ -1333,7 +1409,7 @@ scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
  static void
  scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
  {
-       refcount_add(&spc->spc_refcnt, tag);
+       zfs_refcount_add(&spc->spc_refcnt, tag);
  }
  
  static boolean_t
@@ -2313,6 +2389,20 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
         if (!dsl_scan_is_running(scn))
                 return;
  
+       /*
+        * This function is special because it is the only thing
+        * that can add scan_io_t's to the vdev scan queues from
+        * outside dsl_scan_sync(). For the most part this is ok
+        * as long as it is called from within syncing context.
+        * However, dsl_scan_sync() expects that no new sio's will
+        * be added between when all the work for a scan is done
+        * and the next txg when the scan is actually marked as
+        * completed. This check ensures we do not issue new sio's
+        * during this period.
+        */
+       if (scn->scn_done_txg != 0)
+               return;
+
         for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
                 if (ddp->ddp_phys_birth == 0 ||
                     ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
@@ -2954,6 +3044,26 @@ dsl_scan_active(dsl_scan_t *scn)
         return (used != 0);
  }
  
+static boolean_t
+dsl_scan_check_deferred(vdev_t *vd)
+{
+       boolean_t need_resilver = B_FALSE;
+
+       for (int c = 0; c < vd->vdev_children; c++) {
+               need_resilver |=
+                   dsl_scan_check_deferred(vd->vdev_child[c]);
+       }
+
+       if (!vdev_is_concrete(vd) || vd->vdev_aux ||
+           !vd->vdev_ops->vdev_op_leaf)
+               return (need_resilver);
+
+       if (!vd->vdev_resilver_deferred)
+               need_resilver = B_TRUE;
+
+       return (need_resilver);
+}
+
  static boolean_t
  dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
      uint64_t phys_birth)
@@ -3001,82 +3111,26 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
         if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
                 return (B_FALSE);
  
+       /*
+        * Check that this top-level vdev has a device under it which
+        * is resilvering and is not deferred.
+        */
+       if (!dsl_scan_check_deferred(vd))
+               return (B_FALSE);
+
         return (B_TRUE);
  }
  
-/*
- * This is the primary entry point for scans that is called from syncing
- * context. Scans must happen entirely during syncing context so that we
- * cna guarantee that blocks we are currently scanning will not change out
- * from under us. While a scan is active, this function controls how quickly
- * transaction groups proceed, instead of the normal handling provided by
- * txg_sync_thread().
- */
-void
-dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+static int
+dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
  {
-       int err = 0;
         dsl_scan_t *scn = dp->dp_scan;
         spa_t *spa = dp->dp_spa;
-       state_sync_type_t sync_type = SYNC_OPTIONAL;
-
-       /*
-        * Check for scn_restart_txg before checking spa_load_state, so
-        * that we can restart an old-style scan while the pool is being
-        * imported (see dsl_scan_init).
-        */
-       if (dsl_scan_restarting(scn, tx)) {
-               pool_scan_func_t func = POOL_SCAN_SCRUB;
-               dsl_scan_done(scn, B_FALSE, tx);
-               if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
-                       func = POOL_SCAN_RESILVER;
-               zfs_dbgmsg("restarting scan func=%u txg=%llu",
-                   func, (longlong_t)tx->tx_txg);
-               dsl_scan_setup_sync(&func, tx);
-       }
-
-       /*
-        * Only process scans in sync pass 1.
-        */
-       if (spa_sync_pass(spa) > 1)
-               return;
-
-       /*
-        * If the spa is shutting down, then stop scanning. This will
-        * ensure that the scan does not dirty any new data during the
-        * shutdown phase.
-        */
-       if (spa_shutting_down(spa))
-               return;
-
-       /*
-        * If the scan is inactive due to a stalled async destroy, try again.
-        */
-       if (!scn->scn_async_stalled && !dsl_scan_active(scn))
-               return;
+       int err = 0;
  
-       /* reset scan statistics */
-       scn->scn_visited_this_txg = 0;
-       scn->scn_holes_this_txg = 0;
-       scn->scn_lt_min_this_txg = 0;
-       scn->scn_gt_max_this_txg = 0;
-       scn->scn_ddt_contained_this_txg = 0;
-       scn->scn_objsets_visited_this_txg = 0;
-       scn->scn_avg_seg_size_this_txg = 0;
-       scn->scn_segs_this_txg = 0;
-       scn->scn_avg_zio_size_this_txg = 0;
-       scn->scn_zios_this_txg = 0;
-       scn->scn_suspending = B_FALSE;
-       scn->scn_sync_start_time = gethrtime();
-       spa->spa_scrub_active = B_TRUE;
+       if (spa_suspend_async_destroy(spa))
+               return (0);
  
-       /*
-        * First process the async destroys.  If we suspend, don't do
-        * any scrubbing or resilvering.  This ensures that there are no
-        * async destroys while we are scanning, so the scan code doesn't
-        * have to worry about traversing it.  It is also faster to free the
-        * blocks than to scrub them.
-        */
         if (zfs_free_bpobj_enabled &&
             spa_version(spa) >= SPA_VERSION_DEADLISTS) {
                 scn->scn_is_bptree = B_FALSE;
@@ -3152,7 +3206,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                 ddt_sync(spa, tx->tx_txg);
         }
         if (err != 0)
-               return;
+               return (err);
         if (dp->dp_free_dir != NULL && !scn->scn_async_destroying &&
             zfs_free_leak_on_eio &&
             (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 ||
@@ -3205,6 +3259,92 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                 if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
                         dsl_pool_destroy_obsolete_bpobj(dp, tx);
         }
+       return (0);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * cna guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this function controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
+void
+dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+       int err = 0;
+       dsl_scan_t *scn = dp->dp_scan;
+       spa_t *spa = dp->dp_spa;
+       state_sync_type_t sync_type = SYNC_OPTIONAL;
+
+       if (spa->spa_resilver_deferred &&
+           !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))
+               spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx);
+
+       /*
+        * Check for scn_restart_txg before checking spa_load_state, so
+        * that we can restart an old-style scan while the pool is being
+        * imported (see dsl_scan_init). We also restart scans if there
+        * is a deferred resilver and the user has manually disabled
+        * deferred resilvers via the tunable.
+        */
+       if (dsl_scan_restarting(scn, tx) ||
+           (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) {
+               pool_scan_func_t func = POOL_SCAN_SCRUB;
+               dsl_scan_done(scn, B_FALSE, tx);
+               if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+                       func = POOL_SCAN_RESILVER;
+               zfs_dbgmsg("restarting scan func=%u txg=%llu",
+                   func, (longlong_t)tx->tx_txg);
+               dsl_scan_setup_sync(&func, tx);
+       }
+
+       /*
+        * Only process scans in sync pass 1.
+        */
+       if (spa_sync_pass(spa) > 1)
+               return;
+
+       /*
+        * If the spa is shutting down, then stop scanning. This will
+        * ensure that the scan does not dirty any new data during the
+        * shutdown phase.
+        */
+       if (spa_shutting_down(spa))
+               return;
+
+       /*
+        * If the scan is inactive due to a stalled async destroy, try again.
+        */
+       if (!scn->scn_async_stalled && !dsl_scan_active(scn))
+               return;
+
+       /* reset scan statistics */
+       scn->scn_visited_this_txg = 0;
+       scn->scn_holes_this_txg = 0;
+       scn->scn_lt_min_this_txg = 0;
+       scn->scn_gt_max_this_txg = 0;
+       scn->scn_ddt_contained_this_txg = 0;
+       scn->scn_objsets_visited_this_txg = 0;
+       scn->scn_avg_seg_size_this_txg = 0;
+       scn->scn_segs_this_txg = 0;
+       scn->scn_avg_zio_size_this_txg = 0;
+       scn->scn_zios_this_txg = 0;
+       scn->scn_suspending = B_FALSE;
+       scn->scn_sync_start_time = gethrtime();
+       spa->spa_scrub_active = B_TRUE;
+
+       /*
+        * First process the async destroys.  If we suspend, don't do
+        * any scrubbing or resilvering.  This ensures that there are no
+        * async destroys while we are scanning, so the scan code doesn't
+        * have to worry about traversing it.  It is also faster to free the
+        * blocks than to scrub them.
+        */
+       err = dsl_process_async_destroys(dp, tx);
+       if (err != 0)
+               return;
  
         if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
                 return;
@@ -3345,6 +3485,8 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                             (longlong_t)tx->tx_txg);
                 }
         } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+               ASSERT(scn->scn_clearing);
+
                 /* need to issue scrubbing IOs from per-vdev queues */
                 scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
                     NULL, ZIO_FLAG_CANFAIL);
@@ -3537,15 +3679,16 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
         boolean_t needs_io = B_FALSE;
         int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
  
-       if (phys_birth <= scn->scn_phys.scn_min_txg ||
-           phys_birth >= scn->scn_phys.scn_max_txg)
-               return (0);
  
-       if (BP_IS_EMBEDDED(bp)) {
+       if (phys_birth <= scn->scn_phys.scn_min_txg ||
+           phys_birth >= scn->scn_phys.scn_max_txg) {
                 count_block(scn, dp->dp_blkstats, bp);
                 return (0);
         }
  
+       /* Embedded BP's have phys_birth==0, so we reject them above. */
+       ASSERT(!BP_IS_EMBEDDED(bp));
+
         ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
         if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
                 zio_flags |= ZIO_FLAG_SCRUB;
@@ -3909,7 +4052,7 @@ dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
                 dsl_scan_freed_dva(spa, bp, i);
  }
  
-#if defined(_KERNEL) && defined(HAVE_SPL)
+#if defined(_KERNEL)
  /* CSTYLED */
  module_param(zfs_scan_vdev_limit, ulong, 0644);
  MODULE_PARM_DESC(zfs_scan_vdev_limit,
@@ -3971,4 +4114,8 @@ MODULE_PARM_DESC(zfs_scan_strict_mem_lim,
  module_param(zfs_scan_fill_weight, int, 0644);
  MODULE_PARM_DESC(zfs_scan_fill_weight,
         "Tunable to adjust bias towards more filled segments during scans");
+
+module_param(zfs_resilver_disable_defer, int, 0644);
+MODULE_PARM_DESC(zfs_resilver_disable_defer,
+       "Process all resilvers immediately");
  #endif