OpenZFS 7614, 9064 - zfs device evacuation/removal

[mirror_zfs.git] / module / zfs / dsl_scan.c
diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c

index fc0c24e1c31505ed1405bee2dffa4ddd66203178..53953a6c5851a19dc88bcf9158ef2eda3864cdaa 100644 (file)
--- a/module/zfs/dsl_scan.c
+++ b/module/zfs/dsl_scan.c
@@ -20,7 +20,7 @@
   */
  /*
   * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
   * Copyright 2016 Gary Mills
   * Copyright (c) 2017 Datto Inc.
   * Copyright 2017 Joyent, Inc.
@@ -95,7 +95,7 @@
   * needs to be notified whenever a block is freed. This is needed to allow
   * the scanning code to remove these I/Os from the issuing queue. Additionally,
   * we do not attempt to queue gang blocks to be issued sequentially since this
- * is very hard to do and would have an extremely limitted performance benefit.
+ * is very hard to do and would have an extremely limited performance benefit.
   * Instead, we simply issue gang I/Os as soon as we find them using the legacy
   * algorithm.
   *
@@ -165,6 +165,7 @@ int zfs_scan_mem_lim_fact = 20;             /* fraction of physmem */
  int zfs_scan_mem_lim_soft_fact = 20;   /* fraction of mem lim above */
  
  int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */
  int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
  int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
  int zfs_scan_checkpoint_intval = 7200; /* in seconds */
@@ -172,7 +173,7 @@ int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
  int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
  enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
  /* max number of blocks to free in a single TXG */
-unsigned long zfs_free_max_blocks = 100000;
+unsigned long zfs_async_block_max_blocks = 100000;
  
  /*
   * We wait a few txgs after importing a pool to begin scanning so that
@@ -304,9 +305,9 @@ scan_init(void)
          * This is used in ext_size_compare() to weight segments
          * based on how sparse they are. This cannot be changed
          * mid-scan and the tree comparison functions don't currently
-        * have a mechansim for passing additional context to the
+        * have a mechanism for passing additional context to the
          * compare functions. Thus we store this value globally and
-        * we only allow it to be set at module intiailization time
+        * we only allow it to be set at module initialization time
          */
         fill_weight = zfs_scan_fill_weight;
  
@@ -1176,14 +1177,15 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
                             (longlong_t)zb->zb_blkid);
                         scn->scn_phys.scn_bookmark = *zb;
                 } else {
+#ifdef ZFS_DEBUG
                         dsl_scan_phys_t *scnp = &scn->scn_phys;
-
                         dprintf("suspending at at DDT bookmark "
                             "%llx/%llx/%llx/%llx\n",
                             (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
                             (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
                             (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
                             (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
+#endif
                 }
                 scn->scn_suspending = B_TRUE;
                 return (B_TRUE);
@@ -1438,7 +1440,7 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
         dsl_scan_t *scn = spc->spc_scn;
         spa_t *spa = scn->scn_dp->dp_spa;
  
-       /* broadcast that the IO has completed for rate limitting purposes */
+       /* broadcast that the IO has completed for rate limiting purposes */
         mutex_enter(&spa->spa_scrub_lock);
         ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
         spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
@@ -1446,7 +1448,7 @@ dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
         mutex_exit(&spa->spa_scrub_lock);
  
         /* if there was an error or we are done prefetching, just cleanup */
-       if (buf == NULL || scn->scn_suspending)
+       if (buf == NULL || scn->scn_prefetch_stop)
                 goto out;
  
         if (BP_GET_LEVEL(bp) > 0) {
@@ -1684,11 +1686,15 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
  
                 if (OBJSET_BUF_HAS_USERUSED(buf)) {
                         /*
-                        * We also always visit user/group accounting
+                        * We also always visit user/group/project accounting
                          * objects, and never skip them, even if we are
                          * suspending. This is necessary so that the
                          * space deltas from this txg get integrated.
                          */
+                       if (OBJSET_BUF_HAS_PROJECTUSED(buf))
+                               dsl_scan_visitdnode(scn, ds, osp->os_type,
+                                   &osp->os_projectused_dnode,
+                                   DMU_PROJECTUSED_OBJECT, tx);
                         dsl_scan_visitdnode(scn, ds, osp->os_type,
                             &osp->os_groupused_dnode,
                             DMU_GROUPUSED_OBJECT, tx);
@@ -2107,7 +2113,6 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
  {
         dsl_pool_t *dp = scn->scn_dp;
         dsl_dataset_t *ds;
-       objset_t *os;
  
         VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
  
@@ -2151,18 +2156,23 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
                 goto out;
         }
  
-       if (dmu_objset_from_ds(ds, &os))
-               goto out;
-
         /*
-        * Only the ZIL in the head (non-snapshot) is valid.  Even though
+        * Only the ZIL in the head (non-snapshot) is valid. Even though
          * snapshots can have ZIL block pointers (which may be the same
-        * BP as in the head), they must be ignored.  So we traverse the
-        * ZIL here, rather than in scan_recurse(), because the regular
-        * snapshot block-sharing rules don't apply to it.
+        * BP as in the head), they must be ignored. In addition, $ORIGIN
+        * doesn't have a objset (i.e. its ds_bp is a hole) so we don't
+        * need to look for a ZIL in it either. So we traverse the ZIL here,
+        * rather than in scan_recurse(), because the regular snapshot
+        * block-sharing rules don't apply to it.
          */
-       if (!ds->ds_is_snapshot)
+       if (!dsl_dataset_is_snapshot(ds) &&
+           ds->ds_dir != dp->dp_origin_snap->ds_dir) {
+               objset_t *os;
+               if (dmu_objset_from_ds(ds, &os) != 0) {
+                       goto out;
+               }
                 dsl_scan_zil(dp, &os->os_zil_header);
+       }
  
         /*
          * Iterate over the bps in this ds.
@@ -2201,7 +2211,7 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
         }
  
         /*
-        * Add descendent datasets to work queue.
+        * Add descendant datasets to work queue.
          */
         if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
                 scan_ds_queue_insert(scn,
@@ -2548,11 +2558,11 @@ scan_io_queue_check_suspend(dsl_scan_t *scn)
  }
  
  /*
- * Given a list of scan_io_t's in io_list, this issues the io's out to
+ * Given a list of scan_io_t's in io_list, this issues the I/Os out to
   * disk. This consumes the io_list and frees the scan_io_t's. This is
   * called when emptying queues, either when we're up against the memory
   * limit or when we have finished scanning. Returns B_TRUE if we stopped
- * processing the list before we finished. Any zios that were not issued
+ * processing the list before we finished. Any sios that were not issued
   * will remain in the io_list.
   */
  static boolean_t
@@ -2648,7 +2658,7 @@ scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
  
  /*
   * This is called from the queue emptying thread and selects the next
- * extent from which we are to issue io's. The behavior of this function
+ * extent from which we are to issue I/Os. The behavior of this function
   * depends on the state of the scan, the current memory consumption and
   * whether or not we are performing a scan shutdown.
   * 1) We select extents in an elevator algorithm (LBA-order) if the scan
@@ -2783,7 +2793,7 @@ scan_io_queues_run_one(void *arg)
   * Performs an emptying run on all scan queues in the pool. This just
   * punches out one thread per top-level vdev, each of which processes
   * only that vdev's scan queue. We can parallelize the I/O here because
- * we know that each queue's io's only affect its own top-level vdev.
+ * we know that each queue's I/Os only affect its own top-level vdev.
   *
   * This function waits for the queue runs to complete, and must be
   * called from dsl_scan_sync (or in general, syncing context).
@@ -2826,7 +2836,7 @@ scan_io_queues_run(dsl_scan_t *scn)
         }
  
         /*
-        * Wait for the queues to finish issuing thir IOs for this run
+        * Wait for the queues to finish issuing their IOs for this run
          * before we return. There may still be IOs in flight at this
          * point.
          */
@@ -2834,19 +2844,19 @@ scan_io_queues_run(dsl_scan_t *scn)
  }
  
  static boolean_t
-dsl_scan_free_should_suspend(dsl_scan_t *scn)
+dsl_scan_async_block_should_pause(dsl_scan_t *scn)
  {
         uint64_t elapsed_nanosecs;
  
         if (zfs_recover)
                 return (B_FALSE);
  
-       if (scn->scn_visited_this_txg >= zfs_free_max_blocks)
+       if (scn->scn_visited_this_txg >= zfs_async_block_max_blocks)
                 return (B_TRUE);
  
         elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
         return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
-           (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms &&
+           (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms &&
             txg_sync_waiting(scn->scn_dp)) ||
             spa_shutting_down(scn->scn_dp->dp_spa));
  }
@@ -2858,7 +2868,7 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
  
         if (!scn->scn_is_bptree ||
             (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) {
-               if (dsl_scan_free_should_suspend(scn))
+               if (dsl_scan_async_block_should_pause(scn))
                         return (SET_ERROR(ERESTART));
         }
  
@@ -2906,6 +2916,22 @@ dsl_scan_update_stats(dsl_scan_t *scn)
         scn->scn_zios_this_txg = zio_count_total;
  }
  
+static int
+dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+       dsl_scan_t *scn = arg;
+       const dva_t *dva = &bp->blk_dva[0];
+
+       if (dsl_scan_async_block_should_pause(scn))
+               return (SET_ERROR(ERESTART));
+
+       spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa,
+           DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva),
+           DVA_GET_ASIZE(dva), tx);
+       scn->scn_visited_this_txg++;
+       return (0);
+}
+
  boolean_t
  dsl_scan_active(dsl_scan_t *scn)
  {
@@ -2970,7 +2996,7 @@ dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
   * This is the primary entry point for scans that is called from syncing
   * context. Scans must happen entirely during syncing context so that we
   * cna guarantee that blocks we are currently scanning will not change out
- * from under us. While a scan is active, this funciton controls how quickly
+ * from under us. While a scan is active, this function controls how quickly
   * transaction groups proceed, instead of the normal handling provided by
   * txg_sync_thread().
   */
@@ -3042,6 +3068,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
         if (zfs_free_bpobj_enabled &&
             spa_version(spa) >= SPA_VERSION_DEADLISTS) {
                 scn->scn_is_bptree = B_FALSE;
+               scn->scn_async_block_min_time_ms = zfs_free_min_time_ms;
                 scn->scn_zio_root = zio_root(spa, NULL,
                     NULL, ZIO_FLAG_MUSTSUCCEED);
                 err = bpobj_iterate(&dp->dp_free_bpobj,
@@ -3141,6 +3168,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                     -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes,
                     -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx);
         }
+
         if (dp->dp_free_dir != NULL && !scn->scn_async_destroying) {
                 /* finished; verify that space accounting went to zero */
                 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes);
@@ -3148,6 +3176,24 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                 ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
         }
  
+       EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj),
+           0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_OBSOLETE_BPOBJ));
+       if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) {
+               ASSERT(spa_feature_is_active(dp->dp_spa,
+                   SPA_FEATURE_OBSOLETE_COUNTS));
+
+               scn->scn_is_bptree = B_FALSE;
+               scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms;
+               err = bpobj_iterate(&dp->dp_obsolete_bpobj,
+                   dsl_scan_obsolete_block_cb, scn, tx);
+               if (err != 0 && err != ERESTART)
+                       zfs_panic_recover("error %u from bpobj_iterate()", err);
+
+               if (bpobj_is_empty(&dp->dp_obsolete_bpobj))
+                       dsl_pool_destroy_obsolete_bpobj(dp, tx);
+       }
+
         if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
                 return;
  
@@ -3178,7 +3224,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                 /*
                  * If we are over our checkpoint interval, set scn_clearing
                  * so that we can begin checkpointing immediately. The
-                * checkpoint allows us to save a consisent bookmark
+                * checkpoint allows us to save a consistent bookmark
                  * representing how much data we have scrubbed so far.
                  * Otherwise, use the memory limit to determine if we should
                  * scan for metadata or start issue scrub IOs. We accumulate
@@ -3680,8 +3726,7 @@ scan_io_queue_create(vdev_t *vd)
         q->q_vd = vd;
         cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
         q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
-           &q->q_exts_by_size, ext_size_compare,
-           &q->q_vd->vdev_scan_io_queue_lock, zfs_scan_max_ext_gap);
+           &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
         avl_create(&q->q_sios_by_addr, sio_addr_compare,
             sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
  
@@ -3734,11 +3779,8 @@ dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
         VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
         tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
         svd->vdev_scan_io_queue = NULL;
-       if (tvd->vdev_scan_io_queue != NULL) {
+       if (tvd->vdev_scan_io_queue != NULL)
                 tvd->vdev_scan_io_queue->q_vd = tvd;
-               range_tree_set_lock(tvd->vdev_scan_io_queue->q_exts_by_addr,
-                   &tvd->vdev_scan_io_queue_lock);
-       }
  
         mutex_exit(&tvd->vdev_scan_io_queue_lock);
         mutex_exit(&svd->vdev_scan_io_queue_lock);
@@ -3864,6 +3906,9 @@ MODULE_PARM_DESC(zfs_scan_vdev_limit,
  module_param(zfs_scrub_min_time_ms, int, 0644);
  MODULE_PARM_DESC(zfs_scrub_min_time_ms, "Min millisecs to scrub per txg");
  
+module_param(zfs_obsolete_min_time_ms, int, 0644);
+MODULE_PARM_DESC(zfs_obsolete_min_time_ms, "Min millisecs to obsolete per txg");
+
  module_param(zfs_free_min_time_ms, int, 0644);
  MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg");
  
@@ -3877,8 +3922,9 @@ module_param(zfs_no_scrub_prefetch, int, 0644);
  MODULE_PARM_DESC(zfs_no_scrub_prefetch, "Set to disable scrub prefetching");
  
  /* CSTYLED */
-module_param(zfs_free_max_blocks, ulong, 0644);
-MODULE_PARM_DESC(zfs_free_max_blocks, "Max number of blocks freed in one txg");
+module_param(zfs_async_block_max_blocks, ulong, 0644);
+MODULE_PARM_DESC(zfs_async_block_max_blocks,
+       "Max number of blocks freed in one txg");
  
  module_param(zfs_free_bpobj_enabled, int, 0644);
  MODULE_PARM_DESC(zfs_free_bpobj_enabled, "Enable processing of the free_bpobj");