]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Sequential scrub and resilvers
authorTom Caputi <tcaputi@datto.com>
Thu, 16 Nov 2017 01:27:01 +0000 (20:27 -0500)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 16 Nov 2017 01:27:01 +0000 (17:27 -0800)
Currently, scrubs and resilvers can take an extremely
long time to complete. This is largely due to the fact
that zfs scans process pools in logical order, as
determined by each block's bookmark. This makes sense
from a simplicity perspective, but blocks in zfs are
often scattered randomly across disks, particularly
due to zfs's copy-on-write mechanisms.

This patch improves performance by splitting scrubs
and resilvers into a metadata scanning phase and an IO
issuing phase. The metadata scan reads through the
structure of the pool and gathers an in-memory queue
of I/Os, sorted by size and offset on disk. The issuing
phase will then issue the scrub I/Os as sequentially as
possible, greatly improving performance.

This patch also updates and cleans up some of the scan
code which has not been updated in several years.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Authored-by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Authored-by: Alek Pinchuk <apinchuk@datto.com>
Authored-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: Tom Caputi <tcaputi@datto.com>
Closes #3625
Closes #6256

37 files changed:
cmd/zdb/zdb.c
cmd/zpool/zpool_main.c
include/sys/arc.h
include/sys/dsl_pool.h
include/sys/dsl_scan.h
include/sys/fs/zfs.h
include/sys/range_tree.h
include/sys/spa_impl.h
include/sys/vdev_impl.h
lib/libzfs/libzfs_status.c
man/man5/zfs-module-parameters.5
module/zfs/arc.c
module/zfs/dbuf.c
module/zfs/ddt.c
module/zfs/dmu_traverse.c
module/zfs/dsl_pool.c
module/zfs/dsl_scan.c
module/zfs/metaslab.c
module/zfs/range_tree.c
module/zfs/spa.c
module/zfs/spa_misc.c
module/zfs/vdev.c
module/zfs/vdev_queue.c
module/zfs/zap.c
module/zfs/zio.c
tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh
tests/zfs-tests/tests/functional/events/events_002_pos.ksh
tests/zfs-tests/tests/functional/events/events_common.kshlib

index e45b7743a3017b8f0cc008567e95fc477502d86b..cc4f22a9ed44197294b23d7ead81f6c1a3145eb3 100644 (file)
@@ -2226,8 +2226,6 @@ dump_dir(objset_t *os)
                max_slot_used = object + dnode_slots - 1;
        }
 
-       ASSERT3U(object_count, ==, usedobjs);
-
        (void) printf("\n");
 
        (void) printf("    Dnode slots:\n");
@@ -2245,6 +2243,8 @@ dump_dir(objset_t *os)
                (void) fprintf(stderr, "dmu_object_next() = %d\n", error);
                abort();
        }
+
+       ASSERT3U(object_count, ==, usedobjs);
 }
 
 static void
@@ -3089,7 +3089,7 @@ zdb_blkptr_done(zio_t *zio)
        abd_free(zio->io_abd);
 
        mutex_enter(&spa->spa_scrub_lock);
-       spa->spa_scrub_inflight--;
+       spa->spa_load_verify_ios--;
        cv_broadcast(&spa->spa_scrub_io_cv);
 
        if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
@@ -3160,9 +3160,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
                        flags |= ZIO_FLAG_SPECULATIVE;
 
                mutex_enter(&spa->spa_scrub_lock);
-               while (spa->spa_scrub_inflight > max_inflight)
+               while (spa->spa_load_verify_ios > max_inflight)
                        cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-               spa->spa_scrub_inflight++;
+               spa->spa_load_verify_ios++;
                mutex_exit(&spa->spa_scrub_lock);
 
                zio_nowait(zio_read(NULL, spa, bp, abd, size,
index d23903aecd7c55f0588bdadb38d646b962b05d5f..440b2979960e85666a005c0cb8546cd7b99603e7 100644 (file)
@@ -57,6 +57,8 @@
 #include <sys/fm/protocol.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/mount.h>
+#include <sys/sysmacros.h>
+
 #include <math.h>
 
 #include <libzfs.h>
@@ -1761,7 +1763,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
        (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS,
            (uint64_t **)&ps, &c);
 
-       if (ps && ps->pss_state == DSS_SCANNING &&
+       if (ps != NULL && ps->pss_state == DSS_SCANNING &&
            vs->vs_scan_processed != 0 && children == 0) {
                (void) printf(gettext("  (%s)"),
                    (ps->pss_func == POOL_SCAN_RESILVER) ?
@@ -5967,11 +5969,13 @@ void
 print_scan_status(pool_scan_stat_t *ps)
 {
        time_t start, end, pause;
-       uint64_t elapsed, mins_left, hours_left;
-       uint64_t pass_exam, examined, total;
-       uint_t rate;
+       uint64_t total_secs_left;
+       uint64_t elapsed, secs_left, mins_left, hours_left, days_left;
+       uint64_t pass_scanned, scanned, pass_issued, issued, total;
+       uint_t scan_rate, issue_rate;
        double fraction_done;
-       char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7];
+       char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
+       char srate_buf[7], irate_buf[7];
 
        (void) printf(gettext("  scan: "));
 
@@ -5985,30 +5989,35 @@ print_scan_status(pool_scan_stat_t *ps)
        start = ps->pss_start_time;
        end = ps->pss_end_time;
        pause = ps->pss_pass_scrub_pause;
+
        zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf));
 
        assert(ps->pss_func == POOL_SCAN_SCRUB ||
            ps->pss_func == POOL_SCAN_RESILVER);
-       /*
-        * Scan is finished or canceled.
-        */
+
+       /* Scan is finished or canceled. */
        if (ps->pss_state == DSS_FINISHED) {
-               uint64_t minutes_taken = (end - start) / 60;
-               char *fmt = NULL;
+               total_secs_left = end - start;
+               days_left = total_secs_left / 60 / 60 / 24;
+               hours_left = (total_secs_left / 60 / 60) % 24;
+               mins_left = (total_secs_left / 60) % 60;
+               secs_left = (total_secs_left % 60);
 
                if (ps->pss_func == POOL_SCAN_SCRUB) {
-                       fmt = gettext("scrub repaired %s in %lluh%um with "
-                           "%llu errors on %s");
+                       (void) printf(gettext("scrub repaired %s "
+                           "in %llu days %02llu:%02llu:%02llu "
+                           "with %llu errors on %s"), processed_buf,
+                           (u_longlong_t)days_left, (u_longlong_t)hours_left,
+                           (u_longlong_t)mins_left, (u_longlong_t)secs_left,
+                           (u_longlong_t)ps->pss_errors, ctime(&end));
                } else if (ps->pss_func == POOL_SCAN_RESILVER) {
-                       fmt = gettext("resilvered %s in %lluh%um with "
-                           "%llu errors on %s");
+                       (void) printf(gettext("resilvered %s "
+                           "in %llu days %02llu:%02llu:%02llu "
+                           "with %llu errors on %s"), processed_buf,
+                           (u_longlong_t)days_left, (u_longlong_t)hours_left,
+                           (u_longlong_t)mins_left, (u_longlong_t)secs_left,
+                           (u_longlong_t)ps->pss_errors, ctime(&end));
                }
-               /* LINTED */
-               (void) printf(fmt, processed_buf,
-                   (u_longlong_t)(minutes_taken / 60),
-                   (uint_t)(minutes_taken % 60),
-                   (u_longlong_t)ps->pss_errors,
-                   ctime((time_t *)&end));
                return;
        } else if (ps->pss_state == DSS_CANCELED) {
                if (ps->pss_func == POOL_SCAN_SCRUB) {
@@ -6023,19 +6032,15 @@ print_scan_status(pool_scan_stat_t *ps)
 
        assert(ps->pss_state == DSS_SCANNING);
 
-       /*
-        * Scan is in progress.
-        */
+       /* Scan is in progress. Resilvers can't be paused. */
        if (ps->pss_func == POOL_SCAN_SCRUB) {
                if (pause == 0) {
                        (void) printf(gettext("scrub in progress since %s"),
                            ctime(&start));
                } else {
-                       char buf[32];
-                       struct tm *p = localtime(&pause);
-                       (void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p);
-                       (void) printf(gettext("scrub paused since %s\n"), buf);
-                       (void) printf(gettext("\tscrub started on   %s"),
+                       (void) printf(gettext("scrub paused since %s"),
+                           ctime(&pause));
+                       (void) printf(gettext("\tscrub started on %s"),
                            ctime(&start));
                }
        } else if (ps->pss_func == POOL_SCAN_RESILVER) {
@@ -6043,50 +6048,68 @@ print_scan_status(pool_scan_stat_t *ps)
                    ctime(&start));
        }
 
-       examined = ps->pss_examined ? ps->pss_examined : 1;
+       scanned = ps->pss_examined;
+       pass_scanned = ps->pss_pass_exam;
+       issued = ps->pss_issued;
+       pass_issued = ps->pss_pass_issued;
        total = ps->pss_to_examine;
-       fraction_done = (double)examined / total;
 
-       /* elapsed time for this pass */
+       /* we are only done with a block once we have issued the IO for it */
+       fraction_done = (double)issued / total;
+
+       /* elapsed time for this pass, rounding up to 1 if it's 0 */
        elapsed = time(NULL) - ps->pss_pass_start;
        elapsed -= ps->pss_pass_scrub_spent_paused;
-       elapsed = elapsed ? elapsed : 1;
-       pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
-       rate = pass_exam / elapsed;
-       rate = rate ? rate : 1;
-       mins_left = ((total - examined) / rate) / 60;
-       hours_left = mins_left / 60;
-
-       zfs_nicebytes(examined, examined_buf, sizeof (examined_buf));
+       elapsed = (elapsed != 0) ? elapsed : 1;
+
+       scan_rate = pass_scanned / elapsed;
+       issue_rate = pass_issued / elapsed;
+       total_secs_left = (issue_rate != 0) ?
+           ((total - issued) / issue_rate) : UINT64_MAX;
+
+       days_left = total_secs_left / 60 / 60 / 24;
+       hours_left = (total_secs_left / 60 / 60) % 24;
+       mins_left = (total_secs_left / 60) % 60;
+       secs_left = (total_secs_left % 60);
+
+       /* format all of the numbers we will be reporting */
+       zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
+       zfs_nicebytes(issued, issued_buf, sizeof (issued_buf));
        zfs_nicebytes(total, total_buf, sizeof (total_buf));
+       zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf));
+       zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf));
 
-       /*
-        * do not print estimated time if hours_left is more than 30 days
-        * or we have a paused scrub
-        */
+       /* do not print estimated time if we have a paused scrub */
        if (pause == 0) {
-               zfs_nicebytes(rate, rate_buf, sizeof (rate_buf));
-               (void) printf(gettext("\t%s scanned out of %s at %s/s"),
-                   examined_buf, total_buf, rate_buf);
-               if (hours_left < (30 * 24)) {
-                       (void) printf(gettext(", %lluh%um to go\n"),
-                           (u_longlong_t)hours_left, (uint_t)(mins_left % 60));
-               } else {
-                       (void) printf(gettext(
-                           ", (scan is slow, no estimated time)\n"));
-               }
+               (void) printf(gettext("\t%s scanned at %s/s, "
+                   "%s issued at %s/s, %s total\n"),
+                   scanned_buf, srate_buf, issued_buf, irate_buf, total_buf);
        } else {
-               (void) printf(gettext("\t%s scanned out of %s\n"),
-                   examined_buf, total_buf);
+               (void) printf(gettext("\t%s scanned, %s issued, %s total\n"),
+                   scanned_buf, issued_buf, total_buf);
        }
 
        if (ps->pss_func == POOL_SCAN_RESILVER) {
-               (void) printf(gettext("\t%s resilvered, %.2f%% done\n"),
+               (void) printf(gettext("\t%s resilvered, %.2f%% done"),
                    processed_buf, 100 * fraction_done);
        } else if (ps->pss_func == POOL_SCAN_SCRUB) {
-               (void) printf(gettext("\t%s repaired, %.2f%% done\n"),
+               (void) printf(gettext("\t%s repaired, %.2f%% done"),
                    processed_buf, 100 * fraction_done);
        }
+
+       if (pause == 0) {
+               if (issue_rate >= 10 * 1024 * 1024) {
+                       (void) printf(gettext(", %llu days "
+                           "%02llu:%02llu:%02llu to go\n"),
+                           (u_longlong_t)days_left, (u_longlong_t)hours_left,
+                           (u_longlong_t)mins_left, (u_longlong_t)secs_left);
+               } else {
+                       (void) printf(gettext(", no estimated "
+                           "completion time\n"));
+               }
+       } else {
+               (void) printf(gettext("\n"));
+       }
 }
 
 static void
index 7428a16292ffac71308c63633010aea024e19e34..0e7a85188b73a25e0b129ae5eeb77744282cd5b7 100644 (file)
@@ -66,11 +66,11 @@ typedef struct arc_prune arc_prune_t;
  * while transforming data into its desired format - specifically, when
  * decrypting, the key may not be present, or the HMAC may not be correct
  * which signifies deliberate tampering with the on-disk state
- * (assuming that the checksum was correct). The "error" parameter will be
- * nonzero in this case, even if there is no associated zio.
+ * (assuming that the checksum was correct). If any error occurs, the "buf"
+ * parameter will be NULL.
  */
-typedef void arc_read_done_func_t(zio_t *zio, int error, arc_buf_t *buf,
-    void *private);
+typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb,
+    const blkptr_t *bp, arc_buf_t *buf, void *private);
 typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *private);
 typedef void arc_prune_func_t(int64_t bytes, void *private);
 
@@ -106,44 +106,45 @@ typedef enum arc_flags
        ARC_FLAG_CACHED                 = 1 << 3,       /* I/O was in cache */
        ARC_FLAG_L2CACHE                = 1 << 4,       /* cache in L2ARC */
        ARC_FLAG_PREDICTIVE_PREFETCH    = 1 << 5,       /* I/O from zfetch */
+       ARC_FLAG_PRESCIENT_PREFETCH     = 1 << 6,       /* long min lifespan */
 
        /*
         * Private ARC flags.  These flags are private ARC only flags that
         * will show up in b_flags in the arc_hdr_buf_t. These flags should
         * only be set by ARC code.
         */
-       ARC_FLAG_IN_HASH_TABLE          = 1 << 6,       /* buffer is hashed */
-       ARC_FLAG_IO_IN_PROGRESS         = 1 << 7,       /* I/O in progress */
-       ARC_FLAG_IO_ERROR               = 1 << 8,       /* I/O failed for buf */
-       ARC_FLAG_INDIRECT               = 1 << 9,       /* indirect block */
+       ARC_FLAG_IN_HASH_TABLE          = 1 << 7,       /* buffer is hashed */
+       ARC_FLAG_IO_IN_PROGRESS         = 1 << 8,       /* I/O in progress */
+       ARC_FLAG_IO_ERROR               = 1 << 9,       /* I/O failed for buf */
+       ARC_FLAG_INDIRECT               = 1 << 10,      /* indirect block */
        /* Indicates that block was read with ASYNC priority. */
-       ARC_FLAG_PRIO_ASYNC_READ        = 1 << 10,
-       ARC_FLAG_L2_WRITING             = 1 << 11,      /* write in progress */
-       ARC_FLAG_L2_EVICTED             = 1 << 12,      /* evicted during I/O */
-       ARC_FLAG_L2_WRITE_HEAD          = 1 << 13,      /* head of write list */
+       ARC_FLAG_PRIO_ASYNC_READ        = 1 << 11,
+       ARC_FLAG_L2_WRITING             = 1 << 12,      /* write in progress */
+       ARC_FLAG_L2_EVICTED             = 1 << 13,      /* evicted during I/O */
+       ARC_FLAG_L2_WRITE_HEAD          = 1 << 14,      /* head of write list */
        /*
         * Encrypted or authenticated on disk (may be plaintext in memory).
         * This header has b_crypt_hdr allocated. Does not include indirect
         * blocks with checksums of MACs which will also have their X
         * (encrypted) bit set in the bp.
         */
-       ARC_FLAG_PROTECTED              = 1 << 14,
+       ARC_FLAG_PROTECTED              = 1 << 15,
        /* data has not been authenticated yet */
-       ARC_FLAG_NOAUTH                 = 1 << 15,
+       ARC_FLAG_NOAUTH                 = 1 << 16,
        /* indicates that the buffer contains metadata (otherwise, data) */
-       ARC_FLAG_BUFC_METADATA          = 1 << 16,
+       ARC_FLAG_BUFC_METADATA          = 1 << 17,
 
        /* Flags specifying whether optional hdr struct fields are defined */
-       ARC_FLAG_HAS_L1HDR              = 1 << 17,
-       ARC_FLAG_HAS_L2HDR              = 1 << 18,
+       ARC_FLAG_HAS_L1HDR              = 1 << 18,
+       ARC_FLAG_HAS_L2HDR              = 1 << 19,
 
        /*
         * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data.
         * This allows the l2arc to use the blkptr's checksum to verify
         * the data without having to store the checksum in the hdr.
         */
-       ARC_FLAG_COMPRESSED_ARC         = 1 << 19,
-       ARC_FLAG_SHARED_DATA            = 1 << 20,
+       ARC_FLAG_COMPRESSED_ARC         = 1 << 20,
+       ARC_FLAG_SHARED_DATA            = 1 << 21,
 
        /*
         * The arc buffer's compression mode is stored in the top 7 bits of the
index 044ef95441c94b9f4a3c9f88630f4af90c8fdeb5..9ceb59d9b638cb5384a71bd68985545a9141e340 100644 (file)
@@ -80,6 +80,7 @@ typedef struct zfs_blkstat {
 
 typedef struct zfs_all_blkstats {
        zfs_blkstat_t   zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1];
+       kmutex_t        zab_lock;
 } zfs_all_blkstats_t;
 
 
index 5303d9a699bbcc3c898956daaff18cb6d91c409e..7a29d9788067b301523cd322c0b0b510fbc54b25 100644 (file)
@@ -108,22 +108,56 @@ typedef enum dsl_scan_flags {
  */
 typedef struct dsl_scan {
        struct dsl_pool *scn_dp;
-
-       boolean_t scn_suspending;
        uint64_t scn_restart_txg;
        uint64_t scn_done_txg;
        uint64_t scn_sync_start_time;
-       zio_t *scn_zio_root;
+       uint64_t scn_issued_before_pass;
 
        /* for freeing blocks */
        boolean_t scn_is_bptree;
        boolean_t scn_async_destroying;
        boolean_t scn_async_stalled;
-       uint64_t scn_visited_this_txg;
 
-       dsl_scan_phys_t scn_phys;
+       /* flags and stats for controlling scan state */
+       boolean_t scn_is_sorted;        /* doing sequential scan */
+       boolean_t scn_clearing;         /* scan is issuing sequential extents */
+       boolean_t scn_checkpointing;    /* scan is issuing all queued extents */
+       boolean_t scn_suspending;       /* scan is suspending until next txg */
+       uint64_t scn_last_checkpoint;   /* time of last checkpoint */
+
+       /* members for thread synchronization */
+       zio_t *scn_zio_root;            /* root zio for waiting on IO */
+       taskq_t *scn_taskq;             /* task queue for issuing extents */
+
+       /* for controlling scan prefetch, protected by spa_scrub_lock */
+       boolean_t scn_prefetch_stop;    /* prefetch should stop */
+       zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */
+       avl_tree_t scn_prefetch_queue;  /* priority queue of prefetch IOs */
+       uint64_t scn_maxinflight_bytes; /* max bytes in flight for pool */
+
+       /* per txg statistics */
+       uint64_t scn_visited_this_txg;  /* total bps visited this txg */
+       uint64_t scn_holes_this_txg;
+       uint64_t scn_lt_min_this_txg;
+       uint64_t scn_gt_max_this_txg;
+       uint64_t scn_ddt_contained_this_txg;
+       uint64_t scn_objsets_visited_this_txg;
+       uint64_t scn_avg_seg_size_this_txg;
+       uint64_t scn_segs_this_txg;
+       uint64_t scn_avg_zio_size_this_txg;
+       uint64_t scn_zios_this_txg;
+
+       /* members needed for syncing scan status to disk */
+       dsl_scan_phys_t scn_phys;       /* on disk representation of scan */
+       dsl_scan_phys_t scn_phys_cached;
+       avl_tree_t scn_queue;           /* queue of datasets to scan */
+       uint64_t scn_bytes_pending;     /* outstanding data to issue */
 } dsl_scan_t;
 
+typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
+
+void scan_init(void);
+void scan_fini(void);
 int dsl_scan_init(struct dsl_pool *dp, uint64_t txg);
 void dsl_scan_fini(struct dsl_pool *dp);
 void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *);
@@ -142,6 +176,9 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
     struct dmu_tx *tx);
 boolean_t dsl_scan_active(dsl_scan_t *scn);
 boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
+void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
+void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
+void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
 
 #ifdef __cplusplus
 }
index 1aa3b21b5c96aafed37e190713d9873969efae51..88e8671dbbf653780ada9e383577d1271e0e2730 100644 (file)
@@ -859,17 +859,19 @@ typedef struct pool_scan_stat {
        uint64_t        pss_start_time; /* scan start time */
        uint64_t        pss_end_time;   /* scan end time */
        uint64_t        pss_to_examine; /* total bytes to scan */
-       uint64_t        pss_examined;   /* total examined bytes */
+       uint64_t        pss_examined;   /* total bytes located by scanner */
        uint64_t        pss_to_process; /* total bytes to process */
        uint64_t        pss_processed;  /* total processed bytes */
        uint64_t        pss_errors;     /* scan errors  */
 
        /* values not stored on disk */
-       uint64_t        pss_pass_exam;  /* examined bytes per scan pass */
+       uint64_t        pss_pass_exam; /* examined bytes per scan pass */
+       uint64_t        pss_pass_issued; /* issued bytes per scan pass */
        uint64_t        pss_pass_start; /* start time of a scan pass */
        uint64_t        pss_pass_scrub_pause; /* pause time of a scurb pass */
        /* cumulative time scrub spent paused, needed for rate calculation */
        uint64_t        pss_pass_scrub_spent_paused;
+       uint64_t        pss_issued;     /* total bytes checked by scanner */
 } pool_scan_stat_t;
 
 typedef enum dsl_scan_state {
index 9f3ead537165f3a7b8c52fe58eedef66c1b1952e..1d3bdf9e5fe00d8ec65a77cbe8c8ddea4aef65c2 100644 (file)
@@ -44,8 +44,13 @@ typedef struct range_tree_ops range_tree_ops_t;
 typedef struct range_tree {
        avl_tree_t      rt_root;        /* offset-ordered segment AVL tree */
        uint64_t        rt_space;       /* sum of all segments in the map */
+       uint64_t        rt_gap;         /* allowable inter-segment gap */
        range_tree_ops_t *rt_ops;
+
+       /* rt_avl_compare should only be set if rt_arg is an AVL tree */
        void            *rt_arg;
+       int (*rt_avl_compare)(const void *, const void *);
+
 
        /*
         * The rt_histogram maintains a histogram of ranges. Each bucket,
@@ -61,6 +66,7 @@ typedef struct range_seg {
        avl_node_t      rs_pp_node;     /* AVL picker-private node */
        uint64_t        rs_start;       /* starting offset of this segment */
        uint64_t        rs_end;         /* ending offset (non-inclusive) */
+       uint64_t        rs_fill;        /* actual fill if gap mode is on */
 } range_seg_t;
 
 struct range_tree_ops {
@@ -75,20 +81,37 @@ typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size);
 
 void range_tree_init(void);
 void range_tree_fini(void);
+range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg,
+    int (*avl_compare) (const void *, const void *), kmutex_t *lp,
+    uint64_t gap);
 range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp);
 void range_tree_destroy(range_tree_t *rt);
 boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size);
+range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+    uint64_t newstart, uint64_t newsize);
 uint64_t range_tree_space(range_tree_t *rt);
 void range_tree_verify(range_tree_t *rt, uint64_t start, uint64_t size);
 void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst);
 void range_tree_stat_verify(range_tree_t *rt);
+void range_tree_set_lock(range_tree_t *rt, kmutex_t *lp);
 
 void range_tree_add(void *arg, uint64_t start, uint64_t size);
 void range_tree_remove(void *arg, uint64_t start, uint64_t size);
+void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size);
+void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta);
 void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size);
 
 void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg);
 void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg);
+range_seg_t *range_tree_first(range_tree_t *rt);
+
+void rt_avl_create(range_tree_t *rt, void *arg);
+void rt_avl_destroy(range_tree_t *rt, void *arg);
+void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg);
+void rt_avl_vacate(range_tree_t *rt, void *arg);
+extern struct range_tree_ops rt_avl_ops;
 
 #ifdef __cplusplus
 }
index 926a0bc24b25ac48c4c45b40ecebb94ecbc508b0..2fc5980163060f081edc83a207741ac09cd98df6 100644 (file)
@@ -185,9 +185,9 @@ struct spa {
        uberblock_t     spa_ubsync;             /* last synced uberblock */
        uberblock_t     spa_uberblock;          /* current uberblock */
        boolean_t       spa_extreme_rewind;     /* rewind past deferred frees */
-       uint64_t        spa_last_io;            /* lbolt of last non-scan I/O */
        kmutex_t        spa_scrub_lock;         /* resilver/scrub lock */
-       uint64_t        spa_scrub_inflight;     /* in-flight scrub I/Os */
+       uint64_t        spa_scrub_inflight;     /* in-flight scrub bytes */
+       uint64_t        spa_load_verify_ios;    /* in-flight verification IOs */
        kcondvar_t      spa_scrub_io_cv;        /* scrub I/O completion */
        uint8_t         spa_scrub_active;       /* active or suspended? */
        uint8_t         spa_scrub_type;         /* type of scrub we're doing */
@@ -198,6 +198,7 @@ struct spa {
        uint64_t        spa_scan_pass_scrub_pause; /* scrub pause time */
        uint64_t        spa_scan_pass_scrub_spent_paused; /* total paused */
        uint64_t        spa_scan_pass_exam;     /* examined bytes per pass */
+       uint64_t        spa_scan_pass_issued;   /* issued bytes per pass */
        kmutex_t        spa_async_lock;         /* protect async state */
        kthread_t       *spa_async_thread;      /* thread doing async task */
        int             spa_async_suspended;    /* async tasks suspended */
index 4c2e3cd2e0af6f62053bbe275d11a4e6948f55d5..5f953a8dbcded96f4b65b6cef480866ffd2be294 100644 (file)
@@ -197,6 +197,13 @@ struct vdev {
        uint64_t        vdev_async_write_queue_depth;
        uint64_t        vdev_max_async_write_queue_depth;
 
+       /*
+        * Protects the vdev_scan_io_queue field itself as well as the
+        * structure's contents (when present).
+        */
+       kmutex_t                        vdev_scan_io_queue_lock;
+       struct dsl_scan_io_queue        *vdev_scan_io_queue;
+
        /*
         * Leaf vdev state.
         */
index ccc472153cb110bc627efc9597dd398f6132e98e..320783523b7dadbf3179cdb1bf32e3b47302ef80 100644 (file)
@@ -214,7 +214,7 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap)
         */
        (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
            (uint64_t **)&ps, &psc);
-       if (ps && ps->pss_func == POOL_SCAN_RESILVER &&
+       if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER &&
            ps->pss_state == DSS_SCANNING)
                return (ZPOOL_STATUS_RESILVERING);
 
index 39918e9035f5bf362c102f1a1005665a76faad34..e6e61906128ee283445953c88a54ccb646941ef7 100644 (file)
@@ -1,5 +1,6 @@
 '\" te
 .\" Copyright (c) 2013 by Turbo Fredriksson <turbo@bayour.com>. All rights reserved.
+.\" Copyright (c) 2017 Datto Inc.
 .\" The contents of this file are subject to the terms of the Common Development
 .\" and Distribution License (the "License").  You may not use this file except
 .\" in compliance with the License. You can obtain a copy of the license at
@@ -12,7 +13,7 @@
 .\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your
 .\" own identifying information:
 .\" Portions Copyright [yyyy] [name of copyright owner]
-.TH ZFS-MODULE-PARAMETERS 5 "Oct 28, 2017" 
+.TH ZFS-MODULE-PARAMETERS 5 "Oct 28, 2017"
 .SH NAME
 zfs\-module\-parameters \- ZFS module parameters
 .SH DESCRIPTION
@@ -487,7 +488,7 @@ Default value: \fB10\fR.
 .ad
 .RS 12n
 If set to a non zero value, it will replace the arc_grow_retry value with this value.
-The arc_grow_retry value (default 5) is the number of seconds the ARC will wait before 
+The arc_grow_retry value (default 5) is the number of seconds the ARC will wait before
 trying to resume growth after a memory pressure event.
 .sp
 Default value: \fB0\fR.
@@ -605,7 +606,7 @@ Default value: \fB10,000\fR.
 .RS 12n
 Define the strategy for ARC meta data buffer eviction (meta reclaim strategy).
 A value of 0 (META_ONLY) will evict only the ARC meta data buffers.
-A value of 1 (BALANCED) indicates that additional data buffers may be evicted if 
+A value of 1 (BALANCED) indicates that additional data buffers may be evicted if
 that is required to in order to evict the required number of meta data buffers.
 .sp
 Default value: \fB1\fR.
@@ -626,11 +627,24 @@ Default value: \fB0\fR.
 .sp
 .ne 2
 .na
-\fBzfs_arc_min_prefetch_lifespan\fR (int)
+\fBzfs_arc_min_prefetch_ms\fR (int)
 .ad
 .RS 12n
-Minimum time prefetched blocks are locked in the ARC, specified in jiffies.
-A value of 0 will default to 1 second.
+Minimum time prefetched blocks are locked in the ARC, specified in ms.
+A value of \fB0\fR will default to 1 second.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_arc_min_prescient_prefetch_ms\fR (int)
+.ad
+.RS 12n
+Minimum time "prescient prefetched" blocks are locked in the ARC, specified
+in ms. These blocks are meant to be prefetched fairly aggresively ahead of
+the code that may use them. A value of \fB0\fR will default to 6 seconds.
 .sp
 Default value: \fB0\fR.
 .RE
@@ -679,7 +693,7 @@ Default value: \fB8\fR.
 .RS 12n
 If set to a non zero value, this will update arc_p_min_shift (default 4)
 with the new value.
-arc_p_min_shift is used to shift of arc_c for calculating both min and max 
+arc_p_min_shift is used to shift of arc_c for calculating both min and max
 max arc_p
 .sp
 Default value: \fB0\fR.
@@ -1660,65 +1674,144 @@ Use \fB1\fR for yes and \fB0\fR for no (default).
 .sp
 .ne 2
 .na
-\fBzfs_resilver_delay\fR (int)
+\fBzfs_resilver_min_time_ms\fR (int)
 .ad
 .RS 12n
-Number of ticks to delay prior to issuing a resilver I/O operation when
-a non-resilver or non-scrub I/O operation has occurred within the past
-\fBzfs_scan_idle\fR ticks.
+Resilvers are processed by the sync thread. While resilvering it will spend
+at least this much time working on a resilver between txg flushes.
 .sp
-Default value: \fB2\fR.
+Default value: \fB3,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
-\fBzfs_resilver_min_time_ms\fR (int)
+\fBzfs_scrub_min_time_ms\fR (int)
 .ad
 .RS 12n
-Resilvers are processed by the sync thread. While resilvering it will spend
-at least this much time working on a resilver between txg flushes.
+Scrubs are processed by the sync thread. While scrubbing it will spend
+at least this much time working on a scrub between txg flushes.
 .sp
-Default value: \fB3,000\fR.
+Default value: \fB1,000\fR.
 .RE
 
 .sp
 .ne 2
 .na
-\fBzfs_scan_idle\fR (int)
+\fBzfs_scan_checkpoint_intval\fR (int)
 .ad
 .RS 12n
-Idle window in clock ticks.  During a scrub or a resilver, if
-a non-scrub or non-resilver I/O operation has occurred during this
-window, the next scrub or resilver operation is delayed by, respectively
-\fBzfs_scrub_delay\fR or \fBzfs_resilver_delay\fR ticks.
+To preserve progress across reboots the sequential scan algorithm periodically
+needs to stop metadata scanning and issue all the verifications I/Os to disk.
+The frequency of this flushing is determined by the
+\fBfBzfs_scan_checkpoint_intval\fR tunable.
 .sp
-Default value: \fB50\fR.
+Default value: \fB7200\fR seconds (every 2 hours).
 .RE
 
 .sp
 .ne 2
 .na
-\fBzfs_scan_min_time_ms\fR (int)
+\fBzfs_scan_fill_weight\fR (int)
 .ad
 .RS 12n
-Scrubs are processed by the sync thread. While scrubbing it will spend
-at least this much time working on a scrub between txg flushes.
+This tunable affects how scrub and resilver I/O segments are ordered. A higher
+number indicates that we care more about how filled in a segment is, while a
+lower number indicates we care more about the size of the extent without
+considering the gaps within a segment. This value is only tunable upon module
+insertion. Changing the value afterwards will have no affect on scrub or
+resilver performance.
 .sp
-Default value: \fB1,000\fR.
+Default value: \fB3\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_issue_strategy\fR (int)
+.ad
+.RS 12n
+Determines the order that data will be verified while scrubbing or resilvering.
+If set to \fB1\fR, data will be verified as sequentially as possible, given the
+amount of memory reserved for scrubbing (see \fBzfs_scan_mem_lim_fact\fR). This
+may improve scrub performance if the pool's data is very fragmented. If set to
+\fB2\fR, the largest mostly-contiguous chunk of found data will be verified
+first. By deferring scrubbing of small segments, we may later find adjacent data
+to coalesce and increase the segment size. If set to \fB0\fR, zfs will use
+strategy \fB1\fR during normal verification and strategy \fB2\fR while taking a
+checkpoint.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_legacy\fR (int)
+.ad
+.RS 12n
+A value of 0 indicates that scrubs and resilvers will gather metadata in
+memory before issuing sequential I/O. A value of 1 indicates that the legacy
+algorithm will be used where I/O is initiated as soon as it is discovered.
+Changing this value to 0 will not affect scrubs or resilvers that are already
+in progress.
+.sp
+Default value: \fB0\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_max_ext_gap\fR (int)
+.ad
+.RS 12n
+Indicates the largest gap in bytes between scrub / resilver I/Os that will still
+be considered sequential for sorting purposes. Changing this value will not
+affect scrubs or resilvers that are already in progress.
+.sp
+Default value: \fB2097152 (2 MB)\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_mem_lim_fact\fR (int)
+.ad
+.RS 12n
+Maximum fraction of RAM used for I/O sorting by sequential scan algorithm.
+This tunable determines the hard limit for I/O sorting memory usage.
+When the hard limit is reached we stop scanning metadata and start issuing
+data verification I/O. This is done until we get below the soft limit.
+.sp
+Default value: \fB20\fR which is 5% of RAM (1/20).
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_scan_mem_lim_soft_fact\fR (int)
+.ad
+.RS 12n
+The fraction of the hard limit used to determined the soft limit for I/O sorting
+by the sequential scan algorithm. When we cross this limit from bellow no action
+is taken. When we cross this limit from above it is because we are issuing
+verification I/O. In this case (unless the metadata scan is done) we stop
+issuing verification I/O and start scanning metadata again until we get to the
+hard limit.
+.sp
+Default value: \fB20\fR which is 5% of the hard limit (1/20).
 .RE
 
 .sp
 .ne 2
 .na
-\fBzfs_scrub_delay\fR (int)
+\fBzfs_scan_vdev_limit\fR (int)
 .ad
 .RS 12n
-Number of ticks to delay prior to issuing a scrub I/O operation when
-a non-scrub or non-resilver I/O operation has occurred within the past
-\fBzfs_scan_idle\fR ticks.
+Maximum amount of data that can be concurrently issued at once for scrubs and
+resilvers per leaf device, given in bytes.
 .sp
-Default value: \fB4\fR.
+Default value: \fB41943040\fR.
 .RE
 
 .sp
@@ -1777,18 +1870,6 @@ value of 75% will create a maximum of one thread per cpu.
 Default value: \fB75\fR.
 .RE
 
-.sp
-.ne 2
-.na
-\fBzfs_top_maxinflight\fR (int)
-.ad
-.RS 12n
-Max concurrent I/Os per top-level vdev (mirrors or raidz arrays) allowed during
-scrub or resilver operations.
-.sp
-Default value: \fB32\fR.
-.RE
-
 .sp
 .ne 2
 .na
index cd343b04e65dc61680e6bf9f303d19615087049f..6983576321d7d7e15fcd462862090a950b14218c 100644 (file)
@@ -357,7 +357,8 @@ int                 arc_no_grow_shift = 5;
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
  */
-static int             arc_min_prefetch_lifespan;
+static int             arc_min_prefetch_ms;
+static int             arc_min_prescient_prefetch_ms;
 
 /*
  * If this percent of memory is free, don't throttle.
@@ -407,7 +408,8 @@ unsigned long zfs_arc_dnode_limit_percent = 10;
  * These tunables are Linux specific
  */
 unsigned long zfs_arc_sys_free = 0;
-int zfs_arc_min_prefetch_lifespan = 0;
+int zfs_arc_min_prefetch_ms = 0;
+int zfs_arc_min_prescient_prefetch_ms = 0;
 int zfs_arc_p_aggressive_disable = 1;
 int zfs_arc_p_dampener_disable = 1;
 int zfs_arc_meta_prune = 10000;
@@ -663,6 +665,7 @@ typedef struct arc_stats {
        kstat_named_t arcstat_meta_min;
        kstat_named_t arcstat_sync_wait_for_async;
        kstat_named_t arcstat_demand_hit_predictive_prefetch;
+       kstat_named_t arcstat_demand_hit_prescient_prefetch;
        kstat_named_t arcstat_need_free;
        kstat_named_t arcstat_sys_free;
        kstat_named_t arcstat_raw_size;
@@ -762,6 +765,7 @@ static arc_stats_t arc_stats = {
        { "arc_meta_min",               KSTAT_DATA_UINT64 },
        { "sync_wait_for_async",        KSTAT_DATA_UINT64 },
        { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 },
+       { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 },
        { "arc_need_free",              KSTAT_DATA_UINT64 },
        { "arc_sys_free",               KSTAT_DATA_UINT64 },
        { "arc_raw_size",               KSTAT_DATA_UINT64 }
@@ -861,6 +865,8 @@ static taskq_t *arc_prune_taskq;
 #define        HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS)
 #define        HDR_IO_ERROR(hdr)       ((hdr)->b_flags & ARC_FLAG_IO_ERROR)
 #define        HDR_PREFETCH(hdr)       ((hdr)->b_flags & ARC_FLAG_PREFETCH)
+#define        HDR_PRESCIENT_PREFETCH(hdr)     \
+       ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH)
 #define        HDR_COMPRESSION_ENABLED(hdr)    \
        ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC)
 
@@ -3778,6 +3784,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 {
        arc_state_t *evicted_state, *state;
        int64_t bytes_evicted = 0;
+       int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ?
+           arc_min_prescient_prefetch_ms : arc_min_prefetch_ms;
 
        ASSERT(MUTEX_HELD(hash_lock));
        ASSERT(HDR_HAS_L1HDR(hdr));
@@ -3831,8 +3839,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
        /* prefetch buffers have a minimum lifespan */
        if (HDR_IO_IN_PROGRESS(hdr) ||
            ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) &&
-           ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access <
-           arc_min_prefetch_lifespan)) {
+           ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) {
                ARCSTAT_BUMP(arcstat_evict_skip);
                return (bytes_evicted);
        }
@@ -5492,13 +5499,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * - move the buffer to the head of the list if this is
                 *   another prefetch (to make it less likely to be evicted).
                 */
-               if (HDR_PREFETCH(hdr)) {
+               if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
                        if (refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
                                /* link protected by hash lock */
                                ASSERT(multilist_link_active(
                                    &hdr->b_l1hdr.b_arc_node));
                        } else {
-                               arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+                               arc_hdr_clear_flags(hdr,
+                                   ARC_FLAG_PREFETCH |
+                                   ARC_FLAG_PRESCIENT_PREFETCH);
                                atomic_inc_32(&hdr->b_l1hdr.b_mru_hits);
                                ARCSTAT_BUMP(arcstat_mru_hits);
                        }
@@ -5532,10 +5541,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * MFU state.
                 */
 
-               if (HDR_PREFETCH(hdr)) {
+               if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
                        new_state = arc_mru;
-                       if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0)
-                               arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH);
+                       if (refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) {
+                               arc_hdr_clear_flags(hdr,
+                                   ARC_FLAG_PREFETCH |
+                                   ARC_FLAG_PRESCIENT_PREFETCH);
+                       }
                        DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr);
                } else {
                        new_state = arc_mfu;
@@ -5557,11 +5569,7 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * If it was a prefetch, we will explicitly move it to
                 * the head of the list now.
                 */
-               if ((HDR_PREFETCH(hdr)) != 0) {
-                       ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt));
-                       /* link protected by hash_lock */
-                       ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node));
-               }
+
                atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits);
                ARCSTAT_BUMP(arcstat_mfu_hits);
                hdr->b_l1hdr.b_arc_access = ddi_get_lbolt();
@@ -5573,12 +5581,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
                 * MFU state.
                 */
 
-               if (HDR_PREFETCH(hdr)) {
+               if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) {
                        /*
                         * This is a prefetch access...
                         * move this block back to the MRU state.
                         */
-                       ASSERT0(refcount_count(&hdr->b_l1hdr.b_refcnt));
                        new_state = arc_mru;
                }
 
@@ -5605,20 +5612,25 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock)
 /* a generic arc_read_done_func_t which you can use */
 /* ARGSUSED */
 void
-arc_bcopy_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
+arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
 {
-       if (error == 0)
-               bcopy(buf->b_data, arg, arc_buf_size(buf));
+       if (buf == NULL)
+               return;
+
+       bcopy(buf->b_data, arg, arc_buf_size(buf));
        arc_buf_destroy(buf, arg);
 }
 
 /* a generic arc_read_done_func_t */
+/* ARGSUSED */
 void
-arc_getbuf_func(zio_t *zio, int error, arc_buf_t *buf, void *arg)
+arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *arg)
 {
        arc_buf_t **bufp = arg;
-       if (error != 0) {
-               arc_buf_destroy(buf, arg);
+
+       if (buf == NULL) {
                *bufp = NULL;
        } else {
                *bufp = buf;
@@ -5652,7 +5664,6 @@ arc_read_done(zio_t *zio)
        arc_callback_t  *callback_list;
        arc_callback_t  *acb;
        boolean_t       freeable = B_FALSE;
-       boolean_t       no_zio_error = (zio->io_error == 0);
 
        /*
         * The hdr was inserted into hash-table and removed from lists
@@ -5699,7 +5710,7 @@ arc_read_done(zio_t *zio)
                }
        }
 
-       if (no_zio_error) {
+       if (zio->io_error == 0) {
                /* byteswap if necessary */
                if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
                        if (BP_GET_LEVEL(zio->io_bp) > 0) {
@@ -5720,7 +5731,8 @@ arc_read_done(zio_t *zio)
        callback_list = hdr->b_l1hdr.b_acb;
        ASSERT3P(callback_list, !=, NULL);
 
-       if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) {
+       if (hash_lock && zio->io_error == 0 &&
+           hdr->b_l1hdr.b_state == arc_anon) {
                /*
                 * Only call arc_access on anonymous buffers.  This is because
                 * if we've issued an I/O for an evicted buffer, we've already
@@ -5741,13 +5753,19 @@ arc_read_done(zio_t *zio)
                if (!acb->acb_done)
                        continue;
 
-               /* This is a demand read since prefetches don't use callbacks */
                callback_cnt++;
 
+               if (zio->io_error != 0)
+                       continue;
+
                int error = arc_buf_alloc_impl(hdr, zio->io_spa,
                    acb->acb_dsobj, acb->acb_private, acb->acb_encrypted,
-                   acb->acb_compressed, acb->acb_noauth, no_zio_error,
+                   acb->acb_compressed, acb->acb_noauth, B_TRUE,
                    &acb->acb_buf);
+               if (error != 0) {
+                       arc_buf_destroy(acb->acb_buf, acb->acb_private);
+                       acb->acb_buf = NULL;
+               }
 
                /*
                 * Assert non-speculative zios didn't fail because an
@@ -5770,9 +5788,8 @@ arc_read_done(zio_t *zio)
                        }
                }
 
-               if (no_zio_error) {
+               if (zio->io_error == 0)
                        zio->io_error = error;
-               }
        }
        hdr->b_l1hdr.b_acb = NULL;
        arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS);
@@ -5782,7 +5799,7 @@ arc_read_done(zio_t *zio)
        ASSERT(refcount_is_zero(&hdr->b_l1hdr.b_refcnt) ||
            callback_list != NULL);
 
-       if (no_zio_error) {
+       if (zio->io_error == 0) {
                arc_hdr_verify(hdr, zio->io_bp);
        } else {
                arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR);
@@ -5816,8 +5833,8 @@ arc_read_done(zio_t *zio)
        /* execute each callback and free its structure */
        while ((acb = callback_list) != NULL) {
                if (acb->acb_done) {
-                       acb->acb_done(zio, zio->io_error, acb->acb_buf,
-                           acb->acb_private);
+                       acb->acb_done(zio, &zio->io_bookmark, zio->io_bp,
+                           acb->acb_buf, acb->acb_private);
                }
 
                if (acb->acb_zio_dummy != NULL) {
@@ -5974,12 +5991,25 @@ top:
                                arc_hdr_clear_flags(hdr,
                                    ARC_FLAG_PREDICTIVE_PREFETCH);
                        }
+
+                       if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) {
+                               ARCSTAT_BUMP(
+                                   arcstat_demand_hit_prescient_prefetch);
+                               arc_hdr_clear_flags(hdr,
+                                   ARC_FLAG_PRESCIENT_PREFETCH);
+                       }
+
                        ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp));
 
                        /* Get a buf with the desired data in it. */
                        rc = arc_buf_alloc_impl(hdr, spa, zb->zb_objset,
                            private, encrypted_read, compressed_read,
                            noauth_read, B_TRUE, &buf);
+                       if (rc != 0) {
+                               arc_buf_destroy(buf, private);
+                               buf = NULL;
+                       }
+
                        ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || rc == 0);
                } else if (*arc_flags & ARC_FLAG_PREFETCH &&
                    refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) {
@@ -5987,6 +6017,8 @@ top:
                }
                DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
                arc_access(hdr, hash_lock);
+               if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+                       arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
                if (*arc_flags & ARC_FLAG_L2CACHE)
                        arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
                mutex_exit(hash_lock);
@@ -5996,7 +6028,7 @@ top:
                    data, metadata, hits);
 
                if (done)
-                       done(NULL, rc, buf, private);
+                       done(NULL, zb, bp, buf, private);
        } else {
                uint64_t lsize = BP_GET_LSIZE(bp);
                uint64_t psize = BP_GET_PSIZE(bp);
@@ -6112,6 +6144,8 @@ top:
                if (*arc_flags & ARC_FLAG_PREFETCH &&
                    refcount_is_zero(&hdr->b_l1hdr.b_refcnt))
                        arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH);
+               if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH)
+                       arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH);
                if (*arc_flags & ARC_FLAG_L2CACHE)
                        arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE);
                if (BP_IS_AUTHENTICATED(bp))
@@ -7223,9 +7257,15 @@ arc_tuning_update(void)
        if (zfs_arc_p_min_shift)
                arc_p_min_shift = zfs_arc_p_min_shift;
 
-       /* Valid range: 1 - N ticks */
-       if (zfs_arc_min_prefetch_lifespan)
-               arc_min_prefetch_lifespan = zfs_arc_min_prefetch_lifespan;
+       /* Valid range: 1 - N ms */
+       if (zfs_arc_min_prefetch_ms)
+               arc_min_prefetch_ms = zfs_arc_min_prefetch_ms;
+
+       /* Valid range: 1 - N ms */
+       if (zfs_arc_min_prescient_prefetch_ms) {
+               arc_min_prescient_prefetch_ms =
+                   zfs_arc_min_prescient_prefetch_ms;
+       }
 
        /* Valid range: 0 - 100 */
        if ((zfs_arc_lotsfree_percent >= 0) &&
@@ -7368,7 +7408,8 @@ arc_init(void)
        cv_init(&arc_reclaim_waiters_cv, NULL, CV_DEFAULT, NULL);
 
        /* Convert seconds to clock ticks */
-       arc_min_prefetch_lifespan = 1 * hz;
+       arc_min_prefetch_ms = 1;
+       arc_min_prescient_prefetch_ms = 6;
 
 #ifdef _KERNEL
        /*
@@ -9006,8 +9047,12 @@ MODULE_PARM_DESC(zfs_arc_average_blocksize, "Target average block size");
 module_param(zfs_compressed_arc_enabled, int, 0644);
 MODULE_PARM_DESC(zfs_compressed_arc_enabled, "Disable compressed arc buffers");
 
-module_param(zfs_arc_min_prefetch_lifespan, int, 0644);
-MODULE_PARM_DESC(zfs_arc_min_prefetch_lifespan, "Min life of prefetch block");
+module_param(zfs_arc_min_prefetch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prefetch_ms, "Min life of prefetch block in ms");
+
+module_param(zfs_arc_min_prescient_prefetch_ms, int, 0644);
+MODULE_PARM_DESC(zfs_arc_min_prescient_prefetch_ms,
+       "Min life of prescient prefetched block in ms");
 
 module_param(l2arc_write_max, ulong, 0644);
 MODULE_PARM_DESC(l2arc_write_max, "Max write bytes per interval");
index 64c1a68af28c1de10030a1c6b6ece696ec1c69ed..190d0656a42f8ae52d1ca3a0f251eedd4f3e206a 100644 (file)
@@ -973,7 +973,8 @@ dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset)
 }
 
 static void
-dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
+dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *vdb)
 {
        dmu_buf_impl_t *db = vdb;
 
@@ -987,19 +988,22 @@ dbuf_read_done(zio_t *zio, int err, arc_buf_t *buf, void *vdb)
        ASSERT(db->db.db_data == NULL);
        if (db->db_level == 0 && db->db_freed_in_flight) {
                /* we were freed in flight; disregard any error */
+               if (buf == NULL) {
+                       buf = arc_alloc_buf(db->db_objset->os_spa,
+                           db, DBUF_GET_BUFC_TYPE(db), db->db.db_size);
+               }
                arc_release(buf, db);
                bzero(buf->b_data, db->db.db_size);
                arc_buf_freeze(buf);
                db->db_freed_in_flight = FALSE;
                dbuf_set_data(db, buf);
                db->db_state = DB_CACHED;
-       } else if (err == 0) {
+       } else if (buf != NULL) {
                dbuf_set_data(db, buf);
                db->db_state = DB_CACHED;
        } else {
                ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                ASSERT3P(db->db_buf, ==, NULL);
-               arc_buf_destroy(buf, db);
                db->db_state = DB_UNCACHED;
        }
        cv_broadcast(&db->db_changed);
@@ -2512,7 +2516,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
  * prefetch if the next block down is our target.
  */
 static void
-dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
+dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb,
+    const blkptr_t *iobp, arc_buf_t *abuf, void *private)
 {
        dbuf_prefetch_arg_t *dpa = private;
 
@@ -2551,13 +2556,18 @@ dbuf_prefetch_indirect_done(zio_t *zio, int err, arc_buf_t *abuf, void *private)
                dbuf_rele(db, FTAG);
        }
 
-       dpa->dpa_curlevel--;
+       if (abuf == NULL) {
+               kmem_free(dpa, sizeof (*dpa));
+               return;
+       }
 
+       dpa->dpa_curlevel--;
        uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
            (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
        blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
            P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
-       if (BP_IS_HOLE(bp) || err != 0) {
+
+       if (BP_IS_HOLE(bp)) {
                kmem_free(dpa, sizeof (*dpa));
        } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
                ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
index 00b0a0b9ee2187b1aaf4d3c0b4642725d11d45c7..24516834f0f346300850f8b4fd9422d883c9b2dc 100644 (file)
@@ -1172,14 +1172,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
 void
 ddt_sync(spa_t *spa, uint64_t txg)
 {
+       dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
        dmu_tx_t *tx;
-       zio_t *rio = zio_root(spa, NULL, NULL,
-           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+       zio_t *rio;
 
        ASSERT(spa_syncing_txg(spa) == txg);
 
        tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
+       rio = zio_root(spa, NULL, NULL,
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+       /*
+        * This function may cause an immediate scan of ddt blocks (see
+        * the comment above dsl_scan_ddt() for details). We set the
+        * scan's root zio here so that we can wait for any scan IOs in
+        * addition to the regular ddt IOs.
+        */
+       ASSERT3P(scn->scn_zio_root, ==, NULL);
+       scn->scn_zio_root = rio;
+
        for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
                ddt_t *ddt = spa->spa_ddt[c];
                if (ddt == NULL)
@@ -1189,6 +1201,7 @@ ddt_sync(spa_t *spa, uint64_t txg)
        }
 
        (void) zio_wait(rio);
+       scn->scn_zio_root = NULL;
 
        dmu_tx_commit(tx);
 }
index 64e7d2f7777caf3db974e8f7a2885017adb6dd1a..280e0ee347e9c71881bdf4a6e15f9ce1c0a5d7d9 100644 (file)
@@ -520,7 +520,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 {
        prefetch_data_t *pfd = arg;
        int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
-       arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
+       arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH |
+           ARC_FLAG_PRESCIENT_PREFETCH;
 
        ASSERT(pfd->pd_bytes_fetched >= 0);
        if (bp == NULL)
index 43fd90861c600a9f7684545c75ec105a11d43c8a..86863fad87192acb4158e80949452a64549b322b 100644 (file)
@@ -390,8 +390,10 @@ dsl_pool_close(dsl_pool_t *dp)
        mutex_destroy(&dp->dp_lock);
        cv_destroy(&dp->dp_spaceavail_cv);
        taskq_destroy(dp->dp_iput_taskq);
-       if (dp->dp_blkstats)
+       if (dp->dp_blkstats) {
+               mutex_destroy(&dp->dp_blkstats->zab_lock);
                vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+       }
        kmem_free(dp, sizeof (dsl_pool_t));
 }
 
index b0aec5332def73e90cf16be267d98cd4325c40e1..52c700f11891d1f00c6c7f4c45c34d20345c1000 100644 (file)
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
+#include <sys/range_tree.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
 
+/*
+ * Grand theory statement on scan queue sorting
+ *
+ * Scanning is implemented by recursively traversing all indirection levels
+ * in an object and reading all blocks referenced from said objects. This
+ * results in us approximately traversing the object from lowest logical
+ * offset to the highest. For best performance, we would want the logical
+ * blocks to be physically contiguous. However, this is frequently not the
+ * case with pools given the allocation patterns of copy-on-write filesystems.
+ * So instead, we put the I/Os into a reordering queue and issue them in a
+ * way that will most benefit physical disks (LBA-order).
+ *
+ * Queue management:
+ *
+ * Ideally, we would want to scan all metadata and queue up all block I/O
+ * prior to starting to issue it, because that allows us to do an optimal
+ * sorting job. This can however consume large amounts of memory. Therefore
+ * we continuously monitor the size of the queues and constrain them to 5%
+ * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this
+ * limit, we clear out a few of the largest extents at the head of the queues
+ * to make room for more scanning. Hopefully, these extents will be fairly
+ * large and contiguous, allowing us to approach sequential I/O throughput
+ * even without a fully sorted tree.
+ *
+ * Metadata scanning takes place in dsl_scan_visit(), which is called from
+ * dsl_scan_sync() every spa_sync(). If we have either fully scanned all
+ * metadata on the pool, or we need to make room in memory because our
+ * queues are too large, dsl_scan_visit() is postponed and
+ * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies
+ * that metadata scanning and queued I/O issuing are mutually exclusive. This
+ * allows us to provide maximum sequential I/O throughput for the majority of
+ * I/O's issued since sequential I/O performance is significantly negatively
+ * impacted if it is interleaved with random I/O.
+ *
+ * Implementation Notes
+ *
+ * One side effect of the queued scanning algorithm is that the scanning code
+ * needs to be notified whenever a block is freed. This is needed to allow
+ * the scanning code to remove these I/Os from the issuing queue. Additionally,
+ * we do not attempt to queue gang blocks to be issued sequentially since this
+ * is very hard to do and would have an extremely limitted performance benefit.
+ * Instead, we simply issue gang I/Os as soon as we find them using the legacy
+ * algorithm.
+ *
+ * Backwards compatibility
+ *
+ * This new algorithm is backwards compatible with the legacy on-disk data
+ * structures (and therefore does not require a new feature flag).
+ * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan
+ * will stop scanning metadata (in logical order) and wait for all outstanding
+ * sorted I/O to complete. Once this is done, we write out a checkpoint
+ * bookmark, indicating that we have scanned everything logically before it.
+ * If the pool is imported on a machine without the new sorting algorithm,
+ * the scan simply resumes from the last checkpoint using the legacy algorithm.
+ */
+
 typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *,
     const zbookmark_phys_t *);
 
 static scan_cb_t dsl_scan_scrub_cb;
-static void dsl_scan_cancel_sync(void *, dmu_tx_t *);
-static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *);
-static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *);
 
-int zfs_top_maxinflight = 32;          /* maximum I/Os per top-level */
-int zfs_resilver_delay = 2;            /* number of ticks to delay resilver */
-int zfs_scrub_delay = 4;               /* number of ticks to delay scrub */
-int zfs_scan_idle = 50;                        /* idle window in clock ticks */
+static int scan_ds_queue_compare(const void *a, const void *b);
+static int scan_prefetch_queue_compare(const void *a, const void *b);
+static void scan_ds_queue_clear(dsl_scan_t *scn);
+static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj,
+    uint64_t *txg);
+static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
+static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
+static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
+
+extern int zfs_vdev_async_write_active_min_dirty_percent;
+
+/*
+ * By default zfs will check to ensure it is not over the hard memory
+ * limit before each txg. If finer-grained control of this is needed
+ * this value can be set to 1 to enable checking before scanning each
+ * block.
+ */
+int zfs_scan_strict_mem_lim = B_FALSE;
+
+/*
+ * Maximum number of parallelly executed bytes per leaf vdev. We attempt
+ * to strike a balance here between keeping the vdev queues full of I/Os
+ * at all times and not overflowing the queues to cause long latency,
+ * which would cause long txg sync times. No matter what, we will not
+ * overload the drives with I/O, since that is protected by
+ * zfs_vdev_scrub_max_active.
+ */
+unsigned long zfs_scan_vdev_limit = 4 << 20;
+
+int zfs_scan_issue_strategy = 0;
+int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */
+uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */
+
+/*
+ * fill_weight is non-tunable at runtime, so we copy it at module init from
+ * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would
+ * break queue sorting.
+ */
+int zfs_scan_fill_weight = 3;
+static uint64_t fill_weight;
+
+/* See dsl_scan_should_clear() for details on the memory limit tunables */
+uint64_t zfs_scan_mem_lim_min = 16 << 20;      /* bytes */
+uint64_t zfs_scan_mem_lim_soft_max = 128 << 20;        /* bytes */
+int zfs_scan_mem_lim_fact = 20;                /* fraction of physmem */
+int zfs_scan_mem_lim_soft_fact = 20;   /* fraction of mem lim above */
 
-int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
 int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */
 int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
+int zfs_scan_checkpoint_intval = 7200; /* in seconds */
 int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
 int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */
 enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
-int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */
 /* max number of blocks to free in a single TXG */
 unsigned long zfs_free_max_blocks = 100000;
 
+/*
+ * We wait a few txgs after importing a pool to begin scanning so that
+ * the import / mounting code isn't held up by scrub / resilver IO.
+ * Unfortunately, it is a bit difficult to determine exactly how long
+ * this will take since userspace will trigger fs mounts asynchronously
+ * and the kernel will create zvol minors asynchronously. As a result,
+ * the value provided here is a bit arbitrary, but represents a
+ * reasonable estimate of how many txgs it will take to finish fully
+ * importing a pool
+ */
+#define        SCAN_IMPORT_WAIT_TXGS           5
+
 #define        DSL_SCAN_IS_SCRUB_RESILVER(scn) \
        ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \
        (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER)
@@ -93,6 +201,163 @@ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
        dsl_scan_scrub_cb,      /* POOL_SCAN_RESILVER */
 };
 
+/* In core node for the scn->scn_queue. Represents a dataset to be scanned */
+typedef struct {
+       uint64_t        sds_dsobj;
+       uint64_t        sds_txg;
+       avl_node_t      sds_node;
+} scan_ds_t;
+
+/*
+ * This controls what conditions are placed on dsl_scan_sync_state():
+ * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0
+ * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0.
+ * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise
+ *     write out the scn_phys_cached version.
+ * See dsl_scan_sync_state for details.
+ */
+typedef enum {
+       SYNC_OPTIONAL,
+       SYNC_MANDATORY,
+       SYNC_CACHED
+} state_sync_type_t;
+
+/*
+ * This struct represents the minimum information needed to reconstruct a
+ * zio for sequential scanning. This is useful because many of these will
+ * accumulate in the sequential IO queues before being issued, so saving
+ * memory matters here.
+ */
+typedef struct scan_io {
+       /* fields from blkptr_t */
+       uint64_t                sio_offset;
+       uint64_t                sio_blk_prop;
+       uint64_t                sio_phys_birth;
+       uint64_t                sio_birth;
+       zio_cksum_t             sio_cksum;
+       uint32_t                sio_asize;
+
+       /* fields from zio_t */
+       int                     sio_flags;
+       zbookmark_phys_t        sio_zb;
+
+       /* members for queue sorting */
+       union {
+               avl_node_t      sio_addr_node; /* link into issueing queue */
+               list_node_t     sio_list_node; /* link for issuing to disk */
+       } sio_nodes;
+} scan_io_t;
+
+struct dsl_scan_io_queue {
+       dsl_scan_t      *q_scn; /* associated dsl_scan_t */
+       vdev_t          *q_vd; /* top-level vdev that this queue represents */
+
+       /* trees used for sorting I/Os and extents of I/Os */
+       range_tree_t    *q_exts_by_addr;
+       avl_tree_t      q_exts_by_size;
+       avl_tree_t      q_sios_by_addr;
+
+       /* members for zio rate limiting */
+       uint64_t        q_maxinflight_bytes;
+       uint64_t        q_inflight_bytes;
+       kcondvar_t      q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */
+
+       /* per txg statistics */
+       uint64_t        q_total_seg_size_this_txg;
+       uint64_t        q_segs_this_txg;
+       uint64_t        q_total_zio_size_this_txg;
+       uint64_t        q_zios_this_txg;
+};
+
+/* private data for dsl_scan_prefetch_cb() */
+typedef struct scan_prefetch_ctx {
+       refcount_t spc_refcnt;          /* refcount for memory management */
+       dsl_scan_t *spc_scn;            /* dsl_scan_t for the pool */
+       boolean_t spc_root;             /* is this prefetch for an objset? */
+       uint8_t spc_indblkshift;        /* dn_indblkshift of current dnode */
+       uint16_t spc_datablkszsec;      /* dn_idatablkszsec of current dnode */
+} scan_prefetch_ctx_t;
+
+/* private data for dsl_scan_prefetch() */
+typedef struct scan_prefetch_issue_ctx {
+       avl_node_t spic_avl_node;       /* link into scn->scn_prefetch_queue */
+       scan_prefetch_ctx_t *spic_spc;  /* spc for the callback */
+       blkptr_t spic_bp;               /* bp to prefetch */
+       zbookmark_phys_t spic_zb;       /* bookmark to prefetch */
+} scan_prefetch_issue_ctx_t;
+
+static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+    const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue);
+static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue,
+    scan_io_t *sio);
+
+static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
+static void scan_io_queues_destroy(dsl_scan_t *scn);
+
+static kmem_cache_t *sio_cache;
+
+void
+scan_init(void)
+{
+       /*
+        * This is used in ext_size_compare() to weight segments
+        * based on how sparse they are. This cannot be changed
+        * mid-scan and the tree comparison functions don't currently
+        * have a mechansim for passing additional context to the
+        * compare functions. Thus we store this value globally and
+        * we only allow it to be set at module intiailization time
+        */
+       fill_weight = zfs_scan_fill_weight;
+
+       sio_cache = kmem_cache_create("sio_cache",
+           sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+}
+
+void
+scan_fini(void)
+{
+       kmem_cache_destroy(sio_cache);
+}
+
+static inline boolean_t
+dsl_scan_is_running(const dsl_scan_t *scn)
+{
+       return (scn->scn_phys.scn_state == DSS_SCANNING);
+}
+
+boolean_t
+dsl_scan_resilvering(dsl_pool_t *dp)
+{
+       return (dsl_scan_is_running(dp->dp_scan) &&
+           dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+}
+
+static inline void
+sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+{
+       bzero(bp, sizeof (*bp));
+       DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
+       DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
+       DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
+       bp->blk_prop = sio->sio_blk_prop;
+       bp->blk_phys_birth = sio->sio_phys_birth;
+       bp->blk_birth = sio->sio_birth;
+       bp->blk_fill = 1;       /* we always only work with data pointers */
+       bp->blk_cksum = sio->sio_cksum;
+}
+
+static inline void
+bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
+{
+       /* we discard the vdev id, since we can deduce it from the queue */
+       sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
+       sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
+       sio->sio_blk_prop = bp->blk_prop;
+       sio->sio_phys_birth = bp->blk_phys_birth;
+       sio->sio_birth = bp->blk_birth;
+       sio->sio_cksum = bp->blk_cksum;
+}
+
 int
 dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 {
@@ -113,6 +378,13 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
        scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
            SPA_FEATURE_ASYNC_DESTROY);
 
+       bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+       avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t),
+           offsetof(scan_ds_t, sds_node));
+       avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare,
+           sizeof (scan_prefetch_issue_ctx_t),
+           offsetof(scan_prefetch_issue_ctx_t, spic_avl_node));
+
        err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
            "scrub_func", sizeof (uint64_t), 1, &f);
        if (err == 0) {
@@ -123,7 +395,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
                scn->scn_restart_txg = txg;
                zfs_dbgmsg("old-style scrub was in progress; "
                    "restarting new-style scrub in txg %llu",
-                   scn->scn_restart_txg);
+                   (longlong_t)scn->scn_restart_txg);
 
                /*
                 * Load the queue obj from the old location so that it
@@ -157,7 +429,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
                                    scn->scn_async_destroying) {
                                        spa->spa_errata =
                                            ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY;
-                                       return (SET_ERROR(EOVERFLOW));
+                                       return (EOVERFLOW);
                                }
 
                                bcopy(zaptmp, &scn->scn_phys,
@@ -177,7 +449,14 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
                else if (err)
                        return (err);
 
-               if (scn->scn_phys.scn_state == DSS_SCANNING &&
+               /*
+                * We might be restarting after a reboot, so jump the issued
+                * counter to how far we've scanned. We know we're consistent
+                * up to here.
+                */
+               scn->scn_issued_before_pass = scn->scn_phys.scn_examined;
+
+               if (dsl_scan_is_running(scn) &&
                    spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) {
                        /*
                         * A new-type scrub was in progress on an old
@@ -189,8 +468,24 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
                        scn->scn_restart_txg = txg;
                        zfs_dbgmsg("new-style scrub was modified "
                            "by old software; restarting in txg %llu",
-                           scn->scn_restart_txg);
+                           (longlong_t)scn->scn_restart_txg);
+               }
+       }
+
+       /* reload the queue into the in-core state */
+       if (scn->scn_phys.scn_queue_obj != 0) {
+               zap_cursor_t zc;
+               zap_attribute_t za;
+
+               for (zap_cursor_init(&zc, dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj);
+                   zap_cursor_retrieve(&zc, &za) == 0;
+                   (void) zap_cursor_advance(&zc)) {
+                       scan_ds_queue_insert(scn,
+                           zfs_strtonum(za.za_name, NULL),
+                           za.za_first_integer);
                }
+               zap_cursor_fini(&zc);
        }
 
        spa_scan_stat_init(spa);
@@ -200,19 +495,116 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
 void
 dsl_scan_fini(dsl_pool_t *dp)
 {
-       if (dp->dp_scan) {
+       if (dp->dp_scan != NULL) {
+               dsl_scan_t *scn = dp->dp_scan;
+
+               if (scn->scn_taskq != NULL)
+                       taskq_destroy(scn->scn_taskq);
+               scan_ds_queue_clear(scn);
+               avl_destroy(&scn->scn_queue);
+               avl_destroy(&scn->scn_prefetch_queue);
+
                kmem_free(dp->dp_scan, sizeof (dsl_scan_t));
                dp->dp_scan = NULL;
        }
 }
 
+static boolean_t
+dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+       return (scn->scn_restart_txg != 0 &&
+           scn->scn_restart_txg <= tx->tx_txg);
+}
+
+boolean_t
+dsl_scan_scrubbing(const dsl_pool_t *dp)
+{
+       dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys;
+
+       return (scn_phys->scn_state == DSS_SCANNING &&
+           scn_phys->scn_func == POOL_SCAN_SCRUB);
+}
+
+boolean_t
+dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
+{
+       return (dsl_scan_scrubbing(scn->scn_dp) &&
+           scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
+}
+
+/*
+ * Writes out a persistent dsl_scan_phys_t record to the pool directory.
+ * Because we can be running in the block sorting algorithm, we do not always
+ * want to write out the record, only when it is "safe" to do so. This safety
+ * condition is achieved by making sure that the sorting queues are empty
+ * (scn_bytes_pending == 0). When this condition is not true, the sync'd state
+ * is inconsistent with how much actual scanning progress has been made. The
+ * kind of sync to be performed is specified by the sync_type argument. If the
+ * sync is optional, we only sync if the queues are empty. If the sync is
+ * mandatory, we do a hard ASSERT to make sure that the queues are empty. The
+ * third possible state is a "cached" sync. This is done in response to:
+ * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ *     destroyed, so we wouldn't be able to restart scanning from it.
+ * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been
+ *     superseded by a newer snapshot.
+ * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been
+ *     swapped with its clone.
+ * In all cases, a cached sync simply rewrites the last record we've written,
+ * just slightly modified. For the modifications that are performed to the
+ * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed,
+ * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped.
+ */
+static void
+dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type)
+{
+       int i;
+       spa_t *spa = scn->scn_dp->dp_spa;
+
+       ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0);
+       if (scn->scn_bytes_pending == 0) {
+               for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+                       vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+                       dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue;
+
+                       if (q == NULL)
+                               continue;
+
+                       mutex_enter(&vd->vdev_scan_io_queue_lock);
+                       ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL);
+                       ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL);
+                       ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL);
+                       mutex_exit(&vd->vdev_scan_io_queue_lock);
+               }
+
+               if (scn->scn_phys.scn_queue_obj != 0)
+                       scan_ds_queue_sync(scn, tx);
+               VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+                   DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+                   &scn->scn_phys, tx));
+               bcopy(&scn->scn_phys, &scn->scn_phys_cached,
+                   sizeof (scn->scn_phys));
+
+               if (scn->scn_checkpointing)
+                       zfs_dbgmsg("finish scan checkpoint");
+
+               scn->scn_checkpointing = B_FALSE;
+               scn->scn_last_checkpoint = ddi_get_lbolt();
+       } else if (sync_type == SYNC_CACHED) {
+               VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+                   DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
+                   &scn->scn_phys_cached, tx));
+       }
+}
+
 /* ARGSUSED */
 static int
 dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 {
        dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
-       if (scn->scn_phys.scn_state == DSS_SCANNING)
+       if (dsl_scan_is_running(scn))
                return (SET_ERROR(EBUSY));
 
        return (0);
@@ -227,7 +619,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
        dsl_pool_t *dp = scn->scn_dp;
        spa_t *spa = dp->dp_spa;
 
-       ASSERT(scn->scn_phys.scn_state != DSS_SCANNING);
+       ASSERT(!dsl_scan_is_running(scn));
        ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
        bzero(&scn->scn_phys, sizeof (scn->scn_phys));
        scn->scn_phys.scn_func = *funcp;
@@ -238,8 +630,11 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
        scn->scn_phys.scn_start_time = gethrestime_sec();
        scn->scn_phys.scn_errors = 0;
        scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc;
+       scn->scn_issued_before_pass = 0;
        scn->scn_restart_txg = 0;
        scn->scn_done_txg = 0;
+       scn->scn_last_checkpoint = 0;
+       scn->scn_checkpointing = B_FALSE;
        spa_scan_stat_init(spa);
 
        if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
@@ -272,8 +667,10 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
        if (dp->dp_blkstats == NULL) {
                dp->dp_blkstats =
                    vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP);
+               mutex_init(&dp->dp_blkstats->zab_lock, NULL,
+                   MUTEX_DEFAULT, NULL);
        }
-       bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
+       bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type));
 
        if (spa_version(spa) < SPA_VERSION_DSL_SCRUB)
                ot = DMU_OT_ZAP_OTHER;
@@ -281,13 +678,52 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
        scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset,
            ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx);
 
-       dsl_scan_sync_state(scn, tx);
+       bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys));
+
+       dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 
        spa_history_log_internal(spa, "scan setup", tx,
            "func=%u mintxg=%llu maxtxg=%llu",
            *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg);
 }
 
+/*
+ * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
+ * Can also be called to resume a paused scrub.
+ */
+int
+dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+{
+       spa_t *spa = dp->dp_spa;
+       dsl_scan_t *scn = dp->dp_scan;
+
+       /*
+        * Purge all vdev caches and probe all devices.  We do this here
+        * rather than in sync context because this requires a writer lock
+        * on the spa_config lock, which we can't do from sync context.  The
+        * spa_scrub_reopen flag indicates that vdev_open() should not
+        * attempt to start another scrub.
+        */
+       spa_vdev_state_enter(spa, SCL_NONE);
+       spa->spa_scrub_reopen = B_TRUE;
+       vdev_reopen(spa->spa_root_vdev);
+       spa->spa_scrub_reopen = B_FALSE;
+       (void) spa_vdev_state_exit(spa, NULL, 0);
+
+       if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
+               /* got scrub start cmd, resume paused scrub */
+               int err = dsl_scrub_set_pause_resume(scn->scn_dp,
+                   POOL_SCRUB_NORMAL);
+               if (err == 0)
+                       return (ECANCELED);
+
+               return (SET_ERROR(err));
+       }
+
+       return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
+           dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+}
+
 /* ARGSUSED */
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
@@ -315,10 +751,11 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
        }
 
        if (scn->scn_phys.scn_queue_obj != 0) {
-               VERIFY(0 == dmu_object_free(dp->dp_meta_objset,
+               VERIFY0(dmu_object_free(dp->dp_meta_objset,
                    scn->scn_phys.scn_queue_obj, tx));
                scn->scn_phys.scn_queue_obj = 0;
        }
+       scan_ds_queue_clear(scn);
 
        scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
 
@@ -326,13 +763,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
         * If we were "restarted" from a stopped state, don't bother
         * with anything else.
         */
-       if (scn->scn_phys.scn_state != DSS_SCANNING)
+       if (!dsl_scan_is_running(scn)) {
+               ASSERT(!scn->scn_is_sorted);
                return;
+       }
 
-       if (complete)
-               scn->scn_phys.scn_state = DSS_FINISHED;
-       else
-               scn->scn_phys.scn_state = DSS_CANCELED;
+       if (scn->scn_is_sorted) {
+               scan_io_queues_destroy(scn);
+               scn->scn_is_sorted = B_FALSE;
+
+               if (scn->scn_taskq != NULL) {
+                       taskq_destroy(scn->scn_taskq);
+                       scn->scn_taskq = NULL;
+               }
+       }
+
+       scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED;
 
        if (dsl_scan_restarting(scn, tx))
                spa_history_log_internal(spa, "scan aborted, restarting", tx,
@@ -345,12 +791,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
                    "errors=%llu", spa_get_errlog_size(spa));
 
        if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
-               mutex_enter(&spa->spa_scrub_lock);
-               while (spa->spa_scrub_inflight > 0) {
-                       cv_wait(&spa->spa_scrub_io_cv,
-                           &spa->spa_scrub_lock);
-               }
-               mutex_exit(&spa->spa_scrub_lock);
                spa->spa_scrub_started = B_FALSE;
                spa->spa_scrub_active = B_FALSE;
 
@@ -379,6 +819,8 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 
        if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
                spa->spa_errata = 0;
+
+       ASSERT(!dsl_scan_is_running(scn));
 }
 
 /* ARGSUSED */
@@ -387,7 +829,7 @@ dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 {
        dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
-       if (scn->scn_phys.scn_state != DSS_SCANNING)
+       if (!dsl_scan_is_running(scn))
                return (SET_ERROR(ENOENT));
        return (0);
 }
@@ -399,7 +841,7 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
        dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
 
        dsl_scan_done(scn, B_FALSE, tx);
-       dsl_scan_sync_state(scn, tx);
+       dsl_scan_sync_state(scn, tx, SYNC_MANDATORY);
 }
 
 int
@@ -409,16 +851,6 @@ dsl_scan_cancel(dsl_pool_t *dp)
            dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
 
-boolean_t
-dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
-{
-       if (dsl_scan_scrubbing(scn->scn_dp) &&
-           scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED)
-               return (B_TRUE);
-
-       return (B_FALSE);
-}
-
 static int
 dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx)
 {
@@ -453,7 +885,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
                /* can't pause a scrub when there is no in-progress scrub */
                spa->spa_scan_pass_scrub_pause = gethrestime_sec();
                scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED;
-               dsl_scan_sync_state(scn, tx);
+               dsl_scan_sync_state(scn, tx, SYNC_CACHED);
        } else {
                ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
                if (dsl_scan_is_paused_scrub(scn)) {
@@ -466,7 +898,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
                            gethrestime_sec() - spa->spa_scan_pass_scrub_pause;
                        spa->spa_scan_pass_scrub_pause = 0;
                        scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED;
-                       dsl_scan_sync_state(scn, tx);
+                       dsl_scan_sync_state(scn, tx, SYNC_CACHED);
                }
        }
 }
@@ -482,25 +914,25 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
            ZFS_SPACE_CHECK_RESERVED));
 }
 
-boolean_t
-dsl_scan_scrubbing(const dsl_pool_t *dp)
-{
-       dsl_scan_t *scn = dp->dp_scan;
 
-       if (scn->scn_phys.scn_state == DSS_SCANNING &&
-           scn->scn_phys.scn_func == POOL_SCAN_SCRUB)
-               return (B_TRUE);
+/* start a new scan, or restart an existing one. */
+void
+dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
+{
+       if (txg == 0) {
+               dmu_tx_t *tx;
+               tx = dmu_tx_create_dd(dp->dp_mos_dir);
+               VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
 
-       return (B_FALSE);
+               txg = dmu_tx_get_txg(tx);
+               dp->dp_scan->scn_restart_txg = txg;
+               dmu_tx_commit(tx);
+       } else {
+               dp->dp_scan->scn_restart_txg = txg;
+       }
+       zfs_dbgmsg("restarting resilver txg=%llu", (longlong_t)txg);
 }
 
-static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
-    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
-    dmu_objset_type_t ostype, dmu_tx_t *tx);
-inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
-    dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
-    dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
-
 void
 dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp)
 {
@@ -514,25 +946,169 @@ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
        zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags));
 }
 
-static uint64_t
-dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
+static int
+scan_ds_queue_compare(const void *a, const void *b)
 {
-       uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
-       if (ds->ds_is_snapshot)
-               return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
-       return (smt);
+       const scan_ds_t *sds_a = a, *sds_b = b;
+
+       if (sds_a->sds_dsobj < sds_b->sds_dsobj)
+               return (-1);
+       if (sds_a->sds_dsobj == sds_b->sds_dsobj)
+               return (0);
+       return (1);
 }
 
 static void
-dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+scan_ds_queue_clear(dsl_scan_t *scn)
+{
+       void *cookie = NULL;
+       scan_ds_t *sds;
+       while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) {
+               kmem_free(sds, sizeof (*sds));
+       }
+}
+
+static boolean_t
+scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg)
 {
-       VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
-           DMU_POOL_DIRECTORY_OBJECT,
-           DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
-           &scn->scn_phys, tx));
+       scan_ds_t srch, *sds;
+
+       srch.sds_dsobj = dsobj;
+       sds = avl_find(&scn->scn_queue, &srch, NULL);
+       if (sds != NULL && txg != NULL)
+               *txg = sds->sds_txg;
+       return (sds != NULL);
 }
 
-extern int zfs_vdev_async_write_active_min_dirty_percent;
+static void
+scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg)
+{
+       scan_ds_t *sds;
+       avl_index_t where;
+
+       sds = kmem_zalloc(sizeof (*sds), KM_SLEEP);
+       sds->sds_dsobj = dsobj;
+       sds->sds_txg = txg;
+
+       VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL);
+       avl_insert(&scn->scn_queue, sds, where);
+}
+
+static void
+scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj)
+{
+       scan_ds_t srch, *sds;
+
+       srch.sds_dsobj = dsobj;
+
+       sds = avl_find(&scn->scn_queue, &srch, NULL);
+       VERIFY(sds != NULL);
+       avl_remove(&scn->scn_queue, sds);
+       kmem_free(sds, sizeof (*sds));
+}
+
+static void
+scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = scn->scn_dp;
+       spa_t *spa = dp->dp_spa;
+       dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ?
+           DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER;
+
+       ASSERT0(scn->scn_bytes_pending);
+       ASSERT(scn->scn_phys.scn_queue_obj != 0);
+
+       VERIFY0(dmu_object_free(dp->dp_meta_objset,
+           scn->scn_phys.scn_queue_obj, tx));
+       scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot,
+           DMU_OT_NONE, 0, tx);
+       for (scan_ds_t *sds = avl_first(&scn->scn_queue);
+           sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) {
+               VERIFY0(zap_add_int_key(dp->dp_meta_objset,
+                   scn->scn_phys.scn_queue_obj, sds->sds_dsobj,
+                   sds->sds_txg, tx));
+       }
+}
+
+/*
+ * Computes the memory limit state that we're currently in. A sorted scan
+ * needs quite a bit of memory to hold the sorting queue, so we need to
+ * reasonably constrain the size so it doesn't impact overall system
+ * performance. We compute two limits:
+ * 1) Hard memory limit: if the amount of memory used by the sorting
+ *     queues on a pool gets above this value, we stop the metadata
+ *     scanning portion and start issuing the queued up and sorted
+ *     I/Os to reduce memory usage.
+ *     This limit is calculated as a fraction of physmem (by default 5%).
+ *     We constrain the lower bound of the hard limit to an absolute
+ *     minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain
+ *     the upper bound to 5% of the total pool size - no chance we'll
+ *     ever need that much memory, but just to keep the value in check.
+ * 2) Soft memory limit: once we hit the hard memory limit, we start
+ *     issuing I/O to reduce queue memory usage, but we don't want to
+ *     completely empty out the queues, since we might be able to find I/Os
+ *     that will fill in the gaps of our non-sequential IOs at some point
+ *     in the future. So we stop the issuing of I/Os once the amount of
+ *     memory used drops below the soft limit (at which point we stop issuing
+ *     I/O and start scanning metadata again).
+ *
+ *     This limit is calculated by subtracting a fraction of the hard
+ *     limit from the hard limit. By default this fraction is 5%, so
+ *     the soft limit is 95% of the hard limit. We cap the size of the
+ *     difference between the hard and soft limits at an absolute
+ *     maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is
+ *     sufficient to not cause too frequent switching between the
+ *     metadata scan and I/O issue (even at 2k recordsize, 128 MiB's
+ *     worth of queues is about 1.2 GiB of on-pool data, so scanning
+ *     that should take at least a decent fraction of a second).
+ */
+static boolean_t
+dsl_scan_should_clear(dsl_scan_t *scn)
+{
+       vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+       uint64_t mlim_hard, mlim_soft, mused;
+       uint64_t alloc = metaslab_class_get_alloc(spa_normal_class(
+           scn->scn_dp->dp_spa));
+
+       mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE,
+           zfs_scan_mem_lim_min);
+       mlim_hard = MIN(mlim_hard, alloc / 20);
+       mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact,
+           zfs_scan_mem_lim_soft_max);
+       mused = 0;
+       for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+               vdev_t *tvd = rvd->vdev_child[i];
+               dsl_scan_io_queue_t *queue;
+
+               mutex_enter(&tvd->vdev_scan_io_queue_lock);
+               queue = tvd->vdev_scan_io_queue;
+               if (queue != NULL) {
+                       /* #extents in exts_by_size = # in exts_by_addr */
+                       mused += avl_numnodes(&queue->q_exts_by_size) *
+                           sizeof (range_seg_t) +
+                           avl_numnodes(&queue->q_sios_by_addr) *
+                           sizeof (scan_io_t);
+               }
+               mutex_exit(&tvd->vdev_scan_io_queue_lock);
+       }
+
+       dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused);
+
+       if (mused == 0)
+               ASSERT0(scn->scn_bytes_pending);
+
+       /*
+        * If we are above our hard limit, we need to clear out memory.
+        * If we are below our soft limit, we need to accumulate sequential IOs.
+        * Otherwise, we should keep doing whatever we are currently doing.
+        */
+       if (mused >= mlim_hard)
+               return (B_TRUE);
+       else if (mused < mlim_soft)
+               return (B_FALSE);
+       else
+               return (scn->scn_clearing);
+}
 
 static boolean_t
 dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
@@ -553,27 +1129,32 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
 
        /*
         * We suspend if:
-        *  - we have scanned for the maximum time: an entire txg
-        *    timeout (default 5 sec)
-        *  or
         *  - we have scanned for at least the minimum time (default 1 sec
         *    for scrub, 3 sec for resilver), and either we have sufficient
         *    dirty data that we are starting to write more quickly
-        *    (default 30%), or someone is explicitly waiting for this txg
-        *    to complete.
+        *    (default 30%), someone is explicitly waiting for this txg
+        *    to complete, or we have used up all of the time in the txg
+        *    timeout (default 5 sec).
         *  or
         *  - the spa is shutting down because this pool is being exported
         *    or the machine is rebooting.
+        *  or
+        *  - the scan queue has reached its memory use limit
         */
-       int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
-           zfs_resilver_min_time_ms : zfs_scan_min_time_ms;
-       uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time;
+       uint64_t curr_time_ns = gethrtime();
+       uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+       uint64_t sync_time_ns = curr_time_ns -
+           scn->scn_dp->dp_spa->spa_sync_starttime;
        int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
-       if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout ||
-           (NSEC2MSEC(elapsed_nanosecs) > mintime &&
-           (txg_sync_waiting(scn->scn_dp) ||
-           dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) ||
-           spa_shutting_down(scn->scn_dp->dp_spa)) {
+       int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+           zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+       if ((NSEC2MSEC(scan_time_ns) > mintime &&
+           (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+           txg_sync_waiting(scn->scn_dp) ||
+           NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+           spa_shutting_down(scn->scn_dp->dp_spa) ||
+           (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) {
                if (zb) {
                        dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n",
                            (longlong_t)zb->zb_objset,
@@ -581,12 +1162,16 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
                            (longlong_t)zb->zb_level,
                            (longlong_t)zb->zb_blkid);
                        scn->scn_phys.scn_bookmark = *zb;
+               } else {
+                       dsl_scan_phys_t *scnp = &scn->scn_phys;
+
+                       dprintf("suspending at at DDT bookmark "
+                           "%llx/%llx/%llx/%llx\n",
+                           (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+                           (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+                           (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+                           (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
                }
-               dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n",
-                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
-                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
-                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
-                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
                scn->scn_suspending = B_TRUE;
                return (B_TRUE);
        }
@@ -683,32 +1268,283 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh)
        zil_free(zilog);
 }
 
-/* ARGSUSED */
-static void
-dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp,
-    uint64_t objset, uint64_t object, uint64_t blkid)
+/*
+ * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea
+ * here is to sort the AVL tree by the order each block will be needed.
+ */
+static int
+scan_prefetch_queue_compare(const void *a, const void *b)
 {
-       zbookmark_phys_t czb;
-       arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
-       int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+       const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b;
+       const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc;
+       const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc;
 
-       if (zfs_no_scrub_prefetch)
-               return;
+       return (zbookmark_compare(spc_a->spc_datablkszsec,
+           spc_a->spc_indblkshift, spc_b->spc_datablkszsec,
+           spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb));
+}
 
-       if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg ||
-           (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
-               return;
+static void
+scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag)
+{
+       if (refcount_remove(&spc->spc_refcnt, tag) == 0) {
+               refcount_destroy(&spc->spc_refcnt);
+               kmem_free(spc, sizeof (scan_prefetch_ctx_t));
+       }
+}
+
+static scan_prefetch_ctx_t *
+scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag)
+{
+       scan_prefetch_ctx_t *spc;
+
+       spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP);
+       refcount_create(&spc->spc_refcnt);
+       refcount_add(&spc->spc_refcnt, tag);
+       spc->spc_scn = scn;
+       if (dnp != NULL) {
+               spc->spc_datablkszsec = dnp->dn_datablkszsec;
+               spc->spc_indblkshift = dnp->dn_indblkshift;
+               spc->spc_root = B_FALSE;
+       } else {
+               spc->spc_datablkszsec = 0;
+               spc->spc_indblkshift = 0;
+               spc->spc_root = B_TRUE;
+       }
+
+       return (spc);
+}
+
+static void
+scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag)
+{
+       refcount_add(&spc->spc_refcnt, tag);
+}
+
+static boolean_t
+dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc,
+    const zbookmark_phys_t *zb)
+{
+       zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark;
+       dnode_phys_t tmp_dnp;
+       dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp;
+
+       if (zb->zb_objset != last_zb->zb_objset)
+               return (B_TRUE);
+       if ((int64_t)zb->zb_object < 0)
+               return (B_FALSE);
+
+       tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec;
+       tmp_dnp.dn_indblkshift = spc->spc_indblkshift;
+
+       if (zbookmark_subtree_completed(dnp, zb, last_zb))
+               return (B_TRUE);
+
+       return (B_FALSE);
+}
+
+static void
+dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb)
+{
+       avl_index_t idx;
+       dsl_scan_t *scn = spc->spc_scn;
+       spa_t *spa = scn->scn_dp->dp_spa;
+       scan_prefetch_issue_ctx_t *spic;
+
+       if (zfs_no_scrub_prefetch)
+               return;
+
+       if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg ||
+           (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE &&
+           BP_GET_TYPE(bp) != DMU_OT_OBJSET))
+               return;
+
+       if (dsl_scan_check_prefetch_resume(spc, zb))
+               return;
+
+       scan_prefetch_ctx_add_ref(spc, scn);
+       spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP);
+       spic->spic_spc = spc;
+       spic->spic_bp = *bp;
+       spic->spic_zb = *zb;
+
+       /*
+        * Add the IO to the queue of blocks to prefetch. This allows us to
+        * prioritize blocks that we will need first for the main traversal
+        * thread.
+        */
+       mutex_enter(&spa->spa_scrub_lock);
+       if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) {
+               /* this block is already queued for prefetch */
+               kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+               scan_prefetch_ctx_rele(spc, scn);
+               mutex_exit(&spa->spa_scrub_lock);
+               return;
+       }
+
+       avl_insert(&scn->scn_prefetch_queue, spic, idx);
+       cv_broadcast(&spa->spa_scrub_io_cv);
+       mutex_exit(&spa->spa_scrub_lock);
+}
+
+static void
+dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp,
+    uint64_t objset, uint64_t object)
+{
+       int i;
+       zbookmark_phys_t zb;
+       scan_prefetch_ctx_t *spc;
+
+       if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
+               return;
+
+       SET_BOOKMARK(&zb, objset, object, 0, 0);
+
+       spc = scan_prefetch_ctx_create(scn, dnp, FTAG);
+
+       for (i = 0; i < dnp->dn_nblkptr; i++) {
+               zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]);
+               zb.zb_blkid = i;
+               dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb);
+       }
+
+       if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+               zb.zb_level = 0;
+               zb.zb_blkid = DMU_SPILL_BLKID;
+               dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb);
+       }
+
+       scan_prefetch_ctx_rele(spc, FTAG);
+}
+
+void
+dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
+    arc_buf_t *buf, void *private)
+{
+       scan_prefetch_ctx_t *spc = private;
+       dsl_scan_t *scn = spc->spc_scn;
+       spa_t *spa = scn->scn_dp->dp_spa;
+
+       /* broadcast that the IO has completed for rate limitting purposes */
+       mutex_enter(&spa->spa_scrub_lock);
+       ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+       spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+       cv_broadcast(&spa->spa_scrub_io_cv);
+       mutex_exit(&spa->spa_scrub_lock);
+
+       /* if there was an error or we are done prefetching, just cleanup */
+       if (buf == NULL || scn->scn_suspending)
+               goto out;
+
+       if (BP_GET_LEVEL(bp) > 0) {
+               int i;
+               blkptr_t *cbp;
+               int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
+               zbookmark_phys_t czb;
+
+               for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+                       SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
+                           zb->zb_level - 1, zb->zb_blkid * epb + i);
+                       dsl_scan_prefetch(spc, cbp, &czb);
+               }
+       } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
+               dnode_phys_t *cdnp;
+               int i;
+               int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
+
+               for (i = 0, cdnp = buf->b_data; i < epb;
+                   i += cdnp->dn_extra_slots + 1,
+                   cdnp += cdnp->dn_extra_slots + 1) {
+                       dsl_scan_prefetch_dnode(scn, cdnp,
+                           zb->zb_objset, zb->zb_blkid * epb + i);
+               }
+       } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
+               objset_phys_t *osp = buf->b_data;
+
+               dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode,
+                   zb->zb_objset, DMU_META_DNODE_OBJECT);
+
+               if (OBJSET_BUF_HAS_USERUSED(buf)) {
+                       dsl_scan_prefetch_dnode(scn,
+                           &osp->os_groupused_dnode, zb->zb_objset,
+                           DMU_GROUPUSED_OBJECT);
+                       dsl_scan_prefetch_dnode(scn,
+                           &osp->os_userused_dnode, zb->zb_objset,
+                           DMU_USERUSED_OBJECT);
+               }
+       }
+
+out:
+       if (buf != NULL)
+               arc_buf_destroy(buf, private);
+       scan_prefetch_ctx_rele(spc, scn);
+}
+
+/* ARGSUSED */
+static void
+dsl_scan_prefetch_thread(void *arg)
+{
+       dsl_scan_t *scn = arg;
+       spa_t *spa = scn->scn_dp->dp_spa;
+       scan_prefetch_issue_ctx_t *spic;
+
+       /* loop until we are told to stop */
+       while (!scn->scn_prefetch_stop) {
+               arc_flags_t flags = ARC_FLAG_NOWAIT |
+                   ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH;
+               int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD;
+
+               mutex_enter(&spa->spa_scrub_lock);
+
+               /*
+                * Wait until we have an IO to issue and are not above our
+                * maximum in flight limit.
+                */
+               while (!scn->scn_prefetch_stop &&
+                   (avl_numnodes(&scn->scn_prefetch_queue) == 0 ||
+                   spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) {
+                       cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+               }
+
+               /* recheck if we should stop since we waited for the cv */
+               if (scn->scn_prefetch_stop) {
+                       mutex_exit(&spa->spa_scrub_lock);
+                       break;
+               }
+
+               /* remove the prefetch IO from the tree */
+               spic = avl_first(&scn->scn_prefetch_queue);
+               spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp);
+               avl_remove(&scn->scn_prefetch_queue, spic);
+
+               mutex_exit(&spa->spa_scrub_lock);
+
+               if (BP_IS_PROTECTED(&spic->spic_bp)) {
+                       ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE ||
+                           BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET);
+                       ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0);
+                       zio_flags |= ZIO_FLAG_RAW;
+               }
+
+               /* issue the prefetch asynchronously */
+               (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa,
+                   &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc,
+                   ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &spic->spic_zb);
 
-       if (BP_IS_PROTECTED(bp)) {
-               ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
-               ASSERT3U(BP_GET_LEVEL(bp), ==, 0);
-               zio_flags |= ZIO_FLAG_RAW;
+               kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
        }
 
-       SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+       ASSERT(scn->scn_prefetch_stop);
 
-       (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp,
-           NULL, NULL, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, &czb);
+       /* free any prefetches we didn't get to complete */
+       mutex_enter(&spa->spa_scrub_lock);
+       while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) {
+               avl_remove(&scn->scn_prefetch_queue, spic);
+               scan_prefetch_ctx_rele(spic->spic_spc, scn);
+               kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t));
+       }
+       ASSERT0(avl_numnodes(&scn->scn_prefetch_queue));
+       mutex_exit(&spa->spa_scrub_lock);
 }
 
 static boolean_t
@@ -747,6 +1583,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp,
        return (B_FALSE);
 }
 
+static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
+    dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn,
+    dmu_objset_type_t ostype, dmu_tx_t *tx);
+inline __attribute__((always_inline)) static void dsl_scan_visitdnode(
+    dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype,
+    dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx);
+
 /*
  * Return nonzero on i/o error.
  * Return new buf to write out in *bufp.
@@ -773,10 +1616,6 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
                        scn->scn_phys.scn_errors++;
                        return (err);
                }
-               for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
-                       dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset,
-                           zb->zb_object, zb->zb_blkid * epb + i);
-               }
                for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
                        zbookmark_phys_t czb;
 
@@ -790,7 +1629,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
        } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
                arc_flags_t flags = ARC_FLAG_WAIT;
                dnode_phys_t *cdnp;
-               int i, j;
+               int i;
                int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
                arc_buf_t *buf;
 
@@ -805,15 +1644,6 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
                        scn->scn_phys.scn_errors++;
                        return (err);
                }
-               for (i = 0, cdnp = buf->b_data; i < epb;
-                   i += cdnp->dn_extra_slots + 1,
-                   cdnp += cdnp->dn_extra_slots + 1) {
-                       for (j = 0; j < cdnp->dn_nblkptr; j++) {
-                               blkptr_t *cbp = &cdnp->dn_blkptr[j];
-                               dsl_scan_prefetch(scn, buf, cbp,
-                                   zb->zb_objset, zb->zb_blkid * epb + i, j);
-                       }
-               }
                for (i = 0, cdnp = buf->b_data; i < epb;
                    i += cdnp->dn_extra_slots + 1,
                    cdnp += cdnp->dn_extra_slots + 1) {
@@ -843,8 +1673,8 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
                        /*
                         * We also always visit user/group accounting
                         * objects, and never skip them, even if we are
-                        * suspending.  This is necessary so that the space
-                        * deltas from this txg get integrated.
+                        * suspending. This is necessary so that the
+                        * space deltas from this txg get integrated.
                         */
                        dsl_scan_visitdnode(scn, ds, osp->os_type,
                            &osp->os_groupused_dnode,
@@ -894,21 +1724,13 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
     dmu_objset_type_t ostype, dmu_tx_t *tx)
 {
        dsl_pool_t *dp = scn->scn_dp;
-       blkptr_t *bp_toread;
-
-       bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
-       *bp_toread = *bp;
-
-       /* ASSERT(pbuf == NULL || arc_released(pbuf)); */
+       blkptr_t *bp_toread = NULL;
 
        if (dsl_scan_check_suspend(scn, zb))
-               goto out;
+               return;
 
        if (dsl_scan_check_resume(scn, dnp, zb))
-               goto out;
-
-       if (BP_IS_HOLE(bp))
-               goto out;
+               return;
 
        scn->scn_visited_this_txg++;
 
@@ -919,14 +1741,24 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
         * if required to debug an issue in dsl_scan_visitbp().
         *
         * dprintf_bp(bp,
-        *    "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
-        *    ds, ds ? ds->ds_object : 0,
-        *    zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
-        *    bp);
+        *     "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p",
+        *     ds, ds ? ds->ds_object : 0,
+        *     zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid,
+        *     bp);
         */
 
-       if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg)
-               goto out;
+       if (BP_IS_HOLE(bp)) {
+               scn->scn_holes_this_txg++;
+               return;
+       }
+
+       if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) {
+               scn->scn_lt_min_this_txg++;
+               return;
+       }
+
+       bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP);
+       *bp_toread = *bp;
 
        if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0)
                goto out;
@@ -938,6 +1770,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
         */
        if (ddt_class_contains(dp->dp_spa,
            scn->scn_phys.scn_ddt_class_max, bp)) {
+               scn->scn_ddt_contained_this_txg++;
                goto out;
        }
 
@@ -948,9 +1781,13 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb,
         * Don't scan it now unless we need to because something
         * under it was modified.
         */
-       if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) {
-               scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+       if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) {
+               scn->scn_gt_max_this_txg++;
+               goto out;
        }
+
+       scan_funcs[scn->scn_phys.scn_func](dp, bp, zb);
+
 out:
        kmem_free(bp_toread, sizeof (blkptr_t));
 }
@@ -960,26 +1797,33 @@ dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_tx_t *tx)
 {
        zbookmark_phys_t zb;
+       scan_prefetch_ctx_t *spc;
 
        SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
            ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
-       dsl_scan_visitbp(bp, &zb, NULL,
-           ds, scn, DMU_OST_NONE, tx);
+
+       if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) {
+               SET_BOOKMARK(&scn->scn_prefetch_bookmark,
+                   zb.zb_objset, 0, 0, 0);
+       } else {
+               scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark;
+       }
+
+       scn->scn_objsets_visited_this_txg++;
+
+       spc = scan_prefetch_ctx_create(scn, NULL, FTAG);
+       dsl_scan_prefetch(spc, bp, &zb);
+       scan_prefetch_ctx_rele(spc, FTAG);
+
+       dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx);
 
        dprintf_ds(ds, "finished scan%s", "");
 }
 
-void
-dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+static void
+ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys)
 {
-       dsl_pool_t *dp = ds->ds_dir->dd_pool;
-       dsl_scan_t *scn = dp->dp_scan;
-       uint64_t mintxg;
-
-       if (scn->scn_phys.scn_state != DSS_SCANNING)
-               return;
-
-       if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
+       if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) {
                if (ds->ds_is_snapshot) {
                        /*
                         * Note:
@@ -991,23 +1835,57 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
                         *    ignore it when we retraverse it in
                         *    dsl_scan_visitds().
                         */
-                       scn->scn_phys.scn_bookmark.zb_objset =
+                       scn_phys->scn_bookmark.zb_objset =
                            dsl_dataset_phys(ds)->ds_next_snap_obj;
                        zfs_dbgmsg("destroying ds %llu; currently traversing; "
                            "reset zb_objset to %llu",
                            (u_longlong_t)ds->ds_object,
                            (u_longlong_t)dsl_dataset_phys(ds)->
                            ds_next_snap_obj);
-                       scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN;
+                       scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN;
                } else {
-                       SET_BOOKMARK(&scn->scn_phys.scn_bookmark,
+                       SET_BOOKMARK(&scn_phys->scn_bookmark,
                            ZB_DESTROYED_OBJSET, 0, 0, 0);
                        zfs_dbgmsg("destroying ds %llu; currently traversing; "
                            "reset bookmark to -1,0,0,0",
                            (u_longlong_t)ds->ds_object);
                }
-       } else if (zap_lookup_int_key(dp->dp_meta_objset,
-           scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+       }
+}
+
+/*
+ * Invoked when a dataset is destroyed. We need to make sure that:
+ *
+ * 1) If it is the dataset that was currently being scanned, we write
+ *     a new dsl_scan_phys_t and marking the objset reference in it
+ *     as destroyed.
+ * 2) Remove it from the work queue, if it was present.
+ *
+ * If the dataset was actually a snapshot, instead of marking the dataset
+ * as destroyed, we instead substitute the next snapshot in line.
+ */
+void
+dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       uint64_t mintxg;
+
+       if (!dsl_scan_is_running(scn))
+               return;
+
+       ds_destroyed_scn_phys(ds, &scn->scn_phys);
+       ds_destroyed_scn_phys(ds, &scn->scn_phys_cached);
+
+       if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+               scan_ds_queue_remove(scn, ds->ds_object);
+               if (ds->ds_is_snapshot)
+                       scan_ds_queue_insert(scn,
+                           dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg);
+       }
+
+       if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+           ds->ds_object, &mintxg) == 0) {
                ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1);
                VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
                    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
@@ -1036,9 +1914,28 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
         * dsl_scan_sync() should be called after this, and should sync
         * out our changed state, but just to be safe, do it here.
         */
-       dsl_scan_sync_state(scn, tx);
+       dsl_scan_sync_state(scn, tx, SYNC_CACHED);
+}
+
+static void
+ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark)
+{
+       if (scn_bookmark->zb_objset == ds->ds_object) {
+               scn_bookmark->zb_objset =
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj;
+               zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
+                   "reset zb_objset to %llu",
+                   (u_longlong_t)ds->ds_object,
+                   (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
+       }
 }
 
+/*
+ * Called when a dataset is snapshotted. If we were currently traversing
+ * this snapshot, we reset our bookmark to point at the newly created
+ * snapshot. We also modify our work queue to remove the old snapshot and
+ * replace with the new one.
+ */
 void
 dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
 {
@@ -1046,20 +1943,22 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
        dsl_scan_t *scn = dp->dp_scan;
        uint64_t mintxg;
 
-       if (scn->scn_phys.scn_state != DSS_SCANNING)
+       if (!dsl_scan_is_running(scn))
                return;
 
        ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0);
 
-       if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) {
-               scn->scn_phys.scn_bookmark.zb_objset =
-                   dsl_dataset_phys(ds)->ds_prev_snap_obj;
-               zfs_dbgmsg("snapshotting ds %llu; currently traversing; "
-                   "reset zb_objset to %llu",
-                   (u_longlong_t)ds->ds_object,
-                   (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
-       } else if (zap_lookup_int_key(dp->dp_meta_objset,
-           scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) {
+       ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark);
+       ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark);
+
+       if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) {
+               scan_ds_queue_remove(scn, ds->ds_object);
+               scan_ds_queue_insert(scn,
+                   dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg);
+       }
+
+       if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+           ds->ds_object, &mintxg) == 0) {
                VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
                    scn->scn_phys.scn_queue_obj, ds->ds_object, tx));
                VERIFY(zap_add_int_key(dp->dp_meta_objset,
@@ -1070,37 +1969,59 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx)
                    (u_longlong_t)ds->ds_object,
                    (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj);
        }
-       dsl_scan_sync_state(scn, tx);
+
+       dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
-void
-dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+static void
+ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2,
+    zbookmark_phys_t *scn_bookmark)
 {
-       dsl_pool_t *dp = ds1->ds_dir->dd_pool;
-       dsl_scan_t *scn = dp->dp_scan;
-       uint64_t mintxg;
-
-       if (scn->scn_phys.scn_state != DSS_SCANNING)
-               return;
-
-       if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) {
-               scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object;
+       if (scn_bookmark->zb_objset == ds1->ds_object) {
+               scn_bookmark->zb_objset = ds2->ds_object;
                zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
                    "reset zb_objset to %llu",
                    (u_longlong_t)ds1->ds_object,
                    (u_longlong_t)ds2->ds_object);
-       } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) {
-               scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object;
+       } else if (scn_bookmark->zb_objset == ds2->ds_object) {
+               scn_bookmark->zb_objset = ds1->ds_object;
                zfs_dbgmsg("clone_swap ds %llu; currently traversing; "
                    "reset zb_objset to %llu",
                    (u_longlong_t)ds2->ds_object,
                    (u_longlong_t)ds1->ds_object);
        }
+}
+
+/*
+ * Called when a parent dataset and its clone are swapped. If we were
+ * currently traversing the dataset, we need to switch to traversing the
+ * newly promoted parent.
+ */
+void
+dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = ds1->ds_dir->dd_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       uint64_t mintxg;
+
+       if (!dsl_scan_is_running(scn))
+               return;
+
+       ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark);
+       ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark);
+
+       if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) {
+               scan_ds_queue_remove(scn, ds1->ds_object);
+               scan_ds_queue_insert(scn, ds2->ds_object, mintxg);
+       }
+       if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) {
+               scan_ds_queue_remove(scn, ds2->ds_object);
+               scan_ds_queue_insert(scn, ds1->ds_object, mintxg);
+       }
 
        if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
            ds1->ds_object, &mintxg) == 0) {
                int err;
-
                ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
                ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
                VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
@@ -1118,8 +2039,9 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
                    "replacing with %llu",
                    (u_longlong_t)ds1->ds_object,
                    (u_longlong_t)ds2->ds_object);
-       } else if (zap_lookup_int_key(dp->dp_meta_objset,
-           scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) {
+       }
+       if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
+           ds2->ds_object, &mintxg) == 0) {
                ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg);
                ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg);
                VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
@@ -1132,31 +2054,26 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx)
                    (u_longlong_t)ds1->ds_object);
        }
 
-       dsl_scan_sync_state(scn, tx);
+       dsl_scan_sync_state(scn, tx, SYNC_CACHED);
 }
 
-struct enqueue_clones_arg {
-       dmu_tx_t *tx;
-       uint64_t originobj;
-};
-
 /* ARGSUSED */
 static int
 enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
-       struct enqueue_clones_arg *eca = arg;
+       uint64_t originobj = *(uint64_t *)arg;
        dsl_dataset_t *ds;
        int err;
        dsl_scan_t *scn = dp->dp_scan;
 
-       if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj)
+       if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj)
                return (0);
 
        err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds);
        if (err)
                return (err);
 
-       while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) {
+       while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) {
                dsl_dataset_t *prev;
                err = dsl_dataset_hold_obj(dp,
                    dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev);
@@ -1166,9 +2083,8 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
                        return (err);
                ds = prev;
        }
-       VERIFY(zap_add_int_key(dp->dp_meta_objset,
-           scn->scn_phys.scn_queue_obj, ds->ds_object,
-           dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0);
+       scan_ds_queue_insert(scn, ds->ds_object,
+           dsl_dataset_phys(ds)->ds_prev_snap_txg);
        dsl_dataset_rele(ds, FTAG);
        return (0);
 }
@@ -1214,9 +2130,9 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
                dsl_dataset_name(ds, dsname);
                zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because "
                    "cur_min_txg (%llu) >= max_txg (%llu)",
-                   dsobj, dsname,
-                   scn->scn_phys.scn_cur_min_txg,
-                   scn->scn_phys.scn_max_txg);
+                   (longlong_t)dsobj, dsname,
+                   (longlong_t)scn->scn_phys.scn_cur_min_txg,
+                   (longlong_t)scn->scn_phys.scn_max_txg);
                kmem_free(dsname, MAXNAMELEN);
 
                goto out;
@@ -1232,7 +2148,7 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
         * ZIL here, rather than in scan_recurse(), because the regular
         * snapshot block-sharing rules don't apply to it.
         */
-       if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot)
+       if (!ds->ds_is_snapshot)
                dsl_scan_zil(dp, &os->os_zil_header);
 
        /*
@@ -1266,9 +2182,8 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
        if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) {
                zfs_dbgmsg("incomplete pass; visiting again");
                scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN;
-               VERIFY(zap_add_int_key(dp->dp_meta_objset,
-                   scn->scn_phys.scn_queue_obj, ds->ds_object,
-                   scn->scn_phys.scn_cur_max_txg, tx) == 0);
+               scan_ds_queue_insert(scn, ds->ds_object,
+                   scn->scn_phys.scn_cur_max_txg);
                goto out;
        }
 
@@ -1276,10 +2191,9 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
         * Add descendent datasets to work queue.
         */
        if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) {
-               VERIFY(zap_add_int_key(dp->dp_meta_objset,
-                   scn->scn_phys.scn_queue_obj,
+               scan_ds_queue_insert(scn,
                    dsl_dataset_phys(ds)->ds_next_snap_obj,
-                   dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0);
+                   dsl_dataset_phys(ds)->ds_creation_txg);
        }
        if (dsl_dataset_phys(ds)->ds_num_children > 1) {
                boolean_t usenext = B_FALSE;
@@ -1300,17 +2214,21 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx)
                }
 
                if (usenext) {
-                       VERIFY0(zap_join_key(dp->dp_meta_objset,
-                           dsl_dataset_phys(ds)->ds_next_clones_obj,
-                           scn->scn_phys.scn_queue_obj,
-                           dsl_dataset_phys(ds)->ds_creation_txg, tx));
+                       zap_cursor_t zc;
+                       zap_attribute_t za;
+                       for (zap_cursor_init(&zc, dp->dp_meta_objset,
+                           dsl_dataset_phys(ds)->ds_next_clones_obj);
+                           zap_cursor_retrieve(&zc, &za) == 0;
+                           (void) zap_cursor_advance(&zc)) {
+                               scan_ds_queue_insert(scn,
+                                   zfs_strtonum(za.za_name, NULL),
+                                   dsl_dataset_phys(ds)->ds_creation_txg);
+                       }
+                       zap_cursor_fini(&zc);
                } else {
-                       struct enqueue_clones_arg eca;
-                       eca.tx = tx;
-                       eca.originobj = ds->ds_object;
-
                        VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-                           enqueue_clones_cb, &eca, DS_FIND_CHILDREN));
+                           enqueue_clones_cb, &ds->ds_object,
+                           DS_FIND_CHILDREN));
                }
        }
 
@@ -1322,7 +2240,6 @@ out:
 static int
 enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
 {
-       dmu_tx_t *tx = arg;
        dsl_dataset_t *ds;
        int err;
        dsl_scan_t *scn = dp->dp_scan;
@@ -1352,12 +2269,37 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg)
                ds = prev;
        }
 
-       VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj,
-           ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0);
+       scan_ds_queue_insert(scn, ds->ds_object,
+           dsl_dataset_phys(ds)->ds_prev_snap_txg);
        dsl_dataset_rele(ds, FTAG);
        return (0);
 }
 
+/* ARGSUSED */
+void
+dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+       const ddt_key_t *ddk = &dde->dde_key;
+       ddt_phys_t *ddp = dde->dde_phys;
+       blkptr_t bp;
+       zbookmark_phys_t zb = { 0 };
+       int p;
+
+       if (scn->scn_phys.scn_state != DSS_SCANNING)
+               return;
+
+       for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+               if (ddp->ddp_phys_birth == 0 ||
+                   ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
+                       continue;
+               ddt_bp_create(checksum, ddk, ddp, &bp);
+
+               scn->scn_visited_this_txg++;
+               scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
+       }
+}
+
 /*
  * Scrub/dedup interaction.
  *
@@ -1432,36 +2374,20 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx)
            ddb->ddb_class > scn->scn_phys.scn_ddt_class_max);
 }
 
-/* ARGSUSED */
-void
-dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum,
-    ddt_entry_t *dde, dmu_tx_t *tx)
+static uint64_t
+dsl_scan_ds_maxtxg(dsl_dataset_t *ds)
 {
-       const ddt_key_t *ddk = &dde->dde_key;
-       ddt_phys_t *ddp = dde->dde_phys;
-       blkptr_t bp;
-       zbookmark_phys_t zb = { 0 };
-
-       if (scn->scn_phys.scn_state != DSS_SCANNING)
-               return;
-
-       for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
-               if (ddp->ddp_phys_birth == 0 ||
-                   ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg)
-                       continue;
-               ddt_bp_create(checksum, ddk, ddp, &bp);
-
-               scn->scn_visited_this_txg++;
-               scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb);
-       }
+       uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg;
+       if (ds->ds_is_snapshot)
+               return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg));
+       return (smt);
 }
 
 static void
 dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 {
+       scan_ds_t *sds;
        dsl_pool_t *dp = scn->scn_dp;
-       zap_cursor_t *zc;
-       zap_attribute_t *za;
 
        if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
            scn->scn_phys.scn_ddt_class_max) {
@@ -1485,7 +2411,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
 
                if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
                        VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
-                           enqueue_cb, tx, DS_FIND_CHILDREN));
+                           enqueue_cb, NULL, DS_FIND_CHILDREN));
                } else {
                        dsl_scan_visitds(scn,
                            dp->dp_origin_snap->ds_object, tx);
@@ -1493,42 +2419,42 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
                ASSERT(!scn->scn_suspending);
        } else if (scn->scn_phys.scn_bookmark.zb_objset !=
            ZB_DESTROYED_OBJSET) {
+               uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset;
                /*
-                * If we were suspended, continue from here.  Note if the
+                * If we were suspended, continue from here. Note if the
                 * ds we were suspended on was deleted, the zb_objset may
                 * be -1, so we will skip this and find a new objset
                 * below.
                 */
-               dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx);
+               dsl_scan_visitds(scn, dsobj, tx);
                if (scn->scn_suspending)
                        return;
        }
 
        /*
-        * In case we were suspended right at the end of the ds, zero the
+        * In case we suspended right at the end of the ds, zero the
         * bookmark so we don't think that we're still trying to resume.
         */
        bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t));
-       zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP);
-       za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
-       /* keep pulling things out of the zap-object-as-queue */
-       while (zap_cursor_init(zc, dp->dp_meta_objset,
-           scn->scn_phys.scn_queue_obj),
-           zap_cursor_retrieve(zc, za) == 0) {
+       /*
+        * Keep pulling things out of the dataset avl queue. Updates to the
+        * persistent zap-object-as-queue happen only at checkpoints.
+        */
+       while ((sds = avl_first(&scn->scn_queue)) != NULL) {
                dsl_dataset_t *ds;
-               uint64_t dsobj;
+               uint64_t dsobj = sds->sds_dsobj;
+               uint64_t txg = sds->sds_txg;
 
-               dsobj = zfs_strtonum(za->za_name, NULL);
-               VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
-                   scn->scn_phys.scn_queue_obj, dsobj, tx));
+               /* dequeue and free the ds from the queue */
+               scan_ds_queue_remove(scn, dsobj);
+               sds = NULL;
 
-               /* Set up min/max txg */
+               /* set up min / max txg */
                VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-               if (za->za_first_integer != 0) {
+               if (txg != 0) {
                        scn->scn_phys.scn_cur_min_txg =
-                           MAX(scn->scn_phys.scn_min_txg,
-                           za->za_first_integer);
+                           MAX(scn->scn_phys.scn_min_txg, txg);
                } else {
                        scn->scn_phys.scn_cur_min_txg =
                            MAX(scn->scn_phys.scn_min_txg,
@@ -1538,14 +2464,360 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx)
                dsl_dataset_rele(ds, FTAG);
 
                dsl_scan_visitds(scn, dsobj, tx);
-               zap_cursor_fini(zc);
                if (scn->scn_suspending)
-                       goto out;
+                       return;
        }
-       zap_cursor_fini(zc);
-out:
-       kmem_free(za, sizeof (zap_attribute_t));
-       kmem_free(zc, sizeof (zap_cursor_t));
+
+       /* No more objsets to fetch, we're done */
+       scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET;
+       ASSERT0(scn->scn_suspending);
+}
+
+static uint64_t
+dsl_scan_count_leaves(vdev_t *vd)
+{
+       uint64_t i, leaves = 0;
+
+       /* we only count leaves that belong to the main pool and are readable */
+       if (vd->vdev_islog || vd->vdev_isspare ||
+           vd->vdev_isl2cache || !vdev_readable(vd))
+               return (0);
+
+       if (vd->vdev_ops->vdev_op_leaf)
+               return (1);
+
+       for (i = 0; i < vd->vdev_children; i++) {
+               leaves += dsl_scan_count_leaves(vd->vdev_child[i]);
+       }
+
+       return (leaves);
+}
+
+static void
+scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp)
+{
+       int i;
+       uint64_t cur_size = 0;
+
+       for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+               cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]);
+       }
+
+       q->q_total_zio_size_this_txg += cur_size;
+       q->q_zios_this_txg++;
+}
+
+static void
+scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start,
+    uint64_t end)
+{
+       q->q_total_seg_size_this_txg += end - start;
+       q->q_segs_this_txg++;
+}
+
+static boolean_t
+scan_io_queue_check_suspend(dsl_scan_t *scn)
+{
+       /* See comment in dsl_scan_check_suspend() */
+       uint64_t curr_time_ns = gethrtime();
+       uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time;
+       uint64_t sync_time_ns = curr_time_ns -
+           scn->scn_dp->dp_spa->spa_sync_starttime;
+       int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max;
+       int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ?
+           zfs_resilver_min_time_ms : zfs_scrub_min_time_ms;
+
+       return ((NSEC2MSEC(scan_time_ns) > mintime &&
+           (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent ||
+           txg_sync_waiting(scn->scn_dp) ||
+           NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+           spa_shutting_down(scn->scn_dp->dp_spa));
+}
+
+/*
+ * Given a list of scan_io_t's in io_list, this issues the io's out to
+ * disk. This consumes the io_list and frees the scan_io_t's. This is
+ * called when emptying queues, either when we're up against the memory
+ * limit or when we have finished scanning. Returns B_TRUE if we stopped
+ * processing the list before we finished. Any zios that were not issued
+ * will remain in the io_list.
+ */
+static boolean_t
+scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list)
+{
+       dsl_scan_t *scn = queue->q_scn;
+       scan_io_t *sio;
+       int64_t bytes_issued = 0;
+       boolean_t suspended = B_FALSE;
+
+       while ((sio = list_head(io_list)) != NULL) {
+               blkptr_t bp;
+
+               if (scan_io_queue_check_suspend(scn)) {
+                       suspended = B_TRUE;
+                       break;
+               }
+
+               sio2bp(sio, &bp, queue->q_vd->vdev_id);
+               bytes_issued += sio->sio_asize;
+               scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
+                   &sio->sio_zb, queue);
+               (void) list_remove_head(io_list);
+               scan_io_queues_update_zio_stats(queue, &bp);
+               kmem_cache_free(sio_cache, sio);
+       }
+
+       atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
+
+       return (suspended);
+}
+
+/*
+ * This function removes sios from an IO queue which reside within a given
+ * range_seg_t and inserts them (in offset order) into a list. Note that
+ * we only ever return a maximum of 32 sios at once. If there are more sios
+ * to process within this segment that did not make it onto the list we
+ * return B_TRUE and otherwise B_FALSE.
+ */
+static boolean_t
+scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
+{
+       scan_io_t srch_sio, *sio, *next_sio;
+       avl_index_t idx;
+       uint_t num_sios = 0;
+       int64_t bytes_issued = 0;
+
+       ASSERT(rs != NULL);
+       ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+       srch_sio.sio_offset = rs->rs_start;
+
+       /*
+        * The exact start of the extent might not contain any matching zios,
+        * so if that's the case, examine the next one in the tree.
+        */
+       sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
+       if (sio == NULL)
+               sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
+
+       while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
+               ASSERT3U(sio->sio_offset, >=, rs->rs_start);
+               ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
+
+               next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
+               avl_remove(&queue->q_sios_by_addr, sio);
+
+               bytes_issued += sio->sio_asize;
+               num_sios++;
+               list_insert_tail(list, sio);
+               sio = next_sio;
+       }
+
+       /*
+        * We limit the number of sios we process at once to 32 to avoid
+        * biting off more than we can chew. If we didn't take everything
+        * in the segment we update it to reflect the work we were able to
+        * complete. Otherwise, we remove it from the range tree entirely.
+        */
+       if (sio != NULL && sio->sio_offset < rs->rs_end) {
+               range_tree_adjust_fill(queue->q_exts_by_addr, rs,
+                   -bytes_issued);
+               range_tree_resize_segment(queue->q_exts_by_addr, rs,
+                   sio->sio_offset, rs->rs_end - sio->sio_offset);
+
+               return (B_TRUE);
+       } else {
+               range_tree_remove(queue->q_exts_by_addr, rs->rs_start,
+                   rs->rs_end - rs->rs_start);
+               return (B_FALSE);
+       }
+}
+
+/*
+ * This is called from the queue emptying thread and selects the next
+ * extent from which we are to issue io's. The behavior of this function
+ * depends on the state of the scan, the current memory consumption and
+ * whether or not we are performing a scan shutdown.
+ * 1) We select extents in an elevator algorithm (LBA-order) if the scan
+ *     needs to perform a checkpoint
+ * 2) We select the largest available extent if we are up against the
+ *     memory limit.
+ * 3) Otherwise we don't select any extents.
+ */
+static range_seg_t *
+scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue)
+{
+       dsl_scan_t *scn = queue->q_scn;
+
+       ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+       ASSERT(scn->scn_is_sorted);
+
+       /* handle tunable overrides */
+       if (scn->scn_checkpointing || scn->scn_clearing) {
+               if (zfs_scan_issue_strategy == 1) {
+                       return (range_tree_first(queue->q_exts_by_addr));
+               } else if (zfs_scan_issue_strategy == 2) {
+                       return (avl_first(&queue->q_exts_by_size));
+               }
+       }
+
+       /*
+        * During normal clearing, we want to issue our largest segments
+        * first, keeping IO as sequential as possible, and leaving the
+        * smaller extents for later with the hope that they might eventually
+        * grow to larger sequential segments. However, when the scan is
+        * checkpointing, no new extents will be added to the sorting queue,
+        * so the way we are sorted now is as good as it will ever get.
+        * In this case, we instead switch to issuing extents in LBA order.
+        */
+       if (scn->scn_checkpointing) {
+               return (range_tree_first(queue->q_exts_by_addr));
+       } else if (scn->scn_clearing) {
+               return (avl_first(&queue->q_exts_by_size));
+       } else {
+               return (NULL);
+       }
+}
+
+static void
+scan_io_queues_run_one(void *arg)
+{
+       dsl_scan_io_queue_t *queue = arg;
+       kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
+       boolean_t suspended = B_FALSE;
+       range_seg_t *rs = NULL;
+       scan_io_t *sio = NULL;
+       list_t sio_list;
+       uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+       uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd);
+
+       ASSERT(queue->q_scn->scn_is_sorted);
+
+       list_create(&sio_list, sizeof (scan_io_t),
+           offsetof(scan_io_t, sio_nodes.sio_list_node));
+       mutex_enter(q_lock);
+
+       /* calculate maximum in-flight bytes for this txg (min 1MB) */
+       queue->q_maxinflight_bytes =
+           MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+       /* reset per-queue scan statistics for this txg */
+       queue->q_total_seg_size_this_txg = 0;
+       queue->q_segs_this_txg = 0;
+       queue->q_total_zio_size_this_txg = 0;
+       queue->q_zios_this_txg = 0;
+
+       /* loop until we run out of time or sios */
+       while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) {
+               uint64_t seg_start = 0, seg_end = 0;
+               boolean_t more_left = B_TRUE;
+
+               ASSERT(list_is_empty(&sio_list));
+
+               /* loop while we still have sios left to process in this rs */
+               while (more_left) {
+                       scan_io_t *first_sio, *last_sio;
+
+                       /*
+                        * We have selected which extent needs to be
+                        * processed next. Gather up the corresponding sios.
+                        */
+                       more_left = scan_io_queue_gather(queue, rs, &sio_list);
+                       ASSERT(!list_is_empty(&sio_list));
+                       first_sio = list_head(&sio_list);
+                       last_sio = list_tail(&sio_list);
+
+                       seg_end = last_sio->sio_offset + last_sio->sio_asize;
+                       if (seg_start == 0)
+                               seg_start = first_sio->sio_offset;
+
+                       /*
+                        * Issuing sios can take a long time so drop the
+                        * queue lock. The sio queue won't be updated by
+                        * other threads since we're in syncing context so
+                        * we can be sure that our trees will remain exactly
+                        * as we left them.
+                        */
+                       mutex_exit(q_lock);
+                       suspended = scan_io_queue_issue(queue, &sio_list);
+                       mutex_enter(q_lock);
+
+                       if (suspended)
+                               break;
+               }
+
+               /* update statistics for debugging purposes */
+               scan_io_queues_update_seg_stats(queue, seg_start, seg_end);
+
+               if (suspended)
+                       break;
+       }
+
+       /*
+        * If we were suspended in the middle of processing,
+        * requeue any unfinished sios and exit.
+        */
+       while ((sio = list_head(&sio_list)) != NULL) {
+               list_remove(&sio_list, sio);
+               scan_io_queue_insert_impl(queue, sio);
+       }
+
+       mutex_exit(q_lock);
+       list_destroy(&sio_list);
+}
+
+/*
+ * Performs an emptying run on all scan queues in the pool. This just
+ * punches out one thread per top-level vdev, each of which processes
+ * only that vdev's scan queue. We can parallelize the I/O here because
+ * we know that each queue's io's only affect its own top-level vdev.
+ *
+ * This function waits for the queue runs to complete, and must be
+ * called from dsl_scan_sync (or in general, syncing context).
+ */
+static void
+scan_io_queues_run(dsl_scan_t *scn)
+{
+       spa_t *spa = scn->scn_dp->dp_spa;
+
+       ASSERT(scn->scn_is_sorted);
+       ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
+       if (scn->scn_bytes_pending == 0)
+               return;
+
+       if (scn->scn_taskq == NULL) {
+               int nthreads = spa->spa_root_vdev->vdev_children;
+
+               /*
+                * We need to make this taskq *always* execute as many
+                * threads in parallel as we have top-level vdevs and no
+                * less, otherwise strange serialization of the calls to
+                * scan_io_queues_run_one can occur during spa_sync runs
+                * and that significantly impacts performance.
+                */
+               scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads,
+                   minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE);
+       }
+
+       for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+               vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+
+               mutex_enter(&vd->vdev_scan_io_queue_lock);
+               if (vd->vdev_scan_io_queue != NULL) {
+                       VERIFY(taskq_dispatch(scn->scn_taskq,
+                           scan_io_queues_run_one, vd->vdev_scan_io_queue,
+                           TQ_SLEEP) != TASKQID_INVALID);
+               }
+               mutex_exit(&vd->vdev_scan_io_queue_lock);
+       }
+
+       /*
+        * Wait for the queues to finish issuing thir IOs for this run
+        * before we return. There may still be IOs in flight at this
+        * point.
+        */
+       taskq_wait(scn->scn_taskq);
 }
 
 static boolean_t
@@ -1586,6 +2858,41 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
        return (0);
 }
 
+static void
+dsl_scan_update_stats(dsl_scan_t *scn)
+{
+       spa_t *spa = scn->scn_dp->dp_spa;
+       uint64_t i;
+       uint64_t seg_size_total = 0, zio_size_total = 0;
+       uint64_t seg_count_total = 0, zio_count_total = 0;
+
+       for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) {
+               vdev_t *vd = spa->spa_root_vdev->vdev_child[i];
+               dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue;
+
+               if (queue == NULL)
+                       continue;
+
+               seg_size_total += queue->q_total_seg_size_this_txg;
+               zio_size_total += queue->q_total_zio_size_this_txg;
+               seg_count_total += queue->q_segs_this_txg;
+               zio_count_total += queue->q_zios_this_txg;
+       }
+
+       if (seg_count_total == 0 || zio_count_total == 0) {
+               scn->scn_avg_seg_size_this_txg = 0;
+               scn->scn_avg_zio_size_this_txg = 0;
+               scn->scn_segs_this_txg = 0;
+               scn->scn_zios_this_txg = 0;
+               return;
+       }
+
+       scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total;
+       scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total;
+       scn->scn_segs_this_txg = seg_count_total;
+       scn->scn_zios_this_txg = zio_count_total;
+}
+
 boolean_t
 dsl_scan_active(dsl_scan_t *scn)
 {
@@ -1596,8 +2903,7 @@ dsl_scan_active(dsl_scan_t *scn)
                return (B_FALSE);
        if (spa_shutting_down(spa))
                return (B_FALSE);
-       if ((scn->scn_phys.scn_state == DSS_SCANNING &&
-           !dsl_scan_is_paused_scrub(scn)) ||
+       if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) ||
            (scn->scn_async_destroying && !scn->scn_async_stalled))
                return (B_TRUE);
 
@@ -1608,13 +2914,60 @@ dsl_scan_active(dsl_scan_t *scn)
        return (used != 0);
 }
 
-/* Called whenever a txg syncs. */
+static boolean_t
+dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
+    uint64_t phys_birth)
+{
+       vdev_t *vd;
+
+       if (DVA_GET_GANG(dva)) {
+               /*
+                * Gang members may be spread across multiple
+                * vdevs, so the best estimate we have is the
+                * scrub range, which has already been checked.
+                * XXX -- it would be better to change our
+                * allocation policy to ensure that all
+                * gang members reside on the same vdev.
+                */
+               return (B_TRUE);
+       }
+
+       vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+
+       /*
+        * Check if the txg falls within the range which must be
+        * resilvered.  DVAs outside this range can always be skipped.
+        */
+       if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
+               return (B_FALSE);
+
+       /*
+        * Check if the top-level vdev must resilver this offset.
+        * When the offset does not intersect with a dirty leaf DTL
+        * then it may be possible to skip the resilver IO.  The psize
+        * is provided instead of asize to simplify the check for RAIDZ.
+        */
+       if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
+               return (B_FALSE);
+
+       return (B_TRUE);
+}
+
+/*
+ * This is the primary entry point for scans that is called from syncing
+ * context. Scans must happen entirely during syncing context so that we
+ * cna guarantee that blocks we are currently scanning will not change out
+ * from under us. While a scan is active, this funciton controls how quickly
+ * transaction groups proceed, instead of the normal handling provided by
+ * txg_sync_thread().
+ */
 void
 dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
+       int err = 0;
        dsl_scan_t *scn = dp->dp_scan;
        spa_t *spa = dp->dp_spa;
-       int err = 0;
+       state_sync_type_t sync_type = SYNC_OPTIONAL;
 
        /*
         * Check for scn_restart_txg before checking spa_load_state, so
@@ -1627,14 +2980,14 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
                        func = POOL_SCAN_RESILVER;
                zfs_dbgmsg("restarting scan func=%u txg=%llu",
-                   func, tx->tx_txg);
+                   func, (longlong_t)tx->tx_txg);
                dsl_scan_setup_sync(&func, tx);
        }
 
        /*
         * Only process scans in sync pass 1.
         */
-       if (spa_sync_pass(dp->dp_spa) > 1)
+       if (spa_sync_pass(spa) > 1)
                return;
 
        /*
@@ -1651,7 +3004,17 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
        if (!scn->scn_async_stalled && !dsl_scan_active(scn))
                return;
 
+       /* reset scan statistics */
        scn->scn_visited_this_txg = 0;
+       scn->scn_holes_this_txg = 0;
+       scn->scn_lt_min_this_txg = 0;
+       scn->scn_gt_max_this_txg = 0;
+       scn->scn_ddt_contained_this_txg = 0;
+       scn->scn_objsets_visited_this_txg = 0;
+       scn->scn_avg_seg_size_this_txg = 0;
+       scn->scn_segs_this_txg = 0;
+       scn->scn_avg_zio_size_this_txg = 0;
+       scn->scn_zios_this_txg = 0;
        scn->scn_suspending = B_FALSE;
        scn->scn_sync_start_time = gethrtime();
        spa->spa_scrub_active = B_TRUE;
@@ -1664,13 +3027,14 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
         * blocks than to scrub them.
         */
        if (zfs_free_bpobj_enabled &&
-           spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
+           spa_version(spa) >= SPA_VERSION_DEADLISTS) {
                scn->scn_is_bptree = B_FALSE;
-               scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+               scn->scn_zio_root = zio_root(spa, NULL,
                    NULL, ZIO_FLAG_MUSTSUCCEED);
                err = bpobj_iterate(&dp->dp_free_bpobj,
                    dsl_scan_free_block_cb, scn, tx);
-               VERIFY3U(0, ==, zio_wait(scn->scn_zio_root));
+               VERIFY0(zio_wait(scn->scn_zio_root));
+               scn->scn_zio_root = NULL;
 
                if (err != 0 && err != ERESTART)
                        zfs_panic_recover("error %u from bpobj_iterate()", err);
@@ -1679,11 +3043,12 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
        if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) {
                ASSERT(scn->scn_async_destroying);
                scn->scn_is_bptree = B_TRUE;
-               scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+               scn->scn_zio_root = zio_root(spa, NULL,
                    NULL, ZIO_FLAG_MUSTSUCCEED);
                err = bptree_iterate(dp->dp_meta_objset,
                    dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx);
                VERIFY0(zio_wait(scn->scn_zio_root));
+               scn->scn_zio_root = NULL;
 
                if (err == EIO || err == ECKSUM) {
                        err = 0;
@@ -1770,110 +3135,189 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
                ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes);
        }
 
-       if (scn->scn_phys.scn_state != DSS_SCANNING)
+       if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn))
                return;
 
-       if (scn->scn_done_txg == tx->tx_txg) {
-               ASSERT(!scn->scn_suspending);
-               /* finished with scan. */
-               zfs_dbgmsg("txg %llu scan complete", tx->tx_txg);
-               dsl_scan_done(scn, B_TRUE, tx);
-               ASSERT3U(spa->spa_scrub_inflight, ==, 0);
-               dsl_scan_sync_state(scn, tx);
+       /*
+        * Wait a few txgs after importing to begin scanning so that
+        * we can get the pool imported quickly.
+        */
+       if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS)
                return;
-       }
 
-       if (dsl_scan_is_paused_scrub(scn))
-               return;
+       /*
+        * It is possible to switch from unsorted to sorted at any time,
+        * but afterwards the scan will remain sorted unless reloaded from
+        * a checkpoint after a reboot.
+        */
+       if (!zfs_scan_legacy) {
+               scn->scn_is_sorted = B_TRUE;
+               if (scn->scn_last_checkpoint == 0)
+                       scn->scn_last_checkpoint = ddi_get_lbolt();
+       }
 
-       if (scn->scn_phys.scn_ddt_bookmark.ddb_class <=
-           scn->scn_phys.scn_ddt_class_max) {
-               zfs_dbgmsg("doing scan sync txg %llu; "
-                   "ddt bm=%llu/%llu/%llu/%llx",
-                   (longlong_t)tx->tx_txg,
-                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class,
-                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type,
-                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum,
-                   (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor);
-               ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0);
-               ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0);
-               ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0);
-               ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0);
+       /*
+        * For sorted scans, determine what kind of work we will be doing
+        * this txg based on our memory limitations and whether or not we
+        * need to perform a checkpoint.
+        */
+       if (scn->scn_is_sorted) {
+               /*
+                * If we are over our checkpoint interval, set scn_clearing
+                * so that we can begin checkpointing immediately. The
+                * checkpoint allows us to save a consisent bookmark
+                * representing how much data we have scrubbed so far.
+                * Otherwise, use the memory limit to determine if we should
+                * scan for metadata or start issue scrub IOs. We accumulate
+                * metadata until we hit our hard memory limit at which point
+                * we issue scrub IOs until we are at our soft memory limit.
+                */
+               if (scn->scn_checkpointing ||
+                   ddi_get_lbolt() - scn->scn_last_checkpoint >
+                   SEC_TO_TICK(zfs_scan_checkpoint_intval)) {
+                       if (!scn->scn_checkpointing)
+                               zfs_dbgmsg("begin scan checkpoint");
+
+                       scn->scn_checkpointing = B_TRUE;
+                       scn->scn_clearing = B_TRUE;
+               } else {
+                       boolean_t should_clear = dsl_scan_should_clear(scn);
+                       if (should_clear && !scn->scn_clearing) {
+                               zfs_dbgmsg("begin scan clearing");
+                               scn->scn_clearing = B_TRUE;
+                       } else if (!should_clear && scn->scn_clearing) {
+                               zfs_dbgmsg("finish scan clearing");
+                               scn->scn_clearing = B_FALSE;
+                       }
+               }
        } else {
-               zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu",
-                   (longlong_t)tx->tx_txg,
-                   (longlong_t)scn->scn_phys.scn_bookmark.zb_objset,
-                   (longlong_t)scn->scn_phys.scn_bookmark.zb_object,
-                   (longlong_t)scn->scn_phys.scn_bookmark.zb_level,
-                   (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid);
+               ASSERT0(scn->scn_checkpointing);
+               ASSERT0(scn->scn_clearing);
        }
 
-       scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
-           NULL, ZIO_FLAG_CANFAIL);
-       dsl_pool_config_enter(dp, FTAG);
-       dsl_scan_visit(scn, tx);
-       dsl_pool_config_exit(dp, FTAG);
-       (void) zio_wait(scn->scn_zio_root);
-       scn->scn_zio_root = NULL;
+       if (!scn->scn_clearing && scn->scn_done_txg == 0) {
+               /* Need to scan metadata for more blocks to scrub */
+               dsl_scan_phys_t *scnp = &scn->scn_phys;
+               taskqid_t prefetch_tqid;
+               uint64_t bytes_per_leaf = zfs_scan_vdev_limit;
+               uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev);
 
-       zfs_dbgmsg("visited %llu blocks in %llums",
-           (longlong_t)scn->scn_visited_this_txg,
-           (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time));
+               /*
+                * Calculate the max number of in-flight bytes for pool-wide
+                * scanning operations (minimum 1MB). Limits for the issuing
+                * phase are done per top-level vdev and are handled separately.
+                */
+               scn->scn_maxinflight_bytes =
+                   MAX(nr_leaves * bytes_per_leaf, 1ULL << 20);
+
+               if (scnp->scn_ddt_bookmark.ddb_class <=
+                   scnp->scn_ddt_class_max) {
+                       ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark));
+                       zfs_dbgmsg("doing scan sync txg %llu; "
+                           "ddt bm=%llu/%llu/%llu/%llx",
+                           (longlong_t)tx->tx_txg,
+                           (longlong_t)scnp->scn_ddt_bookmark.ddb_class,
+                           (longlong_t)scnp->scn_ddt_bookmark.ddb_type,
+                           (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum,
+                           (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor);
+               } else {
+                       zfs_dbgmsg("doing scan sync txg %llu; "
+                           "bm=%llu/%llu/%llu/%llu",
+                           (longlong_t)tx->tx_txg,
+                           (longlong_t)scnp->scn_bookmark.zb_objset,
+                           (longlong_t)scnp->scn_bookmark.zb_object,
+                           (longlong_t)scnp->scn_bookmark.zb_level,
+                           (longlong_t)scnp->scn_bookmark.zb_blkid);
+               }
 
-       if (!scn->scn_suspending) {
-               scn->scn_done_txg = tx->tx_txg + 1;
-               zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu",
-                   tx->tx_txg, scn->scn_done_txg);
-       }
+               scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+                   NULL, ZIO_FLAG_CANFAIL);
 
-       if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
-               mutex_enter(&spa->spa_scrub_lock);
-               while (spa->spa_scrub_inflight > 0) {
-                       cv_wait(&spa->spa_scrub_io_cv,
-                           &spa->spa_scrub_lock);
-               }
-               mutex_exit(&spa->spa_scrub_lock);
-       }
+               scn->scn_prefetch_stop = B_FALSE;
+               prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq,
+                   dsl_scan_prefetch_thread, scn, TQ_SLEEP);
+               ASSERT(prefetch_tqid != TASKQID_INVALID);
 
-       dsl_scan_sync_state(scn, tx);
-}
+               dsl_pool_config_enter(dp, FTAG);
+               dsl_scan_visit(scn, tx);
+               dsl_pool_config_exit(dp, FTAG);
 
-/*
- * This will start a new scan, or restart an existing one.
- */
-void
-dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg)
-{
-       if (txg == 0) {
-               dmu_tx_t *tx;
-               tx = dmu_tx_create_dd(dp->dp_mos_dir);
-               VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT));
+               mutex_enter(&dp->dp_spa->spa_scrub_lock);
+               scn->scn_prefetch_stop = B_TRUE;
+               cv_broadcast(&spa->spa_scrub_io_cv);
+               mutex_exit(&dp->dp_spa->spa_scrub_lock);
 
-               txg = dmu_tx_get_txg(tx);
-               dp->dp_scan->scn_restart_txg = txg;
-               dmu_tx_commit(tx);
-       } else {
-               dp->dp_scan->scn_restart_txg = txg;
+               taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid);
+               (void) zio_wait(scn->scn_zio_root);
+               scn->scn_zio_root = NULL;
+
+               zfs_dbgmsg("scan visited %llu blocks in %llums "
+                   "(%llu os's, %llu holes, %llu < mintxg, "
+                   "%llu in ddt, %llu > maxtxg)",
+                   (longlong_t)scn->scn_visited_this_txg,
+                   (longlong_t)NSEC2MSEC(gethrtime() -
+                   scn->scn_sync_start_time),
+                   (longlong_t)scn->scn_objsets_visited_this_txg,
+                   (longlong_t)scn->scn_holes_this_txg,
+                   (longlong_t)scn->scn_lt_min_this_txg,
+                   (longlong_t)scn->scn_ddt_contained_this_txg,
+                   (longlong_t)scn->scn_gt_max_this_txg);
+
+               if (!scn->scn_suspending) {
+                       ASSERT0(avl_numnodes(&scn->scn_queue));
+                       scn->scn_done_txg = tx->tx_txg + 1;
+                       if (scn->scn_is_sorted) {
+                               scn->scn_checkpointing = B_TRUE;
+                               scn->scn_clearing = B_TRUE;
+                       }
+                       zfs_dbgmsg("scan complete txg %llu",
+                           (longlong_t)tx->tx_txg);
+               }
+       } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) {
+               /* need to issue scrubbing IOs from per-vdev queues */
+               scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+                   NULL, ZIO_FLAG_CANFAIL);
+               scan_io_queues_run(scn);
+               (void) zio_wait(scn->scn_zio_root);
+               scn->scn_zio_root = NULL;
+
+               /* calculate and dprintf the current memory usage */
+               (void) dsl_scan_should_clear(scn);
+               dsl_scan_update_stats(scn);
+
+               zfs_dbgmsg("scan issued %llu blocks (%llu segs) in %llums "
+                   "(avg_block_size = %llu, avg_seg_size = %llu)",
+                   (longlong_t)scn->scn_zios_this_txg,
+                   (longlong_t)scn->scn_segs_this_txg,
+                   (longlong_t)NSEC2MSEC(gethrtime() -
+                   scn->scn_sync_start_time),
+                   (longlong_t)scn->scn_avg_zio_size_this_txg,
+                   (longlong_t)scn->scn_avg_seg_size_this_txg);
+       } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) {
+               /* Finished with everything. Mark the scrub as complete */
+               zfs_dbgmsg("scan issuing complete txg %llu",
+                   (longlong_t)tx->tx_txg);
+               ASSERT3U(scn->scn_done_txg, !=, 0);
+               ASSERT0(spa->spa_scrub_inflight);
+               ASSERT0(scn->scn_bytes_pending);
+               dsl_scan_done(scn, B_TRUE, tx);
+               sync_type = SYNC_MANDATORY;
        }
-       zfs_dbgmsg("restarting resilver txg=%llu", txg);
-}
 
-boolean_t
-dsl_scan_resilvering(dsl_pool_t *dp)
-{
-       return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING &&
-           dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER);
+       dsl_scan_sync_state(scn, tx, sync_type);
 }
 
-/*
- * scrub consumers
- */
-
 static void
-count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
+count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp)
 {
        int i;
 
+       /* update the spa's stats on how many bytes we have issued */
+       for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+               atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
+                   DVA_GET_ASIZE(&bp->blk_dva[i]));
+       }
+
        /*
         * If we resume after a reboot, zab will be NULL; don't record
         * incomplete stats in that case.
@@ -1881,6 +3325,8 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
        if (zab == NULL)
                return;
 
+       mutex_enter(&zab->zab_lock);
+
        for (i = 0; i < 4; i++) {
                int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS;
                int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL;
@@ -1916,63 +3362,97 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp)
                        break;
                }
        }
+
+       mutex_exit(&zab->zab_lock);
 }
 
 static void
-dsl_scan_scrub_done(zio_t *zio)
+scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
 {
-       spa_t *spa = zio->io_spa;
-
-       abd_free(zio->io_abd);
+       avl_index_t idx;
+       int64_t asize = sio->sio_asize;
+       dsl_scan_t *scn = queue->q_scn;
 
-       mutex_enter(&spa->spa_scrub_lock);
-       spa->spa_scrub_inflight--;
-       cv_broadcast(&spa->spa_scrub_io_cv);
+       ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
-       if (zio->io_error && (zio->io_error != ECKSUM ||
-           !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
-               spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++;
+       if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
+               /* block is already scheduled for reading */
+               atomic_add_64(&scn->scn_bytes_pending, -asize);
+               kmem_cache_free(sio_cache, sio);
+               return;
        }
-       mutex_exit(&spa->spa_scrub_lock);
+       avl_insert(&queue->q_sios_by_addr, sio, idx);
+       range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
 }
 
-static boolean_t
-dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize,
-    uint64_t phys_birth)
+/*
+ * Given all the info we got from our metadata scanning process, we
+ * construct a scan_io_t and insert it into the scan sorting queue. The
+ * I/O must already be suitable for us to process. This is controlled
+ * by dsl_scan_enqueue().
+ */
+static void
+scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i,
+    int zio_flags, const zbookmark_phys_t *zb)
 {
-       vdev_t *vd;
+       dsl_scan_t *scn = queue->q_scn;
+       scan_io_t *sio = kmem_cache_alloc(sio_cache, KM_SLEEP);
 
-       if (DVA_GET_GANG(dva)) {
-               /*
-                * Gang members may be spread across multiple
-                * vdevs, so the best estimate we have is the
-                * scrub range, which has already been checked.
-                * XXX -- it would be better to change our
-                * allocation policy to ensure that all
-                * gang members reside on the same vdev.
-                */
-               return (B_TRUE);
-       }
+       ASSERT0(BP_IS_GANG(bp));
+       ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
 
-       vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+       bp2sio(bp, sio, dva_i);
+       sio->sio_flags = zio_flags;
+       sio->sio_zb = *zb;
 
        /*
-        * Check if the txg falls within the range which must be
-        * resilvered.  DVAs outside this range can always be skipped.
+        * Increment the bytes pending counter now so that we can't
+        * get an integer underflow in case the worker processes the
+        * zio before we get to incrementing this counter.
         */
-       if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
-               return (B_FALSE);
+       atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
+
+       scan_io_queue_insert_impl(queue, sio);
+}
+
+/*
+ * Given a set of I/O parameters as discovered by the metadata traversal
+ * process, attempts to place the I/O into the sorted queues (if allowed),
+ * or immediately executes the I/O.
+ */
+static void
+dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+    const zbookmark_phys_t *zb)
+{
+       spa_t *spa = dp->dp_spa;
+
+       ASSERT(!BP_IS_EMBEDDED(bp));
 
        /*
-        * Check if the top-level vdev must resilver this offset.
-        * When the offset does not intersect with a dirty leaf DTL
-        * then it may be possible to skip the resilver IO.  The psize
-        * is provided instead of asize to simplify the check for RAIDZ.
+        * Gang blocks are hard to issue sequentially, so we just issue them
+        * here immediately instead of queuing them.
         */
-       if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize))
-               return (B_FALSE);
+       if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) {
+               scan_exec_io(dp, bp, zio_flags, zb, NULL);
+               return;
+       }
 
-       return (B_TRUE);
+       for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
+               dva_t dva;
+               vdev_t *vdev;
+
+               dva = bp->blk_dva[i];
+               vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva));
+               ASSERT(vdev != NULL);
+
+               mutex_enter(&vdev->vdev_scan_io_queue_lock);
+               if (vdev->vdev_scan_io_queue == NULL)
+                       vdev->vdev_scan_io_queue = scan_io_queue_create(vdev);
+               ASSERT(dp->dp_scan != NULL);
+               scan_io_queue_insert(vdev->vdev_scan_io_queue, bp,
+                   i, zio_flags, zb);
+               mutex_exit(&vdev->vdev_scan_io_queue_lock);
+       }
 }
 
 static int
@@ -1980,32 +3460,29 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_phys_t *zb)
 {
        dsl_scan_t *scn = dp->dp_scan;
-       size_t psize = BP_GET_PSIZE(bp);
        spa_t *spa = dp->dp_spa;
        uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
+       size_t psize = BP_GET_PSIZE(bp);
        boolean_t needs_io = B_FALSE;
        int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
-       int scan_delay = 0;
 
        if (phys_birth <= scn->scn_phys.scn_min_txg ||
            phys_birth >= scn->scn_phys.scn_max_txg)
                return (0);
 
-       count_block(dp->dp_blkstats, bp);
-
-       if (BP_IS_EMBEDDED(bp))
+       if (BP_IS_EMBEDDED(bp)) {
+               count_block(scn, dp->dp_blkstats, bp);
                return (0);
+       }
 
        ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn));
        if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) {
                zio_flags |= ZIO_FLAG_SCRUB;
                needs_io = B_TRUE;
-               scan_delay = zfs_scrub_delay;
        } else {
                ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER);
                zio_flags |= ZIO_FLAG_RESILVER;
                needs_io = B_FALSE;
-               scan_delay = zfs_resilver_delay;
        }
 
        /* If it's an intent log block, failure is expected. */
@@ -2029,91 +3506,348 @@ dsl_scan_scrub_cb(dsl_pool_t *dp,
        }
 
        if (needs_io && !zfs_no_scrub_io) {
-               vdev_t *rvd = spa->spa_root_vdev;
-               uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight;
+               dsl_scan_enqueue(dp, bp, zio_flags, zb);
+       } else {
+               count_block(scn, dp->dp_blkstats, bp);
+       }
+
+       /* do not relocate this block */
+       return (0);
+}
+
+static void
+dsl_scan_scrub_done(zio_t *zio)
+{
+       spa_t *spa = zio->io_spa;
+       blkptr_t *bp = zio->io_bp;
+       dsl_scan_io_queue_t *queue = zio->io_private;
+
+       abd_free(zio->io_abd);
+
+       if (queue == NULL) {
+               mutex_enter(&spa->spa_scrub_lock);
+               ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp));
+               spa->spa_scrub_inflight -= BP_GET_PSIZE(bp);
+               cv_broadcast(&spa->spa_scrub_io_cv);
+               mutex_exit(&spa->spa_scrub_lock);
+       } else {
+               mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock);
+               ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp));
+               queue->q_inflight_bytes -= BP_GET_PSIZE(bp);
+               cv_broadcast(&queue->q_zio_cv);
+               mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock);
+       }
+
+       if (zio->io_error && (zio->io_error != ECKSUM ||
+           !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
+               atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
+       }
+}
 
+/*
+ * Given a scanning zio's information, executes the zio. The zio need
+ * not necessarily be only sortable, this function simply executes the
+ * zio, no matter what it is. The optional queue argument allows the
+ * caller to specify that they want per top level vdev IO rate limiting
+ * instead of the legacy global limiting.
+ */
+static void
+scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags,
+    const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue)
+{
+       spa_t *spa = dp->dp_spa;
+       dsl_scan_t *scn = dp->dp_scan;
+       size_t size = BP_GET_PSIZE(bp);
+       abd_t *data = abd_alloc_for_io(size, B_FALSE);
+
+       if (queue == NULL) {
                mutex_enter(&spa->spa_scrub_lock);
-               while (spa->spa_scrub_inflight >= maxinflight)
+               while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)
                        cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-               spa->spa_scrub_inflight++;
+               spa->spa_scrub_inflight += BP_GET_PSIZE(bp);
                mutex_exit(&spa->spa_scrub_lock);
+       } else {
+               kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock;
 
-               /*
-                * If we're seeing recent (zfs_scan_idle) "important" I/Os
-                * then throttle our workload to limit the impact of a scan.
-                */
-               if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle)
-                       delay(scan_delay);
+               mutex_enter(q_lock);
+               while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes)
+                       cv_wait(&queue->q_zio_cv, q_lock);
+               queue->q_inflight_bytes += BP_GET_PSIZE(bp);
+               mutex_exit(q_lock);
+       }
+
+       count_block(scn, dp->dp_blkstats, bp);
+       zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size,
+           dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb));
+}
 
-               zio_nowait(zio_read(NULL, spa, bp,
-                   abd_alloc_for_io(psize, B_FALSE),
-                   psize, dsl_scan_scrub_done, NULL,
-                   ZIO_PRIORITY_SCRUB, zio_flags, zb));
+/*
+ * This is the primary extent sorting algorithm. We balance two parameters:
+ * 1) how many bytes of I/O are in an extent
+ * 2) how well the extent is filled with I/O (as a fraction of its total size)
+ * Since we allow extents to have gaps between their constituent I/Os, it's
+ * possible to have a fairly large extent that contains the same amount of
+ * I/O bytes than a much smaller extent, which just packs the I/O more tightly.
+ * The algorithm sorts based on a score calculated from the extent's size,
+ * the relative fill volume (in %) and a "fill weight" parameter that controls
+ * the split between whether we prefer larger extents or more well populated
+ * extents:
+ *
+ * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT)
+ *
+ * Example:
+ * 1) assume extsz = 64 MiB
+ * 2) assume fill = 32 MiB (extent is half full)
+ * 3) assume fill_weight = 3
+ * 4)  SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100
+ *     SCORE = 32M + (50 * 3 * 32M) / 100
+ *     SCORE = 32M + (4800M / 100)
+ *     SCORE = 32M + 48M
+ *              ^     ^
+ *              |     +--- final total relative fill-based score
+ *              +--------- final total fill-based score
+ *     SCORE = 80M
+ *
+ * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards
+ * extents that are more completely filled (in a 3:2 ratio) vs just larger.
+ * Note that as an optimization, we replace multiplication and division by
+ * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128).
+ */
+static int
+ext_size_compare(const void *x, const void *y)
+{
+       const range_seg_t *rsa = x, *rsb = y;
+       uint64_t sa = rsa->rs_end - rsa->rs_start,
+           sb = rsb->rs_end - rsb->rs_start;
+       uint64_t score_a, score_b;
+
+       score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) *
+           fill_weight * rsa->rs_fill) >> 7);
+       score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) *
+           fill_weight * rsb->rs_fill) >> 7);
+
+       if (score_a > score_b)
+               return (-1);
+       if (score_a == score_b) {
+               if (rsa->rs_start < rsb->rs_start)
+                       return (-1);
+               if (rsa->rs_start == rsb->rs_start)
+                       return (0);
+               return (1);
        }
+       return (1);
+}
 
-       /* do not relocate this block */
-       return (0);
+/*
+ * Comparator for the q_sios_by_addr tree. Sorting is simply performed
+ * based on LBA-order (from lowest to highest).
+ */
+static int
+sio_addr_compare(const void *x, const void *y)
+{
+       const scan_io_t *a = x, *b = y;
+
+       if (a->sio_offset < b->sio_offset)
+               return (-1);
+       if (a->sio_offset == b->sio_offset)
+               return (0);
+       return (1);
+}
+
+/* IO queues are created on demand when they are needed. */
+static dsl_scan_io_queue_t *
+scan_io_queue_create(vdev_t *vd)
+{
+       dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
+       dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP);
+
+       q->q_scn = scn;
+       q->q_vd = vd;
+       cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
+       q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
+           &q->q_exts_by_size, ext_size_compare,
+           &q->q_vd->vdev_scan_io_queue_lock, zfs_scan_max_ext_gap);
+       avl_create(&q->q_sios_by_addr, sio_addr_compare,
+           sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node));
+
+       return (q);
 }
 
 /*
- * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
- * Can also be called to resume a paused scrub.
+ * Destroys a scan queue and all segments and scan_io_t's contained in it.
+ * No further execution of I/O occurs, anything pending in the queue is
+ * simply freed without being executed.
  */
-int
-dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
+void
+dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue)
 {
-       spa_t *spa = dp->dp_spa;
-       dsl_scan_t *scn = dp->dp_scan;
+       dsl_scan_t *scn = queue->q_scn;
+       scan_io_t *sio;
+       void *cookie = NULL;
+       int64_t bytes_dequeued = 0;
+
+       ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
+
+       while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
+           NULL) {
+               ASSERT(range_tree_contains(queue->q_exts_by_addr,
+                   sio->sio_offset, sio->sio_asize));
+               bytes_dequeued += sio->sio_asize;
+               kmem_cache_free(sio_cache, sio);
+       }
 
-       /*
-        * Purge all vdev caches and probe all devices.  We do this here
-        * rather than in sync context because this requires a writer lock
-        * on the spa_config lock, which we can't do from sync context.  The
-        * spa_scrub_reopen flag indicates that vdev_open() should not
-        * attempt to start another scrub.
-        */
-       spa_vdev_state_enter(spa, SCL_NONE);
-       spa->spa_scrub_reopen = B_TRUE;
-       vdev_reopen(spa->spa_root_vdev);
-       spa->spa_scrub_reopen = B_FALSE;
-       (void) spa_vdev_state_exit(spa, NULL, 0);
+       atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
+       range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
+       range_tree_destroy(queue->q_exts_by_addr);
+       avl_destroy(&queue->q_sios_by_addr);
+       cv_destroy(&queue->q_zio_cv);
 
-       if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
-               /* got scrub start cmd, resume paused scrub */
-               int err = dsl_scrub_set_pause_resume(scn->scn_dp,
-                   POOL_SCRUB_NORMAL);
-               if (err == 0)
-                       return (SET_ERROR(ECANCELED));
+       kmem_free(queue, sizeof (*queue));
+}
 
-               return (SET_ERROR(err));
+/*
+ * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is
+ * called on behalf of vdev_top_transfer when creating or destroying
+ * a mirror vdev due to zpool attach/detach.
+ */
+void
+dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd)
+{
+       mutex_enter(&svd->vdev_scan_io_queue_lock);
+       mutex_enter(&tvd->vdev_scan_io_queue_lock);
+
+       VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL);
+       tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue;
+       svd->vdev_scan_io_queue = NULL;
+       if (tvd->vdev_scan_io_queue != NULL) {
+               tvd->vdev_scan_io_queue->q_vd = tvd;
+               range_tree_set_lock(tvd->vdev_scan_io_queue->q_exts_by_addr,
+                   &tvd->vdev_scan_io_queue_lock);
        }
 
-       return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check,
-           dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE));
+       mutex_exit(&tvd->vdev_scan_io_queue_lock);
+       mutex_exit(&svd->vdev_scan_io_queue_lock);
 }
 
-static boolean_t
-dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx)
+static void
+scan_io_queues_destroy(dsl_scan_t *scn)
 {
-       return (scn->scn_restart_txg != 0 &&
-           scn->scn_restart_txg <= tx->tx_txg);
+       vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
+
+       for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+               vdev_t *tvd = rvd->vdev_child[i];
+
+               mutex_enter(&tvd->vdev_scan_io_queue_lock);
+               if (tvd->vdev_scan_io_queue != NULL)
+                       dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue);
+               tvd->vdev_scan_io_queue = NULL;
+               mutex_exit(&tvd->vdev_scan_io_queue_lock);
+       }
 }
 
-#if defined(_KERNEL) && defined(HAVE_SPL)
-module_param(zfs_top_maxinflight, int, 0644);
-MODULE_PARM_DESC(zfs_top_maxinflight, "Max I/Os per top-level");
+static void
+dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i)
+{
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+       vdev_t *vdev;
+       kmutex_t *q_lock;
+       dsl_scan_io_queue_t *queue;
+       scan_io_t srch, *sio;
+       avl_index_t idx;
+       uint64_t start, size;
+
+       vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i]));
+       ASSERT(vdev != NULL);
+       q_lock = &vdev->vdev_scan_io_queue_lock;
+       queue = vdev->vdev_scan_io_queue;
+
+       mutex_enter(q_lock);
+       if (queue == NULL) {
+               mutex_exit(q_lock);
+               return;
+       }
+
+       bp2sio(bp, &srch, dva_i);
+       start = srch.sio_offset;
+       size = srch.sio_asize;
+
+       /*
+        * We can find the zio in two states:
+        * 1) Cold, just sitting in the queue of zio's to be issued at
+        *      some point in the future. In this case, all we do is
+        *      remove the zio from the q_sios_by_addr tree, decrement
+        *      its data volume from the containing range_seg_t and
+        *      resort the q_exts_by_size tree to reflect that the
+        *      range_seg_t has lost some of its 'fill'. We don't shorten
+        *      the range_seg_t - this is usually rare enough not to be
+        *      worth the extra hassle of trying keep track of precise
+        *      extent boundaries.
+        * 2) Hot, where the zio is currently in-flight in
+        *      dsl_scan_issue_ios. In this case, we can't simply
+        *      reach in and stop the in-flight zio's, so we instead
+        *      block the caller. Eventually, dsl_scan_issue_ios will
+        *      be done with issuing the zio's it gathered and will
+        *      signal us.
+        */
+       sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
+       if (sio != NULL) {
+               int64_t asize = sio->sio_asize;
+               blkptr_t tmpbp;
+
+               /* Got it while it was cold in the queue */
+               ASSERT3U(start, ==, sio->sio_offset);
+               ASSERT3U(size, ==, asize);
+               avl_remove(&queue->q_sios_by_addr, sio);
 
-module_param(zfs_resilver_delay, int, 0644);
-MODULE_PARM_DESC(zfs_resilver_delay, "Number of ticks to delay resilver");
+               ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
+               range_tree_remove_fill(queue->q_exts_by_addr, start, size);
+
+               /*
+                * We only update scn_bytes_pending in the cold path,
+                * otherwise it will already have been accounted for as
+                * part of the zio's execution.
+                */
+               atomic_add_64(&scn->scn_bytes_pending, -asize);
 
-module_param(zfs_scrub_delay, int, 0644);
-MODULE_PARM_DESC(zfs_scrub_delay, "Number of ticks to delay scrub");
+               /* count the block as though we issued it */
+               sio2bp(sio, &tmpbp, dva_i);
+               count_block(scn, dp->dp_blkstats, &tmpbp);
 
-module_param(zfs_scan_idle, int, 0644);
-MODULE_PARM_DESC(zfs_scan_idle, "Idle window in clock ticks");
+               kmem_cache_free(sio_cache, sio);
+       }
+       mutex_exit(q_lock);
+}
 
-module_param(zfs_scan_min_time_ms, int, 0644);
-MODULE_PARM_DESC(zfs_scan_min_time_ms, "Min millisecs to scrub per txg");
+/*
+ * Callback invoked when a zio_free() zio is executing. This needs to be
+ * intercepted to prevent the zio from deallocating a particular portion
+ * of disk space and it then getting reallocated and written to, while we
+ * still have it queued up for processing.
+ */
+void
+dsl_scan_freed(spa_t *spa, const blkptr_t *bp)
+{
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+
+       ASSERT(!BP_IS_EMBEDDED(bp));
+       ASSERT(scn != NULL);
+       if (!dsl_scan_is_running(scn))
+               return;
+
+       for (int i = 0; i < BP_GET_NDVAS(bp); i++)
+               dsl_scan_freed_dva(spa, bp, i);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+/* CSTYLED */
+module_param(zfs_scan_vdev_limit, ulong, 0644);
+MODULE_PARM_DESC(zfs_scan_vdev_limit,
+       "Max bytes in flight per leaf vdev for scrubs and resilvers");
+
+module_param(zfs_scrub_min_time_ms, int, 0644);
+MODULE_PARM_DESC(zfs_scrub_min_time_ms, "Min millisecs to scrub per txg");
 
 module_param(zfs_free_min_time_ms, int, 0644);
 MODULE_PARM_DESC(zfs_free_min_time_ms, "Min millisecs to free per txg");
@@ -2133,4 +3867,30 @@ MODULE_PARM_DESC(zfs_free_max_blocks, "Max number of blocks freed in one txg");
 
 module_param(zfs_free_bpobj_enabled, int, 0644);
 MODULE_PARM_DESC(zfs_free_bpobj_enabled, "Enable processing of the free_bpobj");
+
+module_param(zfs_scan_mem_lim_fact, int, 0644);
+MODULE_PARM_DESC(zfs_scan_mem_lim_fact, "Fraction of RAM for scan hard limit");
+
+module_param(zfs_scan_issue_strategy, int, 0644);
+MODULE_PARM_DESC(zfs_scan_issue_strategy,
+       "IO issuing strategy during scrubbing. 0 = default, 1 = LBA, 2 = size");
+
+module_param(zfs_scan_legacy, int, 0644);
+MODULE_PARM_DESC(zfs_scan_legacy, "Scrub using legacy non-sequential method");
+
+module_param(zfs_scan_checkpoint_intval, int, 0644);
+MODULE_PARM_DESC(zfs_scan_checkpoint_intval,
+       "Scan progress on-disk checkpointing interval");
+
+module_param(zfs_scan_mem_lim_soft_fact, int, 0644);
+MODULE_PARM_DESC(zfs_scan_mem_lim_soft_fact,
+       "Fraction of hard limit used as soft limit");
+
+module_param(zfs_scan_strict_mem_lim, int, 0644);
+MODULE_PARM_DESC(zfs_scan_strict_mem_lim,
+       "Tunable to attempt to reduce lock contention");
+
+module_param(zfs_scan_fill_weight, int, 0644);
+MODULE_PARM_DESC(zfs_scan_fill_weight,
+       "Tunable to adjust bias towards more filled segments during scans");
 #endif
index 5dc9ed60df6d68fd518c4ffc4a4e1b86a5cda4dc..6320fd388ff2f01019a2701835902d7846672d44 100644 (file)
@@ -971,85 +971,6 @@ metaslab_rangesize_compare(const void *x1, const void *x2)
        return (AVL_CMP(r1->rs_start, r2->rs_start));
 }
 
-/*
- * Create any block allocator specific components. The current allocators
- * rely on using both a size-ordered range_tree_t and an array of uint64_t's.
- */
-static void
-metaslab_rt_create(range_tree_t *rt, void *arg)
-{
-       metaslab_t *msp = arg;
-
-       ASSERT3P(rt->rt_arg, ==, msp);
-       ASSERT(msp->ms_tree == NULL);
-
-       avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
-           sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
-}
-
-/*
- * Destroy the block allocator specific components.
- */
-static void
-metaslab_rt_destroy(range_tree_t *rt, void *arg)
-{
-       metaslab_t *msp = arg;
-
-       ASSERT3P(rt->rt_arg, ==, msp);
-       ASSERT3P(msp->ms_tree, ==, rt);
-       ASSERT0(avl_numnodes(&msp->ms_size_tree));
-
-       avl_destroy(&msp->ms_size_tree);
-}
-
-static void
-metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-       metaslab_t *msp = arg;
-
-       ASSERT3P(rt->rt_arg, ==, msp);
-       ASSERT3P(msp->ms_tree, ==, rt);
-       VERIFY(!msp->ms_condensing);
-       avl_add(&msp->ms_size_tree, rs);
-}
-
-static void
-metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
-{
-       metaslab_t *msp = arg;
-
-       ASSERT3P(rt->rt_arg, ==, msp);
-       ASSERT3P(msp->ms_tree, ==, rt);
-       VERIFY(!msp->ms_condensing);
-       avl_remove(&msp->ms_size_tree, rs);
-}
-
-static void
-metaslab_rt_vacate(range_tree_t *rt, void *arg)
-{
-       metaslab_t *msp = arg;
-
-       ASSERT3P(rt->rt_arg, ==, msp);
-       ASSERT3P(msp->ms_tree, ==, rt);
-
-       /*
-        * Normally one would walk the tree freeing nodes along the way.
-        * Since the nodes are shared with the range trees we can avoid
-        * walking all nodes and just reinitialize the avl tree. The nodes
-        * will be freed by the range tree, so we don't want to free them here.
-        */
-       avl_create(&msp->ms_size_tree, metaslab_rangesize_compare,
-           sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node));
-}
-
-static range_tree_ops_t metaslab_rt_ops = {
-       metaslab_rt_create,
-       metaslab_rt_destroy,
-       metaslab_rt_add,
-       metaslab_rt_remove,
-       metaslab_rt_vacate
-};
-
 /*
  * ==========================================================================
  * Common allocator routines
@@ -1425,7 +1346,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg,
         * addition of new space; and for debugging, it ensures that we'd
         * data fault on any attempt to use this metaslab before it's ready.
         */
-       ms->ms_tree = range_tree_create(&metaslab_rt_ops, ms, &ms->ms_lock);
+       ms->ms_tree = range_tree_create_impl(&rt_avl_ops, &ms->ms_size_tree,
+           metaslab_rangesize_compare, &ms->ms_lock, 0);
        metaslab_group_add(mg, ms);
 
        metaslab_set_fragmentation(ms);
index ebef7f447862d9f966c49d752f3ff923548887fc..01ef463ecc25a8fea8deceb2d25079513b116fc7 100644 (file)
 #include <sys/zio.h>
 #include <sys/range_tree.h>
 
+/*
+ * Range trees are tree-based data structures that can be used to
+ * track free space or generally any space allocation information.
+ * A range tree keeps track of individual segments and automatically
+ * provides facilities such as adjacent extent merging and extent
+ * splitting in response to range add/remove requests.
+ *
+ * A range tree starts out completely empty, with no segments in it.
+ * Adding an allocation via range_tree_add to the range tree can either:
+ * 1) create a new extent
+ * 2) extend an adjacent extent
+ * 3) merge two adjacent extents
+ * Conversely, removing an allocation via range_tree_remove can:
+ * 1) completely remove an extent
+ * 2) shorten an extent (if the allocation was near one of its ends)
+ * 3) split an extent into two extents, in effect punching a hole
+ *
+ * A range tree is also capable of 'bridging' gaps when adding
+ * allocations. This is useful for cases when close proximity of
+ * allocations is an important detail that needs to be represented
+ * in the range tree. See range_tree_set_gap(). The default behavior
+ * is not to bridge gaps (i.e. the maximum allowed gap size is 0).
+ *
+ * In order to traverse a range tree, use either the range_tree_walk()
+ * or range_tree_vacate() functions.
+ *
+ * To obtain more accurate information on individual segment
+ * operations that the range tree performs "under the hood", you can
+ * specify a set of callbacks by passing a range_tree_ops_t structure
+ * to the range_tree_create function. Any callbacks that are non-NULL
+ * are then called at the appropriate times.
+ *
+ * The range tree code also supports a special variant of range trees
+ * that can bridge small gaps between segments. This kind of tree is used
+ * by the dsl scanning code to group I/Os into mostly sequential chunks to
+ * optimize disk performance. The code here attempts to do this with as
+ * little memory and computational overhead as possible. One limitation of
+ * this implementation is that segments of range trees with gaps can only
+ * support removing complete segments.
+ */
+
 kmem_cache_t *range_seg_cache;
 
+/* Generic ops for managing an AVL tree alongside a range tree */
+struct range_tree_ops rt_avl_ops = {
+       .rtop_create = rt_avl_create,
+       .rtop_destroy = rt_avl_destroy,
+       .rtop_add = rt_avl_add,
+       .rtop_remove = rt_avl_remove,
+       .rtop_vacate = rt_avl_vacate,
+};
+
 void
 range_tree_init(void)
 {
@@ -75,6 +125,18 @@ range_tree_stat_verify(range_tree_t *rt)
        }
 }
 
+/*
+ * Changes out the lock used by the range tree. Useful when you are moving
+ * the range tree between containing structures without having to recreate
+ * it. Both the old and new locks must be held by the caller.
+ */
+void
+range_tree_set_lock(range_tree_t *rt, kmutex_t *lp)
+{
+       ASSERT(MUTEX_HELD(rt->rt_lock) && MUTEX_HELD(lp));
+       rt->rt_lock = lp;
+}
+
 static void
 range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs)
 {
@@ -121,31 +183,38 @@ range_tree_seg_compare(const void *x1, const void *x2)
 }
 
 range_tree_t *
-range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
+range_tree_create_impl(range_tree_ops_t *ops, void *arg,
+    int (*avl_compare) (const void *, const void *), kmutex_t *lp, uint64_t gap)
 {
-       range_tree_t *rt;
-
-       rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
+       range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP);
 
        avl_create(&rt->rt_root, range_tree_seg_compare,
            sizeof (range_seg_t), offsetof(range_seg_t, rs_node));
 
        rt->rt_lock = lp;
        rt->rt_ops = ops;
+       rt->rt_gap = gap;
        rt->rt_arg = arg;
+       rt->rt_avl_compare = avl_compare;
 
-       if (rt->rt_ops != NULL)
+       if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL)
                rt->rt_ops->rtop_create(rt, rt->rt_arg);
 
        return (rt);
 }
 
+range_tree_t *
+range_tree_create(range_tree_ops_t *ops, void *arg, kmutex_t *lp)
+{
+       return (range_tree_create_impl(ops, arg, NULL, lp, 0));
+}
+
 void
 range_tree_destroy(range_tree_t *rt)
 {
        VERIFY0(rt->rt_space);
 
-       if (rt->rt_ops != NULL)
+       if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL)
                rt->rt_ops->rtop_destroy(rt, rt->rt_arg);
 
        avl_destroy(&rt->rt_root);
@@ -153,40 +222,102 @@ range_tree_destroy(range_tree_t *rt)
 }
 
 void
-range_tree_add(void *arg, uint64_t start, uint64_t size)
+range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta)
+{
+       ASSERT(MUTEX_HELD(rt->rt_lock));
+
+       ASSERT3U(rs->rs_fill + delta, !=, 0);
+       ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start);
+
+       if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+               rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+       rs->rs_fill += delta;
+       if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+               rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+}
+
+static void
+range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill)
 {
        range_tree_t *rt = arg;
        avl_index_t where;
        range_seg_t rsearch, *rs_before, *rs_after, *rs;
-       uint64_t end = start + size;
+       uint64_t end = start + size, gap = rt->rt_gap;
+       uint64_t bridge_size = 0;
        boolean_t merge_before, merge_after;
 
        ASSERT(MUTEX_HELD(rt->rt_lock));
-       VERIFY(size != 0);
+       ASSERT3U(size, !=, 0);
+       ASSERT3U(fill, <=, size);
 
        rsearch.rs_start = start;
        rsearch.rs_end = end;
        rs = avl_find(&rt->rt_root, &rsearch, &where);
 
-       if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) {
+       if (gap == 0 && rs != NULL &&
+           rs->rs_start <= start && rs->rs_end >= end) {
                zfs_panic_recover("zfs: allocating allocated segment"
-                   "(offset=%llu size=%llu)\n",
-                   (longlong_t)start, (longlong_t)size);
+                   "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n",
+                   (longlong_t)start, (longlong_t)size,
+                   (longlong_t)rs->rs_start,
+                   (longlong_t)rs->rs_end - rs->rs_start);
+               return;
+       }
+
+       /*
+        * If this is a gap-supporting range tree, it is possible that we
+        * are inserting into an existing segment. In this case simply
+        * bump the fill count and call the remove / add callbacks. If the
+        * new range will extend an existing segment, we remove the
+        * existing one, apply the new extent to it and re-insert it using
+        * the normal code paths.
+        */
+       if (rs != NULL) {
+               ASSERT3U(gap, !=, 0);
+               if (rs->rs_start <= start && rs->rs_end >= end) {
+                       range_tree_adjust_fill(rt, rs, fill);
+                       return;
+               }
+
+               avl_remove(&rt->rt_root, rs);
+               if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+                       rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+               range_tree_stat_decr(rt, rs);
+               rt->rt_space -= rs->rs_end - rs->rs_start;
+
+               fill += rs->rs_fill;
+               start = MIN(start, rs->rs_start);
+               end = MAX(end, rs->rs_end);
+               size = end - start;
+
+               range_tree_add_impl(rt, start, size, fill);
+
+               kmem_cache_free(range_seg_cache, rs);
                return;
        }
 
-       /* Make sure we don't overlap with either of our neighbors */
-       VERIFY(rs == NULL);
+       ASSERT3P(rs, ==, NULL);
 
+       /*
+        * Determine whether or not we will have to merge with our neighbors.
+        * If gap != 0, we might need to merge with our neighbors even if we
+        * aren't directly touching.
+        */
        rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE);
        rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER);
 
-       merge_before = (rs_before != NULL && rs_before->rs_end == start);
-       merge_after = (rs_after != NULL && rs_after->rs_start == end);
+       merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap);
+       merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap);
+
+       if (merge_before && gap != 0)
+               bridge_size += start - rs_before->rs_end;
+       if (merge_after && gap != 0)
+               bridge_size += rs_after->rs_start - end;
 
        if (merge_before && merge_after) {
                avl_remove(&rt->rt_root, rs_before);
-               if (rt->rt_ops != NULL) {
+               if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) {
                        rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
                        rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
                }
@@ -194,43 +325,59 @@ range_tree_add(void *arg, uint64_t start, uint64_t size)
                range_tree_stat_decr(rt, rs_before);
                range_tree_stat_decr(rt, rs_after);
 
+               rs_after->rs_fill += rs_before->rs_fill + fill;
                rs_after->rs_start = rs_before->rs_start;
                kmem_cache_free(range_seg_cache, rs_before);
                rs = rs_after;
        } else if (merge_before) {
-               if (rt->rt_ops != NULL)
+               if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
                        rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg);
 
                range_tree_stat_decr(rt, rs_before);
 
+               rs_before->rs_fill += fill;
                rs_before->rs_end = end;
                rs = rs_before;
        } else if (merge_after) {
-               if (rt->rt_ops != NULL)
+               if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
                        rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg);
 
                range_tree_stat_decr(rt, rs_after);
 
+               rs_after->rs_fill += fill;
                rs_after->rs_start = start;
                rs = rs_after;
        } else {
                rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
+
+               rs->rs_fill = fill;
                rs->rs_start = start;
                rs->rs_end = end;
                avl_insert(&rt->rt_root, rs, where);
        }
 
-       if (rt->rt_ops != NULL)
+       if (gap != 0)
+               ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start);
+       else
+               ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start);
+
+       if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
                rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
 
        range_tree_stat_incr(rt, rs);
-       rt->rt_space += size;
+       rt->rt_space += size + bridge_size;
 }
 
 void
-range_tree_remove(void *arg, uint64_t start, uint64_t size)
+range_tree_add(void *arg, uint64_t start, uint64_t size)
+{
+       range_tree_add_impl(arg, start, size, size);
+}
+
+static void
+range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size,
+    boolean_t do_fill)
 {
-       range_tree_t *rt = arg;
        avl_index_t where;
        range_seg_t rsearch, *rs, *newseg;
        uint64_t end = start + size;
@@ -251,6 +398,34 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
                    (longlong_t)start, (longlong_t)size);
                return;
        }
+
+       /*
+        * Range trees with gap support must only remove complete segments
+        * from the tree. This allows us to maintain accurate fill accounting
+        * and to ensure that bridged sections are not leaked. If we need to
+        * remove less than the full segment, we can only adjust the fill count.
+        */
+       if (rt->rt_gap != 0) {
+               if (do_fill) {
+                       if (rs->rs_fill == size) {
+                               start = rs->rs_start;
+                               end = rs->rs_end;
+                               size = end - start;
+                       } else {
+                               range_tree_adjust_fill(rt, rs, -size);
+                               return;
+                       }
+               } else if (rs->rs_start != start || rs->rs_end != end) {
+                       zfs_panic_recover("zfs: freeing partial segment of "
+                           "gap tree (offset=%llu size=%llu) of "
+                           "(offset=%llu size=%llu)",
+                           (longlong_t)start, (longlong_t)size,
+                           (longlong_t)rs->rs_start,
+                           (longlong_t)rs->rs_end - rs->rs_start);
+                       return;
+               }
+       }
+
        VERIFY3U(rs->rs_start, <=, start);
        VERIFY3U(rs->rs_end, >=, end);
 
@@ -259,19 +434,20 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
 
        range_tree_stat_decr(rt, rs);
 
-       if (rt->rt_ops != NULL)
+       if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
                rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
 
        if (left_over && right_over) {
                newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP);
                newseg->rs_start = end;
                newseg->rs_end = rs->rs_end;
+               newseg->rs_fill = newseg->rs_end - newseg->rs_start;
                range_tree_stat_incr(rt, newseg);
 
                rs->rs_end = start;
 
                avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER);
-               if (rt->rt_ops != NULL)
+               if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
                        rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg);
        } else if (left_over) {
                rs->rs_end = start;
@@ -284,15 +460,55 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size)
        }
 
        if (rs != NULL) {
+               /*
+                * The fill of the leftover segment will always be equal to
+                * the size, since we do not support removing partial segments
+                * of range trees with gaps.
+                */
+               rs->rs_fill = rs->rs_end - rs->rs_start;
                range_tree_stat_incr(rt, rs);
 
-               if (rt->rt_ops != NULL)
+               if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
                        rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
        }
 
        rt->rt_space -= size;
 }
 
+void
+range_tree_remove(void *arg, uint64_t start, uint64_t size)
+{
+       range_tree_remove_impl(arg, start, size, B_FALSE);
+}
+
+void
+range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size)
+{
+       range_tree_remove_impl(rt, start, size, B_TRUE);
+}
+
+void
+range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs,
+    uint64_t newstart, uint64_t newsize)
+{
+       int64_t delta = newsize - (rs->rs_end - rs->rs_start);
+
+       ASSERT(MUTEX_HELD(rt->rt_lock));
+
+       range_tree_stat_decr(rt, rs);
+       if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL)
+               rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg);
+
+       rs->rs_start = newstart;
+       rs->rs_end = newstart + newsize;
+
+       range_tree_stat_incr(rt, rs);
+       if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL)
+               rt->rt_ops->rtop_add(rt, rs, rt->rt_arg);
+
+       rt->rt_space += delta;
+}
+
 static range_seg_t *
 range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
 {
@@ -308,7 +524,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size)
        return (avl_find(&rt->rt_root, &rsearch, &where));
 }
 
-static range_seg_t *
+range_seg_t *
 range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size)
 {
        range_seg_t *rs = range_tree_find_impl(rt, start, size);
@@ -373,7 +589,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg)
 
        ASSERT(MUTEX_HELD(rt->rt_lock));
 
-       if (rt->rt_ops != NULL)
+       if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL)
                rt->rt_ops->rtop_vacate(rt, rt->rt_arg);
 
        while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) {
@@ -397,8 +613,60 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg)
                func(arg, rs->rs_start, rs->rs_end - rs->rs_start);
 }
 
+range_seg_t *
+range_tree_first(range_tree_t *rt)
+{
+       ASSERT(MUTEX_HELD(rt->rt_lock));
+       return (avl_first(&rt->rt_root));
+}
+
 uint64_t
 range_tree_space(range_tree_t *rt)
 {
        return (rt->rt_space);
 }
+
+/* Generic range tree functions for maintaining segments in an AVL tree. */
+void
+rt_avl_create(range_tree_t *rt, void *arg)
+{
+       avl_tree_t *tree = arg;
+
+       avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t),
+           offsetof(range_seg_t, rs_pp_node));
+}
+
+void
+rt_avl_destroy(range_tree_t *rt, void *arg)
+{
+       avl_tree_t *tree = arg;
+
+       ASSERT0(avl_numnodes(tree));
+       avl_destroy(tree);
+}
+
+void
+rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+       avl_tree_t *tree = arg;
+       avl_add(tree, rs);
+}
+
+void
+rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg)
+{
+       avl_tree_t *tree = arg;
+       avl_remove(tree, rs);
+}
+
+void
+rt_avl_vacate(range_tree_t *rt, void *arg)
+{
+       /*
+        * Normally one would walk the tree freeing nodes along the way.
+        * Since the nodes are shared with the range trees we can avoid
+        * walking all nodes and just reinitialize the avl tree. The nodes
+        * will be freed by the range tree, so we don't want to free them here.
+        */
+       rt_avl_create(rt, arg);
+}
index 0604742ab17acd2b1a23d615be90f777af07e652..e06190f9db832209c68db061879f439d9d995d66 100644 (file)
@@ -1996,7 +1996,7 @@ spa_load_verify_done(zio_t *zio)
        }
 
        mutex_enter(&spa->spa_scrub_lock);
-       spa->spa_scrub_inflight--;
+       spa->spa_load_verify_ios--;
        cv_broadcast(&spa->spa_scrub_io_cv);
        mutex_exit(&spa->spa_scrub_lock);
 }
@@ -2030,9 +2030,9 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
        size_t size = BP_GET_PSIZE(bp);
 
        mutex_enter(&spa->spa_scrub_lock);
-       while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
+       while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight)
                cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
-       spa->spa_scrub_inflight++;
+       spa->spa_load_verify_ios++;
        mutex_exit(&spa->spa_scrub_lock);
 
        zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
index 9a3290e95206515b1e5a16bdc018ef68f9456739..116b0ebd91ea8913b5b1adba39d397ec5d81f818 100644 (file)
@@ -1892,6 +1892,7 @@ spa_init(int mode)
        zpool_feature_init();
        spa_config_load();
        l2arc_start();
+       scan_init();
        qat_init();
 }
 
@@ -1915,6 +1916,7 @@ spa_fini(void)
        unique_fini();
        refcount_fini();
        fm_fini();
+       scan_fini();
        qat_fini();
 
        avl_destroy(&spa_namespace_avl);
@@ -2016,6 +2018,7 @@ spa_scan_stat_init(spa_t *spa)
                spa->spa_scan_pass_scrub_pause = 0;
        spa->spa_scan_pass_scrub_spent_paused = 0;
        spa->spa_scan_pass_exam = 0;
+       spa->spa_scan_pass_issued = 0;
        vdev_scan_stat_init(spa->spa_root_vdev);
 }
 
@@ -2033,18 +2036,21 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 
        /* data stored on disk */
        ps->pss_func = scn->scn_phys.scn_func;
+       ps->pss_state = scn->scn_phys.scn_state;
        ps->pss_start_time = scn->scn_phys.scn_start_time;
        ps->pss_end_time = scn->scn_phys.scn_end_time;
        ps->pss_to_examine = scn->scn_phys.scn_to_examine;
-       ps->pss_examined = scn->scn_phys.scn_examined;
        ps->pss_to_process = scn->scn_phys.scn_to_process;
        ps->pss_processed = scn->scn_phys.scn_processed;
        ps->pss_errors = scn->scn_phys.scn_errors;
-       ps->pss_state = scn->scn_phys.scn_state;
+       ps->pss_examined = scn->scn_phys.scn_examined;
+       ps->pss_issued =
+           scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
 
        /* data not stored on disk */
        ps->pss_pass_start = spa->spa_scan_pass_start;
        ps->pss_pass_exam = spa->spa_scan_pass_exam;
+       ps->pss_pass_issued = spa->spa_scan_pass_issued;
        ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause;
        ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused;
 
index 2df0040af9253e4e8c2007e76cb3b55a245b6b9c..9edeaf52592aaee77e8af8108345a702b31449ca 100644 (file)
@@ -360,6 +360,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
        mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
 
        for (int t = 0; t < DTL_TYPES; t++) {
                vd->vdev_dtl[t] = range_tree_create(NULL, NULL,
@@ -647,6 +648,18 @@ vdev_free(vdev_t *vd)
 {
        spa_t *spa = vd->vdev_spa;
 
+       /*
+        * Scan queues are normally destroyed at the end of a scan. If the
+        * queue exists here, that implies the vdev is being removed while
+        * the scan is still running.
+        */
+       if (vd->vdev_scan_io_queue != NULL) {
+               mutex_enter(&vd->vdev_scan_io_queue_lock);
+               dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue);
+               vd->vdev_scan_io_queue = NULL;
+               mutex_exit(&vd->vdev_scan_io_queue_lock);
+       }
+
        /*
         * vdev_free() implies closing the vdev first.  This is simpler than
         * trying to ensure complicated semantics for all callers.
@@ -723,6 +736,7 @@ vdev_free(vdev_t *vd)
        mutex_destroy(&vd->vdev_dtl_lock);
        mutex_destroy(&vd->vdev_stat_lock);
        mutex_destroy(&vd->vdev_probe_lock);
+       mutex_destroy(&vd->vdev_scan_io_queue_lock);
 
        zfs_ratelimit_fini(&vd->vdev_delay_rl);
        zfs_ratelimit_fini(&vd->vdev_checksum_rl);
@@ -800,6 +814,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 
        tvd->vdev_islog = svd->vdev_islog;
        svd->vdev_islog = 0;
+
+       dsl_scan_io_queue_vdev_xfer(svd, tvd);
 }
 
 static void
index 36a4bf629d61cf1ea48cdde73066682c66fc1bc3..792642952e22a81f562558a524a5090c8a389afb 100644 (file)
@@ -169,7 +169,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60;
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
-int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE;
+int zfs_vdev_aggregation_limit = 1 << 20;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;
 
index 6d1b860cc0de7cdbadd754596aac9524bc1b043d..2f6aed66736332b76f91148220d50a6872e4b159 100644 (file)
@@ -1070,7 +1070,7 @@ zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj,
                }
                err = zap_add(os, intoobj, za.za_name,
                    8, 1, &value, tx);
-               if (err)
+               if (err != 0)
                        break;
        }
        zap_cursor_fini(&zc);
index 4cfda7a9e53db32b28cec1403c47fbfa3e46a109..311f79e23b93223dcd0bcec6025051b7199b2a80 100644 (file)
@@ -39,6 +39,7 @@
 #include <sys/ddt.h>
 #include <sys/blkptr.h>
 #include <sys/zfeature.h>
+#include <sys/dsl_scan.h>
 #include <sys/metaslab_impl.h>
 #include <sys/time.h>
 #include <sys/trace_zio.h>
@@ -1050,6 +1051,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 
        metaslab_check_free(spa, bp);
        arc_freed(spa, bp);
+       dsl_scan_freed(spa, bp);
 
        /*
         * GANG and DEDUP blocks can induce a read (for the gang block header,
@@ -3333,26 +3335,6 @@ zio_vdev_io_start(zio_t *zio)
 
        ASSERT3P(zio->io_logical, !=, zio);
 
-       /*
-        * We keep track of time-sensitive I/Os so that the scan thread
-        * can quickly react to certain workloads.  In particular, we care
-        * about non-scrubbing, top-level reads and writes with the following
-        * characteristics:
-        *      - synchronous writes of user data to non-slog devices
-        *      - any reads of user data
-        * When these conditions are met, adjust the timestamp of spa_last_io
-        * which allows the scan thread to adjust its workload accordingly.
-        */
-       if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
-           vd == vd->vdev_top && !vd->vdev_islog &&
-           zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
-           zio->io_txg != spa_syncing_txg(spa)) {
-               uint64_t old = spa->spa_last_io;
-               uint64_t new = ddi_get_lbolt64();
-               if (old != new)
-                       (void) atomic_cas_64(&spa->spa_last_io, old, new);
-       }
-
        align = 1ULL << vd->vdev_top->vdev_ashift;
 
        if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
index 42bc457ea4e0bd0a6c092ddb622ff11f51f9af39..6ac748818461720d62a6b39d67c79175ec09047e 100755 (executable)
@@ -33,7 +33,7 @@
 # 8. Put another device offline and check if the test file checksum is correct.
 #
 # NOTES:
-#      A 25ms delay is added to make sure that the scrub is running while
+#      A 250ms delay is added to make sure that the scrub is running while
 #      the reopen kicks the resilver.
 #
 
@@ -70,7 +70,7 @@ log_must md5sum $TESTFILE > $TESTFILE_MD5
 
 # 4. Execute scrub.
 # add delay to I/O requests for remaining disk in pool
-log_must zinject -d $DISK2 -D25:1 $TESTPOOL
+log_must zinject -d $DISK2 -D250:1 $TESTPOOL
 log_must zpool scrub $TESTPOOL
 
 # 5. "Plug back" disk.
@@ -81,12 +81,12 @@ log_must check_state $TESTPOOL "$REMOVED_DISK_ID" "online"
 # 7. Check if scrub scan is replaced by resilver.
 # the scrub operation has to be running while reopen is executed
 log_must is_pool_scrubbing $TESTPOOL true
+# remove delay from disk
+log_must zinject -c all
 # the scrub will be replaced by resilver, wait until it ends
 log_must wait_for_resilver_end $TESTPOOL $MAXTIMEOUT
 # check if the scrub scan has been interrupted by resilver
 log_must is_scan_restarted $TESTPOOL
-# remove delay from disk
-log_must zinject -c all
 
 # 8. Put another device offline and check if the test file checksum is correct.
 log_must zpool offline $TESTPOOL $DISK2
index d61283d143f5e00576dc6c3249e4b3cb8bcac99e..30c389ce8414e16d32da2394333691fae53704c7 100755 (executable)
@@ -34,7 +34,7 @@
 #    replicas.
 #
 # NOTES:
-#      A 25ms delay is added to make sure that the scrub is running while
+#      A 125ms delay is added to make sure that the scrub is running while
 #      the reopen is invoked.
 #
 
@@ -64,20 +64,19 @@ log_must check_state $TESTPOOL "$REMOVED_DISK_ID" "unavail"
 log_must generate_random_file /$TESTPOOL/data $LARGE_FILE_SIZE
 # 4. Execute scrub.
 # add delay to I/O requests for remaining disk in pool
-log_must zinject -d $DISK2 -D25:1 $TESTPOOL
+log_must zinject -d $DISK2 -D125:1 $TESTPOOL
 log_must zpool scrub $TESTPOOL
 # 5. "Plug back" disk.
 insert_disk $REMOVED_DISK $scsi_host
 # 6. Reopen a pool with an -n flag.
 log_must zpool reopen -n $TESTPOOL
 log_must check_state $TESTPOOL "$REMOVED_DISK_ID" "online"
+# remove delay from disk
+log_must zinject -c all
 # 7. Check if scrub scan is NOT replaced by resilver.
 log_must wait_for_scrub_end $TESTPOOL $MAXTIMEOUT
 log_mustnot is_scan_restarted $TESTPOOL
 
-# remove delay from disk
-log_must zinject -c all
-
 # 8. Check if trying to put device to offline fails because of no valid
 #    replicas.
 log_mustnot zpool offline $TESTPOOL $DISK2
index 74396de797c369adcf20d19eb0e472e97c15bb9c..e8bb8bceb6ab32a4a018a94c6daee460b9247527 100755 (executable)
@@ -26,7 +26,9 @@
 #
 
 . $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
 
 verify_runnable "global"
 
+log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT
 destroy_mirrors
index 374bbe78bb212204e695b215372798fa9f6bf4ee..936fd798e9ddc80c1d83b068240f6dceef50fd8c 100755 (executable)
@@ -37,8 +37,8 @@ verify_disk_count "$DISKS" 2
 
 default_mirror_setup_noexit $DISK1 $DISK2
 
-mntpnt=$(get_prop mountpoint $TESTPOOL)
+mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
 
-# Create 100MB of data
-log_must file_write -b 1048576 -c 100 -o create -d 0 -f $mntpnt/bigfile
+# Create 256M of data
+log_must file_write -b 1048576 -c 256 -o create -d 0 -f $mntpnt/bigfile
 log_pass
index bd2b57e2c1b59ffd61fe84a1a1f6cb98fd649411..fdf2f428477f9d5d8f9a8d2bc94d9e0a0b84dca8 100644 (file)
@@ -30,3 +30,6 @@
 
 export DISK1=${DISKS%% *}
 export DISK2=$(echo $DISKS | awk '{print $2}')
+
+export ZFS_SCAN_VDEV_LIMIT_SLOW=$((128*1024))
+export ZFS_SCAN_VDEV_LIMIT_DEFAULT=$((4*1024*1024))
index 26c22fd98a83ae41e3964770b6d9c76676dca439..712097bb1ca8d7f37176fb3996afac862e964d0d 100755 (executable)
@@ -46,9 +46,9 @@
 #      6. Verify zpool scrub -s succeed when the system is scrubbing.
 #
 # NOTES:
-#      A 10ms delay is added to the ZIOs in order to ensure that the
-#      scrub does not complete before it has a chance to be cancelled.
-#      This can occur when testing with small pools or very fast hardware.
+#      Artificially limit the scrub speed by setting the zfs_scan_vdev_limit
+#      low and adding a 50ms zio delay in order to ensure that the scrub does
+#      not complete early.
 #
 
 verify_runnable "global"
@@ -56,13 +56,21 @@ verify_runnable "global"
 function cleanup
 {
        log_must zinject -c all
+       log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT
+       log_must rm -f $mntpnt/biggerfile
 }
 
 log_onexit cleanup
 
 log_assert "Verify scrub, scrub -p, and scrub -s show the right status."
 
-log_must zinject -d $DISK1 -D20:1 $TESTPOOL
+# Create 1G of additional data
+mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS)
+log_must file_write -b 1048576 -c 1024 -o create -d 0 -f $mntpnt/biggerfile
+log_must sync
+
+log_must zinject -d $DISK1 -D50:1 $TESTPOOL
+log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW
 log_must zpool scrub $TESTPOOL
 log_must is_pool_scrubbing $TESTPOOL true
 log_must zpool scrub -p $TESTPOOL
index 2af3efcff72754f058b61c30865fa24c61934714..c52ad84bc513797e41dec97e566103978745a826 100755 (executable)
 #      2. Kick off a second scrub and verify it fails
 #
 # NOTES:
-#      A 10ms delay is added to the ZIOs in order to ensure that the
-#      scrub does not complete before it has a chance to be restarted.
-#      This can occur when testing with small pools or very fast hardware.
+#      Artificially limit the scrub speed by setting the zfs_scan_vdev_limit
+#      low in order to ensure that the scrub does not complete early.
 #
 
 verify_runnable "global"
 
 function cleanup
 {
-               log_must zinject -c all
+       log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT
 }
 
 log_onexit cleanup
 
 log_assert "Scrub command fails when there is already a scrub in progress"
 
-log_must zinject -d $DISK1 -D10:1 $TESTPOOL
+log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW
 log_must zpool scrub $TESTPOOL
 log_must is_pool_scrubbing $TESTPOOL true
 log_mustnot zpool scrub $TESTPOOL
index fabe276266f878b27fba391417e7eb1708c3eb18..4d29e78bcbfbf688007714122d92f411d1cb9c6f 100755 (executable)
 #      3. Verify scrub failed until the resilver completed
 #
 # NOTES:
-#      A 10ms delay is added to 10% of zio's in order to ensure that the
-#      resilver does not complete before the scrub can be issued.  This
-#      can occur when testing with small pools or very fast hardware.
+#      Artificially limit the scrub speed by setting the zfs_scan_vdev_limit
+#      low in order to ensure that the scrub does not complete early.
+#
 
 function cleanup
 {
-       log_must zinject -c all
+       log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT
 }
 
 verify_runnable "global"
@@ -62,13 +62,12 @@ log_onexit cleanup
 
 log_assert "Resilver prevent scrub from starting until the resilver completes"
 
+log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW
 log_must zpool detach $TESTPOOL $DISK2
-log_must zinject -d $DISK1 -D10:1 $TESTPOOL
 log_must zpool attach $TESTPOOL $DISK1 $DISK2
 log_must is_pool_resilvering $TESTPOOL
 log_mustnot zpool scrub $TESTPOOL
 
-# Allow the resilver to finish, or it will interfere with the next test.
 while ! is_pool_resilvered $TESTPOOL; do
        sleep 1
 done
index 5312f8e82d20cd21a8c5ece1790007ebbc5d0d0a..893cf74b5ba7c89f23d96f8bcf759f8b8d004cd1 100755 (executable)
@@ -63,4 +63,8 @@ log_must zpool scrub $TESTPOOL
 log_must zpool detach $TESTPOOL $DISK1
 log_must zpool attach $TESTPOOL $DISK2 $DISK1
 
+while ! is_pool_resilvered $TESTPOOL; do
+       sleep 1
+done
+
 log_pass "When scrubbing, detach device should not break system."
index 8404744841224654b66e166a77b56c3f1bf0a21c..3bc798d1a9f8e777aee2e6758200853a36b5e706 100755 (executable)
@@ -49,7 +49,7 @@ verify_runnable "global"
 function cleanup
 {
        poolexists $TESTPOOL && destroy_pool $TESTPOOL
-       log_must rm -f $DISK1 $DISK2 $DISK3
+       log_must rm -f $DISK1 $DISK2 $DISK3 $DISK4
 }
 
 #
@@ -94,14 +94,16 @@ TESTDIR="$TEST_BASE_DIR/zpool_scrub_offline_device"
 DISK1="$TEST_BASE_DIR/zpool_disk1.dat"
 DISK2="$TEST_BASE_DIR/zpool_disk2.dat"
 DISK3="$TEST_BASE_DIR/zpool_disk3.dat"
+DISK4="$TEST_BASE_DIR/zpool_disk4.dat"
 
 # 1. Create the pool
 log_must truncate -s $DEVSIZE $DISK1
 log_must truncate -s $DEVSIZE $DISK2
 log_must truncate -s $DEVSIZE $DISK3
+log_must truncate -s $DEVSIZE $DISK4
 poolexists $TESTPOOL && destroy_pool $TESTPOOL
 log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL \
-    raidz1 $DISK1 $DISK2 $DISK3
+    raidz2 $DISK1 $DISK2 $DISK3 $DISK4
 
 # 2. Offline the first device
 zpool_do_sync 'offline' $TESTPOOL $DISK1
index b5cb3bb1d222681c0820b58fc1488f122fba2750..495b2bbadee4f5aa5c2bbc1f37aefb26178f0c81 100755 (executable)
@@ -81,6 +81,10 @@ log_must truncate -s 0 $ZED_DEBUG_LOG
 # 4. Generate additional events.
 log_must zpool offline $MPOOL $VDEV1
 log_must zpool online $MPOOL $VDEV1
+while ! is_pool_resilvered $MPOOL; do
+       sleep 1
+done
+
 log_must zpool scrub $MPOOL
 
 # Wait for the scrub to wrap, or is_healthy will be wrong.
index 5c411936d9570688266fd120c97288799a2c206d..26afc109174f3db65f618ec6e18d3a9c387130f2 100755 (executable)
@@ -78,7 +78,6 @@ function run_and_verify
        zedlog=${zedlog:-$ZED_DEBUG_LOG}
        fullcmd="$1"
        cmd=$(echo $fullcmd | awk '{print $1}')
-       subcmd=$(echo $fullcmd | awk '{print $2}')
 
        # If we aren't running zpool or zfs, something is wrong
        [[ $cmd == "zpool" || $cmd == "zfs" ]] || \