]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Add device rebuild feature
authorBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 3 Jul 2020 18:05:50 +0000 (11:05 -0700)
committerGitHub <noreply@github.com>
Fri, 3 Jul 2020 18:05:50 +0000 (11:05 -0700)
The device_rebuild feature enables sequential reconstruction when
resilvering.  Mirror vdevs can be rebuilt in LBA order which may
more quickly restore redundancy depending on the pools average block
size, overall fragmentation and the performance characteristics
of the devices.  However, block checksums cannot be verified
as part of the rebuild thus a scrub is automatically started after
the sequential resilver completes.

The new '-s' option has been added to the `zpool attach` and
`zpool replace` command to request sequential reconstruction
instead of healing reconstruction when resilvering.

    zpool attach -s <pool> <existing vdev> <new vdev>
    zpool replace -s <pool> <old vdev> <new vdev>

The `zpool status` output has been updated to report the progress
of sequential resilvering in the same way as healing resilvering.
The one notable difference is that multiple sequential resilvers
may be in progress as long as they're operating on different
top-level vdevs.

The `zpool wait -t resilver` command was extended to wait on
sequential resilvers.  From this perspective they are no different
than healing resilvers.

Sequential resilvers cannot be supported for RAIDZ, but are
compatible with the dRAID feature being developed.

As part of this change the resilver_restart_* tests were moved
in to the functional/replacement directory.  Additionally, the
replacement tests were renamed and extended to verify both
resilvering and rebuilding.

Original-patch-by: Isaac Huang <he.huang@intel.com>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Reviewed-by: John Poduska <jpoduska@datto.com>
Co-authored-by: Mark Maybee <mmaybee@cray.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #10349

70 files changed:
cmd/zed/agents/zfs_mod.c
cmd/zed/agents/zfs_retire.c
cmd/zed/zed.d/resilver_finish-start-scrub.sh
cmd/zpool/zpool_main.c
cmd/ztest/ztest.c
configure.ac
contrib/pyzfs/libzfs_core/_constants.py
include/libzfs.h
include/sys/Makefile.am
include/sys/dsl_scan.h
include/sys/fs/zfs.h
include/sys/spa.h
include/sys/spa_impl.h
include/sys/vdev.h
include/sys/vdev_impl.h
include/sys/vdev_rebuild.h [new file with mode: 0644]
include/sys/zio_priority.h
include/zfeature_common.h
lib/libzfs/libzfs_pool.c
lib/libzfs/libzfs_status.c
lib/libzfs/libzfs_util.c
lib/libzpool/Makefile.am
man/man5/zfs-module-parameters.5
man/man5/zpool-features.5
man/man8/zpool-attach.8
man/man8/zpool-replace.8
man/man8/zpool-status.8
module/Makefile.bsd
module/zcommon/zfeature_common.c
module/zfs/Makefile.in
module/zfs/dsl_scan.c
module/zfs/spa.c
module/zfs/spa_misc.c
module/zfs/vdev.c
module/zfs/vdev_label.c
module/zfs/vdev_mirror.c
module/zfs/vdev_queue.c
module/zfs/vdev_rebuild.c [new file with mode: 0644]
module/zfs/zfs_ioctl.c
tests/runfiles/common.run
tests/zfs-tests/include/libtest.shlib
tests/zfs-tests/tests/functional/Makefile.am
tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg
tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am
tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/Makefile.am
tests/zfs-tests/tests/functional/replacement/attach_import.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/detach.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/replace_import.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/replacement.cfg
tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh [deleted file]
tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh [deleted file]
tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh [deleted file]
tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/resilver/Makefile.am [deleted file]
tests/zfs-tests/tests/functional/resilver/cleanup.ksh [deleted file]
tests/zfs-tests/tests/functional/resilver/resilver.cfg [deleted file]
tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh [deleted file]
tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh [deleted file]
tests/zfs-tests/tests/functional/resilver/setup.ksh [deleted file]

index 1094d25dd34eaf3445110546eccde7fd0d5710bd..8d0a3b4200860162663953da9959d93b732bacb9 100644 (file)
@@ -437,7 +437,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
                return;
        }
 
-       ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE);
+       ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE);
 
        zed_log_msg(LOG_INFO, "  zpool_vdev_replace: %s with %s (%s)",
            fullpath, path, (ret == 0) ? "no errors" :
index f3dbb24b84eb24233b909df919f80509ac3c414c..665fb216d5077d1b6abd7ff5787749bb8741fbe1 100644 (file)
@@ -237,7 +237,7 @@ replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev)
                    dev_name, basename(spare_name));
 
                if (zpool_vdev_attach(zhp, dev_name, spare_name,
-                   replacement, B_TRUE) == 0) {
+                   replacement, B_TRUE, B_FALSE) == 0) {
                        free(dev_name);
                        nvlist_free(replacement);
                        return (B_TRUE);
@@ -319,12 +319,16 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
 
        fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class);
 
+       nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state);
+
        /*
         * If this is a resource notifying us of device removal then simply
         * check for an available spare and continue unless the device is a
         * l2arc vdev, in which case we just offline it.
         */
-       if (strcmp(class, "resource.fs.zfs.removed") == 0) {
+       if (strcmp(class, "resource.fs.zfs.removed") == 0 ||
+           (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
+           state == VDEV_STATE_REMOVED)) {
                char *devtype;
                char *devname;
 
@@ -365,8 +369,7 @@ zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl,
         * healthy ones so we need to confirm the actual state value.
         */
        if (strcmp(class, "resource.fs.zfs.statechange") == 0 &&
-           nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE,
-           &state) == 0 && state == VDEV_STATE_HEALTHY) {
+           state == VDEV_STATE_HEALTHY) {
                zfs_vdev_repair(hdl, nvl);
                return;
        }
index 6f9c0b309467e8c92cdf2bd1ff9f1990aee4fb22..c7cfd1ddba80204fee7df98df3886330d6374c9f 100755 (executable)
@@ -5,10 +5,12 @@
 # Exit codes:
 # 1: Internal error
 # 2: Script wasn't enabled in zed.rc
+# 3: Scrubs are automatically started for sequential resilvers
 [ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc"
 . "${ZED_ZEDLET_DIR}/zed-functions.sh"
 
 [ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2
+[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3
 [ -n "${ZEVENT_POOL}" ] || exit 1
 [ -n "${ZEVENT_SUBCLASS}" ] || exit 1
 zed_check_cmd "${ZPOOL}" || exit 1
index ee6c479eb3bdb5df2e39f1e18d53f3eb1eeac987..cdf5511fe75b1507ef69242e52828fcd0eb1c44d 100644 (file)
@@ -337,7 +337,7 @@ get_usage(zpool_help_t idx)
                return (gettext("\tadd [-fgLnP] [-o property=value] "
                    "<pool> <vdev> ...\n"));
        case HELP_ATTACH:
-               return (gettext("\tattach [-fw] [-o property=value] "
+               return (gettext("\tattach [-fsw] [-o property=value] "
                    "<pool> <device> <new-device>\n"));
        case HELP_CLEAR:
                return (gettext("\tclear [-nF] <pool> [device]\n"));
@@ -380,7 +380,7 @@ get_usage(zpool_help_t idx)
        case HELP_ONLINE:
                return (gettext("\tonline [-e] <pool> <device> ...\n"));
        case HELP_REPLACE:
-               return (gettext("\treplace [-fw] [-o property=value] "
+               return (gettext("\treplace [-fsw] [-o property=value] "
                    "<pool> <device> [new-device]\n"));
        case HELP_REMOVE:
                return (gettext("\tremove [-npsw] <pool> <device> ...\n"));
@@ -2077,10 +2077,10 @@ health_str_to_color(const char *health)
  */
 static void
 print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
-    nvlist_t *nv, int depth, boolean_t isspare)
+    nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs)
 {
        nvlist_t **child, *root;
-       uint_t c, children;
+       uint_t c, i, children;
        pool_scan_stat_t *ps = NULL;
        vdev_stat_t *vs;
        char rbuf[6], wbuf[6], cbuf[6];
@@ -2266,6 +2266,14 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
                }
        }
 
+       /* The top-level vdevs have the rebuild stats */
+       if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE &&
+           children == 0) {
+               if (vs->vs_rebuild_processed != 0) {
+                       (void) printf(gettext("  (resilvering)"));
+               }
+       }
+
        if (cb->vcdl != NULL) {
                if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
                        printf("  ");
@@ -2295,11 +2303,17 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name,
                if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS))
                        continue;
 
+               /* Provide vdev_rebuild_stats to children if available */
+               if (vrs == NULL) {
+                       (void) nvlist_lookup_uint64_array(nv,
+                           ZPOOL_CONFIG_REBUILD_STATS,
+                           (uint64_t **)&vrs, &i);
+               }
+
                vname = zpool_vdev_name(g_zfs, zhp, child[c],
                    cb->cb_name_flags | VDEV_NAME_TYPE_ID);
-
                print_status_config(zhp, cb, vname, child[c], depth + 2,
-                   isspare);
+                   isspare, vrs);
                free(vname);
        }
 }
@@ -2468,7 +2482,7 @@ print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv,
                    cb->cb_name_flags | VDEV_NAME_TYPE_ID);
                if (cb->cb_print_status)
                        print_status_config(zhp, cb, name, child[c], 2,
-                           B_FALSE);
+                           B_FALSE, NULL);
                else
                        print_import_config(cb, name, child[c], 2);
                free(name);
@@ -2622,6 +2636,7 @@ show_import(nvlist_t *config)
                break;
 
        case ZPOOL_STATUS_RESILVERING:
+       case ZPOOL_STATUS_REBUILDING:
                printf_color(ANSI_BOLD, gettext("status: "));
                printf_color(ANSI_YELLOW, gettext("One or more devices were "
                    "being resilvered.\n"));
@@ -6118,6 +6133,7 @@ static int
 zpool_do_attach_or_replace(int argc, char **argv, int replacing)
 {
        boolean_t force = B_FALSE;
+       boolean_t rebuild = B_FALSE;
        boolean_t wait = B_FALSE;
        int c;
        nvlist_t *nvroot;
@@ -6128,7 +6144,7 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
        int ret;
 
        /* check options */
-       while ((c = getopt(argc, argv, "fo:w")) != -1) {
+       while ((c = getopt(argc, argv, "fo:sw")) != -1) {
                switch (c) {
                case 'f':
                        force = B_TRUE;
@@ -6146,6 +6162,9 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
                            (add_prop_list(optarg, propval, &props, B_TRUE)))
                                usage(B_FALSE);
                        break;
+               case 's':
+                       rebuild = B_TRUE;
+                       break;
                case 'w':
                        wait = B_TRUE;
                        break;
@@ -6230,7 +6249,8 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
                return (1);
        }
 
-       ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing);
+       ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing,
+           rebuild);
 
        if (ret == 0 && wait)
                ret = zpool_wait(zhp,
@@ -6244,9 +6264,10 @@ zpool_do_attach_or_replace(int argc, char **argv, int replacing)
 }
 
 /*
- * zpool replace [-fw] [-o property=value] <pool> <device> <new_device>
+ * zpool replace [-fsw] [-o property=value] <pool> <device> <new_device>
  *
  *     -f      Force attach, even if <new_device> appears to be in use.
+ *     -s      Use sequential instead of healing reconstruction for resilver.
  *     -o      Set property=value.
  *     -w      Wait for replacing to complete before returning
  *
@@ -6260,9 +6281,10 @@ zpool_do_replace(int argc, char **argv)
 }
 
 /*
- * zpool attach [-fw] [-o property=value] <pool> <device> <new_device>
+ * zpool attach [-fsw] [-o property=value] <pool> <device> <new_device>
  *
  *     -f      Force attach, even if <new_device> appears to be in use.
+ *     -s      Use sequential instead of healing reconstruction for resilver.
  *     -o      Set property=value.
  *     -w      Wait for resilvering to complete before returning
  *
@@ -7131,20 +7153,41 @@ zpool_do_trim(int argc, char **argv)
        return (error);
 }
 
+/*
+ * Converts a total number of seconds to a human readable string broken
+ * down in to days/hours/minutes/seconds.
+ */
+static void
+secs_to_dhms(uint64_t total, char *buf)
+{
+       uint64_t days = total / 60 / 60 / 24;
+       uint64_t hours = (total / 60 / 60) % 24;
+       uint64_t mins = (total / 60) % 60;
+       uint64_t secs = (total % 60);
+
+       if (days > 0) {
+               (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu",
+                   (u_longlong_t)days, (u_longlong_t)hours,
+                   (u_longlong_t)mins, (u_longlong_t)secs);
+       } else {
+               (void) sprintf(buf, "%02llu:%02llu:%02llu",
+                   (u_longlong_t)hours, (u_longlong_t)mins,
+                   (u_longlong_t)secs);
+       }
+}
+
 /*
  * Print out detailed scrub status.
  */
 static void
-print_scan_status(pool_scan_stat_t *ps)
+print_scan_scrub_resilver_status(pool_scan_stat_t *ps)
 {
        time_t start, end, pause;
-       uint64_t total_secs_left;
-       uint64_t elapsed, secs_left, mins_left, hours_left, days_left;
        uint64_t pass_scanned, scanned, pass_issued, issued, total;
-       uint64_t scan_rate, issue_rate;
+       uint64_t elapsed, scan_rate, issue_rate;
        double fraction_done;
        char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7];
-       char srate_buf[7], irate_buf[7];
+       char srate_buf[7], irate_buf[7], time_buf[32];
 
        printf("  ");
        printf_color(ANSI_BOLD, gettext("scan:"));
@@ -7168,26 +7211,18 @@ print_scan_status(pool_scan_stat_t *ps)
 
        /* Scan is finished or canceled. */
        if (ps->pss_state == DSS_FINISHED) {
-               total_secs_left = end - start;
-               days_left = total_secs_left / 60 / 60 / 24;
-               hours_left = (total_secs_left / 60 / 60) % 24;
-               mins_left = (total_secs_left / 60) % 60;
-               secs_left = (total_secs_left % 60);
+               secs_to_dhms(end - start, time_buf);
 
                if (ps->pss_func == POOL_SCAN_SCRUB) {
                        (void) printf(gettext("scrub repaired %s "
-                           "in %llu days %02llu:%02llu:%02llu "
-                           "with %llu errors on %s"), processed_buf,
-                           (u_longlong_t)days_left, (u_longlong_t)hours_left,
-                           (u_longlong_t)mins_left, (u_longlong_t)secs_left,
-                           (u_longlong_t)ps->pss_errors, ctime(&end));
+                           "in %s with %llu errors on %s"), processed_buf,
+                           time_buf, (u_longlong_t)ps->pss_errors,
+                           ctime(&end));
                } else if (ps->pss_func == POOL_SCAN_RESILVER) {
                        (void) printf(gettext("resilvered %s "
-                           "in %llu days %02llu:%02llu:%02llu "
-                           "with %llu errors on %s"), processed_buf,
-                           (u_longlong_t)days_left, (u_longlong_t)hours_left,
-                           (u_longlong_t)mins_left, (u_longlong_t)secs_left,
-                           (u_longlong_t)ps->pss_errors, ctime(&end));
+                           "in %s with %llu errors on %s"), processed_buf,
+                           time_buf, (u_longlong_t)ps->pss_errors,
+                           ctime(&end));
                }
                return;
        } else if (ps->pss_state == DSS_CANCELED) {
@@ -7235,13 +7270,9 @@ print_scan_status(pool_scan_stat_t *ps)
 
        scan_rate = pass_scanned / elapsed;
        issue_rate = pass_issued / elapsed;
-       total_secs_left = (issue_rate != 0 && total >= issued) ?
+       uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ?
            ((total - issued) / issue_rate) : UINT64_MAX;
-
-       days_left = total_secs_left / 60 / 60 / 24;
-       hours_left = (total_secs_left / 60 / 60) % 24;
-       mins_left = (total_secs_left / 60) % 60;
-       secs_left = (total_secs_left % 60);
+       secs_to_dhms(total_secs_left, time_buf);
 
        /* format all of the numbers we will be reporting */
        zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf));
@@ -7271,10 +7302,84 @@ print_scan_status(pool_scan_stat_t *ps)
        if (pause == 0) {
                if (total_secs_left != UINT64_MAX &&
                    issue_rate >= 10 * 1024 * 1024) {
-                       (void) printf(gettext(", %llu days "
-                           "%02llu:%02llu:%02llu to go\n"),
-                           (u_longlong_t)days_left, (u_longlong_t)hours_left,
-                           (u_longlong_t)mins_left, (u_longlong_t)secs_left);
+                       (void) printf(gettext(", %s to go\n"), time_buf);
+               } else {
+                       (void) printf(gettext(", no estimated "
+                           "completion time\n"));
+               }
+       } else {
+               (void) printf(gettext("\n"));
+       }
+}
+
+static void
+print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name)
+{
+       if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE)
+               return;
+
+       printf("  ");
+       printf_color(ANSI_BOLD, gettext("scan:"));
+       printf(" ");
+
+       uint64_t bytes_scanned = vrs->vrs_bytes_scanned;
+       uint64_t bytes_issued = vrs->vrs_bytes_issued;
+       uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt;
+       uint64_t bytes_est = vrs->vrs_bytes_est;
+       uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned /
+           (vrs->vrs_pass_time_ms + 1)) * 1000;
+       uint64_t issue_rate = (vrs->vrs_pass_bytes_issued /
+           (vrs->vrs_pass_time_ms + 1)) * 1000;
+       double scan_pct = MIN((double)bytes_scanned * 100 /
+           (bytes_est + 1), 100);
+
+       /* Format all of the numbers we will be reporting */
+       char bytes_scanned_buf[7], bytes_issued_buf[7];
+       char bytes_rebuilt_buf[7], bytes_est_buf[7];
+       char scan_rate_buf[7], issue_rate_buf[7], time_buf[32];
+       zfs_nicebytes(bytes_scanned, bytes_scanned_buf,
+           sizeof (bytes_scanned_buf));
+       zfs_nicebytes(bytes_issued, bytes_issued_buf,
+           sizeof (bytes_issued_buf));
+       zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf,
+           sizeof (bytes_rebuilt_buf));
+       zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf));
+       zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf));
+       zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf));
+
+       time_t start = vrs->vrs_start_time;
+       time_t end = vrs->vrs_end_time;
+
+       /* Rebuild is finished or canceled. */
+       if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) {
+               secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf);
+               (void) printf(gettext("resilvered (%s) %s in %s "
+                   "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf,
+                   time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end));
+               return;
+       } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) {
+               (void) printf(gettext("resilver (%s) canceled on %s"),
+                   vdev_name, ctime(&end));
+               return;
+       } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+               (void) printf(gettext("resilver (%s) in progress since %s"),
+                   vdev_name, ctime(&start));
+       }
+
+       assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE);
+
+       secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) /
+           MAX(scan_rate, 1), time_buf);
+
+       (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, "
+           "%s total\n"), bytes_scanned_buf, scan_rate_buf,
+           bytes_issued_buf, issue_rate_buf, bytes_est_buf);
+       (void) printf(gettext("\t%s resilvered, %.2f%% done"),
+           bytes_rebuilt_buf, scan_pct);
+
+       if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+               if (scan_rate >= 10 * 1024 * 1024) {
+                       (void) printf(gettext(", %s to go\n"), time_buf);
                } else {
                        (void) printf(gettext(", no estimated "
                            "completion time\n"));
@@ -7285,9 +7390,38 @@ print_scan_status(pool_scan_stat_t *ps)
 }
 
 /*
- * As we don't scrub checkpointed blocks, we want to warn the
- * user that we skipped scanning some blocks if a checkpoint exists
- * or existed at any time during the scan.
+ * Print rebuild status for top-level vdevs.
+ */
+static void
+print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot)
+{
+       nvlist_t **child;
+       uint_t children;
+
+       if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+           &child, &children) != 0)
+               children = 0;
+
+       for (uint_t c = 0; c < children; c++) {
+               vdev_rebuild_stat_t *vrs;
+               uint_t i;
+
+               if (nvlist_lookup_uint64_array(child[c],
+                   ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
+                       char *name = zpool_vdev_name(g_zfs, zhp,
+                           child[c], VDEV_NAME_TYPE_ID);
+                       print_rebuild_status_impl(vrs, name);
+                       free(name);
+               }
+       }
+}
+
+/*
+ * As we don't scrub checkpointed blocks, we want to warn the user that we
+ * skipped scanning some blocks if a checkpoint exists or existed at any
+ * time during the scan.  If a sequential instead of healing reconstruction
+ * was performed then the blocks were reconstructed.  However, their checksums
+ * have not been verified so we still print the warning.
  */
 static void
 print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
@@ -7318,6 +7452,95 @@ print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs)
        }
 }
 
+/*
+ * Returns B_TRUE if there is an active rebuild in progress.  Otherwise,
+ * B_FALSE is returned and 'rebuild_end_time' is set to the end time for
+ * the last completed (or cancelled) rebuild.
+ */
+static boolean_t
+check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time)
+{
+       nvlist_t **child;
+       uint_t children;
+       boolean_t rebuilding = B_FALSE;
+       uint64_t end_time = 0;
+
+       if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+           &child, &children) != 0)
+               children = 0;
+
+       for (uint_t c = 0; c < children; c++) {
+               vdev_rebuild_stat_t *vrs;
+               uint_t i;
+
+               if (nvlist_lookup_uint64_array(child[c],
+                   ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) {
+
+                       if (vrs->vrs_end_time > end_time)
+                               end_time = vrs->vrs_end_time;
+
+                       if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+                               rebuilding = B_TRUE;
+                               end_time = 0;
+                               break;
+                       }
+               }
+       }
+
+       if (rebuild_end_time != NULL)
+               *rebuild_end_time = end_time;
+
+       return (rebuilding);
+}
+
+/*
+ * Print the scan status.
+ */
+static void
+print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
+{
+       uint64_t rebuild_end_time = 0, resilver_end_time = 0;
+       boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE;
+       boolean_t active_resilver = B_FALSE;
+       pool_checkpoint_stat_t *pcs = NULL;
+       pool_scan_stat_t *ps = NULL;
+       uint_t c;
+
+       if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
+           (uint64_t **)&ps, &c) == 0) {
+               if (ps->pss_func == POOL_SCAN_RESILVER) {
+                       resilver_end_time = ps->pss_end_time;
+                       active_resilver = (ps->pss_state == DSS_SCANNING);
+               }
+
+               have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
+               have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
+       }
+
+       boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
+       boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0));
+
+       /* Always print the scrub status when available. */
+       if (have_scrub)
+               print_scan_scrub_resilver_status(ps);
+
+       /*
+        * When there is an active resilver or rebuild print its status.
+        * Otherwise print the status of the last resilver or rebuild.
+        */
+       if (active_resilver || (!active_rebuild && have_resilver &&
+           resilver_end_time && resilver_end_time > rebuild_end_time)) {
+               print_scan_scrub_resilver_status(ps);
+       } else if (active_rebuild || (!active_resilver && have_rebuild &&
+           rebuild_end_time && rebuild_end_time > resilver_end_time)) {
+               print_rebuild_status(zhp, nvroot);
+       }
+
+       (void) nvlist_lookup_uint64_array(nvroot,
+           ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
+       print_checkpoint_scan_warning(ps, pcs);
+}
+
 /*
  * Print out detailed removal status.
  */
@@ -7504,7 +7727,7 @@ print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares,
        for (i = 0; i < nspares; i++) {
                name = zpool_vdev_name(g_zfs, zhp, spares[i],
                    cb->cb_name_flags);
-               print_status_config(zhp, cb, name, spares[i], 2, B_TRUE);
+               print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL);
                free(name);
        }
 }
@@ -7524,7 +7747,8 @@ print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache,
        for (i = 0; i < nl2cache; i++) {
                name = zpool_vdev_name(g_zfs, zhp, l2cache[i],
                    cb->cb_name_flags);
-               print_status_config(zhp, cb, name, l2cache[i], 2, B_FALSE);
+               print_status_config(zhp, cb, name, l2cache[i], 2,
+                   B_FALSE, NULL);
                free(name);
        }
 }
@@ -7718,6 +7942,7 @@ status_callback(zpool_handle_t *zhp, void *data)
                break;
 
        case ZPOOL_STATUS_RESILVERING:
+       case ZPOOL_STATUS_REBUILDING:
                printf_color(ANSI_BOLD, gettext("status: "));
                printf_color(ANSI_YELLOW, gettext("One or more devices is "
                    "currently being resilvered.  The pool will\n\tcontinue "
@@ -7727,6 +7952,16 @@ status_callback(zpool_handle_t *zhp, void *data)
                    "complete.\n"));
                break;
 
+       case ZPOOL_STATUS_REBUILD_SCRUB:
+               printf_color(ANSI_BOLD, gettext("status: "));
+               printf_color(ANSI_YELLOW, gettext("One or more devices have "
+                   "been sequentially resilvered, scrubbing\n\tthe pool "
+                   "is recommended.\n"));
+               printf_color(ANSI_BOLD, gettext("action: "));
+               printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to "
+                   "verify all data checksums.\n"));
+               break;
+
        case ZPOOL_STATUS_CORRUPT_DATA:
                printf_color(ANSI_BOLD, gettext("status: "));
                printf_color(ANSI_YELLOW, gettext("One or more devices has "
@@ -7951,18 +8186,16 @@ status_callback(zpool_handle_t *zhp, void *data)
                nvlist_t **spares, **l2cache;
                uint_t nspares, nl2cache;
                pool_checkpoint_stat_t *pcs = NULL;
-               pool_scan_stat_t *ps = NULL;
                pool_removal_stat_t *prs = NULL;
 
-               (void) nvlist_lookup_uint64_array(nvroot,
-                   ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
-               (void) nvlist_lookup_uint64_array(nvroot,
-                   ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c);
+               print_scan_status(zhp, nvroot);
+
                (void) nvlist_lookup_uint64_array(nvroot,
                    ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c);
-               print_scan_status(ps);
-               print_checkpoint_scan_warning(ps, pcs);
                print_removal_status(zhp, prs);
+
+               (void) nvlist_lookup_uint64_array(nvroot,
+                   ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c);
                print_checkpoint_status(pcs);
 
                cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0,
@@ -7987,7 +8220,7 @@ status_callback(zpool_handle_t *zhp, void *data)
                printf("\n");
 
                print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0,
-                   B_FALSE);
+                   B_FALSE, NULL);
 
                print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP);
                print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL);
@@ -9543,6 +9776,36 @@ vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity)
        return (bytes_remaining);
 }
 
+/* Add up the total number of bytes left to rebuild across top-level vdevs */
+static uint64_t
+vdev_activity_top_remaining(nvlist_t *nv)
+{
+       uint64_t bytes_remaining = 0;
+       nvlist_t **child;
+       uint_t children;
+       int error;
+
+       if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+           &child, &children) != 0)
+               children = 0;
+
+       for (uint_t c = 0; c < children; c++) {
+               vdev_rebuild_stat_t *vrs;
+               uint_t i;
+
+               error = nvlist_lookup_uint64_array(child[c],
+                   ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i);
+               if (error == 0) {
+                       if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) {
+                               bytes_remaining += (vrs->vrs_bytes_est -
+                                   vrs->vrs_bytes_rebuilt);
+                       }
+               }
+       }
+
+       return (bytes_remaining);
+}
+
 /* Whether any vdevs are 'spare' or 'replacing' vdevs */
 static boolean_t
 vdev_any_spare_replacing(nvlist_t *nv)
@@ -9652,6 +9915,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row)
                        bytes_rem[ZPOOL_WAIT_SCRUB] = rem;
                else
                        bytes_rem[ZPOOL_WAIT_RESILVER] = rem;
+       } else if (check_rebuilding(nvroot, NULL)) {
+               bytes_rem[ZPOOL_WAIT_RESILVER] =
+                   vdev_activity_top_remaining(nvroot);
        }
 
        bytes_rem[ZPOOL_WAIT_INITIALIZE] =
index ce748da189a31111b87b3fee166d0ec9ca9e4eef..ca38271cc4acbb996e7d06575188d10a8d39b0a4 100644 (file)
@@ -3507,7 +3507,16 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
        root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0,
            ashift, NULL, 0, 0, 1);
 
-       error = spa_vdev_attach(spa, oldguid, root, replacing);
+       /*
+        * When supported select either a healing or sequential resilver.
+        */
+       boolean_t rebuilding = B_FALSE;
+       if (pvd->vdev_ops == &vdev_mirror_ops ||
+           pvd->vdev_ops ==  &vdev_root_ops) {
+               rebuilding = !!ztest_random(2);
+       }
+
+       error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding);
 
        nvlist_free(root);
 
@@ -3527,10 +3536,11 @@ ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
                expected_error = error;
 
        if (error == ZFS_ERR_CHECKPOINT_EXISTS ||
-           error == ZFS_ERR_DISCARDING_CHECKPOINT)
+           error == ZFS_ERR_DISCARDING_CHECKPOINT ||
+           error == ZFS_ERR_RESILVER_IN_PROGRESS ||
+           error == ZFS_ERR_REBUILD_IN_PROGRESS)
                expected_error = error;
 
-       /* XXX workaround 6690467 */
        if (error != expected_error && expected_error != EBUSY) {
                fatal(0, "attach (%s %llu, %s %llu, %d) "
                    "returned %d, expected %d",
index e405ddb57bb6fc4ddd48ca34747cbe427e7fc2a2..c7f813d1927c944ff063ffb77f2eb1cbfd25bbd7 100644 (file)
@@ -368,7 +368,6 @@ AC_CONFIG_FILES([
        tests/zfs-tests/tests/functional/rename_dirs/Makefile
        tests/zfs-tests/tests/functional/replacement/Makefile
        tests/zfs-tests/tests/functional/reservation/Makefile
-       tests/zfs-tests/tests/functional/resilver/Makefile
        tests/zfs-tests/tests/functional/rootpool/Makefile
        tests/zfs-tests/tests/functional/rsend/Makefile
        tests/zfs-tests/tests/functional/scrub_mirror/Makefile
index 5c285164b976857bbe2fe8489f29ed2f79d8f476..50dca67f3a6fe4f4a57d277b821acaa45cab51bf 100644 (file)
@@ -95,6 +95,8 @@ zfs_errno = enum_with_offset(1024, [
         'ZFS_ERR_EXPORT_IN_PROGRESS',
         'ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR',
         'ZFS_ERR_STREAM_TRUNCATED',
+        'ZFS_ERR_RESILVER_IN_PROGRESS',
+        'ZFS_ERR_REBUILD_IN_PROGRESS',
     ],
     {}
 )
index 64a0a203501ae8bda5dbe6ff0e127b01da816f8a..873e8f3046fb1384de823a883c0ad9b49aa7e47b 100644 (file)
@@ -79,7 +79,7 @@ typedef enum zfs_error {
        EZFS_NODEVICE,          /* no such device in pool */
        EZFS_BADDEV,            /* invalid device to add */
        EZFS_NOREPLICAS,        /* no valid replicas */
-       EZFS_RESILVERING,       /* currently resilvering */
+       EZFS_RESILVERING,       /* resilvering (healing reconstruction) */
        EZFS_BADVERSION,        /* unsupported version */
        EZFS_POOLUNAVAIL,       /* pool is currently unavailable */
        EZFS_DEVOVERFLOW,       /* too many devices in one vdev */
@@ -148,6 +148,7 @@ typedef enum zfs_error {
        EZFS_TRIM_NOTSUP,       /* device does not support trim */
        EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */
        EZFS_EXPORT_IN_PROGRESS,        /* currently exporting the pool */
+       EZFS_REBUILDING,        /* resilvering (sequential reconstrution) */
        EZFS_UNKNOWN
 } zfs_error_t;
 
@@ -297,7 +298,7 @@ extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
     vdev_state_t *);
 extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t);
 extern int zpool_vdev_attach(zpool_handle_t *, const char *,
-    const char *, nvlist_t *, int);
+    const char *, nvlist_t *, int, boolean_t);
 extern int zpool_vdev_detach(zpool_handle_t *, const char *);
 extern int zpool_vdev_remove(zpool_handle_t *, const char *);
 extern int zpool_vdev_remove_cancel(zpool_handle_t *);
@@ -387,6 +388,8 @@ typedef enum {
        ZPOOL_STATUS_RESILVERING,       /* device being resilvered */
        ZPOOL_STATUS_OFFLINE_DEV,       /* device offline */
        ZPOOL_STATUS_REMOVED_DEV,       /* removed device */
+       ZPOOL_STATUS_REBUILDING,        /* device being rebuilt */
+       ZPOOL_STATUS_REBUILD_SCRUB,     /* recommend scrubbing the pool */
 
        /*
         * Finally, the following indicates a healthy pool.
index ce781aa4cffe7c566bec2f99a6f5572288996e2e..0659c6419dfce87db3e3f8f2cb3716eeddebb2c0 100644 (file)
@@ -89,6 +89,7 @@ COMMON_H = \
        vdev_initialize.h \
        vdev_raidz.h \
        vdev_raidz_impl.h \
+       vdev_rebuild.h \
        vdev_removal.h \
        vdev_trim.h \
        xvattr.h \
index bcb896da373bdef161ba1e8d29fef4db539d70bb..8f929207d2d7de3f66d3380b63d5eb5bd8e70ed1 100644 (file)
@@ -42,6 +42,8 @@ struct dsl_dataset;
 struct dsl_pool;
 struct dmu_tx;
 
+extern int zfs_scan_suspend_progress;
+
 /*
  * All members of this structure must be uint64_t, for byteswap
  * purposes.
index 575a4af51439c955701c4732d0f169a3c7b06303..1bfd7a485abe410e928069cb9b25954fd7f13039 100644 (file)
@@ -704,6 +704,7 @@ typedef struct zpool_load_policy {
 #define        ZPOOL_CONFIG_SPLIT_LIST         "guid_list"
 #define        ZPOOL_CONFIG_REMOVING           "removing"
 #define        ZPOOL_CONFIG_RESILVER_TXG       "resilver_txg"
+#define        ZPOOL_CONFIG_REBUILD_TXG        "rebuild_txg"
 #define        ZPOOL_CONFIG_COMMENT            "comment"
 #define        ZPOOL_CONFIG_SUSPENDED          "suspended"     /* not stored on disk */
 #define        ZPOOL_CONFIG_SUSPENDED_REASON   "suspended_reason"      /* not stored */
@@ -730,6 +731,7 @@ typedef struct zpool_load_policy {
 #define        ZPOOL_CONFIG_MMP_HOSTID         "mmp_hostid"    /* not stored on disk */
 #define        ZPOOL_CONFIG_ALLOCATION_BIAS    "alloc_bias"    /* not stored on disk */
 #define        ZPOOL_CONFIG_EXPANSION_TIME     "expansion_time"        /* not stored */
+#define        ZPOOL_CONFIG_REBUILD_STATS      "org.openzfs:rebuild_stats"
 
 /*
  * The persistent vdev state is stored as separate values rather than a single
@@ -778,6 +780,9 @@ typedef struct zpool_load_policy {
 #define        VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \
        "com.delphix:ms_unflushed_phys_txgs"
 
+#define        VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \
+       "org.openzfs:vdev_rebuild"
+
 #define        VDEV_TOP_ZAP_ALLOCATION_BIAS \
        "org.zfsonlinux:allocation_bias"
 
@@ -991,6 +996,21 @@ typedef enum dsl_scan_state {
        DSS_NUM_STATES
 } dsl_scan_state_t;
 
+typedef struct vdev_rebuild_stat {
+       uint64_t vrs_state;             /* vdev_rebuild_state_t */
+       uint64_t vrs_start_time;        /* time_t */
+       uint64_t vrs_end_time;          /* time_t */
+       uint64_t vrs_scan_time_ms;      /* total run time (millisecs) */
+       uint64_t vrs_bytes_scanned;     /* allocated bytes scanned */
+       uint64_t vrs_bytes_issued;      /* read bytes issued */
+       uint64_t vrs_bytes_rebuilt;     /* rebuilt bytes */
+       uint64_t vrs_bytes_est;         /* total bytes to scan */
+       uint64_t vrs_errors;            /* scanning errors */
+       uint64_t vrs_pass_time_ms;      /* pass run time (millisecs) */
+       uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */
+       uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */
+} vdev_rebuild_stat_t;
+
 /*
  * Errata described by https://zfsonlinux.org/msg/ZFS-8000-ER.  The ordering
  * of this enum must be maintained to ensure the errata identifiers map to
@@ -1047,6 +1067,7 @@ typedef struct vdev_stat {
        uint64_t        vs_trim_bytes_est;      /* total bytes to trim */
        uint64_t        vs_trim_state;          /* vdev_trim_state_t */
        uint64_t        vs_trim_action_time;    /* time_t */
+       uint64_t        vs_rebuild_processed;   /* bytes rebuilt */
 } vdev_stat_t;
 
 /*
@@ -1178,6 +1199,13 @@ typedef enum {
        VDEV_TRIM_COMPLETE,
 } vdev_trim_state_t;
 
+typedef enum {
+       VDEV_REBUILD_NONE,
+       VDEV_REBUILD_ACTIVE,
+       VDEV_REBUILD_CANCELED,
+       VDEV_REBUILD_COMPLETE,
+} vdev_rebuild_state_t;
+
 /*
  * nvlist name constants. Facilitate restricting snapshot iteration range for
  * the "list next snapshot" ioctl
@@ -1337,6 +1365,8 @@ typedef enum {
        ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR,
        ZFS_ERR_STREAM_TRUNCATED,
        ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH,
+       ZFS_ERR_RESILVER_IN_PROGRESS,
+       ZFS_ERR_REBUILD_IN_PROGRESS,
 } zfs_errno_t;
 
 /*
@@ -1478,7 +1508,12 @@ typedef enum {
  * given payloads:
  *
  *     ESC_ZFS_RESILVER_START
- *     ESC_ZFS_RESILVER_END
+ *     ESC_ZFS_RESILVER_FINISH
+ *
+ *             ZFS_EV_POOL_NAME        DATA_TYPE_STRING
+ *             ZFS_EV_POOL_GUID        DATA_TYPE_UINT64
+ *             ZFS_EV_RESILVER_TYPE    DATA_TYPE_STRING
+ *
  *     ESC_ZFS_POOL_DESTROY
  *     ESC_ZFS_POOL_REGUID
  *
@@ -1532,6 +1567,7 @@ typedef enum {
 #define        ZFS_EV_HIST_IOCTL       "history_ioctl"
 #define        ZFS_EV_HIST_DSNAME      "history_dsname"
 #define        ZFS_EV_HIST_DSID        "history_dsid"
+#define        ZFS_EV_RESILVER_TYPE    "resilver_type"
 
 #ifdef __cplusplus
 }
index 5806dda418c5974f7f01075e09d187cbab877a0a..9b96eb1f871fd200e3fa940008f00c263655d01a 100644 (file)
@@ -790,17 +790,12 @@ extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 #define        SPA_ASYNC_AUTOTRIM_RESTART              0x400
 #define        SPA_ASYNC_L2CACHE_REBUILD               0x800
 #define        SPA_ASYNC_L2CACHE_TRIM                  0x1000
-
-/*
- * Controls the behavior of spa_vdev_remove().
- */
-#define        SPA_REMOVE_UNSPARE      0x01
-#define        SPA_REMOVE_DONE         0x02
+#define        SPA_ASYNC_REBUILD_DONE                  0x2000
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
-    int replacing);
+    int replacing, int rebuild);
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
@@ -988,6 +983,7 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
 
 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid);
 extern uint64_t spa_vdev_config_enter(spa_t *spa);
 extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
     int error, char *tag);
index 6481d5397792bab0ee4905fd1f5a22e3b8595d63..2c52cb666f73d7220ceb731e985287afed5cd0e8 100644 (file)
@@ -36,6 +36,7 @@
 #include <sys/spa_checkpoint.h>
 #include <sys/spa_log_spacemap.h>
 #include <sys/vdev.h>
+#include <sys/vdev_rebuild.h>
 #include <sys/vdev_removal.h>
 #include <sys/metaslab.h>
 #include <sys/dmu.h>
index d93ef78f164d27ef384518dcb1793baeb117f41b..a7e88063657e87d02c74d06aeb0333bf9cfd72cb 100644 (file)
@@ -73,7 +73,7 @@ extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
 extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
 extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size);
 extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
-    int scrub_done);
+    boolean_t scrub_done, boolean_t rebuild_done);
 extern boolean_t vdev_dtl_required(vdev_t *vd);
 extern boolean_t vdev_resilver_needed(vdev_t *vd,
     uint64_t *minp, uint64_t *maxp);
index 56407a1914bc6350d32e0053e287dfd25c7ef8ae..b9298c62d62962c15477bacf0b586d590f539a15 100644 (file)
@@ -38,6 +38,7 @@
 #include <sys/uberblock_impl.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
+#include <sys/vdev_rebuild.h>
 #include <sys/vdev_removal.h>
 #include <sys/zfs_ratelimit.h>
 
@@ -295,13 +296,26 @@ struct vdev {
        uint64_t        vdev_trim_secure;       /* requested secure TRIM */
        uint64_t        vdev_trim_action_time;  /* start and end time */
 
-       /* for limiting outstanding I/Os (initialize and TRIM) */
+       /* Rebuild related */
+       boolean_t       vdev_rebuilding;
+       boolean_t       vdev_rebuild_exit_wanted;
+       boolean_t       vdev_rebuild_cancel_wanted;
+       boolean_t       vdev_rebuild_reset_wanted;
+       kmutex_t        vdev_rebuild_lock;
+       kcondvar_t      vdev_rebuild_cv;
+       kthread_t       *vdev_rebuild_thread;
+       vdev_rebuild_t  vdev_rebuild_config;
+
+       /* For limiting outstanding I/Os (initialize, TRIM, rebuild) */
        kmutex_t        vdev_initialize_io_lock;
        kcondvar_t      vdev_initialize_io_cv;
        uint64_t        vdev_initialize_inflight;
        kmutex_t        vdev_trim_io_lock;
        kcondvar_t      vdev_trim_io_cv;
        uint64_t        vdev_trim_inflight[3];
+       kmutex_t        vdev_rebuild_io_lock;
+       kcondvar_t      vdev_rebuild_io_cv;
+       uint64_t        vdev_rebuild_inflight;
 
        /*
         * Values stored in the config for an indirect or removing vdev.
@@ -358,6 +372,7 @@ struct vdev {
        uint64_t        vdev_degraded;  /* persistent degraded state    */
        uint64_t        vdev_removed;   /* persistent removed state     */
        uint64_t        vdev_resilver_txg; /* persistent resilvering state */
+       uint64_t        vdev_rebuild_txg; /* persistent rebuilding state */
        uint64_t        vdev_nparity;   /* number of parity devices for raidz */
        char            *vdev_path;     /* vdev path (if any)           */
        char            *vdev_devid;    /* vdev devid (if any)          */
diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h
new file mode 100644 (file)
index 0000000..3d4b8cc
--- /dev/null
@@ -0,0 +1,97 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018, Intel Corporation.
+ * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ */
+
+#ifndef        _SYS_VDEV_REBUILD_H
+#define        _SYS_VDEV_REBUILD_H
+
+#include <sys/spa.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * Number of entries in the physical vdev_rebuild_phys structure.  This
+ * state is stored per top-level as VDEV_ZAP_TOP_VDEV_REBUILD_PHYS.
+ */
+#define        REBUILD_PHYS_ENTRIES    12
+
+/*
+ * On-disk rebuild configuration and state.  When adding new fields they
+ * must be added to the end of the structure.
+ */
+typedef struct vdev_rebuild_phys {
+       uint64_t        vrp_rebuild_state;      /* vdev_rebuild_state_t */
+       uint64_t        vrp_last_offset;        /* last rebuilt offset */
+       uint64_t        vrp_min_txg;            /* minimum missing txg */
+       uint64_t        vrp_max_txg;            /* maximum missing txg */
+       uint64_t        vrp_start_time;         /* start time */
+       uint64_t        vrp_end_time;           /* end time */
+       uint64_t        vrp_scan_time_ms;       /* total run time in ms */
+       uint64_t        vrp_bytes_scanned;      /* alloc bytes scanned */
+       uint64_t        vrp_bytes_issued;       /* read bytes rebuilt */
+       uint64_t        vrp_bytes_rebuilt;      /* rebuilt bytes */
+       uint64_t        vrp_bytes_est;          /* total bytes to scan */
+       uint64_t        vrp_errors;             /* errors during rebuild */
+} vdev_rebuild_phys_t;
+
+/*
+ * The vdev_rebuild_t describes the current state and how a top-level vdev
+ * should be rebuilt.  The core elements are the top-vdev, the metaslab being
+ * rebuilt, range tree containing the allocted extents and the on-disk state.
+ */
+typedef struct vdev_rebuild {
+       vdev_t          *vr_top_vdev;           /* top-level vdev to rebuild */
+       metaslab_t      *vr_scan_msp;           /* scanning disabled metaslab */
+       range_tree_t    *vr_scan_tree;          /* scan ranges (in metaslab) */
+
+       /* In-core state and progress */
+       uint64_t        vr_scan_offset[TXG_SIZE];
+       uint64_t        vr_prev_scan_time_ms;   /* any previous scan time */
+
+       /* Per-rebuild pass statistics for calculating bandwidth */
+       uint64_t        vr_pass_start_time;
+       uint64_t        vr_pass_bytes_scanned;
+       uint64_t        vr_pass_bytes_issued;
+
+       /* On-disk state updated by vdev_rebuild_zap_update_sync() */
+       vdev_rebuild_phys_t vr_rebuild_phys;
+} vdev_rebuild_t;
+
+boolean_t vdev_rebuild_active(vdev_t *);
+
+int vdev_rebuild_load(vdev_t *);
+void vdev_rebuild(vdev_t *);
+void vdev_rebuild_stop_wait(vdev_t *);
+void vdev_rebuild_stop_all(spa_t *);
+void vdev_rebuild_restart(spa_t *);
+void vdev_rebuild_clear_sync(void *, dmu_tx_t *);
+int vdev_rebuild_get_stats(vdev_t *, vdev_rebuild_stat_t *);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _SYS_VDEV_REBUILD_H */
index 0b422904ec5a4120f624a084ce0e778e3c91a81b..2d8e7fc36bae6f833caa739515a28ac705a68a52 100644 (file)
@@ -31,6 +31,7 @@ typedef enum zio_priority {
        ZIO_PRIORITY_REMOVAL,           /* reads/writes for vdev removal */
        ZIO_PRIORITY_INITIALIZING,      /* initializing I/O */
        ZIO_PRIORITY_TRIM,              /* trim I/O (discard) */
+       ZIO_PRIORITY_REBUILD,           /* reads/writes for vdev rebuild */
        ZIO_PRIORITY_NUM_QUEUEABLE,
        ZIO_PRIORITY_NOW,               /* non-queued i/os (e.g. free) */
 } zio_priority_t;
index 2d8767d5b9ae59f39f76a3d498ce0951617a9dc1..7e19a62e24ff71ca63bcb7462e20df1ea5fd7a8e 100644 (file)
@@ -74,6 +74,7 @@ typedef enum spa_feature {
        SPA_FEATURE_BOOKMARK_WRITTEN,
        SPA_FEATURE_LOG_SPACEMAP,
        SPA_FEATURE_LIVELIST,
+       SPA_FEATURE_DEVICE_REBUILD,
        SPA_FEATURES
 } spa_feature_t;
 
index 11b3d4cd9d0d3fd319a5865b6f00642571a2700a..f848cb3cfc24cdf769d838ff380212b67584f709 100644 (file)
@@ -2446,7 +2446,8 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
                    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
                (void) nvlist_lookup_uint64_array(nvroot,
                    ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
-               if (ps && ps->pss_func == POOL_SCAN_SCRUB) {
+               if (ps && ps->pss_func == POOL_SCAN_SCRUB &&
+                   ps->pss_state == DSS_SCANNING) {
                        if (cmd == POOL_SCRUB_PAUSE)
                                return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg));
                        else
@@ -3128,8 +3129,8 @@ is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which)
  * If 'replacing' is specified, the new disk will replace the old one.
  */
 int
-zpool_vdev_attach(zpool_handle_t *zhp,
-    const char *old_disk, const char *new_disk, nvlist_t *nvroot, int replacing)
+zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk,
+    const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild)
 {
        zfs_cmd_t zc = {"\0"};
        char msg[1024];
@@ -3164,6 +3165,14 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 
        verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
        zc.zc_cookie = replacing;
+       zc.zc_simple = rebuild;
+
+       if (rebuild &&
+           zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "the loaded zfs module doesn't support device rebuilds"));
+               return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
+       }
 
        if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
            &child, &children) != 0 || children != 1) {
@@ -3224,16 +3233,21 @@ zpool_vdev_attach(zpool_handle_t *zhp,
                        uint64_t version = zpool_get_prop_int(zhp,
                            ZPOOL_PROP_VERSION, NULL);
 
-                       if (islog)
+                       if (islog) {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                                    "cannot replace a log with a spare"));
-                       else if (version >= SPA_VERSION_MULTI_REPLACE)
+                       } else if (rebuild) {
+                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                                   "only mirror vdevs support sequential "
+                                   "reconstruction"));
+                       } else if (version >= SPA_VERSION_MULTI_REPLACE) {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                                    "already in replacing/spare config; wait "
                                    "for completion or use 'zpool detach'"));
-                       else
+                       } else {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                                    "cannot replace a replacing device"));
+                       }
                } else {
                        zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                            "can only attach to mirrors and top-level "
index ebf497db6423cc2a965c5cdf753f4369dc6a7e11..67b8ea33e9d429df66a190017c01d9e0084d5d1a 100644 (file)
@@ -84,6 +84,8 @@ static char *zfs_msgid_table[] = {
         *      ZPOOL_STATUS_RESILVERING
         *      ZPOOL_STATUS_OFFLINE_DEV
         *      ZPOOL_STATUS_REMOVED_DEV
+        *      ZPOOL_STATUS_REBUILDING
+        *      ZPOOL_STATUS_REBUILD_SCRUB
         *      ZPOOL_STATUS_OK
         */
 };
@@ -195,7 +197,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t))
  *     - Check for any data errors
  *     - Check for any faulted or missing devices in a replicated config
  *     - Look for any devices showing errors
- *     - Check for any resilvering devices
+ *     - Check for any resilvering or rebuilding devices
  *
  * There can obviously be multiple errors within a single pool, so this routine
  * only picks the most damaging of all the current errors to report.
@@ -233,6 +235,49 @@ check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap)
            ps->pss_state == DSS_SCANNING)
                return (ZPOOL_STATUS_RESILVERING);
 
+       /*
+        * Currently rebuilding a vdev, check top-level vdevs.
+        */
+       vdev_rebuild_stat_t *vrs = NULL;
+       nvlist_t **child;
+       uint_t c, i, children;
+       uint64_t rebuild_end_time = 0;
+       if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+           &child, &children) == 0) {
+               for (c = 0; c < children; c++) {
+                       if ((nvlist_lookup_uint64_array(child[c],
+                           ZPOOL_CONFIG_REBUILD_STATS,
+                           (uint64_t **)&vrs, &i) == 0) && (vrs != NULL)) {
+                               uint64_t state = vrs->vrs_state;
+
+                               if (state == VDEV_REBUILD_ACTIVE) {
+                                       return (ZPOOL_STATUS_REBUILDING);
+                               } else if (state == VDEV_REBUILD_COMPLETE &&
+                                   vrs->vrs_end_time > rebuild_end_time) {
+                                       rebuild_end_time = vrs->vrs_end_time;
+                               }
+                       }
+               }
+
+               /*
+                * If we can determine when the last scrub was run, and it
+                * was before the last rebuild completed, then recommend
+                * that the pool be scrubbed to verify all checksums.  When
+                * ps is NULL we can infer the pool has never been scrubbed.
+                */
+               if (rebuild_end_time > 0) {
+                       if (ps != NULL) {
+                               if ((ps->pss_state == DSS_FINISHED &&
+                                   ps->pss_func == POOL_SCAN_SCRUB &&
+                                   rebuild_end_time > ps->pss_end_time) ||
+                                   ps->pss_state == DSS_NONE)
+                                       return (ZPOOL_STATUS_REBUILD_SCRUB);
+                       } else {
+                               return (ZPOOL_STATUS_REBUILD_SCRUB);
+                       }
+               }
+       }
+
        /*
         * The multihost property is set and the pool may be active.
         */
index 21bd8289c02570c8f98aa5094892d16dbf5a3dfa..2f4aaed3298697368c7a180f59f13208750f6946 100644 (file)
@@ -286,6 +286,9 @@ libzfs_error_description(libzfs_handle_t *hdl)
                    "resilver_defer feature"));
        case EZFS_EXPORT_IN_PROGRESS:
                return (dgettext(TEXT_DOMAIN, "pool export in progress"));
+       case EZFS_REBUILDING:
+               return (dgettext(TEXT_DOMAIN, "currently sequentially "
+                   "resilvering"));
        case EZFS_UNKNOWN:
                return (dgettext(TEXT_DOMAIN, "unknown error"));
        default:
@@ -693,6 +696,12 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
        case ZFS_ERR_EXPORT_IN_PROGRESS:
                zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap);
                break;
+       case ZFS_ERR_RESILVER_IN_PROGRESS:
+               zfs_verror(hdl, EZFS_RESILVERING, fmt, ap);
+               break;
+       case ZFS_ERR_REBUILD_IN_PROGRESS:
+               zfs_verror(hdl, EZFS_REBUILDING, fmt, ap);
+               break;
        case ZFS_ERR_IOC_CMD_UNAVAIL:
                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs "
                    "module does not support this operation. A reboot may "
index 46befa7d43fec7e40eb1e34321e8730c5abde024..06b89fe0a64fb80c2c4aa42502bd002ad6905abd 100644 (file)
@@ -132,6 +132,7 @@ KERNEL_C = \
        vdev_raidz_math_sse2.c \
        vdev_raidz_math_ssse3.c \
        vdev_raidz_math_powerpc_altivec.c \
+       vdev_rebuild.c \
        vdev_removal.c \
        vdev_root.c \
        vdev_trim.c \
index 687b85d0bd22935318991164f68d207253d17b09..3fbd3c67f825761fa5486f7feaaf9a7f39f75168 100644 (file)
@@ -1862,6 +1862,30 @@ queue's min_active.  See the section "ZFS I/O SCHEDULER".
 Default value: \fB1,000\fR.
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_vdev_rebuild_max_active\fR (int)
+.ad
+.RS 12n
+Maximum sequential resilver I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB3\fR.
+.RE
+
+.sp
+.ne 2
+.na
+\fBzfs_vdev_rebuild_min_active\fR (int)
+.ad
+.RS 12n
+Minimum sequential resilver I/Os active to each device.
+See the section "ZFS I/O SCHEDULER".
+.sp
+Default value: \fB1\fR.
+.RE
+
 .sp
 .ne 2
 .na
@@ -2707,6 +2731,18 @@ Include cache hits in read history
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_rebuild_max_segment\fR (ulong)
+.ad
+.RS 12n
+Maximum read segment size to issue when sequentially resilvering a
+top-level vdev.
+.sp
+Default value: \fB1,048,576\fR.
+.RE
+
 .sp
 .ne 2
 .na
index e7a61957ff4f3bafcdd4af5bcd65744da3789ee8..3f690c3340c4bb63bef1fb4391fa682f08283e7b 100644 (file)
@@ -255,6 +255,35 @@ This feature becomes \fBactive\fR when a bookmark is created and will be
 returned to the \fBenabled\fR state when all bookmarks with these fields are destroyed.
 .RE
 
+.sp
+.ne 2
+.na
+\fBdevice_rebuild\fR
+.ad
+.RS 4n
+.TS
+l l .
+GUID   org.openzfs:device_rebuild
+READ\-ONLY COMPATIBLE  yes
+DEPENDENCIES   none
+.TE
+
+This feature enables the ability for the \fBzpool attach\fR and \fBzpool
+replace\fR subcommands to perform sequential reconstruction (instead of
+healing reconstruction) when resilvering.
+
+Sequential reconstruction resilvers a device in LBA order without immediately
+verifying the checksums.  Once complete a scrub is started which then verifies
+the checksums.  This approach allows full redundancy to be restored to the pool
+in the minimum amount of time.  This two phase approach will take longer than a
+healing resilver when the time to verify the checksums is included.  However,
+unless there is additional pool damage no checksum errors should be reported
+by the scrub.  This feature is incompatible with raidz configurations.
+
+This feature becomes \fBactive\fR while a sequential resilver is in progress,
+and returns to \fBenabled\fR when the resilver completes.
+.RE
+
 .sp
 .ne 2
 .na
index be0be4e076fc86e51c5fa4d56b067eb767efc5d3..585357b96d68efac9ec4950f94f7cc88a82d1b4a 100644 (file)
@@ -27,7 +27,7 @@
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd August 9, 2019
+.Dd May 15, 2020
 .Dt ZPOOL-ATTACH 8
 .Os Linux
 .Sh NAME
@@ -36,7 +36,7 @@
 .Sh SYNOPSIS
 .Nm
 .Cm attach
-.Op Fl fw
+.Op Fl fsw
 .Oo Fl o Ar property Ns = Ns Ar value Oc
 .Ar pool device new_device
 .Sh DESCRIPTION
@@ -44,7 +44,7 @@
 .It Xo
 .Nm
 .Cm attach
-.Op Fl fw
+.Op Fl fsw
 .Oo Fl o Ar property Ns = Ns Ar value Oc
 .Ar pool device new_device
 .Xc
@@ -68,22 +68,29 @@ is part of a two-way mirror, attaching
 creates a three-way mirror, and so on.
 In either case,
 .Ar new_device
-begins to resilver immediately.
+begins to resilver immediately and any running scrub is cancelled.
 .Bl -tag -width Ds
 .It Fl f
 Forces use of
 .Ar new_device ,
 even if it appears to be in use.
 Not all devices can be overridden in this manner.
-.It Fl w
-Waits until
-.Ar new_device
-has finished resilvering before returning.
 .It Fl o Ar property Ns = Ns Ar value
 Sets the given pool properties. See the
 .Xr zpoolprops 8
 manual page for a list of valid properties that can be set. The only property
 supported at the moment is ashift.
+.It Fl s
+The
+.Ar new_device
+is reconstructed sequentially to restore redundancy as quickly as possible.
+Checksums are not verfied during sequential reconstruction so a scrub is
+started when the resilver completes.
+Sequential reconstruction is not supported for raidz configurations.
+.It Fl w
+Waits until
+.Ar new_device
+has finished resilvering before returning.
 .El
 .El
 .Sh SEE ALSO
index 933fb4ae92ab0e0f3e29ae1f1b45090a2a467fd0..5e639feaf7672a192f1636ddd4647e625f191eb1 100644 (file)
@@ -27,7 +27,7 @@
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd August 9, 2019
+.Dd May 15, 2020
 .Dt ZPOOL-REPLACE 8
 .Os Linux
 .Sh NAME
@@ -36,7 +36,7 @@
 .Sh SYNOPSIS
 .Nm
 .Cm replace
-.Op Fl fw
+.Op Fl fsw
 .Oo Fl o Ar property Ns = Ns Ar value Oc
 .Ar pool Ar device Op Ar new_device
 .Sh DESCRIPTION
@@ -44,7 +44,7 @@
 .It Xo
 .Nm
 .Cm replace
-.Op Fl fw
+.Op Fl fsw
 .Op Fl o Ar property Ns = Ns Ar value
 .Ar pool Ar device Op Ar new_device
 .Xc
@@ -56,6 +56,7 @@ This is equivalent to attaching
 .Ar new_device ,
 waiting for it to resilver, and then detaching
 .Ar old_device .
+Any in progress scrub will be cancelled.
 .Pp
 The size of
 .Ar new_device
@@ -86,6 +87,13 @@ Sets the given pool properties. See the
 manual page for a list of valid properties that can be set.
 The only property supported at the moment is
 .Sy ashift .
+.It Fl s
+The
+.Ar new_device
+is reconstructed sequentially to restore redundancy as quickly as possible.
+Checksums are not verfied during sequential reconstruction so a scrub is
+started when the resilver completes.
+Sequential reconstruction is not supported for raidz configurations.
 .It Fl w
 Waits until the replacement has completed before returning.
 .El
index 7364bf6357064939c380730f58fef7b9bfecac1a..66e33599578377c7849bff09c5056e7def29adf9 100644 (file)
@@ -27,7 +27,7 @@
 .\" Copyright 2017 Nexenta Systems, Inc.
 .\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
 .\"
-.Dd August 9, 2019
+.Dd May 15, 2020
 .Dt ZPOOL-STATUS 8
 .Os Linux
 .Sh NAME
@@ -59,7 +59,7 @@ is specified, then the status of each pool in the system is displayed.
 For more information on pool and device health, see the
 .Em Device Failure and Recovery
 section of
-.Xr zpoolconcepts 8.
+.Xr zpoolconcepts 8 .
 .Pp
 If a scrub or resilver is in progress, this command reports the percentage done
 and the estimated time to completion.
index 7c83113ac86a24fac150b86b538ce145e8fa8340..1ac9d00e7bf592401cc0109e6b8fa78fc47e0ece 100644 (file)
@@ -251,6 +251,7 @@ SRCS+=      abd.c \
        vdev_raidz.c \
        vdev_raidz_math.c \
        vdev_raidz_math_scalar.c \
+       vdev_rebuild.c \
        vdev_raidz_math_avx2.c \
        vdev_raidz_math_avx512bw.c \
        vdev_raidz_math_avx512f.c \
index cf3006721df0da5431ef5e161ef6ffffd7491bde..302d485703f4f351ee1196acd0fe84415e11a997 100644 (file)
@@ -570,6 +570,11 @@ zpool_feature_init(void)
            "com.datto:resilver_defer", "resilver_defer",
            "Support for deferring new resilvers when one is already running.",
            ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
+
+       zfeature_register(SPA_FEATURE_DEVICE_REBUILD,
+           "org.openzfs:device_rebuild", "device_rebuild",
+           "Support for sequential device rebuilds",
+           ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL);
 }
 
 #if defined(_KERNEL)
index 7ea976d129dd50eab362f2a1c78ed839a77ad64c..9ddcd6c339d4cc9e91bac66a8fa11283dbe1f596 100644 (file)
@@ -94,6 +94,7 @@ $(MODULE)-objs += vdev_queue.o
 $(MODULE)-objs += vdev_raidz.o
 $(MODULE)-objs += vdev_raidz_math.o
 $(MODULE)-objs += vdev_raidz_math_scalar.o
+$(MODULE)-objs += vdev_rebuild.o
 $(MODULE)-objs += vdev_removal.o
 $(MODULE)-objs += vdev_root.o
 $(MODULE)-objs += vdev_trim.o
index 895ffbf0a94e633f57de94c5f29d42dd3eb88023..712af664e90f6dcee72afe063eb414b72d5ce32c 100644 (file)
@@ -704,8 +704,9 @@ static int
 dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 {
        dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+       vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 
-       if (dsl_scan_is_running(scn))
+       if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd))
                return (SET_ERROR(EBUSY));
 
        return (0);
@@ -746,8 +747,12 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 
                if (vdev_resilver_needed(spa->spa_root_vdev,
                    &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) {
-                       spa_event_notify(spa, NULL, NULL,
+                       nvlist_t *aux = fnvlist_alloc();
+                       fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
+                           "healing");
+                       spa_event_notify(spa, NULL, aux,
                            ESC_ZFS_RESILVER_START);
+                       nvlist_free(aux);
                } else {
                        spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START);
                }
@@ -761,6 +766,21 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
                if (scn->scn_phys.scn_min_txg > TXG_INITIAL)
                        scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO;
 
+               /*
+                * When starting a resilver clear any existing rebuild state.
+                * This is required to prevent stale rebuild status from
+                * being reported when a rebuild is run, then a resilver and
+                * finally a scrub.  In which case only the scrub status
+                * should be reported by 'zpool status'.
+                */
+               if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) {
+                       vdev_t *rvd = spa->spa_root_vdev;
+                       for (uint64_t i = 0; i < rvd->vdev_children; i++) {
+                               vdev_t *vd = rvd->vdev_child[i];
+                               vdev_rebuild_clear_sync(
+                                   (void *)(uintptr_t)vd->vdev_id, tx);
+                       }
+               }
        }
 
        /* back to the generic stuff */
@@ -918,14 +938,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
                if (complete &&
                    !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
                        vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
-                           scn->scn_phys.scn_max_txg, B_TRUE);
-
-                       spa_event_notify(spa, NULL, NULL,
-                           scn->scn_phys.scn_min_txg ?
-                           ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
+                           scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE);
+
+                       if (scn->scn_phys.scn_min_txg) {
+                               nvlist_t *aux = fnvlist_alloc();
+                               fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE,
+                                   "healing");
+                               spa_event_notify(spa, NULL, aux,
+                                   ESC_ZFS_RESILVER_FINISH);
+                               nvlist_free(aux);
+                       } else {
+                               spa_event_notify(spa, NULL, NULL,
+                                   ESC_ZFS_SCRUB_FINISH);
+                       }
                } else {
                        vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg,
-                           0, B_TRUE);
+                           0, B_TRUE, B_FALSE);
                }
                spa_errlog_rotate(spa);
 
index 943330886eecf7b1b834cb54d61c704bfac93363..6b60227d244fdb0bff2634709567334a45d27bad 100644 (file)
@@ -57,6 +57,7 @@
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_initialize.h>
+#include <sys/vdev_rebuild.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
 #include <sys/metaslab.h>
@@ -1562,6 +1563,7 @@ spa_unload(spa_t *spa)
                vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE);
                vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
                vdev_autotrim_stop_all(spa);
+               vdev_rebuild_stop_all(spa);
        }
 
        /*
@@ -4240,7 +4242,7 @@ spa_ld_load_vdev_metadata(spa_t *spa)
         * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
         */
        spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-       vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
+       vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
        spa_config_exit(spa, SCL_ALL, FTAG);
 
        return (0);
@@ -4829,11 +4831,16 @@ spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
                    update_config_cache);
 
                /*
-                * Check all DTLs to see if anything needs resilvering.
+                * Check if a rebuild was in progress and if so resume it.
+                * Then check all DTLs to see if anything needs resilvering.
+                * The resilver will be deferred if a rebuild was started.
                 */
-               if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
-                   vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
+               if (vdev_rebuild_active(spa->spa_root_vdev)) {
+                       vdev_rebuild_restart(spa);
+               } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
+                   vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
                        spa_async_request(spa, SPA_ASYNC_RESILVER);
+               }
 
                /*
                 * Log the fact that we booted up (so that we can detect if
@@ -6313,6 +6320,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
                        vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
                        vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
                        vdev_autotrim_stop_all(spa);
+                       vdev_rebuild_stop_all(spa);
                }
 
                /*
@@ -6536,12 +6544,17 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
  * is automatically detached.
+ *
+ * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
+ * should be performed instead of traditional healing reconstruction.  From
+ * an administrators perspective these are both resilver operations.
  */
 int
-spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
+spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
+    int rebuild)
 {
        uint64_t txg, dtl_max_txg;
-       vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
+       vdev_t *rvd = spa->spa_root_vdev;
        vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
        vdev_ops_t *pvops;
        char *oldvdpath, *newvdpath;
@@ -6561,6 +6574,19 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
                return (spa_vdev_exit(spa, NULL, txg, error));
        }
 
+       if (rebuild) {
+               if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
+                       return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+
+               if (dsl_scan_resilvering(spa_get_dsl(spa)))
+                       return (spa_vdev_exit(spa, NULL, txg,
+                           ZFS_ERR_RESILVER_IN_PROGRESS));
+       } else {
+               if (vdev_rebuild_active(rvd))
+                       return (spa_vdev_exit(spa, NULL, txg,
+                           ZFS_ERR_REBUILD_IN_PROGRESS));
+       }
+
        if (spa->spa_vdev_removal != NULL)
                return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
@@ -6593,6 +6619,18 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
        if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
                return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
+       if (rebuild) {
+               /*
+                * For rebuilds, the parent vdev must support reconstruction
+                * using only space maps.  This means the only allowable
+                * parents are the root vdev or a mirror vdev.
+                */
+               if (pvd->vdev_ops != &vdev_mirror_ops &&
+                   pvd->vdev_ops != &vdev_root_ops) {
+                       return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
+               }
+       }
+
        if (!replacing) {
                /*
                 * For attach, the only allowable parent is a mirror or the root
@@ -6646,7 +6684,7 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
         * than the top-level vdev.
         */
        if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
-               return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
+               return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
        /*
         * If this is an in-place replacement, update oldvd's path and devid
@@ -6664,9 +6702,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
                }
        }
 
-       /* mark the device being resilvered */
-       newvd->vdev_resilver_txg = txg;
-
        /*
         * If the parent is not a mirror, or if we're replacing, insert the new
         * mirror/replacing/spare vdev above oldvd.
@@ -6704,8 +6739,8 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
         */
        dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
-       vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
-           dtl_max_txg - TXG_INITIAL);
+       vdev_dtl_dirty(newvd, DTL_MISSING,
+           TXG_INITIAL, dtl_max_txg - TXG_INITIAL);
 
        if (newvd->vdev_isspare) {
                spa_spare_activate(newvd);
@@ -6722,16 +6757,25 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
        vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
        /*
-        * Schedule the resilver to restart in the future. We do this to
-        * ensure that dmu_sync-ed blocks have been stitched into the
-        * respective datasets. We do not do this if resilvers have been
-        * deferred.
+        * Schedule the resilver or rebuild to restart in the future. We do
+        * this to ensure that dmu_sync-ed blocks have been stitched into the
+        * respective datasets.
         */
-       if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
-           spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
-               vdev_defer_resilver(newvd);
-       else
-               dsl_scan_restart_resilver(spa->spa_dsl_pool, dtl_max_txg);
+       if (rebuild) {
+               newvd->vdev_rebuild_txg = txg;
+
+               vdev_rebuild(tvd);
+       } else {
+               newvd->vdev_resilver_txg = txg;
+
+               if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
+                   spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) {
+                       vdev_defer_resilver(newvd);
+               } else {
+                       dsl_scan_restart_resilver(spa->spa_dsl_pool,
+                           dtl_max_txg);
+               }
+       }
 
        if (spa->spa_bootfs)
                spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
@@ -6774,7 +6818,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 
        ASSERT(spa_writeable(spa));
 
-       txg = spa_vdev_enter(spa);
+       txg = spa_vdev_detach_enter(spa, guid);
 
        vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
@@ -7728,6 +7772,12 @@ spa_vdev_resilver_done(spa_t *spa)
        }
 
        spa_config_exit(spa, SCL_ALL, FTAG);
+
+       /*
+        * If a detach was not performed above replace waiters will not have
+        * been notified.  In which case we must do so now.
+        */
+       spa_notify_waiters(spa);
 }
 
 /*
@@ -7970,10 +8020,22 @@ spa_async_thread(void *arg)
        if (tasks & SPA_ASYNC_RESILVER_DONE)
                spa_vdev_resilver_done(spa);
 
+       /*
+        * If any devices are done replacing, detach them.  Then if no
+        * top-level vdevs are rebuilding attempt to kick off a scrub.
+        */
+       if (tasks & SPA_ASYNC_REBUILD_DONE) {
+               spa_vdev_resilver_done(spa);
+
+               if (!vdev_rebuild_active(spa->spa_root_vdev))
+                       (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB);
+       }
+
        /*
         * Kick off a resilver.
         */
        if (tasks & SPA_ASYNC_RESILVER &&
+           !vdev_rebuild_active(spa->spa_root_vdev) &&
            (!dsl_scan_resilvering(dp) ||
            !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
                dsl_scan_restart_resilver(dp, 0);
@@ -9470,6 +9532,9 @@ spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
                    DSS_SCANNING);
                break;
        case ZPOOL_WAIT_RESILVER:
+               if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev)))
+                       break;
+               /* fall through */
        case ZPOOL_WAIT_SCRUB:
        {
                boolean_t scanning, paused, is_scrub;
index 61cefa3dda434813412c803096af642bb81d40a5..4c884409afbdda8fbb0c67d9956666d26983529f 100644 (file)
@@ -1165,6 +1165,30 @@ spa_vdev_enter(spa_t *spa)
        return (spa_vdev_config_enter(spa));
 }
 
+/*
+ * The same as spa_vdev_enter() above but additionally takes the guid of
+ * the vdev being detached.  When there is a rebuild in process it will be
+ * suspended while the vdev tree is modified then resumed by spa_vdev_exit().
+ * The rebuild is canceled if only a single child remains after the detach.
+ */
+uint64_t
+spa_vdev_detach_enter(spa_t *spa, uint64_t guid)
+{
+       mutex_enter(&spa->spa_vdev_top_lock);
+       mutex_enter(&spa_namespace_lock);
+
+       vdev_autotrim_stop_all(spa);
+
+       if (guid != 0) {
+               vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
+               if (vd) {
+                       vdev_rebuild_stop_wait(vd->vdev_top);
+               }
+       }
+
+       return (spa_vdev_config_enter(spa));
+}
+
 /*
  * Internal implementation for spa_vdev_enter().  Used when a vdev
  * operation requires multiple syncs (i.e. removing a device) while
@@ -1198,7 +1222,7 @@ spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
        /*
         * Reassess the DTLs.
         */
-       vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE);
+       vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE);
 
        if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
                config_changed = B_TRUE;
@@ -1271,6 +1295,7 @@ int
 spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 {
        vdev_autotrim_restart(spa);
+       vdev_rebuild_restart(spa);
 
        spa_vdev_config_exit(spa, vd, txg, error, FTAG);
        mutex_exit(&spa_namespace_lock);
@@ -1322,7 +1347,7 @@ spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
        }
 
        if (vd != NULL || error == 0)
-               vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE);
+               vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE);
 
        if (vd != NULL) {
                if (vd != spa->spa_root_vdev)
index 03360120ad4c06a223f2f4e343918aa23703d275..27ac17fea5eb539e8f83ca85f75fae828fd44ce0 100644 (file)
@@ -39,6 +39,7 @@
 #include <sys/dmu_tx.h>
 #include <sys/dsl_dir.h>
 #include <sys/vdev_impl.h>
+#include <sys/vdev_rebuild.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
@@ -551,10 +552,12 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
        mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL);
+
        mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL);
+
        mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL);
        mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -562,10 +565,16 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
        cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL);
        cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL);
 
+       mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL);
+       cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL);
+       cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL);
+
        for (int t = 0; t < DTL_TYPES; t++) {
                vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0,
                    0);
        }
+
        txg_list_create(&vd->vdev_ms_list, spa,
            offsetof(struct metaslab, ms_txg_node));
        txg_list_create(&vd->vdev_dtl_list, spa,
@@ -835,6 +844,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
                (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
                    &vd->vdev_resilver_txg);
 
+               (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
+                   &vd->vdev_rebuild_txg);
+
                if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER))
                        vdev_defer_resilver(vd);
 
@@ -890,6 +902,7 @@ vdev_free(vdev_t *vd)
        ASSERT3P(vd->vdev_initialize_thread, ==, NULL);
        ASSERT3P(vd->vdev_trim_thread, ==, NULL);
        ASSERT3P(vd->vdev_autotrim_thread, ==, NULL);
+       ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
 
        /*
         * Scan queues are normally destroyed at the end of a scan. If the
@@ -998,10 +1011,12 @@ vdev_free(vdev_t *vd)
        mutex_destroy(&vd->vdev_stat_lock);
        mutex_destroy(&vd->vdev_probe_lock);
        mutex_destroy(&vd->vdev_scan_io_queue_lock);
+
        mutex_destroy(&vd->vdev_initialize_lock);
        mutex_destroy(&vd->vdev_initialize_io_lock);
        cv_destroy(&vd->vdev_initialize_io_cv);
        cv_destroy(&vd->vdev_initialize_cv);
+
        mutex_destroy(&vd->vdev_trim_lock);
        mutex_destroy(&vd->vdev_autotrim_lock);
        mutex_destroy(&vd->vdev_trim_io_lock);
@@ -1009,6 +1024,11 @@ vdev_free(vdev_t *vd)
        cv_destroy(&vd->vdev_autotrim_cv);
        cv_destroy(&vd->vdev_trim_io_cv);
 
+       mutex_destroy(&vd->vdev_rebuild_lock);
+       mutex_destroy(&vd->vdev_rebuild_io_lock);
+       cv_destroy(&vd->vdev_rebuild_cv);
+       cv_destroy(&vd->vdev_rebuild_io_cv);
+
        zfs_ratelimit_fini(&vd->vdev_delay_rl);
        zfs_ratelimit_fini(&vd->vdev_checksum_rl);
 
@@ -1078,7 +1098,10 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
        ASSERT3P(tvd->vdev_indirect_births, ==, NULL);
        ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL);
        ASSERT0(tvd->vdev_removing);
+       ASSERT0(tvd->vdev_rebuilding);
        tvd->vdev_removing = svd->vdev_removing;
+       tvd->vdev_rebuilding = svd->vdev_rebuilding;
+       tvd->vdev_rebuild_config = svd->vdev_rebuild_config;
        tvd->vdev_indirect_config = svd->vdev_indirect_config;
        tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping;
        tvd->vdev_indirect_births = svd->vdev_indirect_births;
@@ -1092,6 +1115,7 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
        svd->vdev_indirect_births = NULL;
        svd->vdev_obsolete_sm = NULL;
        svd->vdev_removing = 0;
+       svd->vdev_rebuilding = 0;
 
        for (t = 0; t < TXG_SIZE; t++) {
                while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL)
@@ -2576,11 +2600,8 @@ vdev_dtl_max(vdev_t *vd)
  * excise the DTLs.
  */
 static boolean_t
-vdev_dtl_should_excise(vdev_t *vd)
+vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done)
 {
-       spa_t *spa = vd->vdev_spa;
-       dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
-
        ASSERT0(vd->vdev_children);
 
        if (vd->vdev_state < VDEV_STATE_DEGRADED)
@@ -2589,23 +2610,52 @@ vdev_dtl_should_excise(vdev_t *vd)
        if (vd->vdev_resilver_deferred)
                return (B_FALSE);
 
-       if (vd->vdev_resilver_txg == 0 ||
-           range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
+       if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]))
                return (B_TRUE);
 
-       /*
-        * When a resilver is initiated the scan will assign the scn_max_txg
-        * value to the highest txg value that exists in all DTLs. If this
-        * device's max DTL is not part of this scan (i.e. it is not in
-        * the range (scn_min_txg, scn_max_txg] then it is not eligible
-        * for excision.
-        */
-       if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
-               ASSERT3U(scn->scn_phys.scn_min_txg, <=, vdev_dtl_min(vd));
-               ASSERT3U(scn->scn_phys.scn_min_txg, <, vd->vdev_resilver_txg);
-               ASSERT3U(vd->vdev_resilver_txg, <=, scn->scn_phys.scn_max_txg);
-               return (B_TRUE);
+       if (rebuild_done) {
+               vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
+               vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+               /* Rebuild not initiated by attach */
+               if (vd->vdev_rebuild_txg == 0)
+                       return (B_TRUE);
+
+               /*
+                * When a rebuild completes without error then all missing data
+                * up to the rebuild max txg has been reconstructed and the DTL
+                * is eligible for excision.
+                */
+               if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE &&
+                   vdev_dtl_max(vd) <= vrp->vrp_max_txg) {
+                       ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd));
+                       ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg);
+                       ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg);
+                       return (B_TRUE);
+               }
+       } else {
+               dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan;
+               dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys;
+
+               /* Resilver not initiated by attach */
+               if (vd->vdev_resilver_txg == 0)
+                       return (B_TRUE);
+
+               /*
+                * When a resilver is initiated the scan will assign the
+                * scn_max_txg value to the highest txg value that exists
+                * in all DTLs. If this device's max DTL is not part of this
+                * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg]
+                * then it is not eligible for excision.
+                */
+               if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) {
+                       ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd));
+                       ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg);
+                       ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg);
+                       return (B_TRUE);
+               }
        }
+
        return (B_FALSE);
 }
 
@@ -2614,7 +2664,8 @@ vdev_dtl_should_excise(vdev_t *vd)
  * write operations will be issued to the pool.
  */
 void
-vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
+vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
+    boolean_t scrub_done, boolean_t rebuild_done)
 {
        spa_t *spa = vd->vdev_spa;
        avl_tree_t reftree;
@@ -2624,22 +2675,28 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 
        for (int c = 0; c < vd->vdev_children; c++)
                vdev_dtl_reassess(vd->vdev_child[c], txg,
-                   scrub_txg, scrub_done);
+                   scrub_txg, scrub_done, rebuild_done);
 
        if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux)
                return;
 
        if (vd->vdev_ops->vdev_op_leaf) {
                dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+               vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config;
+               boolean_t check_excise = B_FALSE;
                boolean_t wasempty = B_TRUE;
 
                mutex_enter(&vd->vdev_dtl_lock);
 
                /*
-                * If requested, pretend the scan completed cleanly.
+                * If requested, pretend the scan or rebuild completed cleanly.
                 */
-               if (zfs_scan_ignore_errors && scn)
-                       scn->scn_phys.scn_errors = 0;
+               if (zfs_scan_ignore_errors) {
+                       if (scn != NULL)
+                               scn->scn_phys.scn_errors = 0;
+                       if (vr != NULL)
+                               vr->vr_rebuild_phys.vrp_errors = 0;
+               }
 
                if (scrub_txg != 0 &&
                    !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) {
@@ -2654,21 +2711,29 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
                }
 
                /*
-                * If we've completed a scan cleanly then determine
-                * if this vdev should remove any DTLs. We only want to
-                * excise regions on vdevs that were available during
-                * the entire duration of this scan.
+                * If we've completed a scrub/resilver or a rebuild cleanly
+                * then determine if this vdev should remove any DTLs. We
+                * only want to excise regions on vdevs that were available
+                * during the entire duration of this scan.
                 */
-               if (scrub_txg != 0 &&
-                   (spa->spa_scrub_started ||
-                   (scn != NULL && scn->scn_phys.scn_errors == 0)) &&
-                   vdev_dtl_should_excise(vd)) {
+               if (rebuild_done &&
+                   vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) {
+                       check_excise = B_TRUE;
+               } else {
+                       if (spa->spa_scrub_started ||
+                           (scn != NULL && scn->scn_phys.scn_errors == 0)) {
+                               check_excise = B_TRUE;
+                       }
+               }
+
+               if (scrub_txg && check_excise &&
+                   vdev_dtl_should_excise(vd, rebuild_done)) {
                        /*
-                        * We completed a scrub up to scrub_txg.  If we
-                        * did it without rebooting, then the scrub dtl
-                        * will be valid, so excise the old region and
-                        * fold in the scrub dtl.  Otherwise, leave the
-                        * dtl as-is if there was an error.
+                        * We completed a scrub, resilver or rebuild up to
+                        * scrub_txg.  If we did it without rebooting, then
+                        * the scrub dtl will be valid, so excise the old
+                        * region and fold in the scrub dtl.  Otherwise,
+                        * leave the dtl as-is if there was an error.
                         *
                         * There's little trick here: to excise the beginning
                         * of the DTL_MISSING map, we put it into a reference
@@ -2711,15 +2776,20 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
                            range_tree_add, vd->vdev_dtl[DTL_OUTAGE]);
 
                /*
-                * If the vdev was resilvering and no longer has any
-                * DTLs then reset its resilvering flag and dirty
+                * If the vdev was resilvering or rebuilding and no longer
+                * has any DTLs then reset the appropriate flag and dirty
                 * the top level so that we persist the change.
                 */
-               if (txg != 0 && vd->vdev_resilver_txg != 0 &&
+               if (txg != 0 &&
                    range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) &&
                    range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) {
-                       vd->vdev_resilver_txg = 0;
-                       vdev_config_dirty(vd->vdev_top);
+                       if (vd->vdev_rebuild_txg != 0) {
+                               vd->vdev_rebuild_txg = 0;
+                               vdev_config_dirty(vd->vdev_top);
+                       } else if (vd->vdev_resilver_txg != 0) {
+                               vd->vdev_resilver_txg = 0;
+                               vdev_config_dirty(vd->vdev_top);
+                       }
                }
 
                mutex_exit(&vd->vdev_dtl_lock);
@@ -2955,10 +3025,10 @@ vdev_dtl_required(vdev_t *vd)
         * If not, we can safely offline/detach/remove the device.
         */
        vd->vdev_cant_read = B_TRUE;
-       vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+       vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
        required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
        vd->vdev_cant_read = cant_read;
-       vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+       vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE);
 
        if (!required && zio_injection_enabled) {
                required = !!zio_handle_device_injection(vd, NULL,
@@ -3065,6 +3135,20 @@ vdev_load(vdev_t *vd)
                }
        }
 
+       /*
+        * Load any rebuild state from the top-level vdev zap.
+        */
+       if (vd == vd->vdev_top && vd->vdev_top_zap != 0) {
+               error = vdev_rebuild_load(vd);
+               if (error && error != ENOTSUP) {
+                       vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+                           VDEV_AUX_CORRUPT_DATA);
+                       vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load "
+                           "failed [error=%d]", error);
+                       return (error);
+               }
+       }
+
        /*
         * If this is a top-level vdev, initialize its metaslabs.
         */
@@ -3947,6 +4031,7 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
                vs->vs_state = vd->vdev_state;
                vs->vs_rsize = vdev_get_min_asize(vd);
+
                if (vd->vdev_ops->vdev_op_leaf) {
                        vs->vs_rsize += VDEV_LABEL_START_SIZE +
                            VDEV_LABEL_END_SIZE;
@@ -3973,7 +4058,11 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                        vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est;
                        vs->vs_trim_state = vd->vdev_trim_state;
                        vs->vs_trim_action_time = vd->vdev_trim_action_time;
+
+                       /* Set when there is a deferred resilver. */
+                       vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
                }
+
                /*
                 * Report expandable space on top-level, non-auxiliary devices
                 * only. The expandable space is reported in terms of metaslab
@@ -3985,13 +4074,16 @@ vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx)
                            vd->vdev_max_asize - vd->vdev_asize,
                            1ULL << tvd->vdev_ms_shift);
                }
+
+               /*
+                * Report fragmentation and rebuild progress for top-level,
+                * non-auxiliary, concrete devices.
+                */
                if (vd->vdev_aux == NULL && vd == vd->vdev_top &&
                    vdev_is_concrete(vd)) {
                        vs->vs_fragmentation = (vd->vdev_mg != NULL) ?
                            vd->vdev_mg->mg_fragmentation : 0;
                }
-               if (vd->vdev_ops->vdev_op_leaf)
-                       vs->vs_resilver_deferred = vd->vdev_resilver_deferred;
        }
 
        vdev_get_stats_ex_impl(vd, vs, vsx);
@@ -4072,17 +4164,35 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                mutex_enter(&vd->vdev_stat_lock);
 
                if (flags & ZIO_FLAG_IO_REPAIR) {
+                       /*
+                        * Repair is the result of a resilver issued by the
+                        * scan thread (spa_sync).
+                        */
                        if (flags & ZIO_FLAG_SCAN_THREAD) {
-                               dsl_scan_phys_t *scn_phys =
-                                   &spa->spa_dsl_pool->dp_scan->scn_phys;
+                               dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
+                               dsl_scan_phys_t *scn_phys = &scn->scn_phys;
                                uint64_t *processed = &scn_phys->scn_processed;
 
-                               /* XXX cleanup? */
                                if (vd->vdev_ops->vdev_op_leaf)
                                        atomic_add_64(processed, psize);
                                vs->vs_scan_processed += psize;
                        }
 
+                       /*
+                        * Repair is the result of a rebuild issued by the
+                        * rebuild thread (vdev_rebuild_thread).
+                        */
+                       if (zio->io_priority == ZIO_PRIORITY_REBUILD) {
+                               vdev_t *tvd = vd->vdev_top;
+                               vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
+                               vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+                               uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt;
+
+                               if (vd->vdev_ops->vdev_op_leaf)
+                                       atomic_add_64(rebuilt, psize);
+                               vs->vs_rebuild_processed += psize;
+                       }
+
                        if (flags & ZIO_FLAG_SELF_HEAL)
                                vs->vs_self_healed += psize;
                }
@@ -4094,6 +4204,7 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                if (vd->vdev_ops->vdev_op_leaf &&
                    (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) {
                        zio_type_t vs_type = type;
+                       zio_priority_t priority = zio->io_priority;
 
                        /*
                         * TRIM ops and bytes are reported to user space as
@@ -4103,19 +4214,44 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
                        if (type == ZIO_TYPE_TRIM)
                                vs_type = ZIO_TYPE_IOCTL;
 
+                       /*
+                        * Solely for the purposes of 'zpool iostat -lqrw'
+                        * reporting use the priority to catagorize the IO.
+                        * Only the following are reported to user space:
+                        *
+                        *   ZIO_PRIORITY_SYNC_READ,
+                        *   ZIO_PRIORITY_SYNC_WRITE,
+                        *   ZIO_PRIORITY_ASYNC_READ,
+                        *   ZIO_PRIORITY_ASYNC_WRITE,
+                        *   ZIO_PRIORITY_SCRUB,
+                        *   ZIO_PRIORITY_TRIM.
+                        */
+                       if (priority == ZIO_PRIORITY_REBUILD) {
+                               priority = ((type == ZIO_TYPE_WRITE) ?
+                                   ZIO_PRIORITY_ASYNC_WRITE :
+                                   ZIO_PRIORITY_SCRUB);
+                       } else if (priority == ZIO_PRIORITY_INITIALIZING) {
+                               ASSERT3U(type, ==, ZIO_TYPE_WRITE);
+                               priority = ZIO_PRIORITY_ASYNC_WRITE;
+                       } else if (priority == ZIO_PRIORITY_REMOVAL) {
+                               priority = ((type == ZIO_TYPE_WRITE) ?
+                                   ZIO_PRIORITY_ASYNC_WRITE :
+                                   ZIO_PRIORITY_ASYNC_READ);
+                       }
+
                        vs->vs_ops[vs_type]++;
                        vs->vs_bytes[vs_type] += psize;
 
                        if (flags & ZIO_FLAG_DELEGATED) {
-                               vsx->vsx_agg_histo[zio->io_priority]
+                               vsx->vsx_agg_histo[priority]
                                    [RQ_HISTO(zio->io_size)]++;
                        } else {
-                               vsx->vsx_ind_histo[zio->io_priority]
+                               vsx->vsx_ind_histo[priority]
                                    [RQ_HISTO(zio->io_size)]++;
                        }
 
                        if (zio->io_delta && zio->io_delay) {
-                               vsx->vsx_queue_histo[zio->io_priority]
+                               vsx->vsx_queue_histo[priority]
                                    [L_HISTO(zio->io_delta - zio->io_delay)]++;
                                vsx->vsx_disk_histo[type]
                                    [L_HISTO(zio->io_delay)]++;
index 81cfd5ccef06c914949f41f46b43ed683b6e7a68..8c7468255565adc63905eca9895d7d95d2b3eb35 100644 (file)
@@ -404,6 +404,19 @@ root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
        }
 }
 
+static void
+top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl)
+{
+       if (vd == vd->vdev_top) {
+               vdev_rebuild_stat_t vrs;
+               if (vdev_rebuild_get_stats(vd, &vrs) == 0) {
+                       fnvlist_add_uint64_array(nvl,
+                           ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs,
+                           sizeof (vrs) / sizeof (uint64_t));
+               }
+       }
+}
+
 /*
  * Generate the nvlist representing this vdev's config.
  */
@@ -559,6 +572,7 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                vdev_config_generate_stats(vd, nv);
 
                root_vdev_actions_getprogress(vd, nv);
+               top_vdev_actions_getprogress(vd, nv);
 
                /*
                 * Note: this can be called from open context
@@ -663,6 +677,9 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
                if (vd->vdev_resilver_txg != 0)
                        fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
                            vd->vdev_resilver_txg);
+               if (vd->vdev_rebuild_txg != 0)
+                       fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG,
+                           vd->vdev_rebuild_txg);
                if (vd->vdev_faulted)
                        fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
                if (vd->vdev_degraded)
index 3edd65c01965223807aba5a8094edd1f5982b00b..094530e9badd5864e6342f19cf0087dec8888b7d 100644 (file)
@@ -767,8 +767,9 @@ vdev_mirror_io_done(zio_t *zio)
 
                        zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
                            mc->mc_vd, mc->mc_offset,
-                           zio->io_abd, zio->io_size,
-                           ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
+                           zio->io_abd, zio->io_size, ZIO_TYPE_WRITE,
+                           zio->io_priority == ZIO_PRIORITY_REBUILD ?
+                           ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
                            ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
                            ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
                }
index e31271dcb30dd2f9c44468579e57c5077b2b40ef..a8ef3d7474c981526f97ecbb6700856b7aa9720b 100644 (file)
@@ -158,6 +158,8 @@ uint32_t zfs_vdev_initializing_min_active = 1;
 uint32_t zfs_vdev_initializing_max_active = 1;
 uint32_t zfs_vdev_trim_min_active = 1;
 uint32_t zfs_vdev_trim_max_active = 2;
+uint32_t zfs_vdev_rebuild_min_active = 1;
+uint32_t zfs_vdev_rebuild_max_active = 3;
 
 /*
  * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
@@ -278,6 +280,8 @@ vdev_queue_class_min_active(zio_priority_t p)
                return (zfs_vdev_initializing_min_active);
        case ZIO_PRIORITY_TRIM:
                return (zfs_vdev_trim_min_active);
+       case ZIO_PRIORITY_REBUILD:
+               return (zfs_vdev_rebuild_min_active);
        default:
                panic("invalid priority %u", p);
                return (0);
@@ -352,6 +356,8 @@ vdev_queue_class_max_active(spa_t *spa, zio_priority_t p)
                return (zfs_vdev_initializing_max_active);
        case ZIO_PRIORITY_TRIM:
                return (zfs_vdev_trim_max_active);
+       case ZIO_PRIORITY_REBUILD:
+               return (zfs_vdev_rebuild_max_active);
        default:
                panic("invalid priority %u", p);
                return (0);
@@ -845,7 +851,8 @@ vdev_queue_io(zio_t *zio)
                    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
                    zio->io_priority != ZIO_PRIORITY_SCRUB &&
                    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
-                   zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
+                   zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
+                   zio->io_priority != ZIO_PRIORITY_REBUILD) {
                        zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
                }
        } else if (zio->io_type == ZIO_TYPE_WRITE) {
@@ -854,7 +861,8 @@ vdev_queue_io(zio_t *zio)
                if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
                    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
                    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
-                   zio->io_priority != ZIO_PRIORITY_INITIALIZING) {
+                   zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
+                   zio->io_priority != ZIO_PRIORITY_REBUILD) {
                        zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
                }
        } else {
@@ -1051,6 +1059,12 @@ ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW,
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW,
        "Min active trim/discard I/Os per vdev");
 
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
+       "Max active rebuild I/Os per vdev");
+
+ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
+       "Min active rebuild I/Os per vdev");
+
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
        "Queue depth percentage for each top-level vdev");
 /* END CSTYLED */
diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c
new file mode 100644 (file)
index 0000000..bf1079f
--- /dev/null
@@ -0,0 +1,1106 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ *
+ * Copyright (c) 2018, Intel Corporation.
+ * Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+ */
+
+#include <sys/vdev_impl.h>
+#include <sys/dsl_scan.h>
+#include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
+#include <sys/vdev_rebuild.h>
+#include <sys/zio.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/zap.h>
+
+/*
+ * This file contains the sequential reconstruction implementation for
+ * resilvering.  This form of resilvering is internally referred to as device
+ * rebuild to avoid conflating it with the traditional healing reconstruction
+ * performed by the dsl scan code.
+ *
+ * When replacing a device, or scrubbing the pool, ZFS has historically used
+ * a process called resilvering which is a form of healing reconstruction.
+ * This approach has the advantage that as blocks are read from disk their
+ * checksums can be immediately verified and the data repaired.  Unfortunately,
+ * it also results in a random IO pattern to the disk even when extra care
+ * is taken to sequentialize the IO as much as possible.  This substantially
+ * increases the time required to resilver the pool and restore redundancy.
+ *
+ * For mirrored devices it's possible to implement an alternate sequential
+ * reconstruction strategy when resilvering.  Sequential reconstruction
+ * behaves like a traditional RAID rebuild and reconstructs a device in LBA
+ * order without verifying the checksum.  After this phase completes a second
+ * scrub phase is started to verify all of the checksums.  This two phase
+ * process will take longer than the healing reconstruction described above.
+ * However, it has that advantage that after the reconstruction first phase
+ * completes redundancy has been restored.  At this point the pool can incur
+ * another device failure without risking data loss.
+ *
+ * There are a few noteworthy limitations and other advantages of resilvering
+ * using sequential reconstruction vs healing reconstruction.
+ *
+ * Limitations:
+ *
+ *   - Only supported for mirror vdev types.  Due to the variable stripe
+ *     width used by raidz sequential reconstruction is not possible.
+ *
+ *   - Block checksums are not verified during sequential reconstuction.
+ *     Similar to traditional RAID the parity/mirror data is reconstructed
+ *     but cannot be immediately double checked.  For this reason when the
+ *     last active resilver completes the pool is automatically scrubbed.
+ *
+ *   - Deferred resilvers using sequential reconstruction are not currently
+ *     supported.  When adding another vdev to an active top-level resilver
+ *     it must be restarted.
+ *
+ * Advantages:
+ *
+ *   - Sequential reconstuction is performed in LBA order which may be faster
+ *     than healing reconstuction particularly when using using HDDs (or
+ *     especially with SMR devices).  Only allocated capacity is resilvered.
+ *
+ *   - Sequential reconstruction is not constrained by ZFS block boundaries.
+ *     This allows it to issue larger IOs to disk which span multiple blocks
+ *     allowing all of these logical blocks to be repaired with a single IO.
+ *
+ *   - Unlike a healing resilver or scrub which are pool wide operations,
+ *     sequential reconstruction is handled by the top-level mirror vdevs.
+ *     This allows for it to be started or canceled on a top-level vdev
+ *     without impacting any other top-level vdevs in the pool.
+ *
+ *   - Data only referenced by a pool checkpoint will be repaired because
+ *     that space is reflected in the space maps.  This differs for a
+ *     healing resilver or scrub which will not repair that data.
+ */
+
+
+/*
+ * Maximum number of queued rebuild I/Os top-level vdev.  The number of
+ * concurrent rebuild I/Os issued to the device is controlled by the
+ * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module
+ * options.
+ */
+unsigned int zfs_rebuild_queue_limit = 20;
+
+/*
+ * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE.
+ */
+unsigned long zfs_rebuild_max_segment = 1024 * 1024;
+
+/*
+ * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync().
+ */
+static void vdev_rebuild_thread(void *arg);
+
+/*
+ * Clear the per-vdev rebuild bytes value for a vdev tree.
+ */
+static void
+clear_rebuild_bytes(vdev_t *vd)
+{
+       vdev_stat_t *vs = &vd->vdev_stat;
+
+       for (uint64_t i = 0; i < vd->vdev_children; i++)
+               clear_rebuild_bytes(vd->vdev_child[i]);
+
+       mutex_enter(&vd->vdev_stat_lock);
+       vs->vs_rebuild_processed = 0;
+       mutex_exit(&vd->vdev_stat_lock);
+}
+
+/*
+ * Determines whether a vdev_rebuild_thread() should be stopped.
+ */
+static boolean_t
+vdev_rebuild_should_stop(vdev_t *vd)
+{
+       return (!vdev_writeable(vd) || vd->vdev_removing ||
+           vd->vdev_rebuild_exit_wanted ||
+           vd->vdev_rebuild_cancel_wanted ||
+           vd->vdev_rebuild_reset_wanted);
+}
+
+/*
+ * Determine if the rebuild should be canceled.  This may happen when all
+ * vdevs with MISSING DTLs are detached.
+ */
+static boolean_t
+vdev_rebuild_should_cancel(vdev_t *vd)
+{
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+       if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg))
+               return (B_TRUE);
+
+       return (B_FALSE);
+}
+
+/*
+ * The sync task for updating the on-disk state of a rebuild.  This is
+ * scheduled by vdev_rebuild_range().
+ */
+static void
+vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx)
+{
+       int vdev_id = (uintptr_t)arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+       uint64_t txg = dmu_tx_get_txg(tx);
+
+       mutex_enter(&vd->vdev_rebuild_lock);
+
+       if (vr->vr_scan_offset[txg & TXG_MASK] > 0) {
+               vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK];
+               vr->vr_scan_offset[txg & TXG_MASK] = 0;
+       }
+
+       vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms +
+           NSEC2MSEC(gethrtime() - vr->vr_pass_start_time);
+
+       VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+           VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+           REBUILD_PHYS_ENTRIES, vrp, tx));
+
+       mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * Initialize the on-disk state for a new rebuild, start the rebuild thread.
+ */
+static void
+vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx)
+{
+       int vdev_id = (uintptr_t)arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+       ASSERT(vd->vdev_rebuilding);
+
+       spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+       mutex_enter(&vd->vdev_rebuild_lock);
+       bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+       vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE;
+       vrp->vrp_min_txg = 0;
+       vrp->vrp_max_txg = dmu_tx_get_txg(tx);
+       vrp->vrp_start_time = gethrestime_sec();
+       vrp->vrp_scan_time_ms = 0;
+       vr->vr_prev_scan_time_ms = 0;
+
+       /*
+        * Rebuilds are currently only used when replacing a device, in which
+        * case there must be DTL_MISSING entries.  In the future, we could
+        * allow rebuilds to be used in a way similar to a scrub.  This would
+        * be useful because it would allow us to rebuild the space used by
+        * pool checkpoints.
+        */
+       VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg));
+
+       VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+           VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+           REBUILD_PHYS_ENTRIES, vrp, tx));
+
+       spa_history_log_internal(spa, "rebuild", tx,
+           "vdev_id=%llu vdev_guid=%llu started",
+           (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+
+       ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+       vd->vdev_rebuild_thread = thread_create(NULL, 0,
+           vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+
+       mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+static void
+vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name)
+{
+       nvlist_t *aux = fnvlist_alloc();
+
+       fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential");
+       spa_event_notify(spa, vd, aux, name);
+       nvlist_free(aux);
+}
+
+/*
+ * Called to request that a new rebuild be started.  The feature will remain
+ * active for the duration of the rebuild, then revert to the enabled state.
+ */
+static void
+vdev_rebuild_initiate(vdev_t *vd)
+{
+       spa_t *spa = vd->vdev_spa;
+
+       ASSERT(vd->vdev_top == vd);
+       ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock));
+       ASSERT(!vd->vdev_rebuilding);
+
+       dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+       VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+       vd->vdev_rebuilding = B_TRUE;
+
+       dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync,
+           (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx);
+       dmu_tx_commit(tx);
+
+       vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START);
+}
+
+/*
+ * Update the on-disk state to completed when a rebuild finishes.
+ */
+static void
+vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx)
+{
+       int vdev_id = (uintptr_t)arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+       mutex_enter(&vd->vdev_rebuild_lock);
+       vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE;
+       vrp->vrp_end_time = gethrestime_sec();
+
+       VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+           VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+           REBUILD_PHYS_ENTRIES, vrp, tx));
+
+       vdev_dtl_reassess(vd,  tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE);
+       spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+       spa_history_log_internal(spa, "rebuild",  tx,
+           "vdev_id=%llu vdev_guid=%llu complete",
+           (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+       vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH);
+
+       /* Handles detaching of spares */
+       spa_async_request(spa, SPA_ASYNC_REBUILD_DONE);
+       vd->vdev_rebuilding = B_FALSE;
+       mutex_exit(&vd->vdev_rebuild_lock);
+
+       spa_notify_waiters(spa);
+       cv_broadcast(&vd->vdev_rebuild_cv);
+}
+
+/*
+ * Update the on-disk state to canceled when a rebuild finishes.
+ */
+static void
+vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+       int vdev_id = (uintptr_t)arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+       mutex_enter(&vd->vdev_rebuild_lock);
+       vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED;
+       vrp->vrp_end_time = gethrestime_sec();
+
+       VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+           VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+           REBUILD_PHYS_ENTRIES, vrp, tx));
+
+       spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx);
+
+       spa_history_log_internal(spa, "rebuild",  tx,
+           "vdev_id=%llu vdev_guid=%llu canceled",
+           (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+       vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH);
+
+       vd->vdev_rebuild_cancel_wanted = B_FALSE;
+       vd->vdev_rebuilding = B_FALSE;
+       mutex_exit(&vd->vdev_rebuild_lock);
+
+       spa_notify_waiters(spa);
+       cv_broadcast(&vd->vdev_rebuild_cv);
+}
+
+/*
+ * Resets the progress of a running rebuild.  This will occur when a new
+ * vdev is added to rebuild.
+ */
+static void
+vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx)
+{
+       int vdev_id = (uintptr_t)arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+       mutex_enter(&vd->vdev_rebuild_lock);
+
+       ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+       ASSERT3P(vd->vdev_rebuild_thread, ==, NULL);
+
+       vrp->vrp_last_offset = 0;
+       vrp->vrp_min_txg = 0;
+       vrp->vrp_max_txg = dmu_tx_get_txg(tx);
+       vrp->vrp_bytes_scanned = 0;
+       vrp->vrp_bytes_issued = 0;
+       vrp->vrp_bytes_rebuilt = 0;
+       vrp->vrp_bytes_est = 0;
+       vrp->vrp_scan_time_ms = 0;
+       vr->vr_prev_scan_time_ms = 0;
+
+       /* See vdev_rebuild_initiate_sync comment */
+       VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg));
+
+       VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap,
+           VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+           REBUILD_PHYS_ENTRIES, vrp, tx));
+
+       spa_history_log_internal(spa, "rebuild",  tx,
+           "vdev_id=%llu vdev_guid=%llu reset",
+           (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid);
+
+       vd->vdev_rebuild_reset_wanted = B_FALSE;
+       ASSERT(vd->vdev_rebuilding);
+
+       vd->vdev_rebuild_thread = thread_create(NULL, 0,
+           vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri);
+
+       mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * Clear the last rebuild status.
+ */
+void
+vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx)
+{
+       int vdev_id = (uintptr_t)arg;
+       spa_t *spa = dmu_tx_pool(tx)->dp_spa;
+       vdev_t *vd = vdev_lookup_top(spa, vdev_id);
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+       objset_t *mos = spa_meta_objset(spa);
+
+       mutex_enter(&vd->vdev_rebuild_lock);
+
+       if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) ||
+           vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) {
+               mutex_exit(&vd->vdev_rebuild_lock);
+               return;
+       }
+
+       clear_rebuild_bytes(vd);
+       bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+
+       if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap,
+           VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) {
+               VERIFY0(zap_update(mos, vd->vdev_top_zap,
+                   VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+                   REBUILD_PHYS_ENTRIES, vrp, tx));
+       }
+
+       mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+/*
+ * The zio_done_func_t callback for each rebuild I/O issued.  It's responsible
+ * for updating the rebuild stats and limiting the number of in flight I/Os.
+ */
+static void
+vdev_rebuild_cb(zio_t *zio)
+{
+       vdev_rebuild_t *vr = zio->io_private;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+       vdev_t *vd = vr->vr_top_vdev;
+
+       mutex_enter(&vd->vdev_rebuild_io_lock);
+       if (zio->io_error == ENXIO && !vdev_writeable(vd)) {
+               /*
+                * The I/O failed because the top-level vdev was unavailable.
+                * Attempt to roll back to the last completed offset, in order
+                * resume from the correct location if the pool is resumed.
+                * (This works because spa_sync waits on spa_txg_zio before
+                * it runs sync tasks.)
+                */
+               uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK];
+               *off = MIN(*off, zio->io_offset);
+       } else if (zio->io_error) {
+               vrp->vrp_errors++;
+       }
+
+       abd_free(zio->io_abd);
+
+       ASSERT3U(vd->vdev_rebuild_inflight, >, 0);
+       vd->vdev_rebuild_inflight--;
+       cv_broadcast(&vd->vdev_rebuild_io_cv);
+       mutex_exit(&vd->vdev_rebuild_io_lock);
+
+       spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+}
+
+/*
+ * Rebuild the data in this range by constructing a special dummy block
+ * pointer for the given range.  It has no relation to any existing blocks
+ * in the pool.  But by disabling checksum verification and issuing a scrub
+ * I/O mirrored vdevs will replicate the block using any available mirror
+ * leaf vdevs.
+ */
+static void
+vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize,
+    uint64_t txg)
+{
+       vdev_t *vd = vr->vr_top_vdev;
+       spa_t *spa = vd->vdev_spa;
+       uint64_t psize = asize;
+
+       ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
+           vd->vdev_ops == &vdev_replacing_ops ||
+           vd->vdev_ops == &vdev_spare_ops);
+
+       blkptr_t blk, *bp = &blk;
+       BP_ZERO(bp);
+
+       DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id);
+       DVA_SET_OFFSET(&bp->blk_dva[0], start);
+       DVA_SET_GANG(&bp->blk_dva[0], 0);
+       DVA_SET_ASIZE(&bp->blk_dva[0], asize);
+
+       BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
+       BP_SET_LSIZE(bp, psize);
+       BP_SET_PSIZE(bp, psize);
+       BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+       BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+       BP_SET_TYPE(bp, DMU_OT_NONE);
+       BP_SET_LEVEL(bp, 0);
+       BP_SET_DEDUP(bp, 0);
+       BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+
+       /*
+        * We increment the issued bytes by the asize rather than the psize
+        * so the scanned and issued bytes may be directly compared.  This
+        * is consistent with the scrub/resilver issued reporting.
+        */
+       vr->vr_pass_bytes_issued += asize;
+       vr->vr_rebuild_phys.vrp_bytes_issued += asize;
+
+       zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp,
+           abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr,
+           ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL |
+           ZIO_FLAG_RESILVER, NULL));
+}
+
+/*
+ * Issues a rebuild I/O and takes care of rate limiting the number of queued
+ * rebuild I/Os.  The provided start and size must be properly aligned for the
+ * top-level vdev type being rebuilt.
+ */
+static int
+vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size)
+{
+       uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id;
+       vdev_t *vd = vr->vr_top_vdev;
+       spa_t *spa = vd->vdev_spa;
+
+       ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift);
+       ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift);
+
+       vr->vr_pass_bytes_scanned += size;
+       vr->vr_rebuild_phys.vrp_bytes_scanned += size;
+
+       mutex_enter(&vd->vdev_rebuild_io_lock);
+
+       /* Limit in flight rebuild I/Os */
+       while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit)
+               cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
+
+       vd->vdev_rebuild_inflight++;
+       mutex_exit(&vd->vdev_rebuild_io_lock);
+
+       dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+       VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+       uint64_t txg = dmu_tx_get_txg(tx);
+
+       spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER);
+       mutex_enter(&vd->vdev_rebuild_lock);
+
+       /* This is the first I/O for this txg. */
+       if (vr->vr_scan_offset[txg & TXG_MASK] == 0) {
+               vr->vr_scan_offset[txg & TXG_MASK] = start;
+               dsl_sync_task_nowait(spa_get_dsl(spa),
+                   vdev_rebuild_update_sync,
+                   (void *)(uintptr_t)vd->vdev_id, 2,
+                   ZFS_SPACE_CHECK_RESERVED, tx);
+       }
+
+       /* When exiting write out our progress. */
+       if (vdev_rebuild_should_stop(vd)) {
+               mutex_enter(&vd->vdev_rebuild_io_lock);
+               vd->vdev_rebuild_inflight--;
+               mutex_exit(&vd->vdev_rebuild_io_lock);
+               spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd);
+               mutex_exit(&vd->vdev_rebuild_lock);
+               dmu_tx_commit(tx);
+               return (SET_ERROR(EINTR));
+       }
+       mutex_exit(&vd->vdev_rebuild_lock);
+
+       vr->vr_scan_offset[txg & TXG_MASK] = start + size;
+       vdev_rebuild_rebuild_block(vr, start, size, txg);
+
+       dmu_tx_commit(tx);
+
+       return (0);
+}
+
+/*
+ * Split range into legally-sized logical chunks given the constraints of the
+ * top-level mirror vdev type.
+ */
+static uint64_t
+vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size)
+{
+       uint64_t chunk_size, max_asize, max_segment;
+
+       ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
+           vd->vdev_ops == &vdev_replacing_ops ||
+           vd->vdev_ops == &vdev_spare_ops);
+
+       max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment,
+           1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE);
+       max_asize = vdev_psize_to_asize(vd, max_segment);
+       chunk_size = MIN(size, max_asize);
+
+       return (chunk_size);
+}
+
+/*
+ * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree.
+ */
+static int
+vdev_rebuild_ranges(vdev_rebuild_t *vr)
+{
+       vdev_t *vd = vr->vr_top_vdev;
+       zfs_btree_t *t = &vr->vr_scan_tree->rt_root;
+       zfs_btree_index_t idx;
+       int error;
+
+       for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL;
+           rs = zfs_btree_next(t, &idx, &idx)) {
+               uint64_t start = rs_get_start(rs, vr->vr_scan_tree);
+               uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start;
+
+               /*
+                * zfs_scan_suspend_progress can be set to disable rebuild
+                * progress for testing.  See comment in dsl_scan_sync().
+                */
+               while (zfs_scan_suspend_progress &&
+                   !vdev_rebuild_should_stop(vd)) {
+                       delay(hz);
+               }
+
+               while (size > 0) {
+                       uint64_t chunk_size;
+
+                       chunk_size = vdev_rebuild_chunk_size(vd, start, size);
+
+                       error = vdev_rebuild_range(vr, start, chunk_size);
+                       if (error != 0)
+                               return (error);
+
+                       size -= chunk_size;
+                       start += chunk_size;
+               }
+       }
+
+       return (0);
+}
+
+/*
+ * Calculates the estimated capacity which remains to be scanned.  Since
+ * we traverse the pool in metaslab order only allocated capacity beyond
+ * the vrp_last_offset need be considered.  All lower offsets must have
+ * already been rebuilt and are thus already included in vrp_bytes_scanned.
+ */
+static void
+vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id)
+{
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+       uint64_t bytes_est = vrp->vrp_bytes_scanned;
+
+       if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start)
+               return;
+
+       for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) {
+               metaslab_t *msp = vd->vdev_ms[i];
+
+               mutex_enter(&msp->ms_lock);
+               bytes_est += metaslab_allocated_space(msp);
+               mutex_exit(&msp->ms_lock);
+       }
+
+       vrp->vrp_bytes_est = bytes_est;
+}
+
+/*
+ * Load from disk the top-level vdev's rebuild information.
+ */
+int
+vdev_rebuild_load(vdev_t *vd)
+{
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+       spa_t *spa = vd->vdev_spa;
+       int err = 0;
+
+       mutex_enter(&vd->vdev_rebuild_lock);
+       vd->vdev_rebuilding = B_FALSE;
+
+       if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) {
+               bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+               mutex_exit(&vd->vdev_rebuild_lock);
+               return (SET_ERROR(ENOTSUP));
+       }
+
+       ASSERT(vd->vdev_top == vd);
+
+       err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap,
+           VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t),
+           REBUILD_PHYS_ENTRIES, vrp);
+
+       /*
+        * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should
+        * not prevent a pool from being imported.  Clear the rebuild
+        * status allowing a new resilver/rebuild to be started.
+        */
+       if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) {
+               bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES);
+       } else if (err) {
+               mutex_exit(&vd->vdev_rebuild_lock);
+               return (err);
+       }
+
+       vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms;
+       vr->vr_top_vdev = vd;
+
+       mutex_exit(&vd->vdev_rebuild_lock);
+
+       return (0);
+}
+
+/*
+ * Each scan thread is responsible for rebuilding a top-level vdev.  The
+ * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS.
+ */
+static void
+vdev_rebuild_thread(void *arg)
+{
+       vdev_t *vd = arg;
+       spa_t *spa = vd->vdev_spa;
+       int error = 0;
+
+       /*
+        * If there's a scrub in process request that it be stopped.  This
+        * is not required for a correct rebuild, but we do want rebuilds to
+        * emulate the resilver behavior as much as possible.
+        */
+       dsl_pool_t *dsl = spa_get_dsl(spa);
+       if (dsl_scan_scrubbing(dsl))
+               dsl_scan_cancel(dsl);
+
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+       mutex_enter(&vd->vdev_rebuild_lock);
+
+       ASSERT3P(vd->vdev_top, ==, vd);
+       ASSERT3P(vd->vdev_rebuild_thread, !=, NULL);
+       ASSERT(vd->vdev_rebuilding);
+       ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD));
+       ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE);
+       ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE);
+
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+       vr->vr_top_vdev = vd;
+       vr->vr_scan_msp = NULL;
+       vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0);
+       vr->vr_pass_start_time = gethrtime();
+       vr->vr_pass_bytes_scanned = 0;
+       vr->vr_pass_bytes_issued = 0;
+
+       uint64_t update_est_time = gethrtime();
+       vdev_rebuild_update_bytes_est(vd, 0);
+
+       clear_rebuild_bytes(vr->vr_top_vdev);
+
+       mutex_exit(&vd->vdev_rebuild_lock);
+
+       /*
+        * Systematically walk the metaslabs and issue rebuild I/Os for
+        * all ranges in the allocated space map.
+        */
+       for (uint64_t i = 0; i < vd->vdev_ms_count; i++) {
+               metaslab_t *msp = vd->vdev_ms[i];
+               vr->vr_scan_msp = msp;
+
+               /*
+                * Removal of vdevs from the vdev tree may eliminate the need
+                * for the rebuild, in which case it should be canceled.  The
+                * vdev_rebuild_cancel_wanted flag is set until the sync task
+                * completes.  This may be after the rebuild thread exits.
+                */
+               if (vdev_rebuild_should_cancel(vd)) {
+                       vd->vdev_rebuild_cancel_wanted = B_TRUE;
+                       error = EINTR;
+                       break;
+               }
+
+               ASSERT0(range_tree_space(vr->vr_scan_tree));
+
+               /*
+                * Disable any new allocations to this metaslab and wait
+                * for any writes inflight to complete.  This is needed to
+                * ensure all allocated ranges are rebuilt.
+                */
+               metaslab_disable(msp);
+               spa_config_exit(spa, SCL_CONFIG, FTAG);
+               txg_wait_synced(dsl, 0);
+
+               mutex_enter(&msp->ms_sync_lock);
+               mutex_enter(&msp->ms_lock);
+
+               /*
+                * When a metaslab has been allocated from read its allocated
+                * ranges from the space map object in to the vr_scan_tree.
+                * Then add inflight / unflushed ranges and remove inflight /
+                * unflushed frees.  This is the minimum range to be rebuilt.
+                */
+               if (msp->ms_sm != NULL) {
+                       VERIFY0(space_map_load(msp->ms_sm,
+                           vr->vr_scan_tree, SM_ALLOC));
+
+                       for (int i = 0; i < TXG_SIZE; i++) {
+                               ASSERT0(range_tree_space(
+                                   msp->ms_allocating[i]));
+                       }
+
+                       range_tree_walk(msp->ms_unflushed_allocs,
+                           range_tree_add, vr->vr_scan_tree);
+                       range_tree_walk(msp->ms_unflushed_frees,
+                           range_tree_remove, vr->vr_scan_tree);
+
+                       /*
+                        * Remove ranges which have already been rebuilt based
+                        * on the last offset.  This can happen when restarting
+                        * a scan after exporting and re-importing the pool.
+                        */
+                       range_tree_clear(vr->vr_scan_tree, 0,
+                           vrp->vrp_last_offset);
+               }
+
+               mutex_exit(&msp->ms_lock);
+               mutex_exit(&msp->ms_sync_lock);
+
+               /*
+                * To provide an accurate estimate re-calculate the estimated
+                * size every 5 minutes to account for recent allocations and
+                * frees made space maps which have not yet been rebuilt.
+                */
+               if (gethrtime() > update_est_time + SEC2NSEC(300)) {
+                       update_est_time = gethrtime();
+                       vdev_rebuild_update_bytes_est(vd, i);
+               }
+
+               /*
+                * Walk the allocated space map and issue the rebuild I/O.
+                */
+               error = vdev_rebuild_ranges(vr);
+               range_tree_vacate(vr->vr_scan_tree, NULL, NULL);
+
+               spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+               metaslab_enable(msp, B_FALSE, B_FALSE);
+
+               if (error != 0)
+                       break;
+       }
+
+       range_tree_destroy(vr->vr_scan_tree);
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+       /* Wait for any remaining rebuild I/O to complete */
+       mutex_enter(&vd->vdev_rebuild_io_lock);
+       while (vd->vdev_rebuild_inflight > 0)
+               cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock);
+
+       mutex_exit(&vd->vdev_rebuild_io_lock);
+
+       spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+       dsl_pool_t *dp = spa_get_dsl(spa);
+       dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir);
+       VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
+
+       mutex_enter(&vd->vdev_rebuild_lock);
+       if (error == 0) {
+               /*
+                * After a successful rebuild clear the DTLs of all ranges
+                * which were missing when the rebuild was started.  These
+                * ranges must have been rebuilt as a consequence of rebuilding
+                * all allocated space.  Note that unlike a scrub or resilver
+                * the rebuild operation will reconstruct data only referenced
+                * by a pool checkpoint.  See the dsl_scan_done() comments.
+                */
+               dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync,
+                   (void *)(uintptr_t)vd->vdev_id, 0,
+                   ZFS_SPACE_CHECK_NONE, tx);
+       } else if (vd->vdev_rebuild_cancel_wanted) {
+               /*
+                * The rebuild operation was canceled.  This will occur when
+                * a device participating in the rebuild is detached.
+                */
+               dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync,
+                   (void *)(uintptr_t)vd->vdev_id, 0,
+                   ZFS_SPACE_CHECK_NONE, tx);
+       } else if (vd->vdev_rebuild_reset_wanted) {
+               /*
+                * Reset the running rebuild without canceling and restarting
+                * it.  This will occur when a new device is attached and must
+                * participate in the rebuild.
+                */
+               dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync,
+                   (void *)(uintptr_t)vd->vdev_id, 0,
+                   ZFS_SPACE_CHECK_NONE, tx);
+       } else {
+               /*
+                * The rebuild operation should be suspended.  This may occur
+                * when detaching a child vdev or when exporting the pool.  The
+                * rebuild is left in the active state so it will be resumed.
+                */
+               ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+               vd->vdev_rebuilding = B_FALSE;
+       }
+
+       dmu_tx_commit(tx);
+
+       vd->vdev_rebuild_thread = NULL;
+       mutex_exit(&vd->vdev_rebuild_lock);
+       spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+       cv_broadcast(&vd->vdev_rebuild_cv);
+}
+
+/*
+ * Returns B_TRUE if any top-level vdev are rebuilding.
+ */
+boolean_t
+vdev_rebuild_active(vdev_t *vd)
+{
+       spa_t *spa = vd->vdev_spa;
+       boolean_t ret = B_FALSE;
+
+       if (vd == spa->spa_root_vdev) {
+               for (uint64_t i = 0; i < vd->vdev_children; i++) {
+                       ret = vdev_rebuild_active(vd->vdev_child[i]);
+                       if (ret)
+                               return (ret);
+               }
+       } else if (vd->vdev_top_zap != 0) {
+               vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+               vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+               mutex_enter(&vd->vdev_rebuild_lock);
+               ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE);
+               mutex_exit(&vd->vdev_rebuild_lock);
+       }
+
+       return (ret);
+}
+
+/*
+ * Start a rebuild operation.  The rebuild may be restarted when the
+ * top-level vdev is currently actively rebuilding.
+ */
+void
+vdev_rebuild(vdev_t *vd)
+{
+       vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+       vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys;
+
+       ASSERT(vd->vdev_top == vd);
+       ASSERT(vdev_is_concrete(vd));
+       ASSERT(!vd->vdev_removing);
+       ASSERT(spa_feature_is_enabled(vd->vdev_spa,
+           SPA_FEATURE_DEVICE_REBUILD));
+
+       mutex_enter(&vd->vdev_rebuild_lock);
+       if (vd->vdev_rebuilding) {
+               ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE);
+
+               /*
+                * Signal a running rebuild operation that it should restart
+                * from the beginning because a new device was attached.  The
+                * vdev_rebuild_reset_wanted flag is set until the sync task
+                * completes.  This may be after the rebuild thread exits.
+                */
+               if (!vd->vdev_rebuild_reset_wanted)
+                       vd->vdev_rebuild_reset_wanted = B_TRUE;
+       } else {
+               vdev_rebuild_initiate(vd);
+       }
+       mutex_exit(&vd->vdev_rebuild_lock);
+}
+
+static void
+vdev_rebuild_restart_impl(vdev_t *vd)
+{
+       spa_t *spa = vd->vdev_spa;
+
+       if (vd == spa->spa_root_vdev) {
+               for (uint64_t i = 0; i < vd->vdev_children; i++)
+                       vdev_rebuild_restart_impl(vd->vdev_child[i]);
+
+       } else if (vd->vdev_top_zap != 0) {
+               vdev_rebuild_t *vr = &vd->vdev_rebuild_config;
+               vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+               mutex_enter(&vd->vdev_rebuild_lock);
+               if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE &&
+                   vdev_writeable(vd) && !vd->vdev_rebuilding) {
+                       ASSERT(spa_feature_is_active(spa,
+                           SPA_FEATURE_DEVICE_REBUILD));
+                       vd->vdev_rebuilding = B_TRUE;
+                       vd->vdev_rebuild_thread = thread_create(NULL, 0,
+                           vdev_rebuild_thread, vd, 0, &p0, TS_RUN,
+                           maxclsyspri);
+               }
+               mutex_exit(&vd->vdev_rebuild_lock);
+       }
+}
+
+/*
+ * Conditionally restart all of the vdev_rebuild_thread's for a pool.  The
+ * feature flag must be active and the rebuild in the active state.   This
+ * cannot be used to start a new rebuild.
+ */
+void
+vdev_rebuild_restart(spa_t *spa)
+{
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       vdev_rebuild_restart_impl(spa->spa_root_vdev);
+}
+
+/*
+ * Stop and wait for all of the vdev_rebuild_thread's associated with the
+ * vdev tree provide to be terminated (canceled or stopped).
+ */
+void
+vdev_rebuild_stop_wait(vdev_t *vd)
+{
+       spa_t *spa = vd->vdev_spa;
+
+       ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+       if (vd == spa->spa_root_vdev) {
+               for (uint64_t i = 0; i < vd->vdev_children; i++)
+                       vdev_rebuild_stop_wait(vd->vdev_child[i]);
+
+       } else if (vd->vdev_top_zap != 0) {
+               ASSERT(vd == vd->vdev_top);
+
+               mutex_enter(&vd->vdev_rebuild_lock);
+               if (vd->vdev_rebuild_thread != NULL) {
+                       vd->vdev_rebuild_exit_wanted = B_TRUE;
+                       while (vd->vdev_rebuilding) {
+                               cv_wait(&vd->vdev_rebuild_cv,
+                                   &vd->vdev_rebuild_lock);
+                       }
+                       vd->vdev_rebuild_exit_wanted = B_FALSE;
+               }
+               mutex_exit(&vd->vdev_rebuild_lock);
+       }
+}
+
+/*
+ * Stop all rebuild operations but leave them in the active state so they
+ * will be resumed when importing the pool.
+ */
+void
+vdev_rebuild_stop_all(spa_t *spa)
+{
+       vdev_rebuild_stop_wait(spa->spa_root_vdev);
+}
+
+/*
+ * Rebuild statistics reported per top-level vdev.
+ */
+int
+vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs)
+{
+       spa_t *spa = tvd->vdev_spa;
+
+       if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
+               return (SET_ERROR(ENOTSUP));
+
+       if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0)
+               return (SET_ERROR(EINVAL));
+
+       int error = zap_contains(spa_meta_objset(spa),
+           tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS);
+
+       if (error == ENOENT) {
+               bzero(vrs, sizeof (vdev_rebuild_stat_t));
+               vrs->vrs_state = VDEV_REBUILD_NONE;
+               error = 0;
+       } else if (error == 0) {
+               vdev_rebuild_t *vr = &tvd->vdev_rebuild_config;
+               vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys;
+
+               mutex_enter(&tvd->vdev_rebuild_lock);
+               vrs->vrs_state = vrp->vrp_rebuild_state;
+               vrs->vrs_start_time = vrp->vrp_start_time;
+               vrs->vrs_end_time = vrp->vrp_end_time;
+               vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms;
+               vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned;
+               vrs->vrs_bytes_issued = vrp->vrp_bytes_issued;
+               vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt;
+               vrs->vrs_bytes_est = vrp->vrp_bytes_est;
+               vrs->vrs_errors = vrp->vrp_errors;
+               vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() -
+                   vr->vr_pass_start_time);
+               vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned;
+               vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued;
+               mutex_exit(&tvd->vdev_rebuild_lock);
+       }
+
+       return (error);
+}
+
+/* BEGIN CSTYLED */
+ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW,
+        "Max segment size in bytes of rebuild reads");
+/* END CSTYLED */
index 4122114b5619fbd1eabd0ce4f4901d58ca41557f..1d2ae6270546b0c40838b126920bfcdd7fe8ab87 100644 (file)
@@ -1938,8 +1938,9 @@ static int
 zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 {
        spa_t *spa;
-       int replacing = zc->zc_cookie;
        nvlist_t *config;
+       int replacing = zc->zc_cookie;
+       int rebuild = zc->zc_simple;
        int error;
 
        if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
@@ -1947,7 +1948,8 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 
        if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
            zc->zc_iflags, &config)) == 0) {
-               error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
+               error = spa_vdev_attach(spa, zc->zc_guid, config, replacing,
+                   rebuild);
                nvlist_free(config);
        }
 
index 765ffea8a302928f6e36e633c3c475491e2ee7f9..f6478dd0d2924ba4df9964d0dbb3149e0c6b5410 100644 (file)
@@ -487,7 +487,8 @@ tests = ['zpool_wait_discard', 'zpool_wait_freeing',
 tags = ['functional', 'cli_root', 'zpool_wait']
 
 [tests/functional/cli_root/zpool_wait/scan]
-tests = ['zpool_wait_replace_cancel', 'zpool_wait_resilver', 'zpool_wait_scrub_cancel',
+tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild',
+    'zpool_wait_resilver', 'zpool_wait_scrub_cancel',
     'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag']
 tags = ['functional', 'cli_root', 'zpool_wait']
 
@@ -748,7 +749,11 @@ tests = ['rename_dirs_001_pos']
 tags = ['functional', 'rename_dirs']
 
 [tests/functional/replacement]
-tests = ['replacement_001_pos', 'replacement_002_pos', 'replacement_003_pos']
+tests = ['attach_import', 'attach_multiple', 'attach_rebuild',
+    'attach_resilver', 'detach', 'rebuild_disabled_feature',
+    'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild',
+    'replace_resilver', 'resilver_restart_001', 'resilver_restart_002',
+    'scrub_cancel']
 tags = ['functional', 'replacement']
 
 [tests/functional/reservation]
@@ -762,10 +767,6 @@ tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
     'reservation_022_pos']
 tags = ['functional', 'reservation']
 
-[tests/functional/resilver]
-tests = ['resilver_restart_001', 'resilver_restart_002']
-tags = ['functional', 'resilver']
-
 [tests/functional/rootpool]
 tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos']
 tags = ['functional', 'rootpool']
index 9fbcc37c610c1ab31f5b886f125f87fe13834476..5e07cda4d7912cfbfd9d42dc12a13439a1db6b6d 100644 (file)
@@ -2222,26 +2222,27 @@ function check_pool_status # pool token keyword <verbose>
        if [[ $verbose == true ]]; then
                log_note $scan
        fi
-       echo $scan | grep -i "$keyword" > /dev/null 2>&1
+       echo $scan | egrep -i "$keyword" > /dev/null 2>&1
 
        return $?
 }
 
 #
 # The following functions are instance of check_pool_status()
-#      is_pool_resilvering - to check if the pool is resilver in progress
-#      is_pool_resilvered - to check if the pool is resilver completed
-#      is_pool_scrubbing - to check if the pool is scrub in progress
-#      is_pool_scrubbed - to check if the pool is scrub completed
-#      is_pool_scrub_stopped - to check if the pool is scrub stopped
-#      is_pool_scrub_paused - to check if the pool has scrub paused
-#      is_pool_removing - to check if the pool is removing a vdev
-#      is_pool_removed - to check if the pool is remove completed
-#      is_pool_discarding - to check if the pool has checkpoint being discarded
+#      is_pool_resilvering - to check if the pool resilver is in progress
+#      is_pool_resilvered - to check if the pool resilver is completed
+#      is_pool_scrubbing - to check if the pool scrub is in progress
+#      is_pool_scrubbed - to check if the pool scrub is completed
+#      is_pool_scrub_stopped - to check if the pool scrub is stopped
+#      is_pool_scrub_paused - to check if the pool scrub has paused
+#      is_pool_removing - to check if the pool removing is a vdev
+#      is_pool_removed - to check if the pool remove is completed
+#      is_pool_discarding - to check if the pool checkpoint is being discarded
 #
 function is_pool_resilvering #pool <verbose>
 {
-       check_pool_status "$1" "scan" "resilver in progress since " $2
+       check_pool_status "$1" "scan" \
+           "resilver[ ()0-9A-Za-z_-]* in progress since" $2
        return $?
 }
 
@@ -3487,7 +3488,7 @@ function wait_scrubbed
        typeset pool=${1:-$TESTPOOL}
        while true ; do
                is_pool_scrubbed $pool && break
-               log_must sleep 1
+               sleep 1
        done
 }
 
index 24f3e50bb5c6801e05c06051e5cb4c1f7c339a4a..c56518c55a03df2ed3788e85e7a2b2f39ceaaf46 100644 (file)
@@ -65,7 +65,6 @@ SUBDIRS = \
        rename_dirs \
        replacement \
        reservation \
-       resilver \
        rootpool \
        rsend \
        scrub_mirror \
index ee5b2b4e17403e4045233b9f665c435cc224fb39..4991b76bfa575e788e2548f2e4a0bf942a494edd 100644 (file)
@@ -79,6 +79,7 @@ typeset -a properties=(
     "feature@redacted_datasets"
     "feature@bookmark_written"
     "feature@log_spacemap"
+    "feature@device_rebuild"
 )
 
 if is_linux || is_freebsd; then
index 6a21cac4fdae3b340bec56a9707789cf6ecd7090..451d83a79aa68ca6edac8f7941ce7f094ec9bc65 100644 (file)
@@ -4,6 +4,7 @@ dist_pkgdata_SCRIPTS = \
        cleanup.ksh \
        zpool_wait_replace.ksh  \
        zpool_wait_replace_cancel.ksh \
+       zpool_wait_rebuild.ksh \
        zpool_wait_resilver.ksh  \
        zpool_wait_scrub_basic.ksh \
        zpool_wait_scrub_cancel.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh
new file mode 100755 (executable)
index 0000000..8cd5864
--- /dev/null
@@ -0,0 +1,64 @@
+#!/bin/ksh -p
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib
+
+#
+# DESCRIPTION:
+# 'zpool wait' works when waiting for sequential resilvering to complete.
+#
+# STRATEGY:
+# 1. Attach a device to the pool so that sequential resilvering starts.
+# 2. Start 'zpool wait'.
+# 3. Monitor the waiting process to make sure it returns neither too soon nor
+#    too late.
+# 4. Repeat 1-3, except using the '-w' flag with 'zpool attach' instead of using
+#    'zpool wait'.
+#
+
+function cleanup
+{
+       remove_io_delay
+       kill_if_running $pid
+       get_disklist $TESTPOOL | grep $DISK2 >/dev/null && \
+           log_must zpool detach $TESTPOOL $DISK2
+}
+
+typeset -r IN_PROGRESS_CHECK="is_pool_resilvering $TESTPOOL"
+typeset pid
+
+log_onexit cleanup
+
+add_io_delay $TESTPOOL
+
+# Test 'zpool wait -t resilver'
+log_must zpool attach -s $TESTPOOL $DISK1 $DISK2
+log_bkgrnd zpool wait -t resilver $TESTPOOL
+pid=$!
+check_while_waiting $pid "$IN_PROGRESS_CHECK"
+
+log_must zpool detach $TESTPOOL $DISK2
+
+# Test 'zpool attach -w'
+log_bkgrnd zpool attach -sw $TESTPOOL $DISK1 $DISK2
+pid=$!
+while ! is_pool_resilvering $TESTPOOL && proc_exists $pid; do
+       log_must sleep .5
+done
+check_while_waiting $pid "$IN_PROGRESS_CHECK"
+
+log_pass "'zpool wait -t resilver' and 'zpool attach -w' work."
index d47fcd5e1b24daed827b3b2561d44f3ddc506cb4..fe6e4912198df71316a8de71258361f40a9ba4ae 100644 (file)
@@ -2,9 +2,20 @@ pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/replacement
 dist_pkgdata_SCRIPTS = \
        setup.ksh \
        cleanup.ksh \
-       replacement_001_pos.ksh \
-       replacement_002_pos.ksh \
-       replacement_003_pos.ksh
+       attach_import.ksh \
+       attach_multiple.ksh \
+       attach_rebuild.ksh \
+       attach_resilver.ksh \
+       detach.ksh \
+       rebuild_disabled_feature.ksh \
+       rebuild_multiple.ksh \
+       rebuild_raidz.ksh \
+       replace_import.ksh \
+       replace_rebuild.ksh \
+       replace_resilver.ksh \
+       resilver_restart_001.ksh \
+       resilver_restart_002.ksh \
+       scrub_cancel.ksh
 
 dist_pkgdata_DATA = \
        replacement.cfg
diff --git a/tests/zfs-tests/tests/functional/replacement/attach_import.ksh b/tests/zfs-tests/tests/functional/replacement/attach_import.ksh
new file mode 100755 (executable)
index 0000000..e2749b1
--- /dev/null
@@ -0,0 +1,67 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# Description:
+# Verify that on import an in progress attach operation is resumed.
+#
+# Strategy:
+# 1. For both healing and sequential resilvering.
+#    a. Create a pool
+#    b. Add a vdev with 'zpool attach' and resilver (-s) it.
+#    c. Export the pool
+#    d. Import the pool
+#    e. Verify the 'zpool attach' resumed resilvering
+#    f. Destroy the pool
+#
+
+function cleanup
+{
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+           $ORIG_SCAN_SUSPEND_PROGRESS
+       destroy_pool $TESTPOOL1
+       rm -f ${VDEV_FILES[@]}
+}
+
+log_assert "Verify attach is resumed on import"
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]}
+
+# Verify healing and sequential resilver resume on import.
+for arg in "" "-s"; do
+       log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]}
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+       log_must zpool attach $arg $TESTPOOL1 ${VDEV_FILES[0]} ${VDEV_FILES[1]}
+       log_must is_pool_resilvering $TESTPOOL1
+       log_must zpool export $TESTPOOL1
+       log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1
+       log_must is_pool_resilvering $TESTPOOL1
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
+       log_must zpool wait -t resilver $TESTPOOL1
+       log_must is_pool_resilvered $TESTPOOL1
+       destroy_pool $TESTPOOL1
+done
+
+log_pass "Verify attach is resumed on import"
diff --git a/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh b/tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh
new file mode 100755 (executable)
index 0000000..b3192b2
--- /dev/null
@@ -0,0 +1,111 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# Description:
+# Verify that attach/detach work while resilvering and attaching
+# multiple vdevs.
+#
+# Strategy:
+# 1. Create a single vdev pool
+# 2. While healing or sequential resilvering:
+#    a. Attach a vdev to convert the pool to a mirror.
+#    b. Attach a vdev to convert the pool to a 3-way mirror.
+#    c. Verify the original vdev cannot be removed (no redundant copies)
+#    d. Detach a vdev.  Healing and sequential resilver remain running.
+#    e. Detach a vdev.  Healing resilver remains running, sequential
+#       resilver is canceled.
+#    f. Wait for resilver to complete.
+#
+
+function cleanup
+{
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+           $ORIG_SCAN_SUSPEND_PROGRESS
+       destroy_pool $TESTPOOL1
+       rm -f ${VDEV_FILES[@]}
+}
+
+log_assert "Verify attach/detech with multiple vdevs"
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]}
+
+# Verify resilver resumes on import.
+log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[0]}
+
+for replace_mode in "healing" "sequential"; do
+        #
+        # Resilvers abort the dsl_scan and reconfigure it for resilvering.
+        # Rebuilds cancel the dsl_scan and start the vdev_rebuild thread.
+        #
+        if [[ "$replace_mode" = "healing" ]]; then
+                flags=""
+        else
+                flags="-s"
+        fi
+
+       log_mustnot is_pool_resilvering $TESTPOOL1
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+       # Attach first vdev (stripe -> mirror)
+       log_must zpool attach $flags $TESTPOOL1 \
+           ${VDEV_FILES[0]} ${VDEV_FILES[1]}
+       log_must is_pool_resilvering $TESTPOOL1
+
+       # Attach second vdev (2-way -> 3-way mirror)
+       log_must zpool attach $flags $TESTPOOL1 \
+           ${VDEV_FILES[1]} ${VDEV_FILES[2]}
+       log_must is_pool_resilvering $TESTPOOL1
+
+       # Original vdev cannot be detached until there is sufficent redundancy.
+       log_mustnot zpool detach $TESTPOOL1 ${VDEV_FILES[0]}
+
+       # Detach first vdev (resilver keeps running)
+       log_must zpool detach $TESTPOOL1 ${VDEV_FILES[1]}
+       log_must is_pool_resilvering $TESTPOOL1
+
+       #
+       # Detach second vdev.  There's a difference in behavior between
+       # healing and sequential resilvers.  A healing resilver will not be
+       # cancelled even though there's nothing on the original vdev which
+       # needs to be rebuilt.  A sequential resilver on the otherhand is
+       # canceled when returning to a non-redundant striped layout.  At
+       # some point the healing resilver behavior should be updated to match
+       # the sequential resilver behavior.
+       #
+       log_must zpool detach $TESTPOOL1 ${VDEV_FILES[2]}
+
+        if [[ "$replace_mode" = "healing" ]]; then
+               log_must is_pool_resilvering $TESTPOOL1
+        else
+               log_mustnot is_pool_resilvering $TESTPOOL1
+        fi
+
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+           $ORIG_SCAN_SUSPEND_PROGRESS
+       log_must zpool wait $TESTPOOL1
+done
+
+log_pass "Verify attach/detech with multiple vdevs"
diff --git a/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh
new file mode 100755 (executable)
index 0000000..e9427c7
--- /dev/null
@@ -0,0 +1,173 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+#      Attaching disks during I/O should pass for supported pools.
+#
+# STRATEGY:
+#      1. Create multidisk pools (stripe/mirror/raidz) and
+#         start some random I/O
+#      2. Attach a disk to the pool.
+#      3. Verify the integrity of the file system and the resilvering.
+#
+# NOTE: Raidz does not support the sequential resilver (-s) option.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       if [[ -n "$child_pids" ]]; then
+               for wait_pid in $child_pids; do
+                       kill $wait_pid
+               done
+       fi
+
+       if poolexists $TESTPOOL1; then
+               destroy_pool $TESTPOOL1
+       fi
+
+       [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
+}
+
+log_assert "Replacing a disk during I/O completes."
+
+options=""
+options_display="default options"
+
+log_onexit cleanup
+
+[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
+
+[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
+
+[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
+
+[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
+
+[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
+
+options="$options -r "
+
+[[ -n "$options" ]] && options_display=$options
+
+child_pids=""
+
+function attach_test
+{
+       typeset -i iters=2
+       typeset -i index=0
+       typeset opt=$1
+       typeset disk1=$2
+       typeset disk2=$3
+
+       typeset i=0
+       while [[ $i -lt $iters ]]; do
+               log_note "Invoking file_trunc with: $options_display"
+               file_trunc $options $TESTDIR/$TESTFILE.$i &
+               typeset pid=$!
+
+               sleep 1
+
+               child_pids="$child_pids $pid"
+               ((i = i + 1))
+       done
+
+       log_must zpool attach -sw $opt $TESTPOOL1 $disk1 $disk2
+
+       for wait_pid in $child_pids; do
+               kill $wait_pid
+       done
+       child_pids=""
+
+       log_must zpool export $TESTPOOL1
+       log_must zpool import -d $TESTDIR $TESTPOOL1
+       log_must zfs umount $TESTPOOL1/$TESTFS1
+       log_must zdb -cdui $TESTPOOL1/$TESTFS1
+       log_must zfs mount $TESTPOOL1/$TESTFS1
+       verify_pool $TESTPOOL1
+}
+
+specials_list=""
+i=0
+while [[ $i != 3 ]]; do
+       truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
+       specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
+
+       ((i = i + 1))
+done
+
+#
+# Create a replacement disk special file.
+#
+truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
+
+for op in "" "-f"; do
+       create_pool $TESTPOOL1 mirror $specials_list
+       log_must zfs create $TESTPOOL1/$TESTFS1
+       log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+       attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
+
+       zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
+       if [[ $? -ne 0 ]]; then
+               log_fail "$REPLACEFILE is not present."
+       fi
+
+       destroy_pool $TESTPOOL1
+done
+
+log_note "Verify 'zpool attach' fails with non-mirrors."
+
+for type in "" "raidz" "raidz1"; do
+       for op in "" "-f"; do
+               create_pool $TESTPOOL1 $type $specials_list
+               log_must zfs create $TESTPOOL1/$TESTFS1
+               log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+               log_mustnot zpool attach -s "$opt" $TESTDIR/$TESTFILE1.1 \
+                   $TESTDIR/$REPLACEFILE
+
+               zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
+               if [[ $? -eq 0 ]]; then
+                       log_fail "$REPLACEFILE should not be present."
+               fi
+
+               destroy_pool $TESTPOOL1
+       done
+done
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh b/tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh
new file mode 100755 (executable)
index 0000000..4261d4d
--- /dev/null
@@ -0,0 +1,172 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+#      Attaching disks during I/O should pass for supported pools.
+#
+# STRATEGY:
+#      1. Create multidisk pools (stripe/mirror/raidz) and
+#         start some random I/O
+#      2. Attach a disk to the pool.
+#      3. Verify the integrity of the file system and the resilvering.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       if [[ -n "$child_pids" ]]; then
+               for wait_pid in $child_pids
+               do
+                       kill $wait_pid
+               done
+       fi
+
+       if poolexists $TESTPOOL1; then
+               destroy_pool $TESTPOOL1
+       fi
+
+       [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
+}
+
+log_assert "Replacing a disk during I/O completes."
+
+options=""
+options_display="default options"
+
+log_onexit cleanup
+
+[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
+
+[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
+
+[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
+
+[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
+
+[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
+
+options="$options -r "
+
+[[ -n "$options" ]] && options_display=$options
+
+child_pids=""
+
+function attach_test
+{
+       typeset -i iters=2
+       typeset -i index=0
+       typeset opt=$1
+       typeset disk1=$2
+       typeset disk2=$3
+
+       typeset i=0
+       while [[ $i -lt $iters ]]; do
+               log_note "Invoking file_trunc with: $options_display"
+               file_trunc $options $TESTDIR/$TESTFILE.$i &
+               typeset pid=$!
+
+               sleep 1
+
+               child_pids="$child_pids $pid"
+               ((i = i + 1))
+       done
+
+       log_must zpool attach -w $opt $TESTPOOL1 $disk1 $disk2
+
+       for wait_pid in $child_pids
+       do
+               kill $wait_pid
+       done
+       child_pids=""
+
+        log_must zpool export $TESTPOOL1
+        log_must zpool import -d $TESTDIR $TESTPOOL1
+        log_must zfs umount $TESTPOOL1/$TESTFS1
+        log_must zdb -cdui $TESTPOOL1/$TESTFS1
+        log_must zfs mount $TESTPOOL1/$TESTFS1
+       verify_pool $TESTPOOL1
+}
+
+specials_list=""
+i=0
+while [[ $i != 3 ]]; do
+       truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
+       specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
+
+       ((i = i + 1))
+done
+
+#
+# Create a replacement disk special file.
+#
+truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
+
+for op in "" "-f"; do
+       create_pool $TESTPOOL1 mirror $specials_list
+       log_must zfs create $TESTPOOL1/$TESTFS1
+       log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+       attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
+
+       zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
+       if [[ $? -ne 0 ]]; then
+               log_fail "$REPLACEFILE is not present."
+       fi
+
+       destroy_pool $TESTPOOL1
+done
+
+log_note "Verify 'zpool attach' fails with non-mirrors."
+
+for type in "" "raidz" "raidz1"; do
+       for op in "" "-f"; do
+               create_pool $TESTPOOL1 $type $specials_list
+               log_must zfs create $TESTPOOL1/$TESTFS1
+               log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+               log_mustnot zpool attach "$opt" $TESTDIR/$TESTFILE1.1 \
+                   $TESTDIR/$REPLACEFILE
+
+               zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
+               if [[ $? -eq 0 ]]; then
+                       log_fail "$REPLACEFILE should not be present."
+               fi
+
+               destroy_pool $TESTPOOL1
+       done
+done
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/replacement/detach.ksh b/tests/zfs-tests/tests/functional/replacement/detach.ksh
new file mode 100755 (executable)
index 0000000..aa3ec4f
--- /dev/null
@@ -0,0 +1,161 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+#      Detaching disks during I/O should pass for supported pools.
+#
+# STRATEGY:
+#      1. Create multidisk pools (stripe/mirror/raidz) and
+#         start some random I/O
+#      2. Detach a disk from the pool.
+#      3. Verify the integrity of the file system and the resilvering.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       if [[ -n "$child_pids" ]]; then
+               for wait_pid in $child_pids
+               do
+                       kill $wait_pid
+               done
+       fi
+
+       if poolexists $TESTPOOL1; then
+               destroy_pool $TESTPOOL1
+       fi
+
+       [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
+}
+
+log_assert "Replacing a disk during I/O completes."
+
+options=""
+options_display="default options"
+
+log_onexit cleanup
+
+[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
+
+[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
+
+[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
+
+[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
+
+[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
+
+ptions="$options -r "
+
+[[ -n "$options" ]] && options_display=$options
+
+child_pids=""
+
+function detach_test
+{
+       typeset -i iters=2
+       typeset -i index=0
+       typeset disk1=$1
+
+       typeset i=0
+       while [[ $i -lt $iters ]]; do
+               log_note "Invoking file_trunc with: $options_display"
+               file_trunc $options $TESTDIR/$TESTFILE.$i &
+               typeset pid=$!
+
+               sleep 1
+
+               child_pids="$child_pids $pid"
+               ((i = i + 1))
+       done
+
+       log_must zpool detach $TESTPOOL1 $disk1
+
+       sleep 10
+
+       for wait_pid in $child_pids
+       do
+               kill $wait_pid
+       done
+       child_pids=""
+
+        log_must zpool export $TESTPOOL1
+        log_must zpool import -d $TESTDIR $TESTPOOL1
+        log_must zfs umount $TESTPOOL1/$TESTFS1
+        log_must zdb -cdui $TESTPOOL1/$TESTFS1
+        log_must zfs mount $TESTPOOL1/$TESTFS1
+}
+
+specials_list=""
+i=0
+while [[ $i != 3 ]]; do
+       truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
+       specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
+
+       ((i = i + 1))
+done
+
+create_pool $TESTPOOL1 mirror $specials_list
+log_must zfs create $TESTPOOL1/$TESTFS1
+log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+detach_test $TESTDIR/$TESTFILE1.1
+
+zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1"
+if [[ $? -eq 0 ]]; then
+       log_fail "$TESTFILE1.1 should no longer be present."
+fi
+
+destroy_pool $TESTPOOL1
+
+log_note "Verify 'zpool detach' fails with non-mirrors."
+
+for type in "" "raidz" "raidz1"; do
+       create_pool $TESTPOOL1 $type $specials_list
+       log_must zfs create $TESTPOOL1/$TESTFS1
+       log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+       log_mustnot zpool detach $TESTDIR/$TESTFILE1.1
+
+       zpool iostat -v $TESTPOOL1 | grep "$TESTFILE1.1"
+       if [[ $? -ne 0 ]]; then
+               log_fail "$TESTFILE1.1 is not present."
+       fi
+
+       destroy_pool $TESTPOOL1
+done
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh
new file mode 100755 (executable)
index 0000000..d17d83b
--- /dev/null
@@ -0,0 +1,78 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# Description:
+# Verify device_rebuild feature flags.
+#
+# Strategy:
+# 1. Create a pool with all features disabled.
+# 2. Verify 'zpool replace -s' fails and the feature is disabled.
+# 3. Enable the device_rebuild feature.
+# 4. Verify 'zpool replace -s' works and the feature is active.
+# 5. Wait for the feature to return to enabled.
+#
+
+function cleanup
+{
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+           $ORIG_SCAN_SUSPEND_PROGRESS
+       destroy_pool $TESTPOOL1
+       rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+}
+
+function check_feature_flag
+{
+       feature=$1
+       pool=$2
+       expected_value=$3
+
+       value="$(zpool get -H -o property,value all $pool | \
+           egrep "$feature" | awk '{print $2}')"
+       if [ "$value" = "$expected_value" ]; then
+               log_note "$feature verified to be $value"
+       else
+               log_fail "$feature should be $expected_value but is $value"
+       fi
+}
+
+log_assert "Verify device_rebuild feature flags."
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+log_must zpool create -d $TESTPOOL1 ${VDEV_FILES[@]}
+
+log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "disabled"
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+log_must zpool set feature@device_rebuild=enabled $TESTPOOL1
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "active"
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
+log_must zpool wait -t resilver $TESTPOOL1
+check_feature_flag "feature@device_rebuild" "$TESTPOOL1" "enabled"
+
+log_pass "Verify device_rebuild feature flags."
diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh
new file mode 100755 (executable)
index 0000000..7775cbf
--- /dev/null
@@ -0,0 +1,126 @@
+#!/bin/ksh -p
+
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Sequential reconstruction (unlike healing reconstruction) operate on the
+# top-level vdev.  This means that a sequential resilver operation can be
+# started/stopped on a different top-level vdev without impacting other
+# sequential resilvers.
+#
+# STRATEGY:
+# 1. Create a mirrored pool.
+#
+
+function cleanup
+{
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+           $ORIG_SCAN_SUSPEND_PROGRESS
+       destroy_pool $TESTPOOL1
+       rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE $SPARE_VDEV_FILE2
+}
+
+function check_history
+{
+       pool=$1
+       msg=$2
+       exp=$3
+
+       count=$(zpool history -i $pool | grep "rebuild" | grep -c "$msg")
+       if [[ "$count" -ne "$exp" ]]; then
+               log_fail "Expected $exp rebuild '$msg' messages, found $count"
+       else
+               log_note "Found $count/$exp rebuild '$msg' messages"
+       fi
+}
+
+log_assert "Rebuilds operate on the top-level vdevs"
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} \
+    $SPARE_VDEV_FILE $SPARE_VDEV_FILE2
+
+# Verify two sequential resilvers can run concurrently.
+log_must zpool create -f $TESTPOOL1 \
+    mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \
+    mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]}
+log_must zfs create $TESTPOOL1/$TESTFS
+
+mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS)
+log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32
+log_must zpool sync $TESTPOOL1
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2
+
+check_history $TESTPOOL1 "started" 2
+check_history $TESTPOOL1 "reset" 0
+check_history $TESTPOOL1 "complete" 0
+check_history $TESTPOOL1 "canceled" 0
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
+log_must zpool wait -t resilver $TESTPOOL1
+
+check_history $TESTPOOL1 "complete" 2
+destroy_pool $TESTPOOL1
+
+# Verify canceling one resilver (zpool detach) does not impact others.
+log_must zpool create -f $TESTPOOL1 \
+    mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]} \
+    mirror ${VDEV_FILES[2]} ${VDEV_FILES[3]}
+log_must zfs create $TESTPOOL1/$TESTFS
+
+mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS)
+log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=32
+log_must zpool sync $TESTPOOL1
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[3]} $SPARE_VDEV_FILE2
+
+check_history $TESTPOOL1 "started" 2
+check_history $TESTPOOL1 "reset" 0
+check_history $TESTPOOL1 "complete" 0
+check_history $TESTPOOL1 "canceled" 0
+
+log_must zpool detach $TESTPOOL1 $SPARE_VDEV_FILE2
+
+check_history $TESTPOOL1 "complete" 0
+check_history $TESTPOOL1 "canceled" 1
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
+log_must zpool wait -t resilver $TESTPOOL1
+
+check_history $TESTPOOL1 "complete" 1
+check_history $TESTPOOL1 "canceled" 1
+destroy_pool $TESTPOOL1
+
+log_pass "Rebuilds operate on the top-level vdevs"
diff --git a/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh b/tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh
new file mode 100755 (executable)
index 0000000..c919b44
--- /dev/null
@@ -0,0 +1,70 @@
+#!/bin/ksh -p
+
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Executing 'zpool replace -s' for raidz vdevs failed.  Sequential
+# resilvers are only allowed for stripe/mirror pools.
+#
+# STRATEGY:
+# 1. Create a raidz pool, verify 'zpool replace -s' fails
+# 2. Create a stripe/mirror pool, verify 'zpool replace -s' passes
+#
+
+function cleanup
+{
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+           $ORIG_SCAN_SUSPEND_PROGRESS
+       destroy_pool $TESTPOOL1
+       rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+}
+
+log_assert "Sequential resilver is not allowed for raidz vdevs"
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+
+# raidz[1-3]
+for vdev_type in "raidz" "raidz2" "raidz3"; do
+       log_must zpool create -f $TESTPOOL1 $vdev_type ${VDEV_FILES[@]}
+       log_mustnot zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} \
+           $SPARE_VDEV_FILE
+       destroy_pool $TESTPOOL1
+done
+
+# stripe
+log_must zpool create $TESTPOOL1 ${VDEV_FILES[@]}
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]} $SPARE_VDEV_FILE
+destroy_pool $TESTPOOL1
+
+# mirror
+log_must zpool create $TESTPOOL1 mirror ${VDEV_FILES[0]} ${VDEV_FILES[1]}
+log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[1]}  $SPARE_VDEV_FILE
+destroy_pool $TESTPOOL1
+
+log_pass "Sequential resilver is not allowed for raidz vdevs"
diff --git a/tests/zfs-tests/tests/functional/replacement/replace_import.ksh b/tests/zfs-tests/tests/functional/replacement/replace_import.ksh
new file mode 100755 (executable)
index 0000000..35d51d9
--- /dev/null
@@ -0,0 +1,67 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# Description:
+# Verify that on import an in progress replace operation is resumed.
+#
+# Strategy:
+# 1. For both healing and sequential resilvering replace:
+#    a. Create a pool
+#    b. Repalce a vdev with 'zpool replace' to resilver (-s) it.
+#    c. Export the pool
+#    d. Import the pool
+#    e. Verify the 'zpool replace' resumed resilvering.
+#    f. Destroy the pool
+#
+
+function cleanup
+{
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+           $ORIG_SCAN_SUSPEND_PROGRESS
+       destroy_pool $TESTPOOL1
+       rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+}
+
+log_assert "Verify replace is resumed on import"
+
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+
+# Verify healing and sequential resilver resume on import.
+for arg in "" "-s"; do
+       log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]}
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+       log_must zpool replace -s $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE
+       log_must is_pool_resilvering $TESTPOOL1
+       log_must zpool export $TESTPOOL1
+       log_must zpool import -d $TEST_BASE_DIR $TESTPOOL1
+       log_must is_pool_resilvering $TESTPOOL1
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS $ORIG_SCAN_SUSPEND_PROGRESS
+       log_must zpool wait -t resilver $TESTPOOL1
+       log_must is_pool_resilvered $TESTPOOL1
+       destroy_pool $TESTPOOL1
+done
+
+log_pass "Verify replace is resumed on import"
diff --git a/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh b/tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh
new file mode 100755 (executable)
index 0000000..5997352
--- /dev/null
@@ -0,0 +1,158 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+#      Replacing disks during I/O should pass for supported pools.
+#
+# STRATEGY:
+#      1. Create multidisk pools (stripe/mirror) and
+#         start some random I/O
+#      2. Replace a disk in the pool with another disk.
+#      3. Verify the integrity of the file system and the rebuilding.
+#
+# NOTE: Raidz does not support the sequential resilver (-s) option.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       if [[ -n "$child_pids" ]]; then
+               for wait_pid in $child_pids
+               do
+                       kill $wait_pid
+               done
+       fi
+
+       if poolexists $TESTPOOL1; then
+               destroy_pool $TESTPOOL1
+       fi
+
+       [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
+}
+
+log_assert "Replacing a disk with -r during I/O completes."
+
+options=""
+options_display="default options"
+
+log_onexit cleanup
+
+[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
+
+[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
+
+[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
+
+[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
+
+[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
+
+options="$options -r "
+
+[[ -n "$options" ]] && options_display=$options
+
+child_pids=""
+
+function replace_test
+{
+       typeset -i iters=2
+       typeset -i index=0
+       typeset opt=$1
+       typeset disk1=$2
+       typeset disk2=$3
+
+       typeset i=0
+       while [[ $i -lt $iters ]]; do
+               log_note "Invoking file_trunc with: $options_display"
+               file_trunc $options $TESTDIR/$TESTFILE.$i &
+               typeset pid=$!
+
+               sleep 1
+
+               child_pids="$child_pids $pid"
+               ((i = i + 1))
+       done
+
+       log_must zpool replace -sw $opt $TESTPOOL1 $disk1 $disk2
+
+       for wait_pid in $child_pids
+       do
+               kill $wait_pid
+       done
+       child_pids=""
+
+       log_must zpool export $TESTPOOL1
+       log_must zpool import -d $TESTDIR $TESTPOOL1
+       log_must zfs umount $TESTPOOL1/$TESTFS1
+       log_must zdb -cdui $TESTPOOL1/$TESTFS1
+       log_must zfs mount $TESTPOOL1/$TESTFS1
+       verify_pool $TESTPOOL1
+}
+
+specials_list=""
+i=0
+while [[ $i != 3 ]]; do
+       log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
+       specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
+
+       ((i = i + 1))
+done
+
+#
+# Create a replacement disk special file.
+#
+log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
+
+for type in "" "mirror"; do
+       for op in "" "-f"; do
+               create_pool $TESTPOOL1 $type $specials_list
+               log_must zfs create $TESTPOOL1/$TESTFS1
+               log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+               replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
+
+               zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
+               if [[ $? -ne 0 ]]; then
+                       log_fail "$REPLACEFILE is not present."
+               fi
+
+               destroy_pool $TESTPOOL1
+               log_must rm -rf /$TESTPOOL1
+       done
+done
+
+log_pass
diff --git a/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh b/tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh
new file mode 100755 (executable)
index 0000000..253cf65
--- /dev/null
@@ -0,0 +1,155 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+# Use is subject to license terms.
+#
+
+#
+# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+#      Replacing disks during I/O should pass for supported pools.
+#
+# STRATEGY:
+#      1. Create multidisk pools (stripe/mirror/raidz) and
+#         start some random I/O
+#      2. Replace a disk in the pool with another disk.
+#      3. Verify the integrity of the file system and the resilvering.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       if [[ -n "$child_pids" ]]; then
+               for wait_pid in $child_pids
+               do
+                       kill $wait_pid
+               done
+       fi
+
+       if poolexists $TESTPOOL1; then
+               destroy_pool $TESTPOOL1
+       fi
+
+       [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
+}
+
+log_assert "Replacing a disk during I/O completes."
+
+options=""
+options_display="default options"
+
+log_onexit cleanup
+
+[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
+
+[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
+
+[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
+
+[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
+
+[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
+
+options="$options -r "
+
+[[ -n "$options" ]] && options_display=$options
+
+child_pids=""
+
+function replace_test
+{
+       typeset -i iters=2
+       typeset -i index=0
+       typeset opt=$1
+       typeset disk1=$2
+       typeset disk2=$3
+
+       typeset i=0
+       while [[ $i -lt $iters ]]; do
+               log_note "Invoking file_trunc with: $options_display"
+               file_trunc $options $TESTDIR/$TESTFILE.$i &
+               typeset pid=$!
+
+               sleep 1
+
+               child_pids="$child_pids $pid"
+               ((i = i + 1))
+       done
+
+       log_must zpool replace -w $opt $TESTPOOL1 $disk1 $disk2
+
+       for wait_pid in $child_pids
+       do
+               kill $wait_pid
+       done
+       child_pids=""
+
+       log_must zpool export $TESTPOOL1
+       log_must zpool import -d $TESTDIR $TESTPOOL1
+       log_must zfs umount $TESTPOOL1/$TESTFS1
+       log_must zdb -cdui $TESTPOOL1/$TESTFS1
+       log_must zfs mount $TESTPOOL1/$TESTFS1
+       verify_pool $TESTPOOL1
+}
+
+specials_list=""
+i=0
+while [[ $i != 3 ]]; do
+       log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
+       specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
+
+       ((i = i + 1))
+done
+
+#
+# Create a replacement disk special file.
+#
+log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
+
+for type in "" "raidz" "mirror"; do
+       for op in "" "-f"; do
+               create_pool $TESTPOOL1 $type $specials_list
+               log_must zfs create $TESTPOOL1/$TESTFS1
+               log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
+
+               replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
+
+               zpool iostat -v $TESTPOOL1 | grep "$REPLACEFILE"
+               if [[ $? -ne 0 ]]; then
+                       log_fail "$REPLACEFILE is not present."
+               fi
+
+               destroy_pool $TESTPOOL1
+               log_must rm -rf /$TESTPOOL1
+       done
+done
+
+log_pass
index b2ba1b885117a815a5dfeee89b918fb595966991..271317b1c97021cc24c50652e740de6fa8f28fd8 100644 (file)
@@ -36,3 +36,8 @@ export HOLES_SEED=${HOLES_SEED-""}
 export HOLES_FILEOFFSET=${HOLES_FILEOFFSET-""}
 export HOLES_COUNT=${HOLES_COUNT-"16384"}         # FILESIZE/BLKSIZE/8
 export REPLACEFILE="sparedisk"
+
+set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4}
+export VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 ))
+export SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1
+export SPARE_VDEV_FILE2=$TEST_BASE_DIR/spare-2
diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh b/tests/zfs-tests/tests/functional/replacement/replacement_001_pos.ksh
deleted file mode 100755 (executable)
index 8f40436..0000000
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-
-#
-# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/replacement/replacement.cfg
-
-#
-# DESCRIPTION:
-#      Replacing disks during I/O should pass for supported pools.
-#
-# STRATEGY:
-#      1. Create multidisk pools (stripe/mirror/raidz) and
-#         start some random I/O
-#      2. Replace a disk in the pool with another disk.
-#      3. Verify the integrity of the file system and the resilvering.
-#
-
-verify_runnable "global"
-
-function cleanup
-{
-       if [[ -n "$child_pids" ]]; then
-               for wait_pid in $child_pids
-               do
-                       kill $wait_pid
-               done
-       fi
-
-       if poolexists $TESTPOOL1; then
-               destroy_pool $TESTPOOL1
-       fi
-
-       [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
-}
-
-log_assert "Replacing a disk during I/O completes."
-
-options=""
-options_display="default options"
-
-log_onexit cleanup
-
-[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
-
-[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
-
-[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
-
-[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
-
-[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
-
-options="$options -r "
-
-[[ -n "$options" ]] && options_display=$options
-
-child_pids=""
-
-function replace_test
-{
-       typeset -i iters=2
-       typeset -i index=0
-       typeset opt=$1
-       typeset disk1=$2
-       typeset disk2=$3
-
-       typeset i=0
-       while [[ $i -lt $iters ]]; do
-               log_note "Invoking file_trunc with: $options_display"
-               file_trunc $options $TESTDIR/$TESTFILE.$i &
-               typeset pid=$!
-
-               sleep 1
-
-               child_pids="$child_pids $pid"
-               ((i = i + 1))
-       done
-
-       log_must zpool replace $opt $TESTPOOL1 $disk1 $disk2
-
-       sleep 10
-
-       for wait_pid in $child_pids
-       do
-               kill $wait_pid
-       done
-       child_pids=""
-
-       log_must zpool export $TESTPOOL1
-       log_must zpool import -d $TESTDIR $TESTPOOL1
-       log_must zfs umount $TESTPOOL1/$TESTFS1
-       log_must zdb -cdui $TESTPOOL1/$TESTFS1
-       log_must zfs mount $TESTPOOL1/$TESTFS1
-}
-
-specials_list=""
-i=0
-while [[ $i != 2 ]]; do
-       log_must truncate -s $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
-       specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
-
-       ((i = i + 1))
-done
-
-#
-# Create a replacement disk special file.
-#
-log_must truncate -s $MINVDEVSIZE $TESTDIR/$REPLACEFILE
-
-for type in "" "raidz" "mirror"; do
-       for op in "" "-f"; do
-               create_pool $TESTPOOL1 $type $specials_list
-               log_must zfs create $TESTPOOL1/$TESTFS1
-               log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
-
-               replace_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
-
-               zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE"
-               if [[ $? -ne 0 ]]; then
-                       log_fail "$REPLACEFILE is not present."
-               fi
-
-               destroy_pool $TESTPOOL1
-               log_must rm -rf /$TESTPOOL1
-       done
-done
-
-log_pass
diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh b/tests/zfs-tests/tests/functional/replacement/replacement_002_pos.ksh
deleted file mode 100755 (executable)
index 391aa5c..0000000
+++ /dev/null
@@ -1,174 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-
-#
-# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/replacement/replacement.cfg
-
-#
-# DESCRIPTION:
-#      Attaching disks during I/O should pass for supported pools.
-#
-# STRATEGY:
-#      1. Create multidisk pools (stripe/mirror/raidz) and
-#         start some random I/O
-#      2. Attach a disk to the pool.
-#      3. Verify the integrity of the file system and the resilvering.
-#
-
-verify_runnable "global"
-
-function cleanup
-{
-       if [[ -n "$child_pids" ]]; then
-               for wait_pid in $child_pids
-               do
-                       kill $wait_pid
-               done
-       fi
-
-       if poolexists $TESTPOOL1; then
-               destroy_pool $TESTPOOL1
-       fi
-
-       [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
-}
-
-log_assert "Replacing a disk during I/O completes."
-
-options=""
-options_display="default options"
-
-log_onexit cleanup
-
-[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
-
-[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
-
-[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
-
-[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
-
-[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
-
-options="$options -r "
-
-[[ -n "$options" ]] && options_display=$options
-
-child_pids=""
-
-function attach_test
-{
-       typeset -i iters=2
-       typeset -i index=0
-       typeset opt=$1
-       typeset disk1=$2
-       typeset disk2=$3
-
-       typeset i=0
-       while [[ $i -lt $iters ]]; do
-               log_note "Invoking file_trunc with: $options_display"
-               file_trunc $options $TESTDIR/$TESTFILE.$i &
-               typeset pid=$!
-
-               sleep 1
-
-               child_pids="$child_pids $pid"
-               ((i = i + 1))
-       done
-
-       log_must zpool attach $opt $TESTPOOL1 $disk1 $disk2
-
-       sleep 10
-
-       for wait_pid in $child_pids
-       do
-               kill $wait_pid
-       done
-       child_pids=""
-
-        log_must zpool export $TESTPOOL1
-        log_must zpool import -d $TESTDIR $TESTPOOL1
-        log_must zfs umount $TESTPOOL1/$TESTFS1
-        log_must zdb -cdui $TESTPOOL1/$TESTFS1
-        log_must zfs mount $TESTPOOL1/$TESTFS1
-
-}
-
-specials_list=""
-i=0
-while [[ $i != 2 ]]; do
-       mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
-       specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
-
-       ((i = i + 1))
-done
-
-#
-# Create a replacement disk special file.
-#
-mkfile $MINVDEVSIZE $TESTDIR/$REPLACEFILE
-
-for op in "" "-f"; do
-       create_pool $TESTPOOL1 mirror $specials_list
-       log_must zfs create $TESTPOOL1/$TESTFS1
-       log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
-
-       attach_test "$opt" $TESTDIR/$TESTFILE1.1 $TESTDIR/$REPLACEFILE
-
-       zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE"
-       if [[ $? -ne 0 ]]; then
-               log_fail "$REPLACEFILE is not present."
-       fi
-
-       destroy_pool $TESTPOOL1
-done
-
-log_note "Verify 'zpool attach' fails with non-mirrors."
-
-for type in "" "raidz" "raidz1"; do
-       for op in "" "-f"; do
-               create_pool $TESTPOOL1 $type $specials_list
-               log_must zfs create $TESTPOOL1/$TESTFS1
-               log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
-
-               log_mustnot zpool attach "$opt" $TESTDIR/$TESTFILE1.1 \
-                   $TESTDIR/$REPLACEFILE
-
-               zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$REPLACEFILE"
-               if [[ $? -eq 0 ]]; then
-                       log_fail "$REPLACEFILE should not be present."
-               fi
-
-               destroy_pool $TESTPOOL1
-       done
-done
-
-log_pass
diff --git a/tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh b/tests/zfs-tests/tests/functional/replacement/replacement_003_pos.ksh
deleted file mode 100755 (executable)
index 71b9602..0000000
+++ /dev/null
@@ -1,161 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
-# Use is subject to license terms.
-#
-
-#
-# Copyright (c) 2013, 2016 by Delphix. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/replacement/replacement.cfg
-
-#
-# DESCRIPTION:
-#      Detaching disks during I/O should pass for supported pools.
-#
-# STRATEGY:
-#      1. Create multidisk pools (stripe/mirror/raidz) and
-#         start some random I/O
-#      2. Detach a disk from the pool.
-#      3. Verify the integrity of the file system and the resilvering.
-#
-
-verify_runnable "global"
-
-function cleanup
-{
-       if [[ -n "$child_pids" ]]; then
-               for wait_pid in $child_pids
-               do
-                       kill $wait_pid
-               done
-       fi
-
-       if poolexists $TESTPOOL1; then
-               destroy_pool $TESTPOOL1
-       fi
-
-       [[ -e $TESTDIR ]] && log_must rm -rf $TESTDIR/*
-}
-
-log_assert "Replacing a disk during I/O completes."
-
-options=""
-options_display="default options"
-
-log_onexit cleanup
-
-[[ -n "$HOLES_FILESIZE" ]] && options=" $options -f $HOLES_FILESIZE "
-
-[[ -n "$HOLES_BLKSIZE" ]] && options="$options -b $HOLES_BLKSIZE "
-
-[[ -n "$HOLES_COUNT" ]] && options="$options -c $HOLES_COUNT "
-
-[[ -n "$HOLES_SEED" ]] && options="$options -s $HOLES_SEED "
-
-[[ -n "$HOLES_FILEOFFSET" ]] && options="$options -o $HOLES_FILEOFFSET "
-
-ptions="$options -r "
-
-[[ -n "$options" ]] && options_display=$options
-
-child_pids=""
-
-function detach_test
-{
-       typeset -i iters=2
-       typeset -i index=0
-       typeset disk1=$1
-
-       typeset i=0
-       while [[ $i -lt $iters ]]; do
-               log_note "Invoking file_trunc with: $options_display"
-               file_trunc $options $TESTDIR/$TESTFILE.$i &
-               typeset pid=$!
-
-               sleep 1
-
-               child_pids="$child_pids $pid"
-               ((i = i + 1))
-       done
-
-       log_must zpool detach $TESTPOOL1 $disk1
-
-       sleep 10
-
-       for wait_pid in $child_pids
-       do
-               kill $wait_pid
-       done
-       child_pids=""
-
-        log_must zpool export $TESTPOOL1
-        log_must zpool import -d $TESTDIR $TESTPOOL1
-        log_must zfs umount $TESTPOOL1/$TESTFS1
-        log_must zdb -cdui $TESTPOOL1/$TESTFS1
-        log_must zfs mount $TESTPOOL1/$TESTFS1
-}
-
-specials_list=""
-i=0
-while [[ $i != 2 ]]; do
-       mkfile $MINVDEVSIZE $TESTDIR/$TESTFILE1.$i
-       specials_list="$specials_list $TESTDIR/$TESTFILE1.$i"
-
-       ((i = i + 1))
-done
-
-create_pool $TESTPOOL1 mirror $specials_list
-log_must zfs create $TESTPOOL1/$TESTFS1
-log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
-
-detach_test $TESTDIR/$TESTFILE1.1
-
-zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1"
-if [[ $? -eq 0 ]]; then
-       log_fail "$TESTFILE1.1 should no longer be present."
-fi
-
-destroy_pool $TESTPOOL1
-
-log_note "Verify 'zpool detach' fails with non-mirrors."
-
-for type in "" "raidz" "raidz1" ; do
-       create_pool $TESTPOOL1 $type $specials_list
-       log_must zfs create $TESTPOOL1/$TESTFS1
-       log_must zfs set mountpoint=$TESTDIR1 $TESTPOOL1/$TESTFS1
-
-       log_mustnot zpool detach $TESTDIR/$TESTFILE1.1
-
-       zpool iostat -v $TESTPOOL1 | grep "$TESTDIR/$TESTFILE1.1"
-       if [[ $? -ne 0 ]]; then
-               log_fail "$TESTFILE1.1 is not present."
-       fi
-
-       destroy_pool $TESTPOOL1
-done
-
-log_pass
diff --git a/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh
new file mode 100755 (executable)
index 0000000..7896b2d
--- /dev/null
@@ -0,0 +1,187 @@
+#!/bin/ksh -p
+
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Testing resilver restart logic both with and without the deferred resilver
+# feature enabled, verifying that resilver is not restarted when it is
+# unnecessary.
+#
+# STRATEGY:
+# 1. Create a pool
+# 2. Create four filesystems with the primary cache disable to force reads
+# 3. Write four files simultaneously, one to each filesystem
+# 4. Do with and without deferred resilvers enabled
+#    a. Replace a vdev with a spare & suspend resilver immediately
+#    b. Verify resilver starts properly
+#    c. Offline / online another vdev to introduce a new DTL range
+#    d. Verify resilver restart restart or defer
+#    e. Inject read errors on vdev that was offlined / onlned
+#    f. Verify that resilver did not restart
+#    g. Unsuspend resilver and wait for it to finish
+#    h. Verify that there are two resilvers and nothing is deferred
+#
+
+function cleanup
+{
+       log_must set_tunable32 RESILVER_MIN_TIME_MS $ORIG_RESILVER_MIN_TIME
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+           $ORIG_SCAN_SUSPEND_PROGRESS
+       log_must set_tunable32 ZEVENT_LEN_MAX $ORIG_ZFS_ZEVENT_LEN_MAX
+       log_must zinject -c all
+       destroy_pool $TESTPOOL1
+       rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+}
+
+# count resilver events in zpool and number of deferred rsilvers on vdevs
+function verify_restarts # <msg> <cnt> <defer>
+{
+       msg=$1
+       cnt=$2
+       defer=$3
+
+       # check the number of resilver start in events log
+       RESILVERS=$(zpool events | grep -c sysevent.fs.zfs.resilver_start)
+       log_note "expected $cnt resilver start(s)$msg, found $RESILVERS"
+       [[ "$RESILVERS" -ne "$cnt" ]] &&
+           log_fail "expected $cnt resilver start(s)$msg, found $RESILVERS"
+
+       [[ -z "$defer" ]] && return
+
+       # use zdb to find which vdevs have the resilver defer flag
+       VDEV_DEFERS=$(zdb -C $TESTPOOL1 | awk '
+           /children/ { gsub(/[^0-9]/, ""); child = $0 }
+           /com\.datto:resilver_defer$/ { print child }
+       ')
+
+       if [[ "$defer" == "-" ]]
+       then
+               [[ -n $VDEV_DEFERS ]] &&
+                   log_fail "didn't expect any vdevs to have resilver deferred"
+               return
+       fi
+
+       [[ $VDEV_DEFERS -eq $defer ]] ||
+           log_fail "resilver deferred set on unexpected vdev: $VDEV_DEFERS"
+}
+
+log_assert "Check for unnecessary resilver restarts"
+
+ORIG_RESILVER_MIN_TIME=$(get_tunable RESILVER_MIN_TIME_MS)
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+ORIG_ZFS_ZEVENT_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX)
+
+set -A RESTARTS -- '1' '2' '2' '2'
+set -A VDEVS -- '' '' '' ''
+set -A DEFER_RESTARTS -- '1' '1' '1' '2'
+set -A DEFER_VDEVS -- '-' '2' '2' '-'
+
+VDEV_REPLACE="${VDEV_FILES[1]} $SPARE_VDEV_FILE"
+
+log_onexit cleanup
+
+# ensure that enough events will be saved
+log_must set_tunable32 ZEVENT_LEN_MAX 512
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+
+log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL1 \
+    raidz ${VDEV_FILES[@]}
+
+# create 4 filesystems
+for fs in fs{0..3}
+do
+       log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL1/$fs
+done
+
+# simultaneously write 16M to each of them
+set -A DATAPATHS /$TESTPOOL1/fs{0..3}/dat.0
+log_note "Writing data files"
+for path in ${DATAPATHS[@]}
+do
+       dd if=/dev/urandom of=$path bs=1M count=16 > /dev/null 2>&1 &
+done
+wait
+
+# test without and with deferred resilve feature enabled
+for test in "without" "with"
+do
+       log_note "Testing $test deferred resilvers"
+
+       if [[ $test == "with" ]]
+       then
+               log_must zpool set feature@resilver_defer=enabled $TESTPOOL1
+               RESTARTS=( "${DEFER_RESTARTS[@]}" )
+               VDEVS=( "${DEFER_VDEVS[@]}" )
+               VDEV_REPLACE="$SPARE_VDEV_FILE ${VDEV_FILES[1]}"
+       fi
+
+       # clear the events
+       log_must zpool events -c
+
+       # limit scanning time
+       log_must set_tunable32 RESILVER_MIN_TIME_MS 50
+
+       # initiate a resilver and suspend the scan as soon as possible
+       log_must zpool replace $TESTPOOL1 $VDEV_REPLACE
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+       # there should only be 1 resilver start
+       verify_restarts '' "${RESTARTS[0]}" "${VDEVS[0]}"
+
+       # offline then online a vdev to introduce a new DTL range after current
+       # scan, which should restart (or defer) the resilver
+       log_must zpool offline $TESTPOOL1 ${VDEV_FILES[2]}
+       log_must zpool sync $TESTPOOL1
+       log_must zpool online $TESTPOOL1 ${VDEV_FILES[2]}
+       log_must zpool sync $TESTPOOL1
+
+       # there should now be 2 resilver starts w/o defer, 1 with defer
+       verify_restarts ' after offline/online' "${RESTARTS[1]}" "${VDEVS[1]}"
+
+       # inject read io errors on vdev and verify resilver does not restart
+       log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL1
+       log_must cat ${DATAPATHS[1]} > /dev/null
+       log_must zinject -c all
+
+       # there should still be 2 resilver starts w/o defer, 1 with defer
+       verify_restarts ' after zinject' "${RESTARTS[2]}" "${VDEVS[2]}"
+
+       # unsuspend resilver
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+       log_must set_tunable32 RESILVER_MIN_TIME_MS 3000
+
+       # wait for resilver to finish
+       log_must zpool wait -t resilver $TESTPOOL1
+       log_must is_pool_resilvered $TESTPOOL1
+
+       # wait for a few txg's to see if a resilver happens
+       log_must zpool sync $TESTPOOL1
+       log_must zpool sync $TESTPOOL1
+
+       # there should now be 2 resilver starts
+       verify_restarts ' after resilver' "${RESTARTS[3]}" "${VDEVS[3]}"
+done
+
+log_pass "Resilver did not restart unnecessarily"
diff --git a/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh b/tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh
new file mode 100755 (executable)
index 0000000..48763f9
--- /dev/null
@@ -0,0 +1,102 @@
+#!/bin/ksh -p
+
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2020, Datto Inc. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Testing resilver completes when scan errors are encountered, but relevant
+# DTL's have not been lost.
+#
+# STRATEGY:
+# 1. Create a pool (1k recordsize)
+# 2. Create a 32m file (32k records)
+# 3. Inject an error halfway through the file
+# 4. Start a resilver, ensure the error is triggered and that the resilver
+#    does not restart after finishing
+#
+# NB: use legacy scanning to ensure scan of specific block causes error
+#
+
+function cleanup
+{
+       log_must zinject -c all
+       destroy_pool $TESTPOOL1
+       rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+       log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY
+}
+
+log_assert "Check for resilver restarts caused by scan errors"
+
+ORIG_SCAN_LEGACY=$(get_tunable SCAN_LEGACY)
+
+log_onexit cleanup
+
+# use legacy scan to ensure injected error will be triggered
+log_must set_tunable32 SCAN_LEGACY 1
+
+ # create the pool and a 32M file (32k blocks)
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE
+log_must zpool create -f -O recordsize=1k $TESTPOOL1 ${VDEV_FILES[0]}
+log_must dd if=/dev/urandom of=/$TESTPOOL1/file bs=1M count=32 > /dev/null 2>&1
+
+# determine objset/object
+objset=$(zdb -d $TESTPOOL1/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p')
+object=$(ls -i /$TESTPOOL1/file | awk '{print $1}')
+
+# inject event to cause error during resilver
+log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL1
+
+# clear events and start resilver
+log_must zpool events -c
+log_must zpool attach $TESTPOOL1 ${VDEV_FILES[0]} $SPARE_VDEV_FILE
+
+log_note "waiting for read errors to start showing up"
+for iter in {0..59}
+do
+       zpool sync $TESTPOOL1
+       err=$(zpool status $TESTPOOL1 | grep ${VDEV_FILES[0]} | awk '{print $3}')
+       (( $err > 0 )) && break
+       sleep 1
+done
+
+(( $err == 0 )) && log_fail "Unable to induce errors in resilver"
+
+log_note "waiting for resilver to finish"
+for iter in {0..59}
+do
+       finish=$(zpool events | grep "sysevent.fs.zfs.resilver_finish" | wc -l)
+       (( $finish > 0 )) && break
+       sleep 1
+done
+
+(( $finish == 0 )) && log_fail "resilver took too long to finish"
+
+# wait a few syncs to ensure that zfs does not restart the resilver
+log_must zpool sync $TESTPOOL1
+log_must zpool sync $TESTPOOL1
+
+# check if resilver was restarted
+start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l)
+(( $start != 1 )) && log_fail "resilver restarted unnecessarily"
+
+log_pass "Resilver did not restart unnecessarily from scan errors"
diff --git a/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh b/tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh
new file mode 100755 (executable)
index 0000000..da8a0a2
--- /dev/null
@@ -0,0 +1,112 @@
+#!/bin/ksh -p
+
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019, Datto Inc. All rights reserved.
+# Copyright (c) 2020 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/replacement/replacement.cfg
+
+#
+# DESCRIPTION:
+# Verify scrub behaves as intended when contending with a healing or
+# sequential resilver.
+#
+# STRATEGY:
+# 1. Create a pool
+# 2. Add a modest amount of data to the pool.
+# 3. For healing and sequential resilver:
+#    a. Start scrubbing.
+#    b. Verify a resilver can be started and it cancels the scrub.
+#    c. Verify a scrub cannot be started when resilvering
+#
+
+function cleanup
+{
+       log_must set_tunable32 RESILVER_MIN_TIME_MS $ORIG_RESILVER_MIN_TIME
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
+           $ORIG_SCAN_SUSPEND_PROGRESS
+       destroy_pool $TESTPOOL1
+       rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+}
+
+log_assert "Scrub was cancelled by resilver"
+
+ORIG_RESILVER_MIN_TIME=$(get_tunable RESILVER_MIN_TIME_MS)
+ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
+
+log_onexit cleanup
+
+log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
+
+log_must zpool create -f $TESTPOOL1 ${VDEV_FILES[@]}
+log_must zfs create $TESTPOOL1/$TESTFS
+
+mntpnt=$(get_prop mountpoint $TESTPOOL1/$TESTFS)
+log_must dd if=/dev/urandom of=$mntpnt/file bs=1M count=64
+log_must zpool sync $TESTPOOL1
+
+# Request a healing or sequential resilver
+for replace_mode in "healing" "sequential"; do
+
+       #
+       # Healing resilvers abort the dsl_scan and reconfigure it for
+       # resilvering.  Sequential resilvers cancel the dsl_scan and start
+       # the vdev_rebuild thread.
+       #
+       if [[ "$replace_mode" = "healing" ]]; then
+               history_msg="scan aborted, restarting"
+               flags=""
+       else
+               history_msg="scan cancelled"
+               flags="-s"
+       fi
+
+       # Limit scanning time and suspend the scan as soon as possible.
+       log_must set_tunable32 RESILVER_MIN_TIME_MS 50
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+       # Initiate a scrub.
+       log_must zpool scrub $TESTPOOL1
+
+       # Initiate a resilver to cancel the scrub.
+       log_must zpool replace $flags $TESTPOOL1 ${VDEV_FILES[1]} \
+           $SPARE_VDEV_FILE
+
+       # Verify the scrub was canceled, it may take a few seconds to exit.
+       while is_pool_scrubbing $TESTPOOL1; do
+               sleep 1
+       done
+       log_mustnot is_pool_scrubbing $TESTPOOL1
+
+       # Verify a scrub cannot be started while resilvering.
+       log_must is_pool_resilvering $TESTPOOL1
+       log_mustnot zpool scrub $TESTPOOL1
+
+       # Unsuspend resilver.
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+       log_must set_tunable32 RESILVER_MIN_TIME_MS 3000
+
+       # Wait for resilver to finish then put the original back.
+       log_must zpool wait $TESTPOOL1
+       log_must zpool replace $flags -w $TESTPOOL1 $SPARE_VDEV_FILE \
+           ${VDEV_FILES[1]}
+done
+log_pass "Scrub was cancelled by resilver"
+
diff --git a/tests/zfs-tests/tests/functional/resilver/Makefile.am b/tests/zfs-tests/tests/functional/resilver/Makefile.am
deleted file mode 100644 (file)
index 38136a8..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/resilver
-dist_pkgdata_SCRIPTS = \
-       setup.ksh \
-       cleanup.ksh \
-       resilver_restart_001.ksh \
-       resilver_restart_002.ksh
-
-dist_pkgdata_DATA = \
-       resilver.cfg
diff --git a/tests/zfs-tests/tests/functional/resilver/cleanup.ksh b/tests/zfs-tests/tests/functional/resilver/cleanup.ksh
deleted file mode 100755 (executable)
index 4dfa814..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-
-#
-# Copyright (c) 2019, Datto Inc. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/resilver/resilver.cfg
-
-verify_runnable "global"
-
-log_pass
diff --git a/tests/zfs-tests/tests/functional/resilver/resilver.cfg b/tests/zfs-tests/tests/functional/resilver/resilver.cfg
deleted file mode 100644 (file)
index 88dfd24..0000000
+++ /dev/null
@@ -1,32 +0,0 @@
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-
-#
-# Copyright (c) 2019, Datto Inc. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-
-verify_runnable "global"
-
-set -A VDEV_FILES $TEST_BASE_DIR/file-{1..4}
-SPARE_VDEV_FILE=$TEST_BASE_DIR/spare-1
-
-VDEV_FILE_SIZE=$(( $SPA_MINDEVSIZE * 2 ))
diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh b/tests/zfs-tests/tests/functional/resilver/resilver_restart_001.ksh
deleted file mode 100755 (executable)
index 9af1c97..0000000
+++ /dev/null
@@ -1,192 +0,0 @@
-#!/bin/ksh -p
-
-#
-# CDDL HEADER START
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright (c) 2019, Datto Inc. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/resilver/resilver.cfg
-
-#
-# DESCRIPTION:
-# Testing resilver restart logic both with and without the deferred resilver
-# feature enabled, verifying that resilver is not restarted when it is
-# unnecessary.
-#
-# STRATEGY:
-# 1. Create a pool
-# 2. Create four filesystems with the primary cache disable to force reads
-# 3. Write four files simultaneously, one to each filesystem
-# 4. Do with and without deferred resilvers enabled
-#    a. Replace a vdev with a spare & suspend resilver immediately
-#    b. Verify resilver starts properly
-#    c. Offline / online another vdev to introduce a new DTL range
-#    d. Verify resilver restart restart or defer
-#    e. Inject read errors on vdev that was offlined / onlned
-#    f. Verify that resilver did not restart
-#    g. Unsuspend resilver and wait for it to finish
-#    h. Verify that there are two resilvers and nothing is deferred
-#
-
-function cleanup
-{
-       log_must set_tunable32 RESILVER_MIN_TIME_MS $ORIG_RESILVER_MIN_TIME
-       log_must set_tunable32 SCAN_SUSPEND_PROGRESS \
-           $ORIG_SCAN_SUSPEND_PROGRESS
-       log_must set_tunable32 ZEVENT_LEN_MAX $ORIG_ZFS_ZEVENT_LEN_MAX
-       log_must zinject -c all
-       destroy_pool $TESTPOOL
-       rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
-}
-
-# count resilver events in zpool and number of deferred rsilvers on vdevs
-function verify_restarts # <msg> <cnt> <defer>
-{
-       msg=$1
-       cnt=$2
-       defer=$3
-
-       # check the number of resilver start in events log
-       RESILVERS=$(zpool events | grep -c sysevent.fs.zfs.resilver_start)
-       log_note "expected $cnt resilver start(s)$msg, found $RESILVERS"
-       [[ "$RESILVERS" -ne "$cnt" ]] &&
-           log_fail "expected $cnt resilver start(s)$msg, found $RESILVERS"
-
-       [[ -z "$defer" ]] && return
-
-       # use zdb to find which vdevs have the resilver defer flag
-       VDEV_DEFERS=$(zdb -C $TESTPOOL | awk '
-           /children/ { gsub(/[^0-9]/, ""); child = $0 }
-           /com\.datto:resilver_defer$/ { print child }
-       ')
-
-       if [[ "$defer" == "-" ]]
-       then
-               [[ -n $VDEV_DEFERS ]] &&
-                   log_fail "didn't expect any vdevs to have resilver deferred"
-               return
-       fi
-
-       [[ $VDEV_DEFERS -eq $defer ]] ||
-           log_fail "resilver deferred set on unexpected vdev: $VDEV_DEFERS"
-}
-
-log_assert "Check for unnecessary resilver restarts"
-
-ORIG_RESILVER_MIN_TIME=$(get_tunable RESILVER_MIN_TIME_MS)
-ORIG_SCAN_SUSPEND_PROGRESS=$(get_tunable SCAN_SUSPEND_PROGRESS)
-ORIG_ZFS_ZEVENT_LEN_MAX=$(get_tunable ZEVENT_LEN_MAX)
-
-set -A RESTARTS -- '1' '2' '2' '2'
-set -A VDEVS -- '' '' '' ''
-set -A DEFER_RESTARTS -- '1' '1' '1' '2'
-set -A DEFER_VDEVS -- '-' '2' '2' '-'
-
-VDEV_REPLACE="${VDEV_FILES[1]} $SPARE_VDEV_FILE"
-
-log_onexit cleanup
-
-# ensure that enough events will be saved
-log_must set_tunable32 ZEVENT_LEN_MAX 512
-
-log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[@]} $SPARE_VDEV_FILE
-
-log_must zpool create -f -o feature@resilver_defer=disabled $TESTPOOL \
-    raidz ${VDEV_FILES[@]}
-
-# create 4 filesystems
-for fs in fs{0..3}
-do
-       log_must zfs create -o primarycache=none -o recordsize=1k $TESTPOOL/$fs
-done
-
-# simultaneously write 16M to each of them
-set -A DATAPATHS /$TESTPOOL/fs{0..3}/dat.0
-log_note "Writing data files"
-for path in ${DATAPATHS[@]}
-do
-       dd if=/dev/urandom of=$path bs=1M count=16 > /dev/null 2>&1 &
-done
-wait
-
-# test without and with deferred resilve feature enabled
-for test in "without" "with"
-do
-       log_note "Testing $test deferred resilvers"
-
-       if [[ $test == "with" ]]
-       then
-               log_must zpool set feature@resilver_defer=enabled $TESTPOOL
-               RESTARTS=( "${DEFER_RESTARTS[@]}" )
-               VDEVS=( "${DEFER_VDEVS[@]}" )
-               VDEV_REPLACE="$SPARE_VDEV_FILE ${VDEV_FILES[1]}"
-       fi
-
-       # clear the events
-       log_must zpool events -c
-
-       # limit scanning time
-       log_must set_tunable32 RESILVER_MIN_TIME_MS 50
-
-       # initiate a resilver and suspend the scan as soon as possible
-       log_must zpool replace $TESTPOOL $VDEV_REPLACE
-       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
-
-       # there should only be 1 resilver start
-       verify_restarts '' "${RESTARTS[0]}" "${VDEVS[0]}"
-
-       # offline then online a vdev to introduce a new DTL range after current
-       # scan, which should restart (or defer) the resilver
-       log_must zpool offline $TESTPOOL ${VDEV_FILES[2]}
-       log_must zpool sync $TESTPOOL
-       log_must zpool online $TESTPOOL ${VDEV_FILES[2]}
-       log_must zpool sync $TESTPOOL
-
-       # there should now be 2 resilver starts w/o defer, 1 with defer
-       verify_restarts ' after offline/online' "${RESTARTS[1]}" "${VDEVS[1]}"
-
-       # inject read io errors on vdev and verify resilver does not restart
-       log_must zinject -a -d ${VDEV_FILES[2]} -e io -T read -f 0.25 $TESTPOOL
-       log_must cat ${DATAPATHS[1]} > /dev/null
-       log_must zinject -c all
-
-       # there should still be 2 resilver starts w/o defer, 1 with defer
-       verify_restarts ' after zinject' "${RESTARTS[2]}" "${VDEVS[2]}"
-
-       # unsuspend resilver
-       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
-       log_must set_tunable32 RESILVER_MIN_TIME_MS 3000
-
-       # wait for resilver to finish
-       for iter in {0..59}
-       do
-               is_pool_resilvered $TESTPOOL && break
-               sleep 1
-       done
-       is_pool_resilvered $TESTPOOL ||
-           log_fail "resilver timed out"
-
-       # wait for a few txg's to see if a resilver happens
-       log_must zpool sync $TESTPOOL
-       log_must zpool sync $TESTPOOL
-
-       # there should now be 2 resilver starts
-       verify_restarts ' after resilver' "${RESTARTS[3]}" "${VDEVS[3]}"
-done
-
-log_pass "Resilver did not restart unnecessarily"
diff --git a/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh b/tests/zfs-tests/tests/functional/resilver/resilver_restart_002.ksh
deleted file mode 100755 (executable)
index ebe5e69..0000000
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/bin/ksh -p
-
-#
-# CDDL HEADER START
-#
-# This file and its contents are supplied under the terms of the
-# Common Development and Distribution License ("CDDL"), version 1.0.
-# You may only use this file in accordance with the terms of version
-# 1.0 of the CDDL.
-#
-# A full copy of the text of the CDDL should have accompanied this
-# source.  A copy of the CDDL is also available via the Internet at
-# http://www.illumos.org/license/CDDL.
-#
-# CDDL HEADER END
-#
-
-#
-# Copyright (c) 2020, Datto Inc. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/resilver/resilver.cfg
-
-#
-# DESCRIPTION:
-# Testing resilver completes when scan errors are encountered, but relevant
-# DTL's have not been lost.
-#
-# STRATEGY:
-# 1. Create a pool (1k recordsize)
-# 2. Create a 32m file (32k records)
-# 3. Inject an error halfway through the file
-# 4. Start a resilver, ensure the error is triggered and that the resilver
-#    does not restart after finishing
-#
-# NB: use legacy scanning to ensure scan of specific block causes error
-#
-
-function cleanup
-{
-       log_must zinject -c all
-       destroy_pool $TESTPOOL
-       rm -f ${VDEV_FILES[@]} $SPARE_VDEV_FILE
-       log_must set_tunable32 SCAN_LEGACY $ORIG_SCAN_LEGACY
-}
-
-log_assert "Check for resilver restarts caused by scan errors"
-
-ORIG_SCAN_LEGACY=$(get_tunable SCAN_LEGACY)
-
-log_onexit cleanup
-
-# use legacy scan to ensure injected error will be triggered
-log_must set_tunable32 SCAN_LEGACY 1
-
- # create the pool and a 32M file (32k blocks)
-log_must truncate -s $VDEV_FILE_SIZE ${VDEV_FILES[0]} $SPARE_VDEV_FILE
-log_must zpool create -f -O recordsize=1k $TESTPOOL ${VDEV_FILES[0]}
-log_must dd if=/dev/urandom of=/$TESTPOOL/file bs=1M count=32 > /dev/null 2>&1
-
-# determine objset/object
-objset=$(zdb -d $TESTPOOL/ | sed -ne 's/.*ID \([0-9]*\).*/\1/p')
-object=$(ls -i /$TESTPOOL/file | awk '{print $1}')
-
-# inject event to cause error during resilver
-log_must zinject -b `printf "%x:%x:0:3fff" $objset $object` $TESTPOOL
-
-# clear events and start resilver
-log_must zpool events -c
-log_must zpool attach $TESTPOOL ${VDEV_FILES[0]} $SPARE_VDEV_FILE
-
-log_note "waiting for read errors to start showing up"
-for iter in {0..59}
-do
-       zpool sync $TESTPOOL
-       err=$(zpool status $TESTPOOL | grep ${VDEV_FILES[0]} | awk '{print $3}')
-       (( $err > 0 )) && break
-       sleep 1
-done
-
-(( $err == 0 )) && log_fail "Unable to induce errors in resilver"
-
-log_note "waiting for resilver to finish"
-for iter in {0..59}
-do
-       finish=$(zpool events | grep "sysevent.fs.zfs.resilver_finish" | wc -l)
-       (( $finish > 0 )) && break
-       sleep 1
-done
-
-(( $finish == 0 )) && log_fail "resilver took too long to finish"
-
-# wait a few syncs to ensure that zfs does not restart the resilver
-log_must zpool sync $TESTPOOL
-log_must zpool sync $TESTPOOL
-
-# check if resilver was restarted
-start=$(zpool events | grep "sysevent.fs.zfs.resilver_start" | wc -l)
-(( $start != 1 )) && log_fail "resilver restarted unnecessarily"
-
-log_pass "Resilver did not restart unnecessarily from scan errors"
diff --git a/tests/zfs-tests/tests/functional/resilver/setup.ksh b/tests/zfs-tests/tests/functional/resilver/setup.ksh
deleted file mode 100755 (executable)
index 4dfa814..0000000
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-
-#
-# Copyright (c) 2019, Datto Inc. All rights reserved.
-#
-
-. $STF_SUITE/include/libtest.shlib
-. $STF_SUITE/tests/functional/resilver/resilver.cfg
-
-verify_runnable "global"
-
-log_pass