]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Teach zpool scrub to scrub only blocks in error log
authorGeorge Amanakis <gamanakis@gmail.com>
Fri, 17 Dec 2021 20:35:28 +0000 (21:35 +0100)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Thu, 18 May 2023 18:59:42 +0000 (11:59 -0700)
Added a flag '-e' in zpool scrub to scrub only blocks in error log. A
user can pause, resume and cancel the error scrub by passing additional
command line arguments -p -s just like a regular scrub. This involves
adding a new flag, creating new libzfs interfaces, a new ioctl, and the
actual iteration and read-issuing logic. Error scrubbing is executed in
multiple txg to make sure pool performance is not affected.

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Tony Hutter <hutter2@llnl.gov>
Co-authored-by: TulsiJain tulsi.jain@delphix.com
Signed-off-by: George Amanakis <gamanakis@gmail.com>
Closes #8995
Closes #12355

29 files changed:
cmd/zpool/zpool_main.c
include/libzfs.h
include/libzfs_core.h
include/sys/dmu.h
include/sys/dsl_scan.h
include/sys/fs/zfs.h
include/sys/spa.h
include/sys/spa_impl.h
include/sys/sysevent/eventdefs.h
lib/libzfs/libzfs.abi
lib/libzfs/libzfs_pool.c
lib/libzfs/libzfs_util.c
lib/libzfs_core/libzfs_core.abi
lib/libzfs_core/libzfs_core.c
man/man4/zfs.4
man/man8/zpool-scrub.8
module/zfs/dsl_scan.c
module/zfs/spa.c
module/zfs/spa_errlog.c
module/zfs/spa_misc.c
module/zfs/zfs_ioctl.c
tests/runfiles/common.run
tests/zfs-tests/cmd/libzfs_input_check.c
tests/zfs-tests/include/libtest.shlib
tests/zfs-tests/tests/Makefile.am
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh [new file with mode: 0755]

index 3e08e031414d49b777bb918dc677de3204c043df..013dd4a23380201ff35b4431c2866cc829912fe6 100644 (file)
@@ -401,7 +401,7 @@ get_usage(zpool_help_t idx)
                return (gettext("\tinitialize [-c | -s | -u] [-w] <pool> "
                    "[<device> ...]\n"));
        case HELP_SCRUB:
-               return (gettext("\tscrub [-s | -p] [-w] <pool> ...\n"));
+               return (gettext("\tscrub [-s | -p] [-w] [-e] <pool> ...\n"));
        case HELP_RESILVER:
                return (gettext("\tresilver <pool> ...\n"));
        case HELP_TRIM:
@@ -7309,8 +7309,9 @@ wait_callback(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * zpool scrub [-s | -p] [-w] <pool> ...
+ * zpool scrub [-s | -p] [-w] [-e] <pool> ...
  *
+ *     -e      Only scrub blocks in the error log.
  *     -s      Stop.  Stops any in-progress scrub.
  *     -p      Pause. Pause in-progress scrub.
  *     -w      Wait.  Blocks until scrub has completed.
@@ -7326,14 +7327,21 @@ zpool_do_scrub(int argc, char **argv)
        cb.cb_type = POOL_SCAN_SCRUB;
        cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
 
+       boolean_t is_error_scrub = B_FALSE;
+       boolean_t is_pause = B_FALSE;
+       boolean_t is_stop = B_FALSE;
+
        /* check options */
-       while ((c = getopt(argc, argv, "spw")) != -1) {
+       while ((c = getopt(argc, argv, "spwe")) != -1) {
                switch (c) {
+               case 'e':
+                       is_error_scrub = B_TRUE;
+                       break;
                case 's':
-                       cb.cb_type = POOL_SCAN_NONE;
+                       is_stop = B_TRUE;
                        break;
                case 'p':
-                       cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
+                       is_pause = B_TRUE;
                        break;
                case 'w':
                        wait = B_TRUE;
@@ -7345,11 +7353,21 @@ zpool_do_scrub(int argc, char **argv)
                }
        }
 
-       if (cb.cb_type == POOL_SCAN_NONE &&
-           cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) {
-               (void) fprintf(stderr, gettext("invalid option combination: "
-                   "-s and -p are mutually exclusive\n"));
+       if (is_pause && is_stop) {
+               (void) fprintf(stderr, gettext("invalid option "
+                   "combination :-s and -p are mutually exclusive\n"));
                usage(B_FALSE);
+       } else {
+               if (is_error_scrub)
+                       cb.cb_type = POOL_SCAN_ERRORSCRUB;
+
+               if (is_pause) {
+                       cb.cb_scrub_cmd = POOL_SCRUB_PAUSE;
+               } else if (is_stop) {
+                       cb.cb_type = POOL_SCAN_NONE;
+               } else {
+                       cb.cb_scrub_cmd = POOL_SCRUB_NORMAL;
+               }
        }
 
        if (wait && (cb.cb_type == POOL_SCAN_NONE ||
@@ -7573,6 +7591,70 @@ secs_to_dhms(uint64_t total, char *buf)
        }
 }
 
+/*
+ * Print out detailed error scrub status.
+ */
+static void
+print_err_scrub_status(pool_scan_stat_t *ps)
+{
+       time_t start, end, pause;
+       uint64_t total_secs_left;
+       uint64_t secs_left, mins_left, hours_left, days_left;
+       uint64_t examined, to_be_examined;
+
+       if (ps == NULL || ps->pss_error_scrub_func != POOL_SCAN_ERRORSCRUB) {
+               return;
+       }
+
+       (void) printf(gettext(" scrub: "));
+
+       start = ps->pss_error_scrub_start;
+       end = ps->pss_error_scrub_end;
+       pause = ps->pss_pass_error_scrub_pause;
+       examined = ps->pss_error_scrub_examined;
+       to_be_examined = ps->pss_error_scrub_to_be_examined;
+
+       assert(ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB);
+
+       if (ps->pss_error_scrub_state == DSS_FINISHED) {
+               total_secs_left = end - start;
+               days_left = total_secs_left / 60 / 60 / 24;
+               hours_left = (total_secs_left / 60 / 60) % 24;
+               mins_left = (total_secs_left / 60) % 60;
+               secs_left = (total_secs_left % 60);
+
+               (void) printf(gettext("scrubbed %llu error blocks in %llu days "
+                   "%02llu:%02llu:%02llu on %s"), (u_longlong_t)examined,
+                   (u_longlong_t)days_left, (u_longlong_t)hours_left,
+                   (u_longlong_t)mins_left, (u_longlong_t)secs_left,
+                   ctime(&end));
+
+               return;
+       } else if (ps->pss_error_scrub_state == DSS_CANCELED) {
+               (void) printf(gettext("error scrub canceled on %s"),
+                   ctime(&end));
+               return;
+       }
+       assert(ps->pss_error_scrub_state == DSS_ERRORSCRUBBING);
+
+       /* Error scrub is in progress. */
+       if (pause == 0) {
+               (void) printf(gettext("error scrub in progress since %s"),
+                   ctime(&start));
+       } else {
+               (void) printf(gettext("error scrub paused since %s"),
+                   ctime(&pause));
+               (void) printf(gettext("\terror scrub started on %s"),
+                   ctime(&start));
+       }
+
+       double fraction_done = (double)examined / (to_be_examined + examined);
+       (void) printf(gettext("\t%.2f%% done, issued I/O for %llu error"
+           " blocks"), 100 * fraction_done, (u_longlong_t)examined);
+
+       (void) printf("\n");
+}
+
 /*
  * Print out detailed scrub status.
  */
@@ -7909,10 +7991,12 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
 {
        uint64_t rebuild_end_time = 0, resilver_end_time = 0;
        boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE;
+       boolean_t have_errorscrub = B_FALSE;
        boolean_t active_resilver = B_FALSE;
        pool_checkpoint_stat_t *pcs = NULL;
        pool_scan_stat_t *ps = NULL;
        uint_t c;
+       time_t scrub_start = 0, errorscrub_start = 0;
 
        if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS,
            (uint64_t **)&ps, &c) == 0) {
@@ -7921,16 +8005,23 @@ print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot)
                        active_resilver = (ps->pss_state == DSS_SCANNING);
                }
 
+
                have_resilver = (ps->pss_func == POOL_SCAN_RESILVER);
                have_scrub = (ps->pss_func == POOL_SCAN_SCRUB);
+               scrub_start = ps->pss_start_time;
+               have_errorscrub = (ps->pss_error_scrub_func ==
+                   POOL_SCAN_ERRORSCRUB);
+               errorscrub_start = ps->pss_error_scrub_start;
        }
 
        boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time);
        boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0));
 
        /* Always print the scrub status when available. */
-       if (have_scrub)
+       if (have_scrub && scrub_start > errorscrub_start)
                print_scan_scrub_resilver_status(ps);
+       else if (have_errorscrub && errorscrub_start >= scrub_start)
+               print_err_scrub_status(ps);
 
        /*
         * When there is an active resilver or rebuild print its status.
index 87d1ed738f2b724001e81d1124822ace4ff960a4..a7037e3e62664d0f960ba5707344cb62ee3d7103 100644 (file)
@@ -125,11 +125,14 @@ typedef enum zfs_error {
        EZFS_THREADCREATEFAILED, /* thread create failed */
        EZFS_POSTSPLIT_ONLINE,  /* onlining a disk after splitting it */
        EZFS_SCRUBBING,         /* currently scrubbing */
+       EZFS_ERRORSCRUBBING,    /* currently error scrubbing */
+       EZFS_ERRORSCRUB_PAUSED, /* error scrub currently paused */
        EZFS_NO_SCRUB,          /* no active scrub */
        EZFS_DIFF,              /* general failure of zfs diff */
        EZFS_DIFFDATA,          /* bad zfs diff data */
        EZFS_POOLREADONLY,      /* pool is in read-only mode */
        EZFS_SCRUB_PAUSED,      /* scrub currently paused */
+       EZFS_SCRUB_PAUSED_TO_CANCEL,    /* scrub currently paused */
        EZFS_ACTIVE_POOL,       /* pool is imported on a different system */
        EZFS_CRYPTOFAILED,      /* failed to setup encryption */
        EZFS_NO_PENDING,        /* cannot cancel, no operation is pending */
index 14a4857c35daf2c7f87459d9859b6a8cda716759..867c18b9c226f245118761aa205b1e766a384780 100644 (file)
@@ -155,6 +155,8 @@ _LIBZFS_CORE_H int lzc_get_bootenv(const char *, nvlist_t **);
 _LIBZFS_CORE_H int lzc_get_vdev_prop(const char *, nvlist_t *, nvlist_t **);
 _LIBZFS_CORE_H int lzc_set_vdev_prop(const char *, nvlist_t *, nvlist_t **);
 
+_LIBZFS_CORE_H int lzc_scrub(zfs_ioc_t, const char *, nvlist_t *, nvlist_t **);
+
 #ifdef __cplusplus
 }
 #endif
index 5ee6704668a49584e5bfde20596d3547beebbab8..7e57d133c2ec3b2b6c3a57111b2758711ce8a619 100644 (file)
@@ -378,6 +378,7 @@ typedef struct dmu_buf {
 #define        DMU_POOL_DDT_STATS              "DDT-statistics"
 #define        DMU_POOL_CREATION_VERSION       "creation_version"
 #define        DMU_POOL_SCAN                   "scan"
+#define        DMU_POOL_ERRORSCRUB             "error_scrub"
 #define        DMU_POOL_FREE_BPOBJ             "free_bpobj"
 #define        DMU_POOL_BPTREE_OBJ             "bptree_obj"
 #define        DMU_POOL_EMPTY_BPOBJ            "empty_bpobj"
index 8925b5815a3740b1c8c34789cf0395fe37239a20..6753b4a8f359645a5aa2f832149096466d3de63f 100644 (file)
@@ -29,6 +29,7 @@
 
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
+#include <sys/zap.h>
 #include <sys/ddt.h>
 #include <sys/bplist.h>
 
@@ -78,6 +79,21 @@ typedef enum dsl_scan_flags {
 
 #define        DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN)
 
+typedef struct dsl_errorscrub_phys {
+       uint64_t dep_func; /* pool_scan_func_t */
+       uint64_t dep_state; /* dsl_scan_state_t */
+       uint64_t dep_cursor; /* serialized zap cursor for tracing progress */
+       uint64_t dep_start_time; /* error scrub start time, unix timestamp */
+       uint64_t dep_end_time; /* error scrub end time, unix timestamp */
+       uint64_t dep_to_examine; /* total error blocks to be scrubbed */
+       uint64_t dep_examined; /* blocks scrubbed so far */
+       uint64_t dep_errors;    /* error scrub I/O error count */
+       uint64_t dep_paused_flags; /* flag for paused */
+} dsl_errorscrub_phys_t;
+
+#define        ERRORSCRUB_PHYS_NUMINTS (sizeof (dsl_errorscrub_phys_t) \
+       / sizeof (uint64_t))
+
 /*
  * Every pool will have one dsl_scan_t and this structure will contain
  * in-memory information about the scan and a pointer to the on-disk
@@ -151,11 +167,15 @@ typedef struct dsl_scan {
        uint64_t scn_avg_zio_size_this_txg;
        uint64_t scn_zios_this_txg;
 
+       /* zap cursor for tracing error scrub progress */
+       zap_cursor_t errorscrub_cursor;
        /* members needed for syncing scan status to disk */
        dsl_scan_phys_t scn_phys;       /* on disk representation of scan */
        dsl_scan_phys_t scn_phys_cached;
        avl_tree_t scn_queue;           /* queue of datasets to scan */
        uint64_t scn_queues_pending;    /* outstanding data to issue */
+       /* members needed for syncing error scrub status to disk */
+       dsl_errorscrub_phys_t errorscrub_phys;
 } dsl_scan_t;
 
 typedef struct dsl_scan_io_queue dsl_scan_io_queue_t;
@@ -171,8 +191,12 @@ int dsl_scan_cancel(struct dsl_pool *);
 int dsl_scan(struct dsl_pool *, pool_scan_func_t);
 void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd);
 boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp);
-int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd);
+boolean_t dsl_errorscrubbing(const struct dsl_pool *dp);
+boolean_t dsl_errorscrub_active(dsl_scan_t *scn);
 void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg);
+int dsl_scrub_set_pause_resume(const struct dsl_pool *dp,
+    pool_scrub_cmd_t cmd);
+void dsl_errorscrub_sync(struct dsl_pool *, dmu_tx_t *);
 boolean_t dsl_scan_resilvering(struct dsl_pool *dp);
 boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp);
 boolean_t dsl_dataset_unstable(struct dsl_dataset *ds);
@@ -184,6 +208,7 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
     struct dmu_tx *tx);
 boolean_t dsl_scan_active(dsl_scan_t *scn);
 boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn);
+boolean_t dsl_errorscrub_is_paused(const dsl_scan_t *scn);
 void dsl_scan_freed(spa_t *spa, const blkptr_t *bp);
 void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue);
 void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd);
index 4c2097fb830e3dedb624ce9e6893f967a9e4e104..93193fa142da2a1ce8355b21d4a9e618e430e3ab 100644 (file)
@@ -1036,6 +1036,7 @@ typedef enum pool_scan_func {
        POOL_SCAN_NONE,
        POOL_SCAN_SCRUB,
        POOL_SCAN_RESILVER,
+       POOL_SCAN_ERRORSCRUB,
        POOL_SCAN_FUNCS
 } pool_scan_func_t;
 
@@ -1099,6 +1100,20 @@ typedef struct pool_scan_stat {
        uint64_t        pss_pass_scrub_spent_paused;
        uint64_t        pss_pass_issued; /* issued bytes per scan pass */
        uint64_t        pss_issued;     /* total bytes checked by scanner */
+
+       /* error scrub values stored on disk */
+       uint64_t        pss_error_scrub_func;   /* pool_scan_func_t */
+       uint64_t        pss_error_scrub_state;  /* dsl_scan_state_t */
+       uint64_t        pss_error_scrub_start;  /* error scrub start time */
+       uint64_t        pss_error_scrub_end;    /* error scrub end time */
+       uint64_t        pss_error_scrub_examined; /* error blocks issued I/O */
+       /* error blocks to be issued I/O */
+       uint64_t        pss_error_scrub_to_be_examined;
+
+       /* error scrub values not stored on disk */
+       /* error scrub pause time in milliseconds */
+       uint64_t        pss_pass_error_scrub_pause;
+
 } pool_scan_stat_t;
 
 typedef struct pool_removal_stat {
@@ -1120,6 +1135,7 @@ typedef enum dsl_scan_state {
        DSS_SCANNING,
        DSS_FINISHED,
        DSS_CANCELED,
+       DSS_ERRORSCRUBBING,
        DSS_NUM_STATES
 } dsl_scan_state_t;
 
@@ -1360,7 +1376,7 @@ typedef enum {
  */
 typedef enum zfs_ioc {
        /*
-        * Core features - 81/128 numbers reserved.
+        * Core features - 88/128 numbers reserved.
         */
 #ifdef __FreeBSD__
        ZFS_IOC_FIRST = 0,
@@ -1455,6 +1471,7 @@ typedef enum zfs_ioc {
        ZFS_IOC_WAIT_FS,                        /* 0x5a54 */
        ZFS_IOC_VDEV_GET_PROPS,                 /* 0x5a55 */
        ZFS_IOC_VDEV_SET_PROPS,                 /* 0x5a56 */
+       ZFS_IOC_POOL_SCRUB,                     /* 0x5a57 */
 
        /*
         * Per-platform (Optional) - 8/128 numbers reserved.
index 460ea2bfee4ef0061e3fb91e71d70b0f393529e3..ed752967cca65b763a2ba6202991920ba806cc3c 100644 (file)
@@ -1155,6 +1155,7 @@ extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_approx_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count);
+extern uint64_t spa_get_last_errlog_size(spa_t *spa);
 extern void spa_errlog_rotate(spa_t *spa);
 extern void spa_errlog_drain(spa_t *spa);
 extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
@@ -1165,6 +1166,13 @@ extern void spa_swap_errlog(spa_t *spa, uint64_t new_head_ds,
 extern void sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj,
     dmu_tx_t *tx);
 extern void spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx);
+extern int find_top_affected_fs(spa_t *spa, uint64_t head_ds,
+    zbookmark_err_phys_t *zep, uint64_t *top_affected_fs);
+extern int find_birth_txg(struct dsl_dataset *ds, zbookmark_err_phys_t *zep,
+    uint64_t *birth_txg);
+extern void zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep,
+    zbookmark_phys_t *zb);
+extern void name_to_errphys(char *buf, zbookmark_err_phys_t *zep);
 
 /* vdev cache */
 extern void vdev_cache_stat_init(void);
index 5782c54bd78f0f6bb0e898a0078b53ee17c22206..44afa763283a2f11999d8194a4e98e7090bf087c 100644 (file)
@@ -295,6 +295,10 @@ struct spa {
        uint64_t        spa_scan_pass_exam;     /* examined bytes per pass */
        uint64_t        spa_scan_pass_issued;   /* issued bytes per pass */
 
+       /* error scrub pause time in milliseconds */
+       uint64_t        spa_scan_pass_errorscrub_pause;
+       /* total error scrub paused time in milliseconds */
+       uint64_t        spa_scan_pass_errorscrub_spent_paused;
        /*
         * We are in the middle of a resilver, and another resilver
         * is needed once this one completes. This is set iff any
index eb1dfd16c0fdcf93f9cfd7f6b3bfd6f7bfe69d4c..a21085257967c9e8d3537c6f0c8842094484d10c 100644 (file)
@@ -123,6 +123,11 @@ extern "C" {
 #define        ESC_ZFS_TRIM_CANCEL             "trim_cancel"
 #define        ESC_ZFS_TRIM_RESUME             "trim_resume"
 #define        ESC_ZFS_TRIM_SUSPEND            "trim_suspend"
+#define        ESC_ZFS_ERRORSCRUB_START        "errorscrub_start"
+#define        ESC_ZFS_ERRORSCRUB_FINISH       "errorscrub_finish"
+#define        ESC_ZFS_ERRORSCRUB_ABORT        "errorscrub_abort"
+#define        ESC_ZFS_ERRORSCRUB_RESUME       "errorscrub_resume"
+#define        ESC_ZFS_ERRORSCRUB_PAUSED       "errorscrub_paused"
 
 /*
  * datalink subclass definitions.
index 57b096ca6e965c5f297ac8e482f5e0cb0ff6cca0..6e53bcb41a87481ffed1ba961cd09f523e86ca3c 100644 (file)
       <enumerator name='POOL_SCAN_NONE' value='0'/>
       <enumerator name='POOL_SCAN_SCRUB' value='1'/>
       <enumerator name='POOL_SCAN_RESILVER' value='2'/>
-      <enumerator name='POOL_SCAN_FUNCS' value='3'/>
+      <enumerator name='POOL_SCAN_ERRORSCRUB' value='3'/>
+      <enumerator name='POOL_SCAN_FUNCS' value='4'/>
     </enum-decl>
     <typedef-decl name='pool_scan_func_t' type-id='1b092565' id='7313fbe2'/>
     <enum-decl name='pool_scrub_cmd' id='a1474cbd'>
index a71cb24736a9499b1ff4e5a9c016f42e805e946c..d4af31c50cf8c0d986eb8d21cee14b5a7d845b50 100644 (file)
@@ -2648,50 +2648,84 @@ out:
 int
 zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
 {
-       zfs_cmd_t zc = {"\0"};
        char errbuf[ERRBUFLEN];
        int err;
        libzfs_handle_t *hdl = zhp->zpool_hdl;
 
-       (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
-       zc.zc_cookie = func;
-       zc.zc_flags = cmd;
+       nvlist_t *args = fnvlist_alloc();
+       fnvlist_add_uint64(args, "scan_type", (uint64_t)func);
+       fnvlist_add_uint64(args, "scan_command", (uint64_t)cmd);
+
+       err = lzc_scrub(ZFS_IOC_POOL_SCRUB, zhp->zpool_name, args, NULL);
+       fnvlist_free(args);
 
-       if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0)
+       if (err == 0) {
                return (0);
+       } else if (err == ZFS_ERR_IOC_CMD_UNAVAIL) {
+               zfs_cmd_t zc = {"\0"};
+               (void) strlcpy(zc.zc_name, zhp->zpool_name,
+                   sizeof (zc.zc_name));
+               zc.zc_cookie = func;
+               zc.zc_flags = cmd;
 
-       err = errno;
+               if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0)
+                       return (0);
+       }
 
-       /* ECANCELED on a scrub means we resumed a paused scrub */
-       if (err == ECANCELED && func == POOL_SCAN_SCRUB &&
-           cmd == POOL_SCRUB_NORMAL)
+       /*
+        * An ECANCELED on a scrub means one of the following:
+        * 1. we resumed a paused scrub.
+        * 2. we resumed a paused error scrub.
+        * 3. Error scrub is not run because of no error log.
+        */
+       if (err == ECANCELED && (func == POOL_SCAN_SCRUB ||
+           func == POOL_SCAN_ERRORSCRUB) && cmd == POOL_SCRUB_NORMAL)
                return (0);
-
-       if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL)
+       /*
+        * The following cases have been handled here:
+        * 1. Paused a scrub/error scrub if there is none in progress.
+        */
+       if (err == ENOENT && func != POOL_SCAN_NONE && cmd ==
+           POOL_SCRUB_PAUSE) {
                return (0);
+       }
+
+       ASSERT3U(func, >=, POOL_SCAN_NONE);
+       ASSERT3U(func, <, POOL_SCAN_FUNCS);
 
-       if (func == POOL_SCAN_SCRUB) {
+       if (func == POOL_SCAN_SCRUB || func == POOL_SCAN_ERRORSCRUB) {
                if (cmd == POOL_SCRUB_PAUSE) {
                        (void) snprintf(errbuf, sizeof (errbuf),
                            dgettext(TEXT_DOMAIN, "cannot pause scrubbing %s"),
-                           zc.zc_name);
+                           zhp->zpool_name);
                } else {
                        assert(cmd == POOL_SCRUB_NORMAL);
                        (void) snprintf(errbuf, sizeof (errbuf),
                            dgettext(TEXT_DOMAIN, "cannot scrub %s"),
-                           zc.zc_name);
+                           zhp->zpool_name);
                }
        } else if (func == POOL_SCAN_RESILVER) {
                assert(cmd == POOL_SCRUB_NORMAL);
                (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-                   "cannot restart resilver on %s"), zc.zc_name);
+                   "cannot restart resilver on %s"), zhp->zpool_name);
        } else if (func == POOL_SCAN_NONE) {
                (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-                   "cannot cancel scrubbing %s"), zc.zc_name);
+                   "cannot cancel scrubbing %s"), zhp->zpool_name);
        } else {
                assert(!"unexpected result");
        }
 
+       /*
+        * With EBUSY, five cases are possible:
+        *
+        * Current state                Requested
+        * 1. Normal Scrub Running      Normal Scrub or Error Scrub
+        * 2. Normal Scrub Paused       Error Scrub
+        * 3. Normal Scrub Paused       Pause Normal Scrub
+        * 4. Error Scrub Running       Normal Scrub or Error Scrub
+        * 5. Error Scrub Paused        Pause Error Scrub
+        * 6. Resilvering               Anything else
+        */
        if (err == EBUSY) {
                nvlist_t *nvroot;
                pool_scan_stat_t *ps = NULL;
@@ -2703,12 +2737,43 @@ zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd)
                    ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc);
                if (ps && ps->pss_func == POOL_SCAN_SCRUB &&
                    ps->pss_state == DSS_SCANNING) {
-                       if (cmd == POOL_SCRUB_PAUSE)
-                               return (zfs_error(hdl, EZFS_SCRUB_PAUSED,
+                       if (ps->pss_pass_scrub_pause == 0) {
+                               /* handles case 1 */
+                               assert(cmd == POOL_SCRUB_NORMAL);
+                               return (zfs_error(hdl, EZFS_SCRUBBING,
                                    errbuf));
-                       else
-                               return (zfs_error(hdl, EZFS_SCRUBBING, errbuf));
+                       } else {
+                               if (func == POOL_SCAN_ERRORSCRUB) {
+                                       /* handles case 2 */
+                                       ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL);
+                                       return (zfs_error(hdl,
+                                           EZFS_SCRUB_PAUSED_TO_CANCEL,
+                                           errbuf));
+                               } else {
+                                       /* handles case 3 */
+                                       ASSERT3U(func, ==, POOL_SCAN_SCRUB);
+                                       ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE);
+                                       return (zfs_error(hdl,
+                                           EZFS_SCRUB_PAUSED, errbuf));
+                               }
+                       }
+               } else if (ps &&
+                   ps->pss_error_scrub_func == POOL_SCAN_ERRORSCRUB &&
+                   ps->pss_error_scrub_state == DSS_ERRORSCRUBBING) {
+                       if (ps->pss_pass_error_scrub_pause == 0) {
+                               /* handles case 4 */
+                               ASSERT3U(cmd, ==, POOL_SCRUB_NORMAL);
+                               return (zfs_error(hdl, EZFS_ERRORSCRUBBING,
+                                   errbuf));
+                       } else {
+                               /* handles case 5 */
+                               ASSERT3U(func, ==, POOL_SCAN_ERRORSCRUB);
+                               ASSERT3U(cmd, ==, POOL_SCRUB_PAUSE);
+                               return (zfs_error(hdl, EZFS_ERRORSCRUB_PAUSED,
+                                   errbuf));
+                       }
                } else {
+                       /* handles case 6 */
                        return (zfs_error(hdl, EZFS_RESILVERING, errbuf));
                }
        } else if (err == ENOENT) {
index 4b8a20160e02cfb011a0cf59ac0837e2d683be4e..b94abea3d58157bfd3e4220cee4688695688aa66 100644 (file)
@@ -243,10 +243,20 @@ libzfs_error_description(libzfs_handle_t *hdl)
                    "into a new one"));
        case EZFS_SCRUB_PAUSED:
                return (dgettext(TEXT_DOMAIN, "scrub is paused; "
-                   "use 'zpool scrub' to resume"));
+                   "use 'zpool scrub' to resume scrub"));
+       case EZFS_SCRUB_PAUSED_TO_CANCEL:
+               return (dgettext(TEXT_DOMAIN, "scrub is paused; "
+                   "use 'zpool scrub' to resume or 'zpool scrub -s' to "
+                   "cancel scrub"));
        case EZFS_SCRUBBING:
                return (dgettext(TEXT_DOMAIN, "currently scrubbing; "
-                   "use 'zpool scrub -s' to cancel current scrub"));
+                   "use 'zpool scrub -s' to cancel scrub"));
+       case EZFS_ERRORSCRUBBING:
+               return (dgettext(TEXT_DOMAIN, "currently error scrubbing; "
+                   "use 'zpool scrub -s' to cancel error scrub"));
+       case EZFS_ERRORSCRUB_PAUSED:
+               return (dgettext(TEXT_DOMAIN, "error scrub is paused; "
+                   "use 'zpool scrub -e' to resume error scrub"));
        case EZFS_NO_SCRUB:
                return (dgettext(TEXT_DOMAIN, "there is no active scrub"));
        case EZFS_DIFF:
index 33d794e3f8096d16f976800bd7a76b7b1b4d385a..f2087186aa44712b86c3b493bfad2f8d19f46d93 100644 (file)
     <elf-symbol name='lzc_reopen' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_rollback' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_rollback_to' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
+    <elf-symbol name='lzc_scrub' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_send' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_send_redacted' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
     <elf-symbol name='lzc_send_resume' type='func-type' binding='global-binding' visibility='default-visibility' is-defined='yes'/>
       <enumerator name='POOL_TRIM_FUNCS' value='3'/>
     </enum-decl>
     <typedef-decl name='pool_trim_func_t' type-id='54ed608a' id='b1146b8d'/>
+    <enum-decl name='zfs_ioc' id='12033f13'>
+      <underlying-type type-id='9cac1fee'/>
+      <enumerator name='ZFS_IOC_FIRST' value='23040'/>
+      <enumerator name='ZFS_IOC' value='23040'/>
+      <enumerator name='ZFS_IOC_POOL_CREATE' value='23040'/>
+      <enumerator name='ZFS_IOC_POOL_DESTROY' value='23041'/>
+      <enumerator name='ZFS_IOC_POOL_IMPORT' value='23042'/>
+      <enumerator name='ZFS_IOC_POOL_EXPORT' value='23043'/>
+      <enumerator name='ZFS_IOC_POOL_CONFIGS' value='23044'/>
+      <enumerator name='ZFS_IOC_POOL_STATS' value='23045'/>
+      <enumerator name='ZFS_IOC_POOL_TRYIMPORT' value='23046'/>
+      <enumerator name='ZFS_IOC_POOL_SCAN' value='23047'/>
+      <enumerator name='ZFS_IOC_POOL_FREEZE' value='23048'/>
+      <enumerator name='ZFS_IOC_POOL_UPGRADE' value='23049'/>
+      <enumerator name='ZFS_IOC_POOL_GET_HISTORY' value='23050'/>
+      <enumerator name='ZFS_IOC_VDEV_ADD' value='23051'/>
+      <enumerator name='ZFS_IOC_VDEV_REMOVE' value='23052'/>
+      <enumerator name='ZFS_IOC_VDEV_SET_STATE' value='23053'/>
+      <enumerator name='ZFS_IOC_VDEV_ATTACH' value='23054'/>
+      <enumerator name='ZFS_IOC_VDEV_DETACH' value='23055'/>
+      <enumerator name='ZFS_IOC_VDEV_SETPATH' value='23056'/>
+      <enumerator name='ZFS_IOC_VDEV_SETFRU' value='23057'/>
+      <enumerator name='ZFS_IOC_OBJSET_STATS' value='23058'/>
+      <enumerator name='ZFS_IOC_OBJSET_ZPLPROPS' value='23059'/>
+      <enumerator name='ZFS_IOC_DATASET_LIST_NEXT' value='23060'/>
+      <enumerator name='ZFS_IOC_SNAPSHOT_LIST_NEXT' value='23061'/>
+      <enumerator name='ZFS_IOC_SET_PROP' value='23062'/>
+      <enumerator name='ZFS_IOC_CREATE' value='23063'/>
+      <enumerator name='ZFS_IOC_DESTROY' value='23064'/>
+      <enumerator name='ZFS_IOC_ROLLBACK' value='23065'/>
+      <enumerator name='ZFS_IOC_RENAME' value='23066'/>
+      <enumerator name='ZFS_IOC_RECV' value='23067'/>
+      <enumerator name='ZFS_IOC_SEND' value='23068'/>
+      <enumerator name='ZFS_IOC_INJECT_FAULT' value='23069'/>
+      <enumerator name='ZFS_IOC_CLEAR_FAULT' value='23070'/>
+      <enumerator name='ZFS_IOC_INJECT_LIST_NEXT' value='23071'/>
+      <enumerator name='ZFS_IOC_ERROR_LOG' value='23072'/>
+      <enumerator name='ZFS_IOC_CLEAR' value='23073'/>
+      <enumerator name='ZFS_IOC_PROMOTE' value='23074'/>
+      <enumerator name='ZFS_IOC_SNAPSHOT' value='23075'/>
+      <enumerator name='ZFS_IOC_DSOBJ_TO_DSNAME' value='23076'/>
+      <enumerator name='ZFS_IOC_OBJ_TO_PATH' value='23077'/>
+      <enumerator name='ZFS_IOC_POOL_SET_PROPS' value='23078'/>
+      <enumerator name='ZFS_IOC_POOL_GET_PROPS' value='23079'/>
+      <enumerator name='ZFS_IOC_SET_FSACL' value='23080'/>
+      <enumerator name='ZFS_IOC_GET_FSACL' value='23081'/>
+      <enumerator name='ZFS_IOC_SHARE' value='23082'/>
+      <enumerator name='ZFS_IOC_INHERIT_PROP' value='23083'/>
+      <enumerator name='ZFS_IOC_SMB_ACL' value='23084'/>
+      <enumerator name='ZFS_IOC_USERSPACE_ONE' value='23085'/>
+      <enumerator name='ZFS_IOC_USERSPACE_MANY' value='23086'/>
+      <enumerator name='ZFS_IOC_USERSPACE_UPGRADE' value='23087'/>
+      <enumerator name='ZFS_IOC_HOLD' value='23088'/>
+      <enumerator name='ZFS_IOC_RELEASE' value='23089'/>
+      <enumerator name='ZFS_IOC_GET_HOLDS' value='23090'/>
+      <enumerator name='ZFS_IOC_OBJSET_RECVD_PROPS' value='23091'/>
+      <enumerator name='ZFS_IOC_VDEV_SPLIT' value='23092'/>
+      <enumerator name='ZFS_IOC_NEXT_OBJ' value='23093'/>
+      <enumerator name='ZFS_IOC_DIFF' value='23094'/>
+      <enumerator name='ZFS_IOC_TMP_SNAPSHOT' value='23095'/>
+      <enumerator name='ZFS_IOC_OBJ_TO_STATS' value='23096'/>
+      <enumerator name='ZFS_IOC_SPACE_WRITTEN' value='23097'/>
+      <enumerator name='ZFS_IOC_SPACE_SNAPS' value='23098'/>
+      <enumerator name='ZFS_IOC_DESTROY_SNAPS' value='23099'/>
+      <enumerator name='ZFS_IOC_POOL_REGUID' value='23100'/>
+      <enumerator name='ZFS_IOC_POOL_REOPEN' value='23101'/>
+      <enumerator name='ZFS_IOC_SEND_PROGRESS' value='23102'/>
+      <enumerator name='ZFS_IOC_LOG_HISTORY' value='23103'/>
+      <enumerator name='ZFS_IOC_SEND_NEW' value='23104'/>
+      <enumerator name='ZFS_IOC_SEND_SPACE' value='23105'/>
+      <enumerator name='ZFS_IOC_CLONE' value='23106'/>
+      <enumerator name='ZFS_IOC_BOOKMARK' value='23107'/>
+      <enumerator name='ZFS_IOC_GET_BOOKMARKS' value='23108'/>
+      <enumerator name='ZFS_IOC_DESTROY_BOOKMARKS' value='23109'/>
+      <enumerator name='ZFS_IOC_RECV_NEW' value='23110'/>
+      <enumerator name='ZFS_IOC_POOL_SYNC' value='23111'/>
+      <enumerator name='ZFS_IOC_CHANNEL_PROGRAM' value='23112'/>
+      <enumerator name='ZFS_IOC_LOAD_KEY' value='23113'/>
+      <enumerator name='ZFS_IOC_UNLOAD_KEY' value='23114'/>
+      <enumerator name='ZFS_IOC_CHANGE_KEY' value='23115'/>
+      <enumerator name='ZFS_IOC_REMAP' value='23116'/>
+      <enumerator name='ZFS_IOC_POOL_CHECKPOINT' value='23117'/>
+      <enumerator name='ZFS_IOC_POOL_DISCARD_CHECKPOINT' value='23118'/>
+      <enumerator name='ZFS_IOC_POOL_INITIALIZE' value='23119'/>
+      <enumerator name='ZFS_IOC_POOL_TRIM' value='23120'/>
+      <enumerator name='ZFS_IOC_REDACT' value='23121'/>
+      <enumerator name='ZFS_IOC_GET_BOOKMARK_PROPS' value='23122'/>
+      <enumerator name='ZFS_IOC_WAIT' value='23123'/>
+      <enumerator name='ZFS_IOC_WAIT_FS' value='23124'/>
+      <enumerator name='ZFS_IOC_VDEV_GET_PROPS' value='23125'/>
+      <enumerator name='ZFS_IOC_VDEV_SET_PROPS' value='23126'/>
+      <enumerator name='ZFS_IOC_POOL_SCRUB' value='23127'/>
+      <enumerator name='ZFS_IOC_PLATFORM' value='23168'/>
+      <enumerator name='ZFS_IOC_EVENTS_NEXT' value='23169'/>
+      <enumerator name='ZFS_IOC_EVENTS_CLEAR' value='23170'/>
+      <enumerator name='ZFS_IOC_EVENTS_SEEK' value='23171'/>
+      <enumerator name='ZFS_IOC_NEXTBOOT' value='23172'/>
+      <enumerator name='ZFS_IOC_JAIL' value='23173'/>
+      <enumerator name='ZFS_IOC_UNJAIL' value='23174'/>
+      <enumerator name='ZFS_IOC_SET_BOOTENV' value='23175'/>
+      <enumerator name='ZFS_IOC_GET_BOOTENV' value='23176'/>
+      <enumerator name='ZFS_IOC_LAST' value='23177'/>
+    </enum-decl>
+    <typedef-decl name='zfs_ioc_t' type-id='12033f13' id='5b35941c'/>
     <enum-decl name='zpool_wait_activity_t' naming-typedef-id='73446457' id='849338e3'>
       <underlying-type type-id='9cac1fee'/>
       <enumerator name='ZPOOL_WAIT_CKPT_DISCARD' value='0'/>
index 254f14e043216bdca1df086a9e1bded7583f9562..c63a16de5ab6864940b3a9e97cf61478b87c28c6 100644 (file)
@@ -247,6 +247,13 @@ out:
        return (error);
 }
 
+int
+lzc_scrub(zfs_ioc_t ioc, const char *name,
+    nvlist_t *source, nvlist_t **resultp)
+{
+       return (lzc_ioctl(ioc, name, source, resultp));
+}
+
 int
 lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props,
     uint8_t *wkeydata, uint_t wkeylen)
index d529147464fe29442041bdad7bd9f4d2bdae9c19..9ec940a9448804b644160239c6cf3db5bb93aba7 100644 (file)
@@ -1764,6 +1764,9 @@ Scrubs are processed by the sync thread.
 While scrubbing, it will spend at least this much time
 working on a scrub between TXG flushes.
 .
+.It Sy zfs_scrub_error_blocks_per_txg Ns = Ns Sy 4096 Pq uint
+Error blocks to be scrubbed in one txg.
+.
 .It Sy zfs_scan_checkpoint_intval Ns = Ns Sy 7200 Ns s Po 2 hour Pc Pq uint
 To preserve progress across reboots, the sequential scan algorithm periodically
 needs to stop metadata scanning and issue all the verification I/O to disk.
index 1fdbb8a5d56d431210a455dc281ce59f44c4444e..138226e4562c002fa0f60bf6b9c3d12b572b3c01 100644 (file)
@@ -38,6 +38,7 @@
 .Cm scrub
 .Op Fl s Ns | Ns Fl p
 .Op Fl w
+.Op Fl e
 .Ar pool Ns â€¦
 .
 .Sh DESCRIPTION
@@ -62,6 +63,13 @@ device
 whereas scrubbing examines all data to discover silent errors due to hardware
 faults or disk failure.
 .Pp
+When scrubbing a pool with encrypted filesystems the keys do not need to be
+loaded.
+However, if the keys are not loaded and an unrepairable checksum error is
+detected the file name cannot be included in the
+.Nm zpool Cm status Fl v
+verbose error report.
+.Pp
 Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows
 one at a time.
 .Pp
@@ -92,9 +100,20 @@ Once resumed the scrub will pick up from the place where it was last
 checkpointed to disk.
 To resume a paused scrub issue
 .Nm zpool Cm scrub
+or
+.Nm zpool Cm scrub
+.Fl e
 again.
 .It Fl w
 Wait until scrub has completed before returning.
+.It Fl e
+Only scrub files with known data errors as reported by
+.Nm zpool Cm status Fl v .
+The pool must have been scrubbed at least once with the
+.Sy head_errlog
+feature enabled to use this option.
+Error scrubbing cannot be run simultaneously with regular scrubbing or
+resilvering, nor can it be run when a regular scrub is paused.
 .El
 .Sh EXAMPLES
 .Ss Example 1
index d398b6705551575918be7f6f9759047fe66422ad..5e3559b251e341bc0738b92e12de21dab81e1e39 100644 (file)
@@ -54,6 +54,7 @@
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #include <sys/range_tree.h>
+#include <sys/dbuf.h>
 #ifdef _KERNEL
 #include <sys/zfs_vfsops.h>
 #endif
@@ -129,6 +130,7 @@ static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg);
 static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj);
 static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx);
 static uint64_t dsl_scan_count_data_disks(spa_t *spa);
+static void read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb);
 
 extern uint_t zfs_vdev_async_write_active_min_dirty_percent;
 static int zfs_scan_blkstats = 0;
@@ -231,6 +233,9 @@ static int zfs_resilver_disable_defer = B_FALSE;
  */
 static int zfs_free_bpobj_enabled = 1;
 
+/* Error blocks to be scrubbed in one txg. */
+unsigned long zfs_scrub_error_blocks_per_txg = 1 << 12;
+
 /* the order has to match pool_scan_type */
 static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = {
        NULL,
@@ -511,9 +516,17 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
                    "scrub_queue", sizeof (uint64_t), 1,
                    &scn->scn_phys.scn_queue_obj);
        } else {
+               err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+                   DMU_POOL_ERRORSCRUB, sizeof (uint64_t),
+                   ERRORSCRUB_PHYS_NUMINTS, &scn->errorscrub_phys);
+
+               if (err != 0 && err != ENOENT)
+                       return (err);
+
                err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
                    DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS,
                    &scn->scn_phys);
+
                /*
                 * Detect if the pool contains the signature of #2094.  If it
                 * does properly update the scn->scn_phys structure and notify
@@ -663,6 +676,22 @@ dsl_scan_scrubbing(const dsl_pool_t *dp)
            scn_phys->scn_func == POOL_SCAN_SCRUB);
 }
 
+boolean_t
+dsl_errorscrubbing(const dsl_pool_t *dp)
+{
+       dsl_errorscrub_phys_t *errorscrub_phys = &dp->dp_scan->errorscrub_phys;
+
+       return (errorscrub_phys->dep_state == DSS_ERRORSCRUBBING &&
+           errorscrub_phys->dep_func == POOL_SCAN_ERRORSCRUB);
+}
+
+boolean_t
+dsl_errorscrub_is_paused(const dsl_scan_t *scn)
+{
+       return (dsl_errorscrubbing(scn->scn_dp) &&
+           scn->errorscrub_phys.dep_paused_flags);
+}
+
 boolean_t
 dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
 {
@@ -670,6 +699,68 @@ dsl_scan_is_paused_scrub(const dsl_scan_t *scn)
            scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED);
 }
 
+static void
+dsl_errorscrub_sync_state(dsl_scan_t *scn, dmu_tx_t *tx)
+{
+       scn->errorscrub_phys.dep_cursor =
+           zap_cursor_serialize(&scn->errorscrub_cursor);
+
+       VERIFY0(zap_update(scn->scn_dp->dp_meta_objset,
+           DMU_POOL_DIRECTORY_OBJECT,
+           DMU_POOL_ERRORSCRUB, sizeof (uint64_t), ERRORSCRUB_PHYS_NUMINTS,
+           &scn->errorscrub_phys, tx));
+}
+
+static void
+dsl_errorscrub_setup_sync(void *arg, dmu_tx_t *tx)
+{
+       dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+       pool_scan_func_t *funcp = arg;
+       dsl_pool_t *dp = scn->scn_dp;
+       spa_t *spa = dp->dp_spa;
+
+       ASSERT(!dsl_scan_is_running(scn));
+       ASSERT(!dsl_errorscrubbing(scn->scn_dp));
+       ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
+
+       memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
+       scn->errorscrub_phys.dep_func = *funcp;
+       scn->errorscrub_phys.dep_state = DSS_ERRORSCRUBBING;
+       scn->errorscrub_phys.dep_start_time = gethrestime_sec();
+       scn->errorscrub_phys.dep_to_examine = spa_get_last_errlog_size(spa);
+       scn->errorscrub_phys.dep_examined = 0;
+       scn->errorscrub_phys.dep_errors = 0;
+       scn->errorscrub_phys.dep_cursor = 0;
+       zap_cursor_init_serialized(&scn->errorscrub_cursor,
+           spa->spa_meta_objset, spa->spa_errlog_last,
+           scn->errorscrub_phys.dep_cursor);
+
+       vdev_config_dirty(spa->spa_root_vdev);
+       spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_START);
+
+       dsl_errorscrub_sync_state(scn, tx);
+
+       spa_history_log_internal(spa, "error scrub setup", tx,
+           "func=%u mintxg=%u maxtxg=%llu",
+           *funcp, 0, (u_longlong_t)tx->tx_txg);
+}
+
+static int
+dsl_errorscrub_setup_check(void *arg, dmu_tx_t *tx)
+{
+       (void) arg;
+       dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+       if (dsl_scan_is_running(scn) || (dsl_errorscrubbing(scn->scn_dp))) {
+               return (SET_ERROR(EBUSY));
+       }
+
+       if (spa_get_last_errlog_size(scn->scn_dp->dp_spa) == 0) {
+               return (ECANCELED);
+       }
+       return (0);
+}
+
 /*
  * Writes out a persistent dsl_scan_phys_t record to the pool directory.
  * Because we can be running in the block sorting algorithm, we do not always
@@ -745,7 +836,8 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
        dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
        vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev;
 
-       if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd))
+       if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd) ||
+           dsl_errorscrubbing(scn->scn_dp))
                return (SET_ERROR(EBUSY));
 
        return (0);
@@ -754,6 +846,7 @@ dsl_scan_setup_check(void *arg, dmu_tx_t *tx)
 void
 dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 {
+       (void) arg;
        dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
        pool_scan_func_t *funcp = arg;
        dmu_object_type_t ot = 0;
@@ -763,6 +856,14 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
        ASSERT(!dsl_scan_is_running(scn));
        ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS);
        memset(&scn->scn_phys, 0, sizeof (scn->scn_phys));
+
+       /*
+        * If we are starting a fresh scrub, we erase the error scrub
+        * information from disk.
+        */
+       memset(&scn->errorscrub_phys, 0, sizeof (scn->errorscrub_phys));
+       dsl_errorscrub_sync_state(scn, tx);
+
        scn->scn_phys.scn_func = *funcp;
        scn->scn_phys.scn_state = DSS_SCANNING;
        scn->scn_phys.scn_min_txg = 0;
@@ -856,8 +957,9 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx)
 }
 
 /*
- * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver.
- * Can also be called to resume a paused scrub.
+ * Called by ZFS_IOC_POOL_SCRUB and ZFS_IOC_POOL_SCAN ioctl to start a scrub,
+ * error scrub or resilver. Can also be called to resume a paused scrub or
+ * error scrub.
  */
 int
 dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
@@ -883,6 +985,26 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
                return (0);
        }
 
+       if (func == POOL_SCAN_ERRORSCRUB) {
+               if (dsl_errorscrub_is_paused(dp->dp_scan)) {
+                       /*
+                        * got error scrub start cmd, resume paused error scrub.
+                        */
+                       int err = dsl_scrub_set_pause_resume(scn->scn_dp,
+                           POOL_SCRUB_NORMAL);
+                       if (err == 0) {
+                               spa_event_notify(spa, NULL, NULL,
+                                   ESC_ZFS_ERRORSCRUB_RESUME);
+                               return (ECANCELED);
+                       }
+                       return (SET_ERROR(err));
+               }
+
+               return (dsl_sync_task(spa_name(dp->dp_spa),
+                   dsl_errorscrub_setup_check, dsl_errorscrub_setup_sync,
+                   &func, 0, ZFS_SPACE_CHECK_RESERVED));
+       }
+
        if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) {
                /* got scrub start cmd, resume paused scrub */
                int err = dsl_scrub_set_pause_resume(scn->scn_dp,
@@ -891,7 +1013,6 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
                        spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME);
                        return (SET_ERROR(ECANCELED));
                }
-
                return (SET_ERROR(err));
        }
 
@@ -899,6 +1020,33 @@ dsl_scan(dsl_pool_t *dp, pool_scan_func_t func)
            dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED));
 }
 
+static void
+dsl_errorscrub_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
+{
+       dsl_pool_t *dp = scn->scn_dp;
+       spa_t *spa = dp->dp_spa;
+
+       if (complete) {
+               spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_FINISH);
+               spa_history_log_internal(spa, "error scrub done", tx,
+                   "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
+       } else {
+               spa_history_log_internal(spa, "error scrub canceled", tx,
+                   "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
+       }
+
+       scn->errorscrub_phys.dep_state = complete ? DSS_FINISHED : DSS_CANCELED;
+       spa->spa_scrub_active = B_FALSE;
+       spa_errlog_rotate(spa);
+       scn->errorscrub_phys.dep_end_time = gethrestime_sec();
+       zap_cursor_fini(&scn->errorscrub_cursor);
+
+       if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB)
+               spa->spa_errata = 0;
+
+       ASSERT(!dsl_errorscrubbing(scn->scn_dp));
+}
+
 static void
 dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 {
@@ -1045,6 +1193,92 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
        ASSERT(!dsl_scan_is_running(scn));
 }
 
+static int
+dsl_errorscrub_pause_resume_check(void *arg, dmu_tx_t *tx)
+{
+       pool_scrub_cmd_t *cmd = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       dsl_scan_t *scn = dp->dp_scan;
+
+       if (*cmd == POOL_SCRUB_PAUSE) {
+               /*
+                * can't pause a error scrub when there is no in-progress
+                * error scrub.
+                */
+               if (!dsl_errorscrubbing(dp))
+                       return (SET_ERROR(ENOENT));
+
+               /* can't pause a paused error scrub */
+               if (dsl_errorscrub_is_paused(scn))
+                       return (SET_ERROR(EBUSY));
+       } else if (*cmd != POOL_SCRUB_NORMAL) {
+               return (SET_ERROR(ENOTSUP));
+       }
+
+       return (0);
+}
+
+static void
+dsl_errorscrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
+{
+       pool_scrub_cmd_t *cmd = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       spa_t *spa = dp->dp_spa;
+       dsl_scan_t *scn = dp->dp_scan;
+
+       if (*cmd == POOL_SCRUB_PAUSE) {
+               spa->spa_scan_pass_errorscrub_pause = gethrestime_sec();
+               scn->errorscrub_phys.dep_paused_flags = B_TRUE;
+               dsl_errorscrub_sync_state(scn, tx);
+               spa_event_notify(spa, NULL, NULL, ESC_ZFS_ERRORSCRUB_PAUSED);
+       } else {
+               ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL);
+               if (dsl_errorscrub_is_paused(scn)) {
+                       /*
+                        * We need to keep track of how much time we spend
+                        * paused per pass so that we can adjust the error scrub
+                        * rate shown in the output of 'zpool status'.
+                        */
+                       spa->spa_scan_pass_errorscrub_spent_paused +=
+                           gethrestime_sec() -
+                           spa->spa_scan_pass_errorscrub_pause;
+
+                       spa->spa_scan_pass_errorscrub_pause = 0;
+                       scn->errorscrub_phys.dep_paused_flags = B_FALSE;
+
+                       zap_cursor_init_serialized(
+                           &scn->errorscrub_cursor,
+                           spa->spa_meta_objset, spa->spa_errlog_last,
+                           scn->errorscrub_phys.dep_cursor);
+
+                       dsl_errorscrub_sync_state(scn, tx);
+               }
+       }
+}
+
+static int
+dsl_errorscrub_cancel_check(void *arg, dmu_tx_t *tx)
+{
+       (void) arg;
+       dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+       /* can't cancel a error scrub when there is no one in-progress */
+       if (!dsl_errorscrubbing(scn->scn_dp))
+               return (SET_ERROR(ENOENT));
+       return (0);
+}
+
+static void
+dsl_errorscrub_cancel_sync(void *arg, dmu_tx_t *tx)
+{
+       (void) arg;
+       dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan;
+
+       dsl_errorscrub_done(scn, B_FALSE, tx);
+       dsl_errorscrub_sync_state(scn, tx);
+       spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL,
+           ESC_ZFS_ERRORSCRUB_ABORT);
+}
+
 static int
 dsl_scan_cancel_check(void *arg, dmu_tx_t *tx)
 {
@@ -1070,6 +1304,11 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx)
 int
 dsl_scan_cancel(dsl_pool_t *dp)
 {
+       if (dsl_errorscrubbing(dp)) {
+               return (dsl_sync_task(spa_name(dp->dp_spa),
+                   dsl_errorscrub_cancel_check, dsl_errorscrub_cancel_sync,
+                   NULL, 3, ZFS_SPACE_CHECK_RESERVED));
+       }
        return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check,
            dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED));
 }
@@ -1136,6 +1375,12 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx)
 int
 dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd)
 {
+       if (dsl_errorscrubbing(dp)) {
+               return (dsl_sync_task(spa_name(dp->dp_spa),
+                   dsl_errorscrub_pause_resume_check,
+                   dsl_errorscrub_pause_resume_sync, &cmd, 3,
+                   ZFS_SPACE_CHECK_RESERVED));
+       }
        return (dsl_sync_task(spa_name(dp->dp_spa),
            dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3,
            ZFS_SPACE_CHECK_RESERVED));
@@ -1422,6 +1667,42 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
        return (B_FALSE);
 }
 
+static boolean_t
+dsl_error_scrub_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb)
+{
+       /*
+        * We suspend if:
+        *  - we have scrubbed for at least the minimum time (default 1 sec
+        *    for error scrub), someone is explicitly waiting for this txg
+        *    to complete, or we have used up all of the time in the txg
+        *    timeout (default 5 sec).
+        *  or
+        *  - the spa is shutting down because this pool is being exported
+        *    or the machine is rebooting.
+        */
+       uint64_t curr_time_ns = gethrtime();
+       uint64_t error_scrub_time_ns = curr_time_ns - scn->scn_sync_start_time;
+       uint64_t sync_time_ns = curr_time_ns -
+           scn->scn_dp->dp_spa->spa_sync_starttime;
+       int mintime = zfs_scrub_min_time_ms;
+
+       if ((NSEC2MSEC(error_scrub_time_ns) > mintime &&
+           (txg_sync_waiting(scn->scn_dp) ||
+           NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) ||
+           spa_shutting_down(scn->scn_dp->dp_spa)) {
+               if (zb) {
+                       dprintf("error scrub suspending at bookmark "
+                           "%llx/%llx/%llx/%llx\n",
+                           (longlong_t)zb->zb_objset,
+                           (longlong_t)zb->zb_object,
+                           (longlong_t)zb->zb_level,
+                           (longlong_t)zb->zb_blkid);
+               }
+               return (B_TRUE);
+       }
+       return (B_FALSE);
+}
+
 typedef struct zil_scan_arg {
        dsl_pool_t      *zsa_dp;
        zil_header_t    *zsa_zh;
@@ -3352,6 +3633,19 @@ dsl_scan_active(dsl_scan_t *scn)
        return ((used != 0) || (clones_left));
 }
 
+boolean_t
+dsl_errorscrub_active(dsl_scan_t *scn)
+{
+       spa_t *spa = scn->scn_dp->dp_spa;
+       if (spa->spa_load_state != SPA_LOAD_NONE)
+               return (B_FALSE);
+       if (spa_shutting_down(spa))
+               return (B_FALSE);
+       if (dsl_errorscrubbing(scn->scn_dp))
+               return (B_TRUE);
+       return (B_FALSE);
+}
+
 static boolean_t
 dsl_scan_check_deferred(vdev_t *vd)
 {
@@ -3568,6 +3862,387 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx)
        return (0);
 }
 
+static void
+name_to_bookmark(char *buf, zbookmark_phys_t *zb)
+{
+       zb->zb_objset = zfs_strtonum(buf, &buf);
+       ASSERT(*buf == ':');
+       zb->zb_object = zfs_strtonum(buf + 1, &buf);
+       ASSERT(*buf == ':');
+       zb->zb_level = (int)zfs_strtonum(buf + 1, &buf);
+       ASSERT(*buf == ':');
+       zb->zb_blkid = zfs_strtonum(buf + 1, &buf);
+       ASSERT(*buf == '\0');
+}
+
+static void
+name_to_object(char *buf, uint64_t *obj)
+{
+       *obj = zfs_strtonum(buf, &buf);
+       ASSERT(*buf == '\0');
+}
+
+static void
+read_by_block_level(dsl_scan_t *scn, zbookmark_phys_t zb)
+{
+       dsl_pool_t *dp = scn->scn_dp;
+       dsl_dataset_t *ds;
+       objset_t *os;
+       if (dsl_dataset_hold_obj(dp, zb.zb_objset, FTAG, &ds) != 0)
+               return;
+
+       if (dmu_objset_from_ds(ds, &os) != 0) {
+               dsl_dataset_rele(ds, FTAG);
+               return;
+       }
+
+       /*
+        * If the key is not loaded dbuf_dnode_findbp() will error out with
+        * EACCES. However in that case dnode_hold() will eventually call
+        * dbuf_read()->zio_wait() which may call spa_log_error(). This will
+        * lead to a deadlock due to us holding the mutex spa_errlist_lock.
+        * Avoid this by checking here if the keys are loaded, if not return.
+        * If the keys are not loaded the head_errlog feature is meaningless
+        * as we cannot figure out the birth txg of the block pointer.
+        */
+       if (dsl_dataset_get_keystatus(ds->ds_dir) ==
+           ZFS_KEYSTATUS_UNAVAILABLE) {
+               dsl_dataset_rele(ds, FTAG);
+               return;
+       }
+
+       dnode_t *dn;
+       blkptr_t bp;
+
+       if (dnode_hold(os, zb.zb_object, FTAG, &dn) != 0) {
+               dsl_dataset_rele(ds, FTAG);
+               return;
+       }
+
+       rw_enter(&dn->dn_struct_rwlock, RW_READER);
+       int error = dbuf_dnode_findbp(dn, zb.zb_level, zb.zb_blkid, &bp, NULL,
+           NULL);
+
+       if (error) {
+               rw_exit(&dn->dn_struct_rwlock);
+               dnode_rele(dn, FTAG);
+               dsl_dataset_rele(ds, FTAG);
+               return;
+       }
+
+       if (!error && BP_IS_HOLE(&bp)) {
+               rw_exit(&dn->dn_struct_rwlock);
+               dnode_rele(dn, FTAG);
+               dsl_dataset_rele(ds, FTAG);
+               return;
+       }
+
+       int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW |
+           ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB;
+
+       /* If it's an intent log block, failure is expected. */
+       if (zb.zb_level == ZB_ZIL_LEVEL)
+               zio_flags |= ZIO_FLAG_SPECULATIVE;
+
+       ASSERT(!BP_IS_EMBEDDED(&bp));
+       scan_exec_io(dp, &bp, zio_flags, &zb, NULL);
+       rw_exit(&dn->dn_struct_rwlock);
+       dnode_rele(dn, FTAG);
+       dsl_dataset_rele(ds, FTAG);
+}
+
+/*
+ * We keep track of the scrubbed error blocks in "count". This will be used
+ * when deciding whether we exceeded zfs_scrub_error_blocks_per_txg. This
+ * function is modelled after check_filesystem().
+ */
+static int
+scrub_filesystem(spa_t *spa, uint64_t fs, zbookmark_err_phys_t *zep,
+    int *count)
+{
+       dsl_dataset_t *ds;
+       dsl_pool_t *dp = spa->spa_dsl_pool;
+       dsl_scan_t *scn = dp->dp_scan;
+
+       int error = dsl_dataset_hold_obj(dp, fs, FTAG, &ds);
+       if (error != 0)
+               return (error);
+
+       uint64_t latest_txg;
+       uint64_t txg_to_consider = spa->spa_syncing_txg;
+       boolean_t check_snapshot = B_TRUE;
+
+       error = find_birth_txg(ds, zep, &latest_txg);
+
+       /*
+        * If find_birth_txg() errors out, then err on the side of caution and
+        * proceed. In worst case scenario scrub all objects. If zep->zb_birth
+        * is 0 (e.g. in case of encryption with unloaded keys) also proceed to
+        * scrub all objects.
+        */
+       if (error == 0 && zep->zb_birth == latest_txg) {
+               /* Block neither free nor re written. */
+               zbookmark_phys_t zb;
+               zep_to_zb(fs, zep, &zb);
+               scn->scn_zio_root = zio_root(spa, NULL, NULL,
+                   ZIO_FLAG_CANFAIL);
+               /* We have already acquired the config lock for spa */
+               read_by_block_level(scn, zb);
+
+               (void) zio_wait(scn->scn_zio_root);
+               scn->scn_zio_root = NULL;
+
+               scn->errorscrub_phys.dep_examined++;
+               scn->errorscrub_phys.dep_to_examine--;
+               (*count)++;
+               if ((*count) == zfs_scrub_error_blocks_per_txg ||
+                   dsl_error_scrub_check_suspend(scn, &zb)) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (SET_ERROR(EFAULT));
+               }
+
+               check_snapshot = B_FALSE;
+       } else if (error == 0) {
+               txg_to_consider = latest_txg;
+       }
+
+       /*
+        * Retrieve the number of snapshots if the dataset is not a snapshot.
+        */
+       uint64_t snap_count = 0;
+       if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) {
+
+               error = zap_count(spa->spa_meta_objset,
+                   dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count);
+
+               if (error != 0) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (error);
+               }
+       }
+
+       if (snap_count == 0) {
+               /* Filesystem without snapshots. */
+               dsl_dataset_rele(ds, FTAG);
+               return (0);
+       }
+
+       uint64_t snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+       uint64_t snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+
+       dsl_dataset_rele(ds, FTAG);
+
+       /* Check only snapshots created from this file system. */
+       while (snap_obj != 0 && zep->zb_birth < snap_obj_txg &&
+           snap_obj_txg <= txg_to_consider) {
+
+               error = dsl_dataset_hold_obj(dp, snap_obj, FTAG, &ds);
+               if (error != 0)
+                       return (error);
+
+               if (dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj != fs) {
+                       snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+                       snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+                       dsl_dataset_rele(ds, FTAG);
+                       continue;
+               }
+
+               boolean_t affected = B_TRUE;
+               if (check_snapshot) {
+                       uint64_t blk_txg;
+                       error = find_birth_txg(ds, zep, &blk_txg);
+
+                       /*
+                        * Scrub the snapshot also when zb_birth == 0 or when
+                        * find_birth_txg() returns an error.
+                        */
+                       affected = (error == 0 && zep->zb_birth == blk_txg) ||
+                           (error != 0) || (zep->zb_birth == 0);
+               }
+
+               /* Scrub snapshots. */
+               if (affected) {
+                       zbookmark_phys_t zb;
+                       zep_to_zb(snap_obj, zep, &zb);
+                       scn->scn_zio_root = zio_root(spa, NULL, NULL,
+                           ZIO_FLAG_CANFAIL);
+                       /* We have already acquired the config lock for spa */
+                       read_by_block_level(scn, zb);
+
+                       (void) zio_wait(scn->scn_zio_root);
+                       scn->scn_zio_root = NULL;
+
+                       scn->errorscrub_phys.dep_examined++;
+                       scn->errorscrub_phys.dep_to_examine--;
+                       (*count)++;
+                       if ((*count) == zfs_scrub_error_blocks_per_txg ||
+                           dsl_error_scrub_check_suspend(scn, &zb)) {
+                               dsl_dataset_rele(ds, FTAG);
+                               return (EFAULT);
+                       }
+               }
+               snap_obj_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg;
+               snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
+               dsl_dataset_rele(ds, FTAG);
+       }
+       return (0);
+}
+
+void
+dsl_errorscrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+       spa_t *spa = dp->dp_spa;
+       dsl_scan_t *scn = dp->dp_scan;
+
+       /*
+        * Only process scans in sync pass 1.
+        */
+
+       if (spa_sync_pass(spa) > 1)
+               return;
+
+       /*
+        * If the spa is shutting down, then stop scanning. This will
+        * ensure that the scan does not dirty any new data during the
+        * shutdown phase.
+        */
+       if (spa_shutting_down(spa))
+               return;
+
+       if (!dsl_errorscrub_active(scn) || dsl_errorscrub_is_paused(scn)) {
+               return;
+       }
+
+       if (dsl_scan_resilvering(scn->scn_dp)) {
+               /* cancel the error scrub if resilver started */
+               dsl_scan_cancel(scn->scn_dp);
+               return;
+       }
+
+       spa->spa_scrub_active = B_TRUE;
+       scn->scn_sync_start_time = gethrtime();
+
+       /*
+        * zfs_scan_suspend_progress can be set to disable scrub progress.
+        * See more detailed comment in dsl_scan_sync().
+        */
+       if (zfs_scan_suspend_progress) {
+               uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+               int mintime = zfs_scrub_min_time_ms;
+
+               while (zfs_scan_suspend_progress &&
+                   !txg_sync_waiting(scn->scn_dp) &&
+                   !spa_shutting_down(scn->scn_dp->dp_spa) &&
+                   NSEC2MSEC(scan_time_ns) < mintime) {
+                       delay(hz);
+                       scan_time_ns = gethrtime() - scn->scn_sync_start_time;
+               }
+               return;
+       }
+
+       int i = 0;
+       zap_attribute_t *za;
+       zbookmark_phys_t *zb;
+       boolean_t limit_exceeded = B_FALSE;
+
+       za = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+       zb = kmem_zalloc(sizeof (zbookmark_phys_t), KM_SLEEP);
+
+       if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
+               for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
+                   zap_cursor_advance(&scn->errorscrub_cursor)) {
+                       name_to_bookmark(za->za_name, zb);
+
+                       scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
+                           NULL, ZIO_FLAG_CANFAIL);
+                       dsl_pool_config_enter(dp, FTAG);
+                       read_by_block_level(scn, *zb);
+                       dsl_pool_config_exit(dp, FTAG);
+
+                       (void) zio_wait(scn->scn_zio_root);
+                       scn->scn_zio_root = NULL;
+
+                       scn->errorscrub_phys.dep_examined += 1;
+                       scn->errorscrub_phys.dep_to_examine -= 1;
+                       i++;
+                       if (i == zfs_scrub_error_blocks_per_txg ||
+                           dsl_error_scrub_check_suspend(scn, zb)) {
+                               limit_exceeded = B_TRUE;
+                               break;
+                       }
+               }
+
+               if (!limit_exceeded)
+                       dsl_errorscrub_done(scn, B_TRUE, tx);
+
+               dsl_errorscrub_sync_state(scn, tx);
+               kmem_free(za, sizeof (*za));
+               kmem_free(zb, sizeof (*zb));
+               return;
+       }
+
+       int error = 0;
+       for (; zap_cursor_retrieve(&scn->errorscrub_cursor, za) == 0;
+           zap_cursor_advance(&scn->errorscrub_cursor)) {
+
+               zap_cursor_t *head_ds_cursor;
+               zap_attribute_t *head_ds_attr;
+               zbookmark_err_phys_t head_ds_block;
+
+               head_ds_cursor = kmem_zalloc(sizeof (zap_cursor_t), KM_SLEEP);
+               head_ds_attr = kmem_zalloc(sizeof (zap_attribute_t), KM_SLEEP);
+
+               uint64_t head_ds_err_obj = za->za_first_integer;
+               uint64_t head_ds;
+               name_to_object(za->za_name, &head_ds);
+               boolean_t config_held = B_FALSE;
+               uint64_t top_affected_fs;
+
+               for (zap_cursor_init(head_ds_cursor, spa->spa_meta_objset,
+                   head_ds_err_obj); zap_cursor_retrieve(head_ds_cursor,
+                   head_ds_attr) == 0; zap_cursor_advance(head_ds_cursor)) {
+
+                       name_to_errphys(head_ds_attr->za_name, &head_ds_block);
+
+                       /*
+                        * In case we are called from spa_sync the pool
+                        * config is already held.
+                        */
+                       if (!dsl_pool_config_held(dp)) {
+                               dsl_pool_config_enter(dp, FTAG);
+                               config_held = B_TRUE;
+                       }
+
+                       error = find_top_affected_fs(spa,
+                           head_ds, &head_ds_block, &top_affected_fs);
+                       if (error)
+                               break;
+
+                       error = scrub_filesystem(spa, top_affected_fs,
+                           &head_ds_block, &i);
+
+                       if (error == SET_ERROR(EFAULT)) {
+                               limit_exceeded = B_TRUE;
+                               break;
+                       }
+               }
+
+               zap_cursor_fini(head_ds_cursor);
+               kmem_free(head_ds_cursor, sizeof (*head_ds_cursor));
+               kmem_free(head_ds_attr, sizeof (*head_ds_attr));
+
+               if (config_held)
+                       dsl_pool_config_exit(dp, FTAG);
+       }
+
+       kmem_free(za, sizeof (*za));
+       kmem_free(zb, sizeof (*zb));
+       if (!limit_exceeded)
+               dsl_errorscrub_done(scn, B_TRUE, tx);
+
+       dsl_errorscrub_sync_state(scn, tx);
+}
+
 /*
  * This is the primary entry point for scans that is called from syncing
  * context. Scans must happen entirely during syncing context so that we
@@ -4109,7 +4784,14 @@ dsl_scan_scrub_done(zio_t *zio)
 
        if (zio->io_error && (zio->io_error != ECKSUM ||
            !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) {
-               atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors);
+               if (dsl_errorscrubbing(spa->spa_dsl_pool) &&
+                   !dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan)) {
+                       atomic_inc_64(&spa->spa_dsl_pool->dp_scan
+                           ->errorscrub_phys.dep_errors);
+               } else {
+                       atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys
+                           .scn_errors);
+               }
        }
 }
 
@@ -4559,3 +5241,7 @@ ZFS_MODULE_PARAM(zfs, zfs_, scan_report_txgs, UINT, ZMOD_RW,
 
 ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW,
        "Process all resilvers immediately");
+
+ZFS_MODULE_PARAM(zfs, zfs_, scrub_error_blocks_per_txg, U64, ZMOD_RW,
+       "Error blocks to be scrubbed in one txg");
+/* END CSTYLED */
index 51d6de9105fb7120fd155bc3e836e6f433c7aefb..1fc2c5e8c55d7b61c2cf3e4124616c5544ac5985 100644 (file)
@@ -8173,6 +8173,7 @@ spa_scan_stop(spa_t *spa)
        ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
        if (dsl_scan_resilvering(spa->spa_dsl_pool))
                return (SET_ERROR(EBUSY));
+
        return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
@@ -8198,6 +8199,10 @@ spa_scan(spa_t *spa, pool_scan_func_t func)
                return (0);
        }
 
+       if (func == POOL_SCAN_ERRORSCRUB &&
+           !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
+               return (SET_ERROR(ENOTSUP));
+
        return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
@@ -9249,6 +9254,7 @@ spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
                brt_sync(spa, txg);
                ddt_sync(spa, txg);
                dsl_scan_sync(dp, tx);
+               dsl_errorscrub_sync(dp, tx);
                svr_sync(spa, tx);
                spa_sync_upgrades(spa, tx);
 
index 5fe35278683a7cfe96d661e1290e06fd6050afd7..2e5c22c11490148430308bfc1a2ae0f1ef5e9380 100644 (file)
@@ -110,7 +110,7 @@ errphys_to_name(zbookmark_err_phys_t *zep, char *buf, size_t len)
 /*
  * Convert a string to a err_phys.
  */
-static void
+void
 name_to_errphys(char *buf, zbookmark_err_phys_t *zep)
 {
        zep->zb_object = zfs_strtonum(buf, &buf);
@@ -139,8 +139,7 @@ name_to_bookmark(char *buf, zbookmark_phys_t *zb)
        ASSERT(*buf == '\0');
 }
 
-#ifdef _KERNEL
-static void
+void
 zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
 {
        zb->zb_objset = dataset;
@@ -148,7 +147,6 @@ zep_to_zb(uint64_t dataset, zbookmark_err_phys_t *zep, zbookmark_phys_t *zb)
        zb->zb_level = zep->zb_level;
        zb->zb_blkid = zep->zb_blkid;
 }
-#endif
 
 static void
 name_to_object(char *buf, uint64_t *obj)
@@ -238,8 +236,7 @@ spa_log_error(spa_t *spa, const zbookmark_phys_t *zb, const uint64_t *birth)
        mutex_exit(&spa->spa_errlist_lock);
 }
 
-#ifdef _KERNEL
-static int
+int
 find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
     uint64_t *birth_txg)
 {
@@ -267,6 +264,34 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
        return (error);
 }
 
+/*
+ * This function finds the oldest affected filesystem containing an error
+ * block.
+ */
+int
+find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
+    uint64_t *top_affected_fs)
+{
+       uint64_t oldest_dsobj;
+       int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
+           &oldest_dsobj);
+       if (error != 0)
+               return (error);
+
+       dsl_dataset_t *ds;
+       error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj,
+           DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
+       if (error != 0)
+               return (error);
+
+       *top_affected_fs =
+           dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
+       dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
+       return (0);
+}
+
+
+#ifdef _KERNEL
 /*
  * Copy the bookmark to the end of the user-space buffer which starts at
  * uaddr and has *count unused entries, and decrement *count by 1.
@@ -288,7 +313,8 @@ copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count)
  * Each time the error block is referenced by a snapshot or clone, add a
  * zbookmark_phys_t entry to the userspace array at uaddr. The array is
  * filled from the back and the in-out parameter *count is modified to be the
- * number of unused entries at the beginning of the array.
+ * number of unused entries at the beginning of the array. The function
+ * scrub_filesystem() is modelled after this one.
  */
 static int
 check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
@@ -449,28 +475,6 @@ out:
        return (error);
 }
 
-static int
-find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
-    uint64_t *top_affected_fs)
-{
-       uint64_t oldest_dsobj;
-       int error = dsl_dataset_oldest_snapshot(spa, head_ds, zep->zb_birth,
-           &oldest_dsobj);
-       if (error != 0)
-               return (error);
-
-       dsl_dataset_t *ds;
-       error = dsl_dataset_hold_obj_flags(spa->spa_dsl_pool, oldest_dsobj,
-           DS_HOLD_FLAG_DECRYPT, FTAG, &ds);
-       if (error != 0)
-               return (error);
-
-       *top_affected_fs =
-           dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj;
-       dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
-       return (0);
-}
-
 static int
 process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
     void *uaddr, uint64_t *count)
@@ -536,6 +540,21 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
 }
 #endif
 
+/* Return the number of errors in the error log */
+uint64_t
+spa_get_last_errlog_size(spa_t *spa)
+{
+       uint64_t total = 0, count;
+       mutex_enter(&spa->spa_errlog_lock);
+
+       if (spa->spa_errlog_last != 0 &&
+           zap_count(spa->spa_meta_objset, spa->spa_errlog_last,
+           &count) == 0)
+               total += count;
+       mutex_exit(&spa->spa_errlog_lock);
+       return (total);
+}
+
 /*
  * If a healed bookmark matches an entry in the error log we stash it in a tree
  * so that we can later remove the related log entries in sync context.
@@ -1447,6 +1466,7 @@ spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
 /* error handling */
 EXPORT_SYMBOL(spa_log_error);
 EXPORT_SYMBOL(spa_approx_errlog_size);
+EXPORT_SYMBOL(spa_get_last_errlog_size);
 EXPORT_SYMBOL(spa_get_errlog);
 EXPORT_SYMBOL(spa_errlog_rotate);
 EXPORT_SYMBOL(spa_errlog_drain);
@@ -1456,6 +1476,10 @@ EXPORT_SYMBOL(spa_delete_dataset_errlog);
 EXPORT_SYMBOL(spa_swap_errlog);
 EXPORT_SYMBOL(sync_error_list);
 EXPORT_SYMBOL(spa_upgrade_errlog);
+EXPORT_SYMBOL(find_top_affected_fs);
+EXPORT_SYMBOL(find_birth_txg);
+EXPORT_SYMBOL(zep_to_zb);
+EXPORT_SYMBOL(name_to_errphys);
 #endif
 
 /* BEGIN CSTYLED */
index 54a0eeccf27b04c30ea65566a9187feb34f340ce..89e1ce7165db72c1a5c1c32a32c35755a707cedf 100644 (file)
@@ -2579,9 +2579,18 @@ spa_scan_stat_init(spa_t *spa)
                spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start;
        else
                spa->spa_scan_pass_scrub_pause = 0;
+
+       if (dsl_errorscrub_is_paused(spa->spa_dsl_pool->dp_scan))
+               spa->spa_scan_pass_errorscrub_pause = spa->spa_scan_pass_start;
+       else
+               spa->spa_scan_pass_errorscrub_pause = 0;
+
        spa->spa_scan_pass_scrub_spent_paused = 0;
        spa->spa_scan_pass_exam = 0;
        spa->spa_scan_pass_issued = 0;
+
+       // error scrub stats
+       spa->spa_scan_pass_errorscrub_spent_paused = 0;
 }
 
 /*
@@ -2592,8 +2601,10 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
 {
        dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL;
 
-       if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE)
+       if (scn == NULL || (scn->scn_phys.scn_func == POOL_SCAN_NONE &&
+           scn->errorscrub_phys.dep_func == POOL_SCAN_NONE))
                return (SET_ERROR(ENOENT));
+
        memset(ps, 0, sizeof (pool_scan_stat_t));
 
        /* data stored on disk */
@@ -2616,6 +2627,18 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps)
        ps->pss_issued =
            scn->scn_issued_before_pass + spa->spa_scan_pass_issued;
 
+       /* error scrub data stored on disk */
+       ps->pss_error_scrub_func = scn->errorscrub_phys.dep_func;
+       ps->pss_error_scrub_state = scn->errorscrub_phys.dep_state;
+       ps->pss_error_scrub_start = scn->errorscrub_phys.dep_start_time;
+       ps->pss_error_scrub_end = scn->errorscrub_phys.dep_end_time;
+       ps->pss_error_scrub_examined = scn->errorscrub_phys.dep_examined;
+       ps->pss_error_scrub_to_be_examined =
+           scn->errorscrub_phys.dep_to_examine;
+
+       /* error scrub data not stored on disk */
+       ps->pss_pass_error_scrub_pause = spa->spa_scan_pass_errorscrub_pause;
+
        return (0);
 }
 
index efaf6f9b390a21947c258482b48e2bf8a39781d7..f91a2f3bbca55a6b7597a5b3333e8a5d4e43db03 100644 (file)
@@ -1685,6 +1685,47 @@ zfs_ioc_pool_scan(zfs_cmd_t *zc)
        return (error);
 }
 
+/*
+ * inputs:
+ * poolname             name of the pool
+ * scan_type            scan func (pool_scan_func_t)
+ * scan_command         scrub pause/resume flag (pool_scrub_cmd_t)
+ */
+static const zfs_ioc_key_t zfs_keys_pool_scrub[] = {
+       {"scan_type",           DATA_TYPE_UINT64,       0},
+       {"scan_command",        DATA_TYPE_UINT64,       0},
+};
+
+static int
+zfs_ioc_pool_scrub(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl)
+{
+       spa_t *spa;
+       int error;
+       uint64_t scan_type, scan_cmd;
+
+       if (nvlist_lookup_uint64(innvl, "scan_type", &scan_type) != 0)
+               return (SET_ERROR(EINVAL));
+       if (nvlist_lookup_uint64(innvl, "scan_command", &scan_cmd) != 0)
+               return (SET_ERROR(EINVAL));
+
+       if (scan_cmd >= POOL_SCRUB_FLAGS_END)
+               return (SET_ERROR(EINVAL));
+
+       if ((error = spa_open(poolname, &spa, FTAG)) != 0)
+               return (error);
+
+       if (scan_cmd == POOL_SCRUB_PAUSE) {
+               error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE);
+       } else if (scan_type == POOL_SCAN_NONE) {
+               error = spa_scan_stop(spa);
+       } else {
+               error = spa_scan(spa, scan_type);
+       }
+
+       spa_close(spa, FTAG);
+       return (error);
+}
+
 static int
 zfs_ioc_pool_freeze(zfs_cmd_t *zc)
 {
@@ -7218,6 +7259,11 @@ zfs_ioctl_init(void)
            POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE,
            zfs_keys_vdev_set_props, ARRAY_SIZE(zfs_keys_vdev_set_props));
 
+       zfs_ioctl_register("scrub", ZFS_IOC_POOL_SCRUB,
+           zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME,
+           POOL_CHECK_NONE, B_TRUE, B_TRUE,
+           zfs_keys_pool_scrub, ARRAY_SIZE(zfs_keys_pool_scrub));
+
        /* IOCTLS that use the legacy function signature */
 
        zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze,
index 62d9cbeb6d9003da1acd897fd196db7e10273bb1..9ed1a6d37a97b3f7b8e01fbae85d3671e94967e7 100644 (file)
@@ -479,7 +479,9 @@ tags = ['functional', 'cli_root', 'zpool_resilver']
 tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
     'zpool_scrub_004_pos', 'zpool_scrub_005_pos',
     'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing',
-    'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies']
+    'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies',
+    'zpool_error_scrub_001_pos', 'zpool_error_scrub_002_pos',
+    'zpool_error_scrub_003_pos', 'zpool_error_scrub_004_pos']
 tags = ['functional', 'cli_root', 'zpool_scrub']
 
 [tests/functional/cli_root/zpool_set]
index a1dfaefd71050998483ef22664e1148a64133e04..c661718a296cdfef3e7fa6fcb80a632d6fbda3ec 100644 (file)
@@ -27,6 +27,7 @@
 #include <sys/vdev_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_bootenv.h>
+#include <sys/fs/zfs.h>
 
 /*
  * Test the nvpair inputs for the non-legacy zfs ioctl commands.
@@ -688,6 +689,17 @@ test_vdev_trim(const char *pool)
        nvlist_free(required);
 }
 
+/* Test with invalid values */
+static void
+test_scrub(const char *pool)
+{
+       nvlist_t *required = fnvlist_alloc();
+       fnvlist_add_uint64(required, "scan_type", POOL_SCAN_FUNCS + 1);
+       fnvlist_add_uint64(required, "scan_command", POOL_SCRUB_FLAGS_END + 1);
+       IOC_INPUT_TEST(ZFS_IOC_POOL_SCRUB, pool, required, NULL, EINVAL);
+       nvlist_free(required);
+}
+
 static int
 zfs_destroy(const char *dataset)
 {
@@ -868,6 +880,8 @@ zfs_ioc_input_tests(const char *pool)
        test_set_bootenv(pool);
        test_get_bootenv(pool);
 
+       test_scrub(pool);
+
        /*
         * cleanup
         */
@@ -1022,6 +1036,7 @@ validate_ioc_values(void)
        CHECK(ZFS_IOC_BASE + 82 == ZFS_IOC_GET_BOOKMARK_PROPS);
        CHECK(ZFS_IOC_BASE + 83 == ZFS_IOC_WAIT);
        CHECK(ZFS_IOC_BASE + 84 == ZFS_IOC_WAIT_FS);
+       CHECK(ZFS_IOC_BASE + 87 == ZFS_IOC_POOL_SCRUB);
        CHECK(ZFS_IOC_PLATFORM_BASE + 1 == ZFS_IOC_EVENTS_NEXT);
        CHECK(ZFS_IOC_PLATFORM_BASE + 2 == ZFS_IOC_EVENTS_CLEAR);
        CHECK(ZFS_IOC_PLATFORM_BASE + 3 == ZFS_IOC_EVENTS_SEEK);
index 8521f271be54a7567963b739062f5fb3d84b21ad..133f8387ddafd2c8dfcd804e04c561bdb1156965 100644 (file)
@@ -1969,6 +1969,12 @@ function is_pool_scrubbing #pool <verbose>
        check_pool_status "$1" "scan" "scrub in progress since " $2
 }
 
+function is_pool_error_scrubbing #pool <verbose>
+{
+       check_pool_status "$1" "scrub" "error scrub in progress since " $2
+       return $?
+}
+
 function is_pool_scrubbed #pool <verbose>
 {
        check_pool_status "$1" "scan" "scrub repaired" $2
@@ -1979,11 +1985,23 @@ function is_pool_scrub_stopped #pool <verbose>
        check_pool_status "$1" "scan" "scrub canceled" $2
 }
 
+function is_pool_error_scrub_stopped #pool <verbose>
+{
+       check_pool_status "$1" "scrub" "error scrub canceled on " $2
+       return $?
+}
+
 function is_pool_scrub_paused #pool <verbose>
 {
        check_pool_status "$1" "scan" "scrub paused since " $2
 }
 
+function is_pool_error_scrub_paused #pool <verbose>
+{
+       check_pool_status "$1" "scrub" "error scrub paused since " $2
+       return $?
+}
+
 function is_pool_removing #pool
 {
        check_pool_status "$1" "remove" "in progress since "
index 3e4120f52ca5790d4822701b6b6ba275a9a2eb8f..ad4aec5432992b66b9076391c149eab738b29e44 100644 (file)
@@ -1153,6 +1153,10 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
        functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh \
        functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh \
        functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh \
+       functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh \
+       functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh \
+       functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh \
+       functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh \
        functional/cli_root/zpool_set/cleanup.ksh \
        functional/cli_root/zpool_set/setup.ksh \
        functional/cli_root/zpool/setup.ksh \
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_001_pos.ksh
new file mode 100755 (executable)
index 0000000..e414cd1
--- /dev/null
@@ -0,0 +1,79 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2019, Delphix. All rights reserved.
+# Copyright (c) 2023, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+#      Verify scrub -e, -p, and -s show the right status.
+#
+# STRATEGY:
+#      1. Create a pool and create a 10MB file in it.
+#      2. Start a error scrub (-e) and verify it's doing a scrub.
+#      3. Pause error scrub (-p) and verify it's paused.
+#      4. Try to pause a paused error scrub (-p) and make sure that fails.
+#      5. Resume the paused error scrub and verify again it's doing a scrub.
+#      6. Verify zpool scrub -s succeed when the system is error scrubbing.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+       log_must zinject -c all
+       rm -f /$TESTPOOL/10m_file
+}
+
+log_onexit cleanup
+
+log_assert "Verify scrub -e, -p, and -s show the right status."
+
+log_must fio --rw=write --name=job --size=10M --filename=/$TESTPOOL/10m_file
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+log_must zinject -t data -e checksum -f 100 -am /$TESTPOOL/10m_file
+
+# create some error blocks
+dd if=/$TESTPOOL/10m_file bs=1M count=1 || true
+
+# sync error blocks to disk
+log_must sync_pool $TESTPOOL
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+log_must zpool scrub -e $TESTPOOL
+log_must is_pool_error_scrubbing $TESTPOOL true
+log_must zpool scrub -p $TESTPOOL
+log_must is_pool_error_scrub_paused $TESTPOOL true
+log_mustnot zpool scrub -p $TESTPOOL
+log_must is_pool_error_scrub_paused $TESTPOOL true
+log_must zpool scrub -e $TESTPOOL
+log_must is_pool_error_scrubbing $TESTPOOL true
+log_must zpool scrub -s $TESTPOOL
+log_must is_pool_error_scrub_stopped $TESTPOOL true
+
+log_pass "Verified scrub -e, -p, and -s show expected status."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_002_pos.ksh
new file mode 100755 (executable)
index 0000000..daa11c3
--- /dev/null
@@ -0,0 +1,99 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2019, Delphix. All rights reserved.
+# Copyright (c) 2023, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+#      Verify regular scrub and error scrub can't run at the same time.
+#
+# STRATEGY:
+#      1. Create a pool and create a 10MB file in it.
+#      2. Start a scrub and verify it's doing a scrub.
+#      3. Start a error scrub (-e) and verify it fails.
+#      4. Pause scrub (-p) and verify it's paused.
+#      5. Start a error scrub (-e) verify it fails again.
+#      6. Resume the paused scrub, verify it and cancel it.
+#      7. Start a error scrub (-e) and verify it's doing error scrub.
+#      8. Start a scrub and verify it fails.
+#      9. Cancel error scrub (-e) and verify it is canceled.
+#      10. Start scrub, verify it, cancel it and verify it.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       log_must set_tunable32 SCAN_SUSPEND_PROGRESS 0
+       log_must zinject -c all
+       rm -f /$TESTPOOL/10m_file
+}
+
+log_onexit cleanup
+
+log_assert "Verify regular scrub and error scrub can't run at the same time."
+
+log_must fio --rw=write --name=job --size=10M --filename=/$TESTPOOL/10m_file
+
+log_must zpool export $TESTPOOL
+log_must zpool import $TESTPOOL
+log_must zinject -t data -e checksum -f 100 -am /$TESTPOOL/10m_file
+
+# create some error blocks before error scrub is requested.
+dd if=/$TESTPOOL/10m_file bs=1M count=1 || true
+# sync error blocks to disk
+log_must sync_pool $TESTPOOL
+
+log_must set_tunable32 SCAN_SUSPEND_PROGRESS 1
+
+log_must zpool scrub $TESTPOOL
+log_must is_pool_scrubbing $TESTPOOL true
+log_mustnot zpool scrub -e $TESTPOOL
+log_must zpool scrub -p $TESTPOOL
+log_must is_pool_scrub_paused $TESTPOOL true
+log_mustnot zpool scrub -e $TESTPOOL
+log_must zpool scrub $TESTPOOL
+log_must is_pool_scrubbing $TESTPOOL true
+log_must zpool scrub -s $TESTPOOL
+log_must is_pool_scrub_stopped $TESTPOOL true
+
+# create some error blocks before error scrub is requested.
+dd if=/$TESTPOOL/10m_file bs=1M count=1 || true
+# sync error blocks to disk
+log_must sync_pool $TESTPOOL
+
+log_must zpool scrub -e $TESTPOOL
+log_must is_pool_error_scrubbing $TESTPOOL true
+log_mustnot zpool scrub $TESTPOOL
+log_must zpool scrub -s $TESTPOOL
+log_must is_pool_error_scrub_stopped $TESTPOOL true
+
+log_must zpool scrub $TESTPOOL
+log_must is_pool_scrubbing $TESTPOOL true
+log_must zpool scrub -s $TESTPOOL
+log_must is_pool_scrub_stopped $TESTPOOL true
+
+log_pass "Verified regular scrub and error scrub can't run at the same time."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_003_pos.ksh
new file mode 100755 (executable)
index 0000000..d0066fd
--- /dev/null
@@ -0,0 +1,109 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2019, Delphix. All rights reserved.
+# Copyright (c) 2023, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+#      Verify error scrub clears the errorlog, if errors no longer exist.
+#
+# STRATEGY:
+#      1. Create a pool and create file in it.
+#      2. Zinject errors and read using dd to log errors to disk.
+#      3. Make sure file name is mentioned in the list of error files.
+#      4. Start error scrub and wait for it finish.
+#      5. Check scrub ran and errors are still reported.
+#      6. Clear corruption and error scrub again.
+#      7. Check scrub ran and errors are cleared.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       zinject -c all
+       rm -f /$TESTPOOL2/$TESTFILE0
+       destroy_pool $TESTPOOL2
+}
+
+log_onexit cleanup
+
+log_assert "Verify error scrub clears the errorlog, if errors no longer exist."
+
+truncate -s $MINVDEVSIZE $TESTDIR/vdev_a
+log_must zpool create -f -O primarycache=none $TESTPOOL2 $TESTDIR/vdev_a
+log_must zfs create $TESTPOOL2/$TESTFS1
+typeset file=/$TESTPOOL2/$TESTFS1/$TESTFILE0
+log_must dd if=/dev/urandom of=$file bs=2M count=10
+
+lastfs="$(zfs list -r $TESTPOOL2 | tail -1 | awk '{print $1}')"
+for i in {1..3}; do
+       log_must zfs snap $lastfs@snap$i
+       log_must zfs clone $lastfs@snap$i $TESTPOOL2/clone$i
+       lastfs="$(zfs list -r $TESTPOOL2/clone$i | tail -1 | awk '{print $1}')"
+done
+
+log_must zinject -t data -e checksum -f 100 -a $file
+dd if=$file of=/dev/null bs=2M count=10
+
+# Important: sync error log to disk
+log_must sync_pool $TESTPOOL2
+
+# Check reported errors
+log_must zpool status -v $TESTPOOL2
+log_must eval "zpool status -v $TESTPOOL2 | \
+    grep \"Permanent errors have been detected\""
+log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1@snap1:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'"
+
+# Check errors are reported if corruption persists
+log_must zpool scrub -e -w $TESTPOOL2
+log_must eval "zpool status -v | grep 'error blocks'"
+log_must zpool status -v $TESTPOOL2
+log_must eval "zpool status -v $TESTPOOL2 | \
+    grep \"Permanent errors have been detected\""
+log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/$TESTFS1@snap1:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone1/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone1@snap2:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone2/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone2@snap3:/$TESTFILE0'"
+log_must eval "zpool status -v | grep '$TESTPOOL2/clone3/$TESTFILE0'"
+
+# Check errors are cleared
+log_must zinject -c all
+log_must zpool scrub -e -w $TESTPOOL2
+log_must zpool status -v $TESTPOOL2
+log_must eval "zpool status -v | grep 'error blocks'"
+log_mustnot eval "zpool status -v | grep '$TESTFILE0'"
+
+
+log_pass "Verify error scrub clears the errorlog, if errors no longer exist."
diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_error_scrub_004_pos.ksh
new file mode 100755 (executable)
index 0000000..c88b9b0
--- /dev/null
@@ -0,0 +1,54 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+# Copyright (c) 2023, George Amanakis. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg
+
+#
+# DESCRIPTION:
+#      Verify error scrub clears the errorlog, if errors no longer exist.
+#
+# STRATEGY:
+#      1. Create a pool with head_errlog disabled.
+#      2. Run an error scrub and verify it is not supported.
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+       rm -f /$TESTPOOL2/$TESTFILE0
+       destroy_pool $TESTPOOL2
+}
+
+log_onexit cleanup
+
+log_assert "Verify error scrub cannot run without the head_errlog feature."
+
+truncate -s $MINVDEVSIZE $TESTDIR/vdev_a
+log_must zpool create -f -o feature@head_errlog=disabled $TESTPOOL2 $TESTDIR/vdev_a
+log_mustnot zpool scrub -ew $TESTPOOL2
+
+log_pass "Verify error scrub cannot run without the head_errlog feature."
+