"\t\tspecified by the remaining tuple. Each number is in\n"
"\t\thexadecimal, and only one block can be specified.\n"
"\n"
- "\tzinject [-q] <-t type> [-e errno] [-l level] [-r range]\n"
- "\t [-a] [-m] [-u] [-f freq] <object>\n"
+ "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n"
+ "\t\t[-r range] [-a] [-m] [-u] [-f freq] <object>\n"
"\n"
"\t\tInject an error into the object specified by the '-t' option\n"
"\t\tand the object descriptor. The 'object' parameter is\n"
"\n"
"\t\t-q\tQuiet mode. Only print out the handler number added.\n"
"\t\t-e\tInject a specific error. Must be one of 'io',\n"
- "\t\t\t'checksum', 'decompress', or decrypt. Default is 'io'.\n"
+ "\t\t\t'checksum', 'decompress', or 'decrypt'. Default is 'io'.\n"
+ "\t\t-C\tInject the given error only into specific DVAs. The\n"
+ "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n"
+ "\t\t\tseparated by commas (ex. '0,2').\n"
"\t\t-l\tInject error at a particular block level. Default is "
"0.\n"
"\t\t-m\tAutomatically remount underlying filesystem.\n"
return (0);
if (*count == 0) {
- (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-15s\n",
- "ID", "POOL", "OBJSET", "OBJECT", "TYPE", "LVL", "RANGE");
+ (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s "
+ "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE",
+ "LVL", "DVAs", "RANGE");
(void) printf("--- --------------- ------ "
- "------ -------- --- ---------------\n");
+ "------ -------- --- ---- ---------------\n");
}
*count += 1;
- (void) printf("%3d %-15s %-6llu %-6llu %-8s %3d ", id, pool,
- (u_longlong_t)record->zi_objset, (u_longlong_t)record->zi_object,
- type_to_name(record->zi_type), record->zi_level);
+ (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ",
+ id, pool, (u_longlong_t)record->zi_objset,
+ (u_longlong_t)record->zi_object, type_to_name(record->zi_type),
+ record->zi_level, record->zi_dvas);
+
if (record->zi_start == 0 &&
record->zi_end == -1ULL)
(void) printf(" range: [%llu, %llu)\n",
(u_longlong_t)record->zi_start,
(u_longlong_t)record->zi_end);
+ (void) printf(" dvas: 0x%x\n", record->zi_dvas);
}
}
return (0);
}
+/*
+ * This function converts a string specifier for DVAs into a bit mask.
+ * The dva's provided by the user should be 0 indexed and separated by
+ * a comma. For example:
+ * "1" -> 0b0010 (0x2)
+ * "0,1" -> 0b0011 (0x3)
+ * "0,1,2" -> 0b0111 (0x7)
+ */
+static int
+parse_dvas(const char *str, uint32_t *dvas_out)
+{
+ const char *c = str;
+ uint32_t mask = 0;
+ boolean_t need_delim = B_FALSE;
+
+ /* max string length is 5 ("0,1,2") */
+ if (strlen(str) > 5 || strlen(str) == 0)
+ return (EINVAL);
+
+ while (*c != '\0') {
+ switch (*c) {
+ case '0':
+ case '1':
+ case '2':
+ /* check for pipe between DVAs */
+ if (need_delim)
+ return (EINVAL);
+
+ /* check if this DVA has been set already */
+ if (mask & (1 << ((*c) - '0')))
+ return (EINVAL);
+
+ mask |= (1 << ((*c) - '0'));
+ need_delim = B_TRUE;
+ break;
+ case ',':
+ need_delim = B_FALSE;
+ break;
+ default:
+ /* check for invalid character */
+ return (EINVAL);
+ }
+ c++;
+ }
+
+ /* check for dangling delimiter */
+ if (!need_delim)
+ return (EINVAL);
+
+ *dvas_out = mask;
+ return (0);
+}
+
int
main(int argc, char **argv)
{
int dur_secs = 0;
int ret;
int flags = 0;
+ uint32_t dvas = 0;
if ((g_zfs = libzfs_init()) == NULL) {
(void) fprintf(stderr, "%s", libzfs_error_init(errno));
}
while ((c = getopt(argc, argv,
- ":aA:b:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
+ ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) {
switch (c) {
case 'a':
flags |= ZINJECT_FLUSH_ARC;
case 'c':
cancel = optarg;
break;
+ case 'C':
+ ret = parse_dvas(optarg, &dvas);
+ if (ret != 0) {
+ (void) fprintf(stderr, "invalid DVA list '%s': "
+ "DVAs should be 0 indexed and separated by "
+ "commas.\n", optarg);
+ usage();
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+ break;
case 'd':
device = optarg;
break;
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
- record.zi_freq > 0) {
+ record.zi_freq > 0 || dvas != 0) {
(void) fprintf(stderr, "cancel (-c) incompatible with "
"any other options\n");
usage();
* for doing injection, so handle it separately here.
*/
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED) {
+ level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED ||
+ dvas != 0) {
(void) fprintf(stderr, "device (-d) incompatible with "
"data error injection\n");
usage();
} else if (raw != NULL) {
if (range != NULL || type != TYPE_INVAL || level != 0 ||
record.zi_cmd != ZINJECT_UNINITIALIZED ||
- record.zi_freq > 0) {
+ record.zi_freq > 0 || dvas != 0) {
(void) fprintf(stderr, "raw (-b) format with "
"any other options\n");
usage();
error = EIO;
} else if (record.zi_cmd == ZINJECT_PANIC) {
if (raw != NULL || range != NULL || type != TYPE_INVAL ||
- level != 0 || device != NULL || record.zi_freq > 0) {
+ level != 0 || device != NULL || record.zi_freq > 0 ||
+ dvas != 0) {
(void) fprintf(stderr, "panic (-p) incompatible with "
"other options\n");
usage();
record.zi_type = atoi(argv[1]);
dataset[0] = '\0';
} else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) {
+ if (raw != NULL || range != NULL || type != TYPE_INVAL ||
+ level != 0 || record.zi_freq > 0 || dvas != 0) {
+ (void) fprintf(stderr, "hardware failure (-I) "
+ "incompatible with other options\n");
+ usage();
+ libzfs_fini(g_zfs);
+ return (2);
+ }
+
if (nowrites == 0) {
(void) fprintf(stderr, "-s or -g meaningless "
"without -I (ignore writes)\n");
return (1);
}
+ if (dvas != 0) {
+ if (error == EACCES || error == EINVAL) {
+ (void) fprintf(stderr, "the '-c' option may "
+ "not be used with logical data errors "
+ "'decrypt' and 'decompress'\n");
+ libzfs_fini(g_zfs);
+ return (1);
+ }
+
+ record.zi_dvas = dvas;
+ }
+
if (error == EACCES) {
if (type != TYPE_DATA) {
(void) fprintf(stderr, "decryption errors "
uint64_t zi_timer;
uint64_t zi_nlanes;
uint32_t zi_cmd;
- uint32_t zi_pad;
+ uint32_t zi_dvas;
} zinject_record_t;
#define ZINJECT_NULL 0x1
#define ZI_PERCENTAGE_MIN 4294UL
#define ZI_PERCENTAGE_MAX UINT32_MAX
+#define ZI_NO_DVA (-1)
+
typedef enum zinject_type {
ZINJECT_UNINITIALIZED,
ZINJECT_DATA_FAULT,
.B "zinject \-p \fIfunction\fB \fIpool\fB
Panic inside the specified function.
.TP
-.B "zinject \-t data [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB"
+.B "zinject \-t data [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB"
Force an error into the contents of a file.
.TP
-.B "zinject \-t dnode [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB"
+.B "zinject \-t dnode [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB"
Force an error into the metadnode for a file or directory.
.TP
-.B "zinject \-t \fImos_type\fB [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB"
+.B "zinject \-t \fImos_type\fB [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB"
Force an error into the MOS of a pool.
.SH OPTIONS
.TP
Force an error into the pool at this bookmark tuple. Each number is
in hexadecimal, and only one block can be specified.
.TP
+.BI "\-C" " dvas"
+Inject the given error only into specific DVAs. The mask should be
+specified as a list of 0-indexed DVAs separated by commas (ex. '0,2'). This
+option is not applicable to logical data errors such as
+.BR "decompress"
+and
+.BR "decrypt" .
+.TP
.BI "\-d" " vdev"
A vdev specified by path or GUID.
.TP
*/
typedef struct scan_io {
/* fields from blkptr_t */
- uint64_t sio_offset;
uint64_t sio_blk_prop;
uint64_t sio_phys_birth;
uint64_t sio_birth;
zio_cksum_t sio_cksum;
- uint32_t sio_asize;
+ uint32_t sio_nr_dvas;
/* fields from zio_t */
- int sio_flags;
+ uint32_t sio_flags;
zbookmark_phys_t sio_zb;
/* members for queue sorting */
union {
- avl_node_t sio_addr_node; /* link into issueing queue */
+ avl_node_t sio_addr_node; /* link into issuing queue */
list_node_t sio_list_node; /* link for issuing to disk */
} sio_nodes;
+
+ /*
+ * There may be up to SPA_DVAS_PER_BP DVAs here from the bp,
+ * depending on how many were in the original bp. Only the
+ * first DVA is really used for sorting and issuing purposes.
+ * The other DVAs (if provided) simply exist so that the zio
+ * layer can find additional copies to repair from in the
+ * event of an error. This array must go at the end of the
+ * struct to allow this for the variable number of elements.
+ */
+ dva_t sio_dva[0];
} scan_io_t;
+#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x)
+#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x)
+#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0])
+#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0])
+#define SIO_GET_END_OFFSET(sio) \
+ (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio))
+#define SIO_GET_MUSED(sio) \
+ (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t)))
+
struct dsl_scan_io_queue {
dsl_scan_t *q_scn; /* associated dsl_scan_t */
vdev_t *q_vd; /* top-level vdev that this queue represents */
range_tree_t *q_exts_by_addr;
avl_tree_t q_exts_by_size;
avl_tree_t q_sios_by_addr;
+ uint64_t q_sio_memused;
/* members for zio rate limiting */
uint64_t q_maxinflight_bytes;
static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd);
static void scan_io_queues_destroy(dsl_scan_t *scn);
-static kmem_cache_t *sio_cache;
+static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP];
+
+/* sio->sio_nr_dvas must be set so we know which cache to free from */
+static void
+sio_free(scan_io_t *sio)
+{
+ ASSERT3U(sio->sio_nr_dvas, >, 0);
+ ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio);
+}
+
+/* It is up to the caller to set sio->sio_nr_dvas for freeing */
+static scan_io_t *
+sio_alloc(unsigned short nr_dvas)
+{
+ ASSERT3U(nr_dvas, >, 0);
+ ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP));
+}
void
scan_init(void)
*/
fill_weight = zfs_scan_fill_weight;
- sio_cache = kmem_cache_create("sio_cache",
- sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ char name[36];
+
+ (void) sprintf(name, "sio_cache_%d", i);
+ sio_cache[i] = kmem_cache_create(name,
+ (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))),
+ 0, NULL, NULL, NULL, NULL, NULL, 0);
+ }
}
void
scan_fini(void)
{
- kmem_cache_destroy(sio_cache);
+ for (int i = 0; i < SPA_DVAS_PER_BP; i++) {
+ kmem_cache_destroy(sio_cache[i]);
+ }
}
static inline boolean_t
}
static inline void
-sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id)
+sio2bp(const scan_io_t *sio, blkptr_t *bp)
{
bzero(bp, sizeof (*bp));
- DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize);
- DVA_SET_VDEV(&bp->blk_dva[0], vdev_id);
- DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset);
bp->blk_prop = sio->sio_blk_prop;
bp->blk_phys_birth = sio->sio_phys_birth;
bp->blk_birth = sio->sio_birth;
bp->blk_fill = 1; /* we always only work with data pointers */
bp->blk_cksum = sio->sio_cksum;
+
+ ASSERT3U(sio->sio_nr_dvas, >, 0);
+ ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP);
+
+ bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t));
}
static inline void
bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i)
{
- /* we discard the vdev id, since we can deduce it from the queue */
- sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]);
- sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]);
sio->sio_blk_prop = bp->blk_prop;
sio->sio_phys_birth = bp->blk_phys_birth;
sio->sio_birth = bp->blk_birth;
sio->sio_cksum = bp->blk_cksum;
+ sio->sio_nr_dvas = BP_GET_NDVAS(bp);
+
+ /*
+ * Copy the DVAs to the sio. We need all copies of the block so
+ * that the self healing code can use the alternate copies if the
+ * first is corrupted. We want the DVA at index dva_i to be first
+ * in the sio since this is the primary one that we want to issue.
+ */
+ for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) {
+ sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas];
+ }
}
int
mutex_enter(&tvd->vdev_scan_io_queue_lock);
queue = tvd->vdev_scan_io_queue;
if (queue != NULL) {
- /* #extents in exts_by_size = # in exts_by_addr */
+ /* # extents in exts_by_size = # in exts_by_addr */
mused += avl_numnodes(&queue->q_exts_by_size) *
- sizeof (range_seg_t) +
- avl_numnodes(&queue->q_sios_by_addr) *
- sizeof (scan_io_t);
+ sizeof (range_seg_t) + queue->q_sio_memused;
}
mutex_exit(&tvd->vdev_scan_io_queue_lock);
}
break;
}
- sio2bp(sio, &bp, queue->q_vd->vdev_id);
- bytes_issued += sio->sio_asize;
+ sio2bp(sio, &bp);
+ bytes_issued += SIO_GET_ASIZE(sio);
scan_exec_io(scn->scn_dp, &bp, sio->sio_flags,
&sio->sio_zb, queue);
(void) list_remove_head(io_list);
scan_io_queues_update_zio_stats(queue, &bp);
- kmem_cache_free(sio_cache, sio);
+ sio_free(sio);
}
atomic_add_64(&scn->scn_bytes_pending, -bytes_issued);
static boolean_t
scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list)
{
- scan_io_t srch_sio, *sio, *next_sio;
+ scan_io_t *srch_sio, *sio, *next_sio;
avl_index_t idx;
uint_t num_sios = 0;
int64_t bytes_issued = 0;
ASSERT(rs != NULL);
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
- srch_sio.sio_offset = rs->rs_start;
+ srch_sio = sio_alloc(1);
+ srch_sio->sio_nr_dvas = 1;
+ SIO_SET_OFFSET(srch_sio, rs->rs_start);
/*
* The exact start of the extent might not contain any matching zios,
* so if that's the case, examine the next one in the tree.
*/
- sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx);
+ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+ sio_free(srch_sio);
+
if (sio == NULL)
sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER);
- while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) {
- ASSERT3U(sio->sio_offset, >=, rs->rs_start);
- ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end);
+ while (sio != NULL &&
+ SIO_GET_OFFSET(sio) < rs->rs_end && num_sios <= 32) {
+ ASSERT3U(SIO_GET_OFFSET(sio), >=, rs->rs_start);
+ ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs->rs_end);
next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio);
avl_remove(&queue->q_sios_by_addr, sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
- bytes_issued += sio->sio_asize;
+ bytes_issued += SIO_GET_ASIZE(sio);
num_sios++;
list_insert_tail(list, sio);
sio = next_sio;
* in the segment we update it to reflect the work we were able to
* complete. Otherwise, we remove it from the range tree entirely.
*/
- if (sio != NULL && sio->sio_offset < rs->rs_end) {
+ if (sio != NULL && SIO_GET_OFFSET(sio) < rs->rs_end) {
range_tree_adjust_fill(queue->q_exts_by_addr, rs,
-bytes_issued);
range_tree_resize_segment(queue->q_exts_by_addr, rs,
- sio->sio_offset, rs->rs_end - sio->sio_offset);
+ SIO_GET_OFFSET(sio), rs->rs_end - SIO_GET_OFFSET(sio));
return (B_TRUE);
} else {
first_sio = list_head(&sio_list);
last_sio = list_tail(&sio_list);
- seg_end = last_sio->sio_offset + last_sio->sio_asize;
+ seg_end = SIO_GET_END_OFFSET(last_sio);
if (seg_start == 0)
- seg_start = first_sio->sio_offset;
+ seg_start = SIO_GET_OFFSET(first_sio);
/*
* Issuing sios can take a long time so drop the
{
int i;
- /* update the spa's stats on how many bytes we have issued */
- for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ /*
+ * Update the spa's stats on how many bytes we have issued.
+ * Sequential scrubs create a zio for each DVA of the bp. Each
+ * of these will include all DVAs for repair purposes, but the
+ * zio code will only try the first one unless there is an issue.
+ * Therefore, we should only count the first DVA for these IOs.
+ */
+ if (scn->scn_is_sorted) {
atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued,
- DVA_GET_ASIZE(&bp->blk_dva[i]));
+ DVA_GET_ASIZE(&bp->blk_dva[0]));
+ } else {
+ spa_t *spa = scn->scn_dp->dp_spa;
+
+ for (i = 0; i < BP_GET_NDVAS(bp); i++) {
+ atomic_add_64(&spa->spa_scan_pass_issued,
+ DVA_GET_ASIZE(&bp->blk_dva[i]));
+ }
}
/*
scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio)
{
avl_index_t idx;
- int64_t asize = sio->sio_asize;
+ int64_t asize = SIO_GET_ASIZE(sio);
dsl_scan_t *scn = queue->q_scn;
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) {
/* block is already scheduled for reading */
atomic_add_64(&scn->scn_bytes_pending, -asize);
- kmem_cache_free(sio_cache, sio);
+ sio_free(sio);
return;
}
avl_insert(&queue->q_sios_by_addr, sio, idx);
- range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize);
+ queue->q_sio_memused += SIO_GET_MUSED(sio);
+ range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize);
}
/*
int zio_flags, const zbookmark_phys_t *zb)
{
dsl_scan_t *scn = queue->q_scn;
- scan_io_t *sio = kmem_cache_alloc(sio_cache, KM_SLEEP);
+ scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp));
ASSERT0(BP_IS_GANG(bp));
ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock));
* get an integer underflow in case the worker processes the
* zio before we get to incrementing this counter.
*/
- atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize);
+ atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio));
scan_io_queue_insert_impl(queue, sio);
}
{
const scan_io_t *a = x, *b = y;
- if (a->sio_offset < b->sio_offset)
- return (-1);
- if (a->sio_offset == b->sio_offset)
- return (0);
- return (1);
+ return (AVL_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b)));
}
/* IO queues are created on demand when they are needed. */
q->q_scn = scn;
q->q_vd = vd;
+ q->q_sio_memused = 0;
cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL);
q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops,
&q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap);
while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) !=
NULL) {
ASSERT(range_tree_contains(queue->q_exts_by_addr,
- sio->sio_offset, sio->sio_asize));
- bytes_dequeued += sio->sio_asize;
- kmem_cache_free(sio_cache, sio);
+ SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio)));
+ bytes_dequeued += SIO_GET_ASIZE(sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
+ sio_free(sio);
}
+ ASSERT0(queue->q_sio_memused);
atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued);
range_tree_vacate(queue->q_exts_by_addr, NULL, queue);
range_tree_destroy(queue->q_exts_by_addr);
vdev_t *vdev;
kmutex_t *q_lock;
dsl_scan_io_queue_t *queue;
- scan_io_t srch, *sio;
+ scan_io_t *srch_sio, *sio;
avl_index_t idx;
uint64_t start, size;
return;
}
- bp2sio(bp, &srch, dva_i);
- start = srch.sio_offset;
- size = srch.sio_asize;
+ srch_sio = sio_alloc(BP_GET_NDVAS(bp));
+ bp2sio(bp, srch_sio, dva_i);
+ start = SIO_GET_OFFSET(srch_sio);
+ size = SIO_GET_ASIZE(srch_sio);
/*
* We can find the zio in two states:
* be done with issuing the zio's it gathered and will
* signal us.
*/
- sio = avl_find(&queue->q_sios_by_addr, &srch, &idx);
+ sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx);
+ sio_free(srch_sio);
+
if (sio != NULL) {
- int64_t asize = sio->sio_asize;
+ int64_t asize = SIO_GET_ASIZE(sio);
blkptr_t tmpbp;
/* Got it while it was cold in the queue */
- ASSERT3U(start, ==, sio->sio_offset);
+ ASSERT3U(start, ==, SIO_GET_OFFSET(sio));
ASSERT3U(size, ==, asize);
avl_remove(&queue->q_sios_by_addr, sio);
+ queue->q_sio_memused -= SIO_GET_MUSED(sio);
ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size));
range_tree_remove_fill(queue->q_exts_by_addr, start, size);
atomic_add_64(&scn->scn_bytes_pending, -asize);
/* count the block as though we issued it */
- sio2bp(sio, &tmpbp, dva_i);
+ sio2bp(sio, &tmpbp);
count_block(scn, dp->dp_blkstats, &tmpbp);
- kmem_cache_free(sio_cache, sio);
+ sio_free(sio);
}
mutex_exit(q_lock);
}
if (vd == NULL) {
dva_t *dva = zio->io_bp->blk_dva;
spa_t *spa = zio->io_spa;
+ dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan;
dva_t dva_copy[SPA_DVAS_PER_BP];
c = BP_GET_NDVAS(zio->io_bp);
+ /*
+ * The sequential scrub code sorts and issues all DVAs
+ * of a bp separately. Each of these IOs includes all
+ * original DVA copies so that repairs can be performed
+ * in the event of an error, but we only actually want
+ * to check the first DVA since the others will be
+ * checked by their respective sorted IOs. Only if we
+ * hit an error will we try all DVAs upon retrying.
+ *
+ * Note: This check is safe even if the user switches
+ * from a legacy scrub to a sequential one in the middle
+ * of processing, since scn_is_sorted isn't updated until
+ * all outstanding IOs from the previous scrub pass
+ * complete.
+ */
+ if ((zio->io_flags & ZIO_FLAG_SCRUB) &&
+ !(zio->io_flags & ZIO_FLAG_IO_RETRY) &&
+ dsl_scan_scrubbing(spa->spa_dsl_pool) &&
+ scn->scn_is_sorted) {
+ c = 1;
+ } else {
+ c = BP_GET_NDVAS(zio->io_bp);
+ }
+
/*
* If we do not trust the pool config, some DVAs might be
* invalid or point to vdevs that do not exist. We skip them.
* Returns true if the given record matches the I/O in progress.
*/
static boolean_t
-zio_match_handler(const zbookmark_phys_t *zb, uint64_t type,
+zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva,
zinject_record_t *record, int error)
{
/*
zb->zb_level == record->zi_level &&
zb->zb_blkid >= record->zi_start &&
zb->zb_blkid <= record->zi_end &&
- error == record->zi_error)
+ (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) &&
+ error == record->zi_error) {
return (freq_triggered(record->zi_freq));
+ }
return (B_FALSE);
}
handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT)
continue;
- if (zio_match_handler(zb, type, &handler->zi_record, error)) {
+ if (zio_match_handler(zb, type, ZI_NO_DVA,
+ &handler->zi_record, error)) {
ret = error;
break;
}
return (ret);
}
+/*
+ * If this is a physical I/O for a vdev child determine which DVA it is
+ * for. We iterate backwards through the DVAs matching on the offset so
+ * that we end up with ZI_NO_DVA (-1) if we don't find a match.
+ */
+static int
+zio_match_dva(zio_t *zio)
+{
+ int i = ZI_NO_DVA;
+
+ if (zio->io_bp != NULL && zio->io_vd != NULL &&
+ zio->io_child_type == ZIO_CHILD_VDEV) {
+ for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) {
+ dva_t *dva = &zio->io_bp->blk_dva[i];
+ uint64_t off = DVA_GET_OFFSET(dva);
+ vdev_t *vd = vdev_lookup_top(zio->io_spa,
+ DVA_GET_VDEV(dva));
+
+ /* Compensate for vdev label added to leaves */
+ if (zio->io_vd->vdev_ops->vdev_op_leaf)
+ off += VDEV_LABEL_START_SIZE;
+
+ if (zio->io_vd == vd && zio->io_offset == off)
+ break;
+ }
+ }
+
+ return (i);
+}
+
+
/*
* Determine if the I/O in question should return failure. Returns the errno
* to be returned to the caller.
for (handler = list_head(&inject_handlers); handler != NULL;
handler = list_next(&inject_handlers, handler)) {
-
if (zio->io_spa != handler->zi_spa ||
handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
continue;
- /* If this handler matches, return EIO */
+ /* If this handler matches, return the specified error */
if (zio_match_handler(&zio->io_logical->io_bookmark,
zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
- &handler->zi_record, error)) {
+ zio_match_dva(zio), &handler->zi_record, error)) {
ret = error;
break;
}
tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos',
'zpool_scrub_004_pos', 'zpool_scrub_005_pos',
'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing',
- 'zpool_scrub_offline_device']
+ 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies']
tags = ['functional', 'cli_root', 'zpool_scrub']
[tests/functional/cli_root/zpool_set]
zpool_scrub_005_pos.ksh \
zpool_scrub_encrypted_unloaded.ksh \
zpool_scrub_offline_device.ksh \
- zpool_scrub_print_repairing.ksh
+ zpool_scrub_print_repairing.ksh \
+ zpool_scrub_multiple_copies.ksh
dist_pkgdata_DATA = \
zpool_scrub.cfg
--- /dev/null
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source. A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2019 Datto, Inc. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Scrubs and self-healing should be able to repair data from additional
+# copies that may be stored.
+#
+#
+# STRATEGY:
+# 1. Create a dataset with copies=3
+# 2. Write a file to the dataset
+# 3. zinject errors into the first and second DVAs of that file
+# 4. Scrub and verify the scrub repaired all errors
+# 7. Read the file normally to check that self healing also works
+# 8. Remove the zinject handler
+# 9. Scrub again and confirm 0 bytes were scrubbed
+#
+
+verify_runnable "global"
+
+function cleanup
+{
+ destroy_dataset $TESTPOOL/$TESTFS2
+ log_must zinject -c all
+}
+log_onexit cleanup
+
+log_assert "Scrubs and self healing must work with additional copies"
+
+log_must zfs create -o copies=3 $TESTPOOL/$TESTFS2
+typeset mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS2)
+log_must mkfile 10m $mntpnt/file
+log_must zpool sync $TESTPOOL
+
+log_must zinject -a -t data -C 0,1 -e io $mntpnt/file
+
+log_must zpool scrub $TESTPOOL
+log_must wait_scrubbed $TESTPOOL
+
+log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
+log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+
+log_must dd if=$mntpnt/file of=/dev/null bs=1M iflag=fullblock
+log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+
+log_must zinject -c all
+
+log_must zpool scrub $TESTPOOL
+log_must wait_scrubbed $TESTPOOL
+
+zpool status
+
+log_must check_pool_status $TESTPOOL "errors" "No known data errors"
+log_must check_pool_status $TESTPOOL "scan" "with 0 errors"
+log_must check_pool_status $TESTPOOL "scan" "repaired 0B"
+
+log_pass "Scrubs and self healing work with additional copies"
function cleanup
{
- poolexists $TESTPOOL && destroy_pool $TESTPOOL
+ poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
log_must rm -f $DISK1 $DISK2 $DISK3 $DISK4
}
log_must truncate -s $DEVSIZE $DISK2
log_must truncate -s $DEVSIZE $DISK3
log_must truncate -s $DEVSIZE $DISK4
-poolexists $TESTPOOL && destroy_pool $TESTPOOL
-log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL \
+poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2
+log_must zpool create -O mountpoint=$TESTDIR $TESTPOOL2 \
raidz2 $DISK1 $DISK2 $DISK3 $DISK4
# 2. Offline the first device
-zpool_do_sync 'offline' $TESTPOOL $DISK1
+zpool_do_sync 'offline' $TESTPOOL2 $DISK1
# 3. Write to the pool
log_must mkfile $FILESIZE "$TESTDIR/data.bin"
# 4. Scrub the pool
-zpool_scrub_sync $TESTPOOL
+zpool_scrub_sync $TESTPOOL2
# 5. Online the first device and offline the second device
-zpool_do_sync 'online' $TESTPOOL $DISK1
-zpool_do_sync 'offline' $TESTPOOL $DISK2
-log_must wait_for_resilver_end $TESTPOOL $RESILVER_TIMEOUT
+zpool_do_sync 'online' $TESTPOOL2 $DISK1
+zpool_do_sync 'offline' $TESTPOOL2 $DISK2
+log_must wait_for_resilver_end $TESTPOOL2 $RESILVER_TIMEOUT
# 6. Scrub the pool again
-zpool_scrub_sync $TESTPOOL
+zpool_scrub_sync $TESTPOOL2
# 7. Verify data integrity
-cksum=$(zpool status $TESTPOOL | awk 'L{print $NF;L=0} /CKSUM$/{L=1}')
+cksum=$(zpool status $TESTPOOL2 | awk 'L{print $NF;L=0} /CKSUM$/{L=1}')
if [[ $cksum != 0 ]]; then
- log_fail "Unexpected CKSUM errors found on $TESTPOOL ($cksum)"
+ log_fail "Unexpected CKSUM errors found on $TESTPOOL2 ($cksum)"
fi
log_pass "Scrubbing a pool with offline devices correctly preserves DTLs"