4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
23 * Use is subject to license terms.
27 * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
30 #include <sys/zfs_context.h>
32 #include <sys/vdev_impl.h>
35 #include <sys/fs/zfs.h>
40 static kstat_t
*mirror_ksp
= NULL
;
42 typedef struct mirror_stats
{
43 kstat_named_t vdev_mirror_stat_rotating_linear
;
44 kstat_named_t vdev_mirror_stat_rotating_offset
;
45 kstat_named_t vdev_mirror_stat_rotating_seek
;
46 kstat_named_t vdev_mirror_stat_non_rotating_linear
;
47 kstat_named_t vdev_mirror_stat_non_rotating_seek
;
49 kstat_named_t vdev_mirror_stat_preferred_found
;
50 kstat_named_t vdev_mirror_stat_preferred_not_found
;
53 static mirror_stats_t mirror_stats
= {
54 /* New I/O follows directly the last I/O */
55 { "rotating_linear", KSTAT_DATA_UINT64
},
56 /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */
57 { "rotating_offset", KSTAT_DATA_UINT64
},
58 /* New I/O requires random seek */
59 { "rotating_seek", KSTAT_DATA_UINT64
},
60 /* New I/O follows directly the last I/O (nonrot) */
61 { "non_rotating_linear", KSTAT_DATA_UINT64
},
62 /* New I/O requires random seek (nonrot) */
63 { "non_rotating_seek", KSTAT_DATA_UINT64
},
64 /* Preferred child vdev found */
65 { "preferred_found", KSTAT_DATA_UINT64
},
66 /* Preferred child vdev not found or equal load */
67 { "preferred_not_found", KSTAT_DATA_UINT64
},
71 #define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64)
72 #define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val)
73 #define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1)
76 vdev_mirror_stat_init(void)
78 mirror_ksp
= kstat_create("zfs", 0, "vdev_mirror_stats",
79 "misc", KSTAT_TYPE_NAMED
,
80 sizeof (mirror_stats
) / sizeof (kstat_named_t
), KSTAT_FLAG_VIRTUAL
);
81 if (mirror_ksp
!= NULL
) {
82 mirror_ksp
->ks_data
= &mirror_stats
;
83 kstat_install(mirror_ksp
);
88 vdev_mirror_stat_fini(void)
90 if (mirror_ksp
!= NULL
) {
91 kstat_delete(mirror_ksp
);
97 * Virtual device vector for mirroring.
100 typedef struct mirror_child
{
107 uint8_t mc_speculative
;
110 typedef struct mirror_map
{
112 int mm_preferred_cnt
;
114 boolean_t mm_replacing
;
116 mirror_child_t mm_child
[];
119 static int vdev_mirror_shift
= 21;
122 * The load configuration settings below are tuned by default for
123 * the case where all devices are of the same rotational type.
125 * If there is a mixture of rotating and non-rotating media, setting
126 * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results
127 * as it will direct more reads to the non-rotating vdevs which are more likely
128 * to have a higher performance.
131 /* Rotating media load calculation configuration. */
132 static int zfs_vdev_mirror_rotating_inc
= 0;
133 static int zfs_vdev_mirror_rotating_seek_inc
= 5;
134 static int zfs_vdev_mirror_rotating_seek_offset
= 1 * 1024 * 1024;
136 /* Non-rotating media load calculation configuration. */
137 static int zfs_vdev_mirror_non_rotating_inc
= 0;
138 static int zfs_vdev_mirror_non_rotating_seek_inc
= 1;
141 vdev_mirror_map_size(int children
)
143 return (offsetof(mirror_map_t
, mm_child
[children
]) +
144 sizeof (int) * children
);
147 static inline mirror_map_t
*
148 vdev_mirror_map_alloc(int children
, boolean_t replacing
, boolean_t root
)
152 mm
= kmem_zalloc(vdev_mirror_map_size(children
), KM_SLEEP
);
153 mm
->mm_children
= children
;
154 mm
->mm_replacing
= replacing
;
156 mm
->mm_preferred
= (int *)((uintptr_t)mm
+
157 offsetof(mirror_map_t
, mm_child
[children
]));
163 vdev_mirror_map_free(zio_t
*zio
)
165 mirror_map_t
*mm
= zio
->io_vsd
;
167 kmem_free(mm
, vdev_mirror_map_size(mm
->mm_children
));
170 static const zio_vsd_ops_t vdev_mirror_vsd_ops
= {
171 .vsd_free
= vdev_mirror_map_free
,
172 .vsd_cksum_report
= zio_vsd_default_cksum_report
176 vdev_mirror_load(mirror_map_t
*mm
, vdev_t
*vd
, uint64_t zio_offset
)
178 uint64_t last_offset
;
182 /* All DVAs have equal weight at the root. */
187 * We don't return INT_MAX if the device is resilvering i.e.
188 * vdev_resilver_txg != 0 as when tested performance was slightly
189 * worse overall when resilvering with compared to without.
192 /* Fix zio_offset for leaf vdevs */
193 if (vd
->vdev_ops
->vdev_op_leaf
)
194 zio_offset
+= VDEV_LABEL_START_SIZE
;
196 /* Standard load based on pending queue length. */
197 load
= vdev_queue_length(vd
);
198 last_offset
= vdev_queue_last_offset(vd
);
200 if (vd
->vdev_nonrot
) {
201 /* Non-rotating media. */
202 if (last_offset
== zio_offset
) {
203 MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear
);
204 return (load
+ zfs_vdev_mirror_non_rotating_inc
);
208 * Apply a seek penalty even for non-rotating devices as
209 * sequential I/O's can be aggregated into fewer operations on
210 * the device, thus avoiding unnecessary per-command overhead
211 * and boosting performance.
213 MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek
);
214 return (load
+ zfs_vdev_mirror_non_rotating_seek_inc
);
217 /* Rotating media I/O's which directly follow the last I/O. */
218 if (last_offset
== zio_offset
) {
219 MIRROR_BUMP(vdev_mirror_stat_rotating_linear
);
220 return (load
+ zfs_vdev_mirror_rotating_inc
);
224 * Apply half the seek increment to I/O's within seek offset
225 * of the last I/O issued to this vdev as they should incur less
226 * of a seek increment.
228 offset_diff
= (int64_t)(last_offset
- zio_offset
);
229 if (ABS(offset_diff
) < zfs_vdev_mirror_rotating_seek_offset
) {
230 MIRROR_BUMP(vdev_mirror_stat_rotating_offset
);
231 return (load
+ (zfs_vdev_mirror_rotating_seek_inc
/ 2));
234 /* Apply the full seek increment to all other I/O's. */
235 MIRROR_BUMP(vdev_mirror_stat_rotating_seek
);
236 return (load
+ zfs_vdev_mirror_rotating_seek_inc
);
240 * Avoid inlining the function to keep vdev_mirror_io_start(), which
241 * is this functions only caller, as small as possible on the stack.
243 noinline
static mirror_map_t
*
244 vdev_mirror_map_init(zio_t
*zio
)
246 mirror_map_t
*mm
= NULL
;
248 vdev_t
*vd
= zio
->io_vd
;
252 dva_t
*dva
= zio
->io_bp
->blk_dva
;
253 spa_t
*spa
= zio
->io_spa
;
255 mm
= vdev_mirror_map_alloc(BP_GET_NDVAS(zio
->io_bp
), B_FALSE
,
257 for (c
= 0; c
< mm
->mm_children
; c
++) {
258 mc
= &mm
->mm_child
[c
];
260 mc
->mc_vd
= vdev_lookup_top(spa
, DVA_GET_VDEV(&dva
[c
]));
261 mc
->mc_offset
= DVA_GET_OFFSET(&dva
[c
]);
264 mm
= vdev_mirror_map_alloc(vd
->vdev_children
,
265 (vd
->vdev_ops
== &vdev_replacing_ops
||
266 vd
->vdev_ops
== &vdev_spare_ops
), B_FALSE
);
267 for (c
= 0; c
< mm
->mm_children
; c
++) {
268 mc
= &mm
->mm_child
[c
];
269 mc
->mc_vd
= vd
->vdev_child
[c
];
270 mc
->mc_offset
= zio
->io_offset
;
275 zio
->io_vsd_ops
= &vdev_mirror_vsd_ops
;
280 vdev_mirror_open(vdev_t
*vd
, uint64_t *asize
, uint64_t *max_asize
,
286 if (vd
->vdev_children
== 0) {
287 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
288 return (SET_ERROR(EINVAL
));
291 vdev_open_children(vd
);
293 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
294 vdev_t
*cvd
= vd
->vdev_child
[c
];
296 if (cvd
->vdev_open_error
) {
297 lasterror
= cvd
->vdev_open_error
;
302 *asize
= MIN(*asize
- 1, cvd
->vdev_asize
- 1) + 1;
303 *max_asize
= MIN(*max_asize
- 1, cvd
->vdev_max_asize
- 1) + 1;
304 *ashift
= MAX(*ashift
, cvd
->vdev_ashift
);
307 if (numerrors
== vd
->vdev_children
) {
308 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NO_REPLICAS
;
316 vdev_mirror_close(vdev_t
*vd
)
318 for (int c
= 0; c
< vd
->vdev_children
; c
++)
319 vdev_close(vd
->vdev_child
[c
]);
323 vdev_mirror_child_done(zio_t
*zio
)
325 mirror_child_t
*mc
= zio
->io_private
;
327 mc
->mc_error
= zio
->io_error
;
333 vdev_mirror_scrub_done(zio_t
*zio
)
335 mirror_child_t
*mc
= zio
->io_private
;
337 if (zio
->io_error
== 0) {
339 zio_link_t
*zl
= NULL
;
341 mutex_enter(&zio
->io_lock
);
342 while ((pio
= zio_walk_parents(zio
, &zl
)) != NULL
) {
343 mutex_enter(&pio
->io_lock
);
344 ASSERT3U(zio
->io_size
, >=, pio
->io_size
);
345 abd_copy(pio
->io_abd
, zio
->io_abd
, pio
->io_size
);
346 mutex_exit(&pio
->io_lock
);
348 mutex_exit(&zio
->io_lock
);
351 abd_free(zio
->io_abd
);
353 mc
->mc_error
= zio
->io_error
;
359 * Check the other, lower-index DVAs to see if they're on the same
360 * vdev as the child we picked. If they are, use them since they
361 * are likely to have been allocated from the primary metaslab in
362 * use at the time, and hence are more likely to have locality with
366 vdev_mirror_dva_select(zio_t
*zio
, int p
)
368 dva_t
*dva
= zio
->io_bp
->blk_dva
;
369 mirror_map_t
*mm
= zio
->io_vsd
;
373 preferred
= mm
->mm_preferred
[p
];
374 for (p
--; p
>= 0; p
--) {
375 c
= mm
->mm_preferred
[p
];
376 if (DVA_GET_VDEV(&dva
[c
]) == DVA_GET_VDEV(&dva
[preferred
]))
383 vdev_mirror_preferred_child_randomize(zio_t
*zio
)
385 mirror_map_t
*mm
= zio
->io_vsd
;
389 p
= spa_get_random(mm
->mm_preferred_cnt
);
390 return (vdev_mirror_dva_select(zio
, p
));
394 * To ensure we don't always favour the first matching vdev,
395 * which could lead to wear leveling issues on SSD's, we
396 * use the I/O offset as a pseudo random seed into the vdevs
397 * which have the lowest load.
399 p
= (zio
->io_offset
>> vdev_mirror_shift
) % mm
->mm_preferred_cnt
;
400 return (mm
->mm_preferred
[p
]);
404 * Try to find a vdev whose DTL doesn't contain the block we want to read
405 * prefering vdevs based on determined load.
407 * Try to find a child whose DTL doesn't contain the block we want to read.
408 * If we can't, try the read on any vdev we haven't already tried.
411 vdev_mirror_child_select(zio_t
*zio
)
413 mirror_map_t
*mm
= zio
->io_vsd
;
414 uint64_t txg
= zio
->io_txg
;
417 ASSERT(zio
->io_bp
== NULL
|| BP_PHYSICAL_BIRTH(zio
->io_bp
) == txg
);
419 lowest_load
= INT_MAX
;
420 mm
->mm_preferred_cnt
= 0;
421 for (c
= 0; c
< mm
->mm_children
; c
++) {
424 mc
= &mm
->mm_child
[c
];
425 if (mc
->mc_tried
|| mc
->mc_skipped
)
428 if (mc
->mc_vd
== NULL
|| !vdev_readable(mc
->mc_vd
)) {
429 mc
->mc_error
= SET_ERROR(ENXIO
);
430 mc
->mc_tried
= 1; /* don't even try */
435 if (vdev_dtl_contains(mc
->mc_vd
, DTL_MISSING
, txg
, 1)) {
436 mc
->mc_error
= SET_ERROR(ESTALE
);
438 mc
->mc_speculative
= 1;
442 mc
->mc_load
= vdev_mirror_load(mm
, mc
->mc_vd
, mc
->mc_offset
);
443 if (mc
->mc_load
> lowest_load
)
446 if (mc
->mc_load
< lowest_load
) {
447 lowest_load
= mc
->mc_load
;
448 mm
->mm_preferred_cnt
= 0;
450 mm
->mm_preferred
[mm
->mm_preferred_cnt
] = c
;
451 mm
->mm_preferred_cnt
++;
454 if (mm
->mm_preferred_cnt
== 1) {
455 MIRROR_BUMP(vdev_mirror_stat_preferred_found
);
456 return (mm
->mm_preferred
[0]);
459 if (mm
->mm_preferred_cnt
> 1) {
460 MIRROR_BUMP(vdev_mirror_stat_preferred_not_found
);
461 return (vdev_mirror_preferred_child_randomize(zio
));
465 * Every device is either missing or has this txg in its DTL.
466 * Look for any child we haven't already tried before giving up.
468 for (c
= 0; c
< mm
->mm_children
; c
++) {
469 if (!mm
->mm_child
[c
].mc_tried
)
474 * Every child failed. There's no place left to look.
480 vdev_mirror_io_start(zio_t
*zio
)
486 mm
= vdev_mirror_map_init(zio
);
488 if (zio
->io_type
== ZIO_TYPE_READ
) {
489 if ((zio
->io_flags
& ZIO_FLAG_SCRUB
) && !mm
->mm_replacing
) {
491 * For scrubbing reads we need to allocate a read
492 * buffer for each child and issue reads to all
493 * children. If any child succeeds, it will copy its
494 * data into zio->io_data in vdev_mirror_scrub_done.
496 for (c
= 0; c
< mm
->mm_children
; c
++) {
497 mc
= &mm
->mm_child
[c
];
498 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
499 mc
->mc_vd
, mc
->mc_offset
,
500 abd_alloc_sametype(zio
->io_abd
,
501 zio
->io_size
), zio
->io_size
,
502 zio
->io_type
, zio
->io_priority
, 0,
503 vdev_mirror_scrub_done
, mc
));
509 * For normal reads just pick one child.
511 c
= vdev_mirror_child_select(zio
);
514 ASSERT(zio
->io_type
== ZIO_TYPE_WRITE
);
517 * Writes go to all children.
520 children
= mm
->mm_children
;
524 mc
= &mm
->mm_child
[c
];
525 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
526 mc
->mc_vd
, mc
->mc_offset
, zio
->io_abd
, zio
->io_size
,
527 zio
->io_type
, zio
->io_priority
, 0,
528 vdev_mirror_child_done
, mc
));
536 vdev_mirror_worst_error(mirror_map_t
*mm
)
538 int error
[2] = { 0, 0 };
540 for (int c
= 0; c
< mm
->mm_children
; c
++) {
541 mirror_child_t
*mc
= &mm
->mm_child
[c
];
542 int s
= mc
->mc_speculative
;
543 error
[s
] = zio_worst_error(error
[s
], mc
->mc_error
);
546 return (error
[0] ? error
[0] : error
[1]);
550 vdev_mirror_io_done(zio_t
*zio
)
552 mirror_map_t
*mm
= zio
->io_vsd
;
556 int unexpected_errors
= 0;
558 for (c
= 0; c
< mm
->mm_children
; c
++) {
559 mc
= &mm
->mm_child
[c
];
564 } else if (mc
->mc_tried
) {
569 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
571 * XXX -- for now, treat partial writes as success.
573 * Now that we support write reallocation, it would be better
574 * to treat partial failure as real failure unless there are
575 * no non-degraded top-level vdevs left, and not update DTLs
576 * if we intend to reallocate.
579 if (good_copies
!= mm
->mm_children
) {
581 * Always require at least one good copy.
583 * For ditto blocks (io_vd == NULL), require
584 * all copies to be good.
586 * XXX -- for replacing vdevs, there's no great answer.
587 * If the old device is really dead, we may not even
588 * be able to access it -- so we only want to
589 * require good writes to the new device. But if
590 * the new device turns out to be flaky, we want
591 * to be able to detach it -- which requires all
592 * writes to the old device to have succeeded.
594 if (good_copies
== 0 || zio
->io_vd
== NULL
)
595 zio
->io_error
= vdev_mirror_worst_error(mm
);
600 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
603 * If we don't have a good copy yet, keep trying other children.
606 if (good_copies
== 0 && (c
= vdev_mirror_child_select(zio
)) != -1) {
607 ASSERT(c
>= 0 && c
< mm
->mm_children
);
608 mc
= &mm
->mm_child
[c
];
609 zio_vdev_io_redone(zio
);
610 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
611 mc
->mc_vd
, mc
->mc_offset
, zio
->io_abd
, zio
->io_size
,
612 ZIO_TYPE_READ
, zio
->io_priority
, 0,
613 vdev_mirror_child_done
, mc
));
618 if (good_copies
== 0) {
619 zio
->io_error
= vdev_mirror_worst_error(mm
);
620 ASSERT(zio
->io_error
!= 0);
623 if (good_copies
&& spa_writeable(zio
->io_spa
) &&
624 (unexpected_errors
||
625 (zio
->io_flags
& ZIO_FLAG_RESILVER
) ||
626 ((zio
->io_flags
& ZIO_FLAG_SCRUB
) && mm
->mm_replacing
))) {
628 * Use the good data we have in hand to repair damaged children.
630 for (c
= 0; c
< mm
->mm_children
; c
++) {
632 * Don't rewrite known good children.
633 * Not only is it unnecessary, it could
634 * actually be harmful: if the system lost
635 * power while rewriting the only good copy,
636 * there would be no good copies left!
638 mc
= &mm
->mm_child
[c
];
640 if (mc
->mc_error
== 0) {
643 if (!(zio
->io_flags
& ZIO_FLAG_SCRUB
) &&
644 !vdev_dtl_contains(mc
->mc_vd
, DTL_PARTIAL
,
647 mc
->mc_error
= SET_ERROR(ESTALE
);
650 zio_nowait(zio_vdev_child_io(zio
, zio
->io_bp
,
651 mc
->mc_vd
, mc
->mc_offset
,
652 zio
->io_abd
, zio
->io_size
,
653 ZIO_TYPE_WRITE
, ZIO_PRIORITY_ASYNC_WRITE
,
654 ZIO_FLAG_IO_REPAIR
| (unexpected_errors
?
655 ZIO_FLAG_SELF_HEAL
: 0), NULL
, NULL
));
661 vdev_mirror_state_change(vdev_t
*vd
, int faulted
, int degraded
)
663 if (faulted
== vd
->vdev_children
)
664 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
665 VDEV_AUX_NO_REPLICAS
);
666 else if (degraded
+ faulted
!= 0)
667 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
, VDEV_AUX_NONE
);
669 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_HEALTHY
, VDEV_AUX_NONE
);
672 vdev_ops_t vdev_mirror_ops
= {
676 vdev_mirror_io_start
,
678 vdev_mirror_state_change
,
683 VDEV_TYPE_MIRROR
, /* name of this vdev type */
684 B_FALSE
/* not a leaf vdev */
687 vdev_ops_t vdev_replacing_ops
= {
691 vdev_mirror_io_start
,
693 vdev_mirror_state_change
,
698 VDEV_TYPE_REPLACING
, /* name of this vdev type */
699 B_FALSE
/* not a leaf vdev */
702 vdev_ops_t vdev_spare_ops
= {
706 vdev_mirror_io_start
,
708 vdev_mirror_state_change
,
713 VDEV_TYPE_SPARE
, /* name of this vdev type */
714 B_FALSE
/* not a leaf vdev */
717 #if defined(_KERNEL) && defined(HAVE_SPL)
719 module_param(zfs_vdev_mirror_rotating_inc
, int, 0644);
720 MODULE_PARM_DESC(zfs_vdev_mirror_rotating_inc
,
721 "Rotating media load increment for non-seeking I/O's");
723 module_param(zfs_vdev_mirror_rotating_seek_inc
, int, 0644);
724 MODULE_PARM_DESC(zfs_vdev_mirror_rotating_seek_inc
,
725 "Rotating media load increment for seeking I/O's");
727 module_param(zfs_vdev_mirror_rotating_seek_offset
, int, 0644);
729 MODULE_PARM_DESC(zfs_vdev_mirror_rotating_seek_offset
,
730 "Offset in bytes from the last I/O which "
731 "triggers a reduced rotating media seek increment");
733 module_param(zfs_vdev_mirror_non_rotating_inc
, int, 0644);
734 MODULE_PARM_DESC(zfs_vdev_mirror_non_rotating_inc
,
735 "Non-rotating media load increment for non-seeking I/O's");
737 module_param(zfs_vdev_mirror_non_rotating_seek_inc
, int, 0644);
738 MODULE_PARM_DESC(zfs_vdev_mirror_non_rotating_seek_inc
,
739 "Non-rotating media load increment for seeking I/O's");