]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_mirror.c
Modify vdev_elevator_switch() to use elevator_change()
[mirror_zfs.git] / module / zfs / vdev_mirror.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
34dc7c2f
BB
23 * Use is subject to license terms.
24 */
25
1bd201e7
CS
26/*
27 * Copyright (c) 2012 by Delphix. All rights reserved.
28 */
29
34dc7c2f
BB
30#include <sys/zfs_context.h>
31#include <sys/spa.h>
32#include <sys/vdev_impl.h>
33#include <sys/zio.h>
34#include <sys/fs/zfs.h>
35
36/*
37 * Virtual device vector for mirroring.
38 */
39
40typedef struct mirror_child {
41 vdev_t *mc_vd;
42 uint64_t mc_offset;
43 int mc_error;
b128c09f
BB
44 uint8_t mc_tried;
45 uint8_t mc_skipped;
46 uint8_t mc_speculative;
34dc7c2f
BB
47} mirror_child_t;
48
49typedef struct mirror_map {
50 int mm_children;
51 int mm_replacing;
52 int mm_preferred;
53 int mm_root;
54 mirror_child_t mm_child[1];
55} mirror_map_t;
56
57int vdev_mirror_shift = 21;
58
b128c09f
BB
59static void
60vdev_mirror_map_free(zio_t *zio)
61{
62 mirror_map_t *mm = zio->io_vsd;
63
64 kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
65}
66
428870ff
BB
67static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
68 vdev_mirror_map_free,
69 zio_vsd_default_cksum_report
70};
71
34dc7c2f
BB
72static mirror_map_t *
73vdev_mirror_map_alloc(zio_t *zio)
74{
75 mirror_map_t *mm = NULL;
76 mirror_child_t *mc;
77 vdev_t *vd = zio->io_vd;
78 int c, d;
79
80 if (vd == NULL) {
81 dva_t *dva = zio->io_bp->blk_dva;
82 spa_t *spa = zio->io_spa;
83
84 c = BP_GET_NDVAS(zio->io_bp);
85
b8d06fca 86 mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
34dc7c2f
BB
87 mm->mm_children = c;
88 mm->mm_replacing = B_FALSE;
89 mm->mm_preferred = spa_get_random(c);
90 mm->mm_root = B_TRUE;
91
92 /*
93 * Check the other, lower-index DVAs to see if they're on
94 * the same vdev as the child we picked. If they are, use
95 * them since they are likely to have been allocated from
96 * the primary metaslab in use at the time, and hence are
97 * more likely to have locality with single-copy data.
98 */
99 for (c = mm->mm_preferred, d = c - 1; d >= 0; d--) {
100 if (DVA_GET_VDEV(&dva[d]) == DVA_GET_VDEV(&dva[c]))
101 mm->mm_preferred = d;
102 }
103
104 for (c = 0; c < mm->mm_children; c++) {
105 mc = &mm->mm_child[c];
106
107 mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c]));
108 mc->mc_offset = DVA_GET_OFFSET(&dva[c]);
109 }
110 } else {
111 c = vd->vdev_children;
112
b8d06fca 113 mm = kmem_zalloc(offsetof(mirror_map_t, mm_child[c]), KM_PUSHPAGE);
34dc7c2f
BB
114 mm->mm_children = c;
115 mm->mm_replacing = (vd->vdev_ops == &vdev_replacing_ops ||
116 vd->vdev_ops == &vdev_spare_ops);
117 mm->mm_preferred = mm->mm_replacing ? 0 :
118 (zio->io_offset >> vdev_mirror_shift) % c;
119 mm->mm_root = B_FALSE;
120
121 for (c = 0; c < mm->mm_children; c++) {
122 mc = &mm->mm_child[c];
123 mc->mc_vd = vd->vdev_child[c];
124 mc->mc_offset = zio->io_offset;
125 }
126 }
127
128 zio->io_vsd = mm;
428870ff 129 zio->io_vsd_ops = &vdev_mirror_vsd_ops;
34dc7c2f
BB
130 return (mm);
131}
132
34dc7c2f 133static int
1bd201e7
CS
134vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
135 uint64_t *ashift)
34dc7c2f 136{
34dc7c2f 137 int numerrors = 0;
45d1cae3 138 int lasterror = 0;
d6320ddb 139 int c;
34dc7c2f
BB
140
141 if (vd->vdev_children == 0) {
142 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
143 return (EINVAL);
144 }
145
45d1cae3 146 vdev_open_children(vd);
34dc7c2f 147
d6320ddb 148 for (c = 0; c < vd->vdev_children; c++) {
45d1cae3
BB
149 vdev_t *cvd = vd->vdev_child[c];
150
151 if (cvd->vdev_open_error) {
152 lasterror = cvd->vdev_open_error;
34dc7c2f
BB
153 numerrors++;
154 continue;
155 }
156
157 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1bd201e7 158 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
34dc7c2f
BB
159 *ashift = MAX(*ashift, cvd->vdev_ashift);
160 }
161
162 if (numerrors == vd->vdev_children) {
163 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
164 return (lasterror);
165 }
166
167 return (0);
168}
169
170static void
171vdev_mirror_close(vdev_t *vd)
172{
d6320ddb
BB
173 int c;
174
175 for (c = 0; c < vd->vdev_children; c++)
34dc7c2f
BB
176 vdev_close(vd->vdev_child[c]);
177}
178
179static void
180vdev_mirror_child_done(zio_t *zio)
181{
182 mirror_child_t *mc = zio->io_private;
183
184 mc->mc_error = zio->io_error;
185 mc->mc_tried = 1;
186 mc->mc_skipped = 0;
187}
188
189static void
190vdev_mirror_scrub_done(zio_t *zio)
191{
192 mirror_child_t *mc = zio->io_private;
193
194 if (zio->io_error == 0) {
d164b209
BB
195 zio_t *pio;
196
197 mutex_enter(&zio->io_lock);
198 while ((pio = zio_walk_parents(zio)) != NULL) {
199 mutex_enter(&pio->io_lock);
200 ASSERT3U(zio->io_size, >=, pio->io_size);
201 bcopy(zio->io_data, pio->io_data, pio->io_size);
202 mutex_exit(&pio->io_lock);
203 }
204 mutex_exit(&zio->io_lock);
34dc7c2f
BB
205 }
206
207 zio_buf_free(zio->io_data, zio->io_size);
208
209 mc->mc_error = zio->io_error;
210 mc->mc_tried = 1;
211 mc->mc_skipped = 0;
212}
213
34dc7c2f
BB
214/*
215 * Try to find a child whose DTL doesn't contain the block we want to read.
216 * If we can't, try the read on any vdev we haven't already tried.
217 */
218static int
219vdev_mirror_child_select(zio_t *zio)
220{
221 mirror_map_t *mm = zio->io_vsd;
222 mirror_child_t *mc;
223 uint64_t txg = zio->io_txg;
224 int i, c;
225
428870ff 226 ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
34dc7c2f
BB
227
228 /*
229 * Try to find a child whose DTL doesn't contain the block to read.
230 * If a child is known to be completely inaccessible (indicated by
231 * vdev_readable() returning B_FALSE), don't even try.
232 */
233 for (i = 0, c = mm->mm_preferred; i < mm->mm_children; i++, c++) {
234 if (c >= mm->mm_children)
235 c = 0;
236 mc = &mm->mm_child[c];
237 if (mc->mc_tried || mc->mc_skipped)
238 continue;
b128c09f 239 if (!vdev_readable(mc->mc_vd)) {
34dc7c2f
BB
240 mc->mc_error = ENXIO;
241 mc->mc_tried = 1; /* don't even try */
242 mc->mc_skipped = 1;
243 continue;
244 }
fb5f0bc8 245 if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
34dc7c2f
BB
246 return (c);
247 mc->mc_error = ESTALE;
248 mc->mc_skipped = 1;
b128c09f 249 mc->mc_speculative = 1;
34dc7c2f
BB
250 }
251
252 /*
253 * Every device is either missing or has this txg in its DTL.
254 * Look for any child we haven't already tried before giving up.
255 */
256 for (c = 0; c < mm->mm_children; c++)
257 if (!mm->mm_child[c].mc_tried)
258 return (c);
259
260 /*
261 * Every child failed. There's no place left to look.
262 */
263 return (-1);
264}
265
266static int
267vdev_mirror_io_start(zio_t *zio)
268{
269 mirror_map_t *mm;
270 mirror_child_t *mc;
271 int c, children;
272
273 mm = vdev_mirror_map_alloc(zio);
274
275 if (zio->io_type == ZIO_TYPE_READ) {
276 if ((zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_replacing) {
277 /*
278 * For scrubbing reads we need to allocate a read
279 * buffer for each child and issue reads to all
280 * children. If any child succeeds, it will copy its
281 * data into zio->io_data in vdev_mirror_scrub_done.
282 */
283 for (c = 0; c < mm->mm_children; c++) {
284 mc = &mm->mm_child[c];
285 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
286 mc->mc_vd, mc->mc_offset,
287 zio_buf_alloc(zio->io_size), zio->io_size,
b128c09f 288 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
289 vdev_mirror_scrub_done, mc));
290 }
b128c09f 291 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
292 }
293 /*
294 * For normal reads just pick one child.
295 */
296 c = vdev_mirror_child_select(zio);
297 children = (c >= 0);
298 } else {
299 ASSERT(zio->io_type == ZIO_TYPE_WRITE);
300
301 /*
fb5f0bc8 302 * Writes go to all children.
34dc7c2f 303 */
fb5f0bc8
BB
304 c = 0;
305 children = mm->mm_children;
34dc7c2f
BB
306 }
307
308 while (children--) {
309 mc = &mm->mm_child[c];
310 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
b128c09f
BB
311 mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
312 zio->io_type, zio->io_priority, 0,
313 vdev_mirror_child_done, mc));
34dc7c2f
BB
314 c++;
315 }
316
b128c09f 317 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
318}
319
320static int
b128c09f
BB
321vdev_mirror_worst_error(mirror_map_t *mm)
322{
d6320ddb 323 int c, error[2] = { 0, 0 };
b128c09f 324
d6320ddb 325 for (c = 0; c < mm->mm_children; c++) {
b128c09f
BB
326 mirror_child_t *mc = &mm->mm_child[c];
327 int s = mc->mc_speculative;
328 error[s] = zio_worst_error(error[s], mc->mc_error);
329 }
330
331 return (error[0] ? error[0] : error[1]);
332}
333
334static void
34dc7c2f
BB
335vdev_mirror_io_done(zio_t *zio)
336{
337 mirror_map_t *mm = zio->io_vsd;
338 mirror_child_t *mc;
339 int c;
340 int good_copies = 0;
341 int unexpected_errors = 0;
342
34dc7c2f
BB
343 for (c = 0; c < mm->mm_children; c++) {
344 mc = &mm->mm_child[c];
345
34dc7c2f 346 if (mc->mc_error) {
34dc7c2f
BB
347 if (!mc->mc_skipped)
348 unexpected_errors++;
b128c09f
BB
349 } else if (mc->mc_tried) {
350 good_copies++;
34dc7c2f
BB
351 }
352 }
353
354 if (zio->io_type == ZIO_TYPE_WRITE) {
355 /*
356 * XXX -- for now, treat partial writes as success.
b128c09f
BB
357 *
358 * Now that we support write reallocation, it would be better
359 * to treat partial failure as real failure unless there are
360 * no non-degraded top-level vdevs left, and not update DTLs
361 * if we intend to reallocate.
34dc7c2f
BB
362 */
363 /* XXPOLICY */
b128c09f
BB
364 if (good_copies != mm->mm_children) {
365 /*
366 * Always require at least one good copy.
367 *
368 * For ditto blocks (io_vd == NULL), require
369 * all copies to be good.
370 *
371 * XXX -- for replacing vdevs, there's no great answer.
372 * If the old device is really dead, we may not even
373 * be able to access it -- so we only want to
374 * require good writes to the new device. But if
375 * the new device turns out to be flaky, we want
376 * to be able to detach it -- which requires all
377 * writes to the old device to have succeeded.
378 */
379 if (good_copies == 0 || zio->io_vd == NULL)
380 zio->io_error = vdev_mirror_worst_error(mm);
381 }
382 return;
34dc7c2f
BB
383 }
384
385 ASSERT(zio->io_type == ZIO_TYPE_READ);
386
387 /*
388 * If we don't have a good copy yet, keep trying other children.
389 */
390 /* XXPOLICY */
391 if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) {
392 ASSERT(c >= 0 && c < mm->mm_children);
393 mc = &mm->mm_child[c];
34dc7c2f
BB
394 zio_vdev_io_redone(zio);
395 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
396 mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size,
b128c09f 397 ZIO_TYPE_READ, zio->io_priority, 0,
34dc7c2f 398 vdev_mirror_child_done, mc));
b128c09f 399 return;
34dc7c2f
BB
400 }
401
402 /* XXPOLICY */
b128c09f
BB
403 if (good_copies == 0) {
404 zio->io_error = vdev_mirror_worst_error(mm);
34dc7c2f 405 ASSERT(zio->io_error != 0);
b128c09f 406 }
34dc7c2f 407
fb5f0bc8 408 if (good_copies && spa_writeable(zio->io_spa) &&
34dc7c2f
BB
409 (unexpected_errors ||
410 (zio->io_flags & ZIO_FLAG_RESILVER) ||
411 ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
34dc7c2f
BB
412 /*
413 * Use the good data we have in hand to repair damaged children.
34dc7c2f 414 */
34dc7c2f
BB
415 for (c = 0; c < mm->mm_children; c++) {
416 /*
417 * Don't rewrite known good children.
418 * Not only is it unnecessary, it could
419 * actually be harmful: if the system lost
420 * power while rewriting the only good copy,
421 * there would be no good copies left!
422 */
423 mc = &mm->mm_child[c];
424
425 if (mc->mc_error == 0) {
426 if (mc->mc_tried)
427 continue;
428 if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
fb5f0bc8 429 !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
34dc7c2f
BB
430 zio->io_txg, 1))
431 continue;
432 mc->mc_error = ESTALE;
433 }
434
b128c09f
BB
435 zio_nowait(zio_vdev_child_io(zio, zio->io_bp,
436 mc->mc_vd, mc->mc_offset,
437 zio->io_data, zio->io_size,
34dc7c2f 438 ZIO_TYPE_WRITE, zio->io_priority,
fb5f0bc8
BB
439 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
440 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
34dc7c2f 441 }
34dc7c2f 442 }
34dc7c2f
BB
443}
444
445static void
446vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded)
447{
448 if (faulted == vd->vdev_children)
449 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
450 VDEV_AUX_NO_REPLICAS);
451 else if (degraded + faulted != 0)
452 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
453 else
454 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
455}
456
457vdev_ops_t vdev_mirror_ops = {
458 vdev_mirror_open,
459 vdev_mirror_close,
34dc7c2f
BB
460 vdev_default_asize,
461 vdev_mirror_io_start,
462 vdev_mirror_io_done,
463 vdev_mirror_state_change,
428870ff
BB
464 NULL,
465 NULL,
34dc7c2f
BB
466 VDEV_TYPE_MIRROR, /* name of this vdev type */
467 B_FALSE /* not a leaf vdev */
468};
469
470vdev_ops_t vdev_replacing_ops = {
471 vdev_mirror_open,
472 vdev_mirror_close,
34dc7c2f
BB
473 vdev_default_asize,
474 vdev_mirror_io_start,
475 vdev_mirror_io_done,
476 vdev_mirror_state_change,
428870ff
BB
477 NULL,
478 NULL,
34dc7c2f
BB
479 VDEV_TYPE_REPLACING, /* name of this vdev type */
480 B_FALSE /* not a leaf vdev */
481};
482
483vdev_ops_t vdev_spare_ops = {
484 vdev_mirror_open,
485 vdev_mirror_close,
34dc7c2f
BB
486 vdev_default_asize,
487 vdev_mirror_io_start,
488 vdev_mirror_io_done,
489 vdev_mirror_state_change,
428870ff
BB
490 NULL,
491 NULL,
34dc7c2f
BB
492 VDEV_TYPE_SPARE, /* name of this vdev type */
493 B_FALSE /* not a leaf vdev */
494};