]> git.proxmox.com Git - mirror_zfs.git/blame - zfs/lib/libzpool/vdev_raidz.c
Remove stray stub kernel files which should be brought in my linux-kernel-module...
[mirror_zfs.git] / zfs / lib / libzpool / vdev_raidz.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27#pragma ident "@(#)vdev_raidz.c 1.10 07/11/27 SMI"
28
29#include <sys/zfs_context.h>
30#include <sys/spa.h>
31#include <sys/vdev_impl.h>
32#include <sys/zio.h>
33#include <sys/zio_checksum.h>
34#include <sys/fs/zfs.h>
35#include <sys/fm/fs/zfs.h>
36
37/*
38 * Virtual device vector for RAID-Z.
39 *
40 * This vdev supports both single and double parity. For single parity, we
41 * use a simple XOR of all the data columns. For double parity, we use both
42 * the simple XOR as well as a technique described in "The mathematics of
43 * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
44 * over the integers expressable in a single byte. Briefly, the operations on
45 * the field are defined as follows:
46 *
47 * o addition (+) is represented by a bitwise XOR
48 * o subtraction (-) is therefore identical to addition: A + B = A - B
49 * o multiplication of A by 2 is defined by the following bitwise expression:
50 * (A * 2)_7 = A_6
51 * (A * 2)_6 = A_5
52 * (A * 2)_5 = A_4
53 * (A * 2)_4 = A_3 + A_7
54 * (A * 2)_3 = A_2 + A_7
55 * (A * 2)_2 = A_1 + A_7
56 * (A * 2)_1 = A_0
57 * (A * 2)_0 = A_7
58 *
59 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
60 *
61 * Observe that any number in the field (except for 0) can be expressed as a
62 * power of 2 -- a generator for the field. We store a table of the powers of
63 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
64 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
65 * than field addition). The inverse of a field element A (A^-1) is A^254.
66 *
67 * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
68 * can be expressed by field operations:
69 *
70 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
71 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
72 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
73 *
74 * See the reconstruction code below for how P and Q can used individually or
75 * in concert to recover missing data columns.
76 */
77
78typedef struct raidz_col {
79 uint64_t rc_devidx; /* child device index for I/O */
80 uint64_t rc_offset; /* device offset */
81 uint64_t rc_size; /* I/O size */
82 void *rc_data; /* I/O data */
83 int rc_error; /* I/O error for this device */
84 uint8_t rc_tried; /* Did we attempt this I/O column? */
85 uint8_t rc_skipped; /* Did we skip this I/O column? */
86} raidz_col_t;
87
88typedef struct raidz_map {
89 uint64_t rm_cols; /* Column count */
90 uint64_t rm_bigcols; /* Number of oversized columns */
91 uint64_t rm_asize; /* Actual total I/O size */
92 uint64_t rm_missingdata; /* Count of missing data devices */
93 uint64_t rm_missingparity; /* Count of missing parity devices */
94 uint64_t rm_firstdatacol; /* First data column/parity count */
95 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
96} raidz_map_t;
97
98#define VDEV_RAIDZ_P 0
99#define VDEV_RAIDZ_Q 1
100
101#define VDEV_RAIDZ_MAXPARITY 2
102
103#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
104
105/*
106 * These two tables represent powers and logs of 2 in the Galois field defined
107 * above. These values were computed by repeatedly multiplying by 2 as above.
108 */
109static const uint8_t vdev_raidz_pow2[256] = {
110 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
111 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
112 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
113 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
114 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
115 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
116 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
117 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
118 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
119 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
120 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
121 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
122 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
123 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
124 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
125 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
126 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
127 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
128 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
129 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
130 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
131 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
132 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
133 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
134 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
135 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
136 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
137 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
138 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
139 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
140 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
141 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
142};
143static const uint8_t vdev_raidz_log2[256] = {
144 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
145 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
146 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
147 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
148 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
149 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
150 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
151 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
152 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
153 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
154 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
155 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
156 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
157 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
158 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
159 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
160 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
161 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
162 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
163 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
164 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
165 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
166 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
167 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
168 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
169 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
170 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
171 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
172 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
173 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
174 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
175 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
176};
177
178/*
179 * Multiply a given number by 2 raised to the given power.
180 */
181static uint8_t
182vdev_raidz_exp2(uint_t a, int exp)
183{
184 if (a == 0)
185 return (0);
186
187 ASSERT(exp >= 0);
188 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
189
190 exp += vdev_raidz_log2[a];
191 if (exp > 255)
192 exp -= 255;
193
194 return (vdev_raidz_pow2[exp]);
195}
196
197static raidz_map_t *
198vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
199 uint64_t nparity)
200{
201 raidz_map_t *rm;
202 uint64_t b = zio->io_offset >> unit_shift;
203 uint64_t s = zio->io_size >> unit_shift;
204 uint64_t f = b % dcols;
205 uint64_t o = (b / dcols) << unit_shift;
206 uint64_t q, r, c, bc, col, acols, coff, devidx;
207
208 q = s / (dcols - nparity);
209 r = s - q * (dcols - nparity);
210 bc = (r == 0 ? 0 : r + nparity);
211
212 acols = (q == 0 ? bc : dcols);
213
214 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
215
216 rm->rm_cols = acols;
217 rm->rm_bigcols = bc;
218 rm->rm_asize = 0;
219 rm->rm_missingdata = 0;
220 rm->rm_missingparity = 0;
221 rm->rm_firstdatacol = nparity;
222
223 for (c = 0; c < acols; c++) {
224 col = f + c;
225 coff = o;
226 if (col >= dcols) {
227 col -= dcols;
228 coff += 1ULL << unit_shift;
229 }
230 rm->rm_col[c].rc_devidx = col;
231 rm->rm_col[c].rc_offset = coff;
232 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
233 rm->rm_col[c].rc_data = NULL;
234 rm->rm_col[c].rc_error = 0;
235 rm->rm_col[c].rc_tried = 0;
236 rm->rm_col[c].rc_skipped = 0;
237 rm->rm_asize += rm->rm_col[c].rc_size;
238 }
239
240 rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
241
242 for (c = 0; c < rm->rm_firstdatacol; c++)
243 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
244
245 rm->rm_col[c].rc_data = zio->io_data;
246
247 for (c = c + 1; c < acols; c++)
248 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
249 rm->rm_col[c - 1].rc_size;
250
251 /*
252 * If all data stored spans all columns, there's a danger that parity
253 * will always be on the same device and, since parity isn't read
254 * during normal operation, that that device's I/O bandwidth won't be
255 * used effectively. We therefore switch the parity every 1MB.
256 *
257 * ... at least that was, ostensibly, the theory. As a practical
258 * matter unless we juggle the parity between all devices evenly, we
259 * won't see any benefit. Further, occasional writes that aren't a
260 * multiple of the LCM of the number of children and the minimum
261 * stripe width are sufficient to avoid pessimal behavior.
262 * Unfortunately, this decision created an implicit on-disk format
263 * requirement that we need to support for all eternity, but only
264 * for single-parity RAID-Z.
265 */
266 ASSERT(rm->rm_cols >= 2);
267 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
268
269 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
270 devidx = rm->rm_col[0].rc_devidx;
271 o = rm->rm_col[0].rc_offset;
272 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
273 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
274 rm->rm_col[1].rc_devidx = devidx;
275 rm->rm_col[1].rc_offset = o;
276 }
277
278 zio->io_vsd = rm;
279 return (rm);
280}
281
282static void
283vdev_raidz_map_free(zio_t *zio)
284{
285 raidz_map_t *rm = zio->io_vsd;
286 int c;
287
288 for (c = 0; c < rm->rm_firstdatacol; c++)
289 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
290
291 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
292 zio->io_vsd = NULL;
293}
294
295static void
296vdev_raidz_generate_parity_p(raidz_map_t *rm)
297{
298 uint64_t *p, *src, pcount, ccount, i;
299 int c;
300
301 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
302
303 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
304 src = rm->rm_col[c].rc_data;
305 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
306 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
307
308 if (c == rm->rm_firstdatacol) {
309 ASSERT(ccount == pcount);
310 for (i = 0; i < ccount; i++, p++, src++) {
311 *p = *src;
312 }
313 } else {
314 ASSERT(ccount <= pcount);
315 for (i = 0; i < ccount; i++, p++, src++) {
316 *p ^= *src;
317 }
318 }
319 }
320}
321
322static void
323vdev_raidz_generate_parity_pq(raidz_map_t *rm)
324{
325 uint64_t *q, *p, *src, pcount, ccount, mask, i;
326 int c;
327
328 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
329 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
330 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
331
332 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
333 src = rm->rm_col[c].rc_data;
334 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
335 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
336 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
337
338 if (c == rm->rm_firstdatacol) {
339 ASSERT(ccount == pcount || ccount == 0);
340 for (i = 0; i < ccount; i++, p++, q++, src++) {
341 *q = *src;
342 *p = *src;
343 }
344 for (; i < pcount; i++, p++, q++, src++) {
345 *q = 0;
346 *p = 0;
347 }
348 } else {
349 ASSERT(ccount <= pcount);
350
351 /*
352 * Rather than multiplying each byte individually (as
353 * described above), we are able to handle 8 at once
354 * by generating a mask based on the high bit in each
355 * byte and using that to conditionally XOR in 0x1d.
356 */
357 for (i = 0; i < ccount; i++, p++, q++, src++) {
358 mask = *q & 0x8080808080808080ULL;
359 mask = (mask << 1) - (mask >> 7);
360 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
361 (mask & 0x1d1d1d1d1d1d1d1dULL);
362 *q ^= *src;
363 *p ^= *src;
364 }
365
366 /*
367 * Treat short columns as though they are full of 0s.
368 */
369 for (; i < pcount; i++, q++) {
370 mask = *q & 0x8080808080808080ULL;
371 mask = (mask << 1) - (mask >> 7);
372 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
373 (mask & 0x1d1d1d1d1d1d1d1dULL);
374 }
375 }
376 }
377}
378
379static void
380vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
381{
382 uint64_t *dst, *src, xcount, ccount, count, i;
383 int c;
384
385 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
386 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
387 ASSERT(xcount > 0);
388
389 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
390 dst = rm->rm_col[x].rc_data;
391 for (i = 0; i < xcount; i++, dst++, src++) {
392 *dst = *src;
393 }
394
395 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
396 src = rm->rm_col[c].rc_data;
397 dst = rm->rm_col[x].rc_data;
398
399 if (c == x)
400 continue;
401
402 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
403 count = MIN(ccount, xcount);
404
405 for (i = 0; i < count; i++, dst++, src++) {
406 *dst ^= *src;
407 }
408 }
409}
410
411static void
412vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
413{
414 uint64_t *dst, *src, xcount, ccount, count, mask, i;
415 uint8_t *b;
416 int c, j, exp;
417
418 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
419 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
420
421 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
422 src = rm->rm_col[c].rc_data;
423 dst = rm->rm_col[x].rc_data;
424
425 if (c == x)
426 ccount = 0;
427 else
428 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
429
430 count = MIN(ccount, xcount);
431
432 if (c == rm->rm_firstdatacol) {
433 for (i = 0; i < count; i++, dst++, src++) {
434 *dst = *src;
435 }
436 for (; i < xcount; i++, dst++) {
437 *dst = 0;
438 }
439
440 } else {
441 /*
442 * For an explanation of this, see the comment in
443 * vdev_raidz_generate_parity_pq() above.
444 */
445 for (i = 0; i < count; i++, dst++, src++) {
446 mask = *dst & 0x8080808080808080ULL;
447 mask = (mask << 1) - (mask >> 7);
448 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
449 (mask & 0x1d1d1d1d1d1d1d1dULL);
450 *dst ^= *src;
451 }
452
453 for (; i < xcount; i++, dst++) {
454 mask = *dst & 0x8080808080808080ULL;
455 mask = (mask << 1) - (mask >> 7);
456 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
457 (mask & 0x1d1d1d1d1d1d1d1dULL);
458 }
459 }
460 }
461
462 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463 dst = rm->rm_col[x].rc_data;
464 exp = 255 - (rm->rm_cols - 1 - x);
465
466 for (i = 0; i < xcount; i++, dst++, src++) {
467 *dst ^= *src;
468 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
469 *b = vdev_raidz_exp2(*b, exp);
470 }
471 }
472}
473
474static void
475vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
476{
477 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
478 void *pdata, *qdata;
479 uint64_t xsize, ysize, i;
480
481 ASSERT(x < y);
482 ASSERT(x >= rm->rm_firstdatacol);
483 ASSERT(y < rm->rm_cols);
484
485 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
486
487 /*
488 * Move the parity data aside -- we're going to compute parity as
489 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
490 * reuse the parity generation mechanism without trashing the actual
491 * parity so we make those columns appear to be full of zeros by
492 * setting their lengths to zero.
493 */
494 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
495 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
496 xsize = rm->rm_col[x].rc_size;
497 ysize = rm->rm_col[y].rc_size;
498
499 rm->rm_col[VDEV_RAIDZ_P].rc_data =
500 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
501 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
502 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
503 rm->rm_col[x].rc_size = 0;
504 rm->rm_col[y].rc_size = 0;
505
506 vdev_raidz_generate_parity_pq(rm);
507
508 rm->rm_col[x].rc_size = xsize;
509 rm->rm_col[y].rc_size = ysize;
510
511 p = pdata;
512 q = qdata;
513 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
514 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
515 xd = rm->rm_col[x].rc_data;
516 yd = rm->rm_col[y].rc_data;
517
518 /*
519 * We now have:
520 * Pxy = P + D_x + D_y
521 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
522 *
523 * We can then solve for D_x:
524 * D_x = A * (P + Pxy) + B * (Q + Qxy)
525 * where
526 * A = 2^(x - y) * (2^(x - y) + 1)^-1
527 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
528 *
529 * With D_x in hand, we can easily solve for D_y:
530 * D_y = P + Pxy + D_x
531 */
532
533 a = vdev_raidz_pow2[255 + x - y];
534 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
535 tmp = 255 - vdev_raidz_log2[a ^ 1];
536
537 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
538 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
539
540 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
541 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
542 vdev_raidz_exp2(*q ^ *qxy, bexp);
543
544 if (i < ysize)
545 *yd = *p ^ *pxy ^ *xd;
546 }
547
548 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
549 rm->rm_col[VDEV_RAIDZ_P].rc_size);
550 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
551 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
552
553 /*
554 * Restore the saved parity data.
555 */
556 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
557 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
558}
559
560
561static int
562vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
563{
564 vdev_t *cvd;
565 uint64_t nparity = vd->vdev_nparity;
566 int c, error;
567 int lasterror = 0;
568 int numerrors = 0;
569
570 ASSERT(nparity > 0);
571
572 if (nparity > VDEV_RAIDZ_MAXPARITY ||
573 vd->vdev_children < nparity + 1) {
574 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
575 return (EINVAL);
576 }
577
578 for (c = 0; c < vd->vdev_children; c++) {
579 cvd = vd->vdev_child[c];
580
581 if ((error = vdev_open(cvd)) != 0) {
582 lasterror = error;
583 numerrors++;
584 continue;
585 }
586
587 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
588 *ashift = MAX(*ashift, cvd->vdev_ashift);
589 }
590
591 *asize *= vd->vdev_children;
592
593 if (numerrors > nparity) {
594 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
595 return (lasterror);
596 }
597
598 return (0);
599}
600
601static void
602vdev_raidz_close(vdev_t *vd)
603{
604 int c;
605
606 for (c = 0; c < vd->vdev_children; c++)
607 vdev_close(vd->vdev_child[c]);
608}
609
610static uint64_t
611vdev_raidz_asize(vdev_t *vd, uint64_t psize)
612{
613 uint64_t asize;
614 uint64_t ashift = vd->vdev_top->vdev_ashift;
615 uint64_t cols = vd->vdev_children;
616 uint64_t nparity = vd->vdev_nparity;
617
618 asize = ((psize - 1) >> ashift) + 1;
619 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
620 asize = roundup(asize, nparity + 1) << ashift;
621
622 return (asize);
623}
624
625static void
626vdev_raidz_child_done(zio_t *zio)
627{
628 raidz_col_t *rc = zio->io_private;
629
630 rc->rc_error = zio->io_error;
631 rc->rc_tried = 1;
632 rc->rc_skipped = 0;
633}
634
635static void
636vdev_raidz_repair_done(zio_t *zio)
637{
638 ASSERT(zio->io_private == zio->io_parent);
639 vdev_raidz_map_free(zio->io_private);
640}
641
642static int
643vdev_raidz_io_start(zio_t *zio)
644{
645 vdev_t *vd = zio->io_vd;
646 vdev_t *tvd = vd->vdev_top;
647 vdev_t *cvd;
648 blkptr_t *bp = zio->io_bp;
649 raidz_map_t *rm;
650 raidz_col_t *rc;
651 int c;
652
653 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
654 vd->vdev_nparity);
655
656 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
657
658 if (zio->io_type == ZIO_TYPE_WRITE) {
659 /*
660 * Generate RAID parity in the first virtual columns.
661 */
662 if (rm->rm_firstdatacol == 1)
663 vdev_raidz_generate_parity_p(rm);
664 else
665 vdev_raidz_generate_parity_pq(rm);
666
667 for (c = 0; c < rm->rm_cols; c++) {
668 rc = &rm->rm_col[c];
669 cvd = vd->vdev_child[rc->rc_devidx];
670 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
671 rc->rc_offset, rc->rc_data, rc->rc_size,
672 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
673 vdev_raidz_child_done, rc));
674 }
675
676 return (zio_wait_for_children_done(zio));
677 }
678
679 ASSERT(zio->io_type == ZIO_TYPE_READ);
680
681 /*
682 * Iterate over the columns in reverse order so that we hit the parity
683 * last -- any errors along the way will force us to read the parity
684 * data.
685 */
686 for (c = rm->rm_cols - 1; c >= 0; c--) {
687 rc = &rm->rm_col[c];
688 cvd = vd->vdev_child[rc->rc_devidx];
689 if (!vdev_readable(cvd)) {
690 if (c >= rm->rm_firstdatacol)
691 rm->rm_missingdata++;
692 else
693 rm->rm_missingparity++;
694 rc->rc_error = ENXIO;
695 rc->rc_tried = 1; /* don't even try */
696 rc->rc_skipped = 1;
697 continue;
698 }
699 if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
700 if (c >= rm->rm_firstdatacol)
701 rm->rm_missingdata++;
702 else
703 rm->rm_missingparity++;
704 rc->rc_error = ESTALE;
705 rc->rc_skipped = 1;
706 continue;
707 }
708 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
709 (zio->io_flags & ZIO_FLAG_SCRUB)) {
710 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
711 rc->rc_offset, rc->rc_data, rc->rc_size,
712 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
713 vdev_raidz_child_done, rc));
714 }
715 }
716
717 return (zio_wait_for_children_done(zio));
718}
719
720/*
721 * Report a checksum error for a child of a RAID-Z device.
722 */
723static void
724raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
725{
726 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
727 dprintf_bp(zio->io_bp, "imputed checksum error on %s: ",
728 vdev_description(vd));
729
730 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
731 mutex_enter(&vd->vdev_stat_lock);
732 vd->vdev_stat.vs_checksum_errors++;
733 mutex_exit(&vd->vdev_stat_lock);
734 }
735
736 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
737 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
738 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
739}
740
741/*
742 * Generate the parity from the data columns. If we tried and were able to
743 * read the parity without error, verify that the generated parity matches the
744 * data we read. If it doesn't, we fire off a checksum error. Return the
745 * number such failures.
746 */
747static int
748raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
749{
750 void *orig[VDEV_RAIDZ_MAXPARITY];
751 int c, ret = 0;
752 raidz_col_t *rc;
753
754 for (c = 0; c < rm->rm_firstdatacol; c++) {
755 rc = &rm->rm_col[c];
756 if (!rc->rc_tried || rc->rc_error != 0)
757 continue;
758 orig[c] = zio_buf_alloc(rc->rc_size);
759 bcopy(rc->rc_data, orig[c], rc->rc_size);
760 }
761
762 if (rm->rm_firstdatacol == 1)
763 vdev_raidz_generate_parity_p(rm);
764 else
765 vdev_raidz_generate_parity_pq(rm);
766
767 for (c = 0; c < rm->rm_firstdatacol; c++) {
768 rc = &rm->rm_col[c];
769 if (!rc->rc_tried || rc->rc_error != 0)
770 continue;
771 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
772 raidz_checksum_error(zio, rc);
773 rc->rc_error = ECKSUM;
774 ret++;
775 }
776 zio_buf_free(orig[c], rc->rc_size);
777 }
778
779 return (ret);
780}
781
782static uint64_t raidz_corrected_p;
783static uint64_t raidz_corrected_q;
784static uint64_t raidz_corrected_pq;
785
786static int
787vdev_raidz_io_done(zio_t *zio)
788{
789 vdev_t *vd = zio->io_vd;
790 vdev_t *cvd;
791 raidz_map_t *rm = zio->io_vsd;
792 raidz_col_t *rc, *rc1;
793 int unexpected_errors = 0;
794 int parity_errors = 0;
795 int parity_untried = 0;
796 int data_errors = 0;
797 int n, c, c1;
798
799 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
800
801 zio->io_error = 0;
802 zio->io_numerrors = 0;
803
804 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
805 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
806
807 for (c = 0; c < rm->rm_cols; c++) {
808 rc = &rm->rm_col[c];
809
810 /*
811 * We preserve any EIOs because those may be worth retrying;
812 * whereas ECKSUM and ENXIO are more likely to be persistent.
813 */
814 if (rc->rc_error) {
815 if (zio->io_error != EIO)
816 zio->io_error = rc->rc_error;
817
818 if (c < rm->rm_firstdatacol)
819 parity_errors++;
820 else
821 data_errors++;
822
823 if (!rc->rc_skipped)
824 unexpected_errors++;
825
826 zio->io_numerrors++;
827 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
828 parity_untried++;
829 }
830 }
831
832 if (zio->io_type == ZIO_TYPE_WRITE) {
833 /*
834 * If this is not a failfast write, and we were able to
835 * write enough columns to reconstruct the data, good enough.
836 */
837 /* XXPOLICY */
838 if (zio->io_numerrors <= rm->rm_firstdatacol &&
839 !(zio->io_flags & ZIO_FLAG_FAILFAST))
840 zio->io_error = 0;
841
842 vdev_raidz_map_free(zio);
843
844 return (ZIO_PIPELINE_CONTINUE);
845 }
846
847 ASSERT(zio->io_type == ZIO_TYPE_READ);
848 /*
849 * There are three potential phases for a read:
850 * 1. produce valid data from the columns read
851 * 2. read all disks and try again
852 * 3. perform combinatorial reconstruction
853 *
854 * Each phase is progressively both more expensive and less likely to
855 * occur. If we encounter more errors than we can repair or all phases
856 * fail, we have no choice but to return an error.
857 */
858
859 /*
860 * If the number of errors we saw was correctable -- less than or equal
861 * to the number of parity disks read -- attempt to produce data that
862 * has a valid checksum. Naturally, this case applies in the absence of
863 * any errors.
864 */
865 if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) {
866 switch (data_errors) {
867 case 0:
868 if (zio_checksum_error(zio) == 0) {
869 zio->io_error = 0;
870
871 /*
872 * If we read parity information (unnecessarily
873 * as it happens since no reconstruction was
874 * needed) regenerate and verify the parity.
875 * We also regenerate parity when resilvering
876 * so we can write it out to the failed device
877 * later.
878 */
879 if (parity_errors + parity_untried <
880 rm->rm_firstdatacol ||
881 (zio->io_flags & ZIO_FLAG_RESILVER)) {
882 n = raidz_parity_verify(zio, rm);
883 unexpected_errors += n;
884 ASSERT(parity_errors + n <=
885 rm->rm_firstdatacol);
886 }
887 goto done;
888 }
889 break;
890
891 case 1:
892 /*
893 * We either attempt to read all the parity columns or
894 * none of them. If we didn't try to read parity, we
895 * wouldn't be here in the correctable case. There must
896 * also have been fewer parity errors than parity
897 * columns or, again, we wouldn't be in this code path.
898 */
899 ASSERT(parity_untried == 0);
900 ASSERT(parity_errors < rm->rm_firstdatacol);
901
902 /*
903 * Find the column that reported the error.
904 */
905 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
906 rc = &rm->rm_col[c];
907 if (rc->rc_error != 0)
908 break;
909 }
910 ASSERT(c != rm->rm_cols);
911 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
912 rc->rc_error == ESTALE);
913
914 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
915 vdev_raidz_reconstruct_p(rm, c);
916 } else {
917 ASSERT(rm->rm_firstdatacol > 1);
918 vdev_raidz_reconstruct_q(rm, c);
919 }
920
921 if (zio_checksum_error(zio) == 0) {
922 zio->io_error = 0;
923 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
924 atomic_inc_64(&raidz_corrected_p);
925 else
926 atomic_inc_64(&raidz_corrected_q);
927
928 /*
929 * If there's more than one parity disk that
930 * was successfully read, confirm that the
931 * other parity disk produced the correct data.
932 * This routine is suboptimal in that it
933 * regenerates both the parity we wish to test
934 * as well as the parity we just used to
935 * perform the reconstruction, but this should
936 * be a relatively uncommon case, and can be
937 * optimized if it becomes a problem.
938 * We also regenerate parity when resilvering
939 * so we can write it out to the failed device
940 * later.
941 */
942 if (parity_errors < rm->rm_firstdatacol - 1 ||
943 (zio->io_flags & ZIO_FLAG_RESILVER)) {
944 n = raidz_parity_verify(zio, rm);
945 unexpected_errors += n;
946 ASSERT(parity_errors + n <=
947 rm->rm_firstdatacol);
948 }
949
950 goto done;
951 }
952 break;
953
954 case 2:
955 /*
956 * Two data column errors require double parity.
957 */
958 ASSERT(rm->rm_firstdatacol == 2);
959
960 /*
961 * Find the two columns that reported errors.
962 */
963 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
964 rc = &rm->rm_col[c];
965 if (rc->rc_error != 0)
966 break;
967 }
968 ASSERT(c != rm->rm_cols);
969 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
970 rc->rc_error == ESTALE);
971
972 for (c1 = c++; c < rm->rm_cols; c++) {
973 rc = &rm->rm_col[c];
974 if (rc->rc_error != 0)
975 break;
976 }
977 ASSERT(c != rm->rm_cols);
978 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
979 rc->rc_error == ESTALE);
980
981 vdev_raidz_reconstruct_pq(rm, c1, c);
982
983 if (zio_checksum_error(zio) == 0) {
984 zio->io_error = 0;
985 atomic_inc_64(&raidz_corrected_pq);
986
987 goto done;
988 }
989 break;
990
991 default:
992 ASSERT(rm->rm_firstdatacol <= 2);
993 ASSERT(0);
994 }
995 }
996
997 /*
998 * This isn't a typical situation -- either we got a read error or
999 * a child silently returned bad data. Read every block so we can
1000 * try again with as much data and parity as we can track down. If
1001 * we've already been through once before, all children will be marked
1002 * as tried so we'll proceed to combinatorial reconstruction.
1003 */
1004 unexpected_errors = 1;
1005 rm->rm_missingdata = 0;
1006 rm->rm_missingparity = 0;
1007
1008 for (c = 0; c < rm->rm_cols; c++) {
1009 if (rm->rm_col[c].rc_tried)
1010 continue;
1011
1012 zio->io_error = 0;
1013 zio_vdev_io_redone(zio);
1014 do {
1015 rc = &rm->rm_col[c];
1016 if (rc->rc_tried)
1017 continue;
1018 zio_nowait(zio_vdev_child_io(zio, NULL,
1019 vd->vdev_child[rc->rc_devidx],
1020 rc->rc_offset, rc->rc_data, rc->rc_size,
1021 zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL,
1022 vdev_raidz_child_done, rc));
1023 } while (++c < rm->rm_cols);
1024 dprintf("rereading\n");
1025
1026 return (zio_wait_for_children_done(zio));
1027 }
1028
1029 /*
1030 * At this point we've attempted to reconstruct the data given the
1031 * errors we detected, and we've attempted to read all columns. There
1032 * must, therefore, be one or more additional problems -- silent errors
1033 * resulting in invalid data rather than explicit I/O errors resulting
1034 * in absent data. Before we attempt combinatorial reconstruction make
1035 * sure we have a chance of coming up with the right answer.
1036 */
1037 if (zio->io_numerrors >= rm->rm_firstdatacol) {
1038 ASSERT(zio->io_error != 0);
1039 goto done;
1040 }
1041
1042 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1043 /*
1044 * Attempt to reconstruct the data from parity P.
1045 */
1046 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1047 void *orig;
1048 rc = &rm->rm_col[c];
1049
1050 orig = zio_buf_alloc(rc->rc_size);
1051 bcopy(rc->rc_data, orig, rc->rc_size);
1052 vdev_raidz_reconstruct_p(rm, c);
1053
1054 if (zio_checksum_error(zio) == 0) {
1055 zio_buf_free(orig, rc->rc_size);
1056 zio->io_error = 0;
1057 atomic_inc_64(&raidz_corrected_p);
1058
1059 /*
1060 * If this child didn't know that it returned
1061 * bad data, inform it.
1062 */
1063 if (rc->rc_tried && rc->rc_error == 0)
1064 raidz_checksum_error(zio, rc);
1065 rc->rc_error = ECKSUM;
1066 goto done;
1067 }
1068
1069 bcopy(orig, rc->rc_data, rc->rc_size);
1070 zio_buf_free(orig, rc->rc_size);
1071 }
1072 }
1073
1074 if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1075 /*
1076 * Attempt to reconstruct the data from parity Q.
1077 */
1078 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1079 void *orig;
1080 rc = &rm->rm_col[c];
1081
1082 orig = zio_buf_alloc(rc->rc_size);
1083 bcopy(rc->rc_data, orig, rc->rc_size);
1084 vdev_raidz_reconstruct_q(rm, c);
1085
1086 if (zio_checksum_error(zio) == 0) {
1087 zio_buf_free(orig, rc->rc_size);
1088 zio->io_error = 0;
1089 atomic_inc_64(&raidz_corrected_q);
1090
1091 /*
1092 * If this child didn't know that it returned
1093 * bad data, inform it.
1094 */
1095 if (rc->rc_tried && rc->rc_error == 0)
1096 raidz_checksum_error(zio, rc);
1097 rc->rc_error = ECKSUM;
1098 goto done;
1099 }
1100
1101 bcopy(orig, rc->rc_data, rc->rc_size);
1102 zio_buf_free(orig, rc->rc_size);
1103 }
1104 }
1105
1106 if (rm->rm_firstdatacol > 1 &&
1107 rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1108 rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1109 /*
1110 * Attempt to reconstruct the data from both P and Q.
1111 */
1112 for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1113 void *orig, *orig1;
1114 rc = &rm->rm_col[c];
1115
1116 orig = zio_buf_alloc(rc->rc_size);
1117 bcopy(rc->rc_data, orig, rc->rc_size);
1118
1119 for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1120 rc1 = &rm->rm_col[c1];
1121
1122 orig1 = zio_buf_alloc(rc1->rc_size);
1123 bcopy(rc1->rc_data, orig1, rc1->rc_size);
1124
1125 vdev_raidz_reconstruct_pq(rm, c, c1);
1126
1127 if (zio_checksum_error(zio) == 0) {
1128 zio_buf_free(orig, rc->rc_size);
1129 zio_buf_free(orig1, rc1->rc_size);
1130 zio->io_error = 0;
1131 atomic_inc_64(&raidz_corrected_pq);
1132
1133 /*
1134 * If these children didn't know they
1135 * returned bad data, inform them.
1136 */
1137 if (rc->rc_tried && rc->rc_error == 0)
1138 raidz_checksum_error(zio, rc);
1139 if (rc1->rc_tried && rc1->rc_error == 0)
1140 raidz_checksum_error(zio, rc1);
1141
1142 rc->rc_error = ECKSUM;
1143 rc1->rc_error = ECKSUM;
1144
1145 goto done;
1146 }
1147
1148 bcopy(orig1, rc1->rc_data, rc1->rc_size);
1149 zio_buf_free(orig1, rc1->rc_size);
1150 }
1151
1152 bcopy(orig, rc->rc_data, rc->rc_size);
1153 zio_buf_free(orig, rc->rc_size);
1154 }
1155 }
1156
1157 /*
1158 * All combinations failed to checksum. Generate checksum ereports for
1159 * all children.
1160 */
1161 zio->io_error = ECKSUM;
1162 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1163 for (c = 0; c < rm->rm_cols; c++) {
1164 rc = &rm->rm_col[c];
1165 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1166 zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1167 rc->rc_offset, rc->rc_size);
1168 }
1169 }
1170
1171done:
1172 zio_checksum_verified(zio);
1173
1174 if (zio->io_error == 0 && (spa_mode & FWRITE) &&
1175 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
1176 zio_t *rio;
1177
1178 /*
1179 * Use the good data we have in hand to repair damaged children.
1180 *
1181 * We issue all repair I/Os as children of 'rio' to arrange
1182 * that vdev_raidz_map_free(zio) will be invoked after all
1183 * repairs complete, but before we advance to the next stage.
1184 */
1185 rio = zio_null(zio, zio->io_spa,
1186 vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL);
1187
1188 for (c = 0; c < rm->rm_cols; c++) {
1189 rc = &rm->rm_col[c];
1190 cvd = vd->vdev_child[rc->rc_devidx];
1191
1192 if (rc->rc_error == 0)
1193 continue;
1194
1195 dprintf("%s resilvered %s @ 0x%llx error %d\n",
1196 vdev_description(vd),
1197 vdev_description(cvd),
1198 zio->io_offset, rc->rc_error);
1199
1200 zio_nowait(zio_vdev_child_io(rio, NULL, cvd,
1201 rc->rc_offset, rc->rc_data, rc->rc_size,
1202 ZIO_TYPE_WRITE, zio->io_priority,
1203 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE |
1204 ZIO_FLAG_CANFAIL, NULL, NULL));
1205 }
1206
1207 zio_nowait(rio);
1208
1209 return (zio_wait_for_children_done(zio));
1210 }
1211
1212 vdev_raidz_map_free(zio);
1213
1214 return (ZIO_PIPELINE_CONTINUE);
1215}
1216
1217static void
1218vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1219{
1220 if (faulted > vd->vdev_nparity)
1221 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1222 VDEV_AUX_NO_REPLICAS);
1223 else if (degraded + faulted != 0)
1224 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1225 else
1226 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1227}
1228
1229vdev_ops_t vdev_raidz_ops = {
1230 vdev_raidz_open,
1231 vdev_raidz_close,
1232 NULL,
1233 vdev_raidz_asize,
1234 vdev_raidz_io_start,
1235 vdev_raidz_io_done,
1236 vdev_raidz_state_change,
1237 VDEV_TYPE_RAIDZ, /* name of this vdev type */
1238 B_FALSE /* not a leaf vdev */
1239};