]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_raidz.c
Rebase master to b108
[mirror_zfs.git] / module / zfs / vdev_raidz.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
b128c09f 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
34dc7c2f
BB
24 * Use is subject to license terms.
25 */
26
34dc7c2f
BB
27#include <sys/zfs_context.h>
28#include <sys/spa.h>
29#include <sys/vdev_impl.h>
30#include <sys/zio.h>
31#include <sys/zio_checksum.h>
32#include <sys/fs/zfs.h>
33#include <sys/fm/fs/zfs.h>
34
35/*
36 * Virtual device vector for RAID-Z.
37 *
38 * This vdev supports both single and double parity. For single parity, we
39 * use a simple XOR of all the data columns. For double parity, we use both
40 * the simple XOR as well as a technique described in "The mathematics of
41 * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
42 * over the integers expressable in a single byte. Briefly, the operations on
43 * the field are defined as follows:
44 *
45 * o addition (+) is represented by a bitwise XOR
46 * o subtraction (-) is therefore identical to addition: A + B = A - B
47 * o multiplication of A by 2 is defined by the following bitwise expression:
48 * (A * 2)_7 = A_6
49 * (A * 2)_6 = A_5
50 * (A * 2)_5 = A_4
51 * (A * 2)_4 = A_3 + A_7
52 * (A * 2)_3 = A_2 + A_7
53 * (A * 2)_2 = A_1 + A_7
54 * (A * 2)_1 = A_0
55 * (A * 2)_0 = A_7
56 *
57 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
58 *
59 * Observe that any number in the field (except for 0) can be expressed as a
60 * power of 2 -- a generator for the field. We store a table of the powers of
61 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
62 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
63 * than field addition). The inverse of a field element A (A^-1) is A^254.
64 *
65 * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
66 * can be expressed by field operations:
67 *
68 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
69 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
70 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
71 *
72 * See the reconstruction code below for how P and Q can used individually or
73 * in concert to recover missing data columns.
74 */
75
76typedef struct raidz_col {
77 uint64_t rc_devidx; /* child device index for I/O */
78 uint64_t rc_offset; /* device offset */
79 uint64_t rc_size; /* I/O size */
80 void *rc_data; /* I/O data */
81 int rc_error; /* I/O error for this device */
82 uint8_t rc_tried; /* Did we attempt this I/O column? */
83 uint8_t rc_skipped; /* Did we skip this I/O column? */
84} raidz_col_t;
85
86typedef struct raidz_map {
87 uint64_t rm_cols; /* Column count */
88 uint64_t rm_bigcols; /* Number of oversized columns */
89 uint64_t rm_asize; /* Actual total I/O size */
90 uint64_t rm_missingdata; /* Count of missing data devices */
91 uint64_t rm_missingparity; /* Count of missing parity devices */
92 uint64_t rm_firstdatacol; /* First data column/parity count */
93 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
94} raidz_map_t;
95
96#define VDEV_RAIDZ_P 0
97#define VDEV_RAIDZ_Q 1
98
99#define VDEV_RAIDZ_MAXPARITY 2
100
101#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
102
103/*
104 * These two tables represent powers and logs of 2 in the Galois field defined
105 * above. These values were computed by repeatedly multiplying by 2 as above.
106 */
107static const uint8_t vdev_raidz_pow2[256] = {
108 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
109 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
110 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
111 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
112 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
113 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
114 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
115 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
116 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
117 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
118 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
119 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
120 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
121 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
122 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
123 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
124 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
125 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
126 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
127 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
128 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
129 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
130 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
131 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
132 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
133 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
134 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
135 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
136 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
137 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
138 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
139 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
140};
141static const uint8_t vdev_raidz_log2[256] = {
142 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
143 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
144 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
145 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
146 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
147 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
148 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
149 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
150 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
151 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
152 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
153 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
154 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
155 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
156 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
157 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
158 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
159 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
160 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
161 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
162 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
163 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
164 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
165 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
166 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
167 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
168 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
169 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
170 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
171 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
172 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
173 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
174};
175
176/*
177 * Multiply a given number by 2 raised to the given power.
178 */
179static uint8_t
180vdev_raidz_exp2(uint_t a, int exp)
181{
182 if (a == 0)
183 return (0);
184
185 ASSERT(exp >= 0);
186 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
187
188 exp += vdev_raidz_log2[a];
189 if (exp > 255)
190 exp -= 255;
191
192 return (vdev_raidz_pow2[exp]);
193}
194
b128c09f
BB
195static void
196vdev_raidz_map_free(zio_t *zio)
197{
198 raidz_map_t *rm = zio->io_vsd;
199 int c;
200
201 for (c = 0; c < rm->rm_firstdatacol; c++)
202 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
203
204 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
205}
206
34dc7c2f
BB
207static raidz_map_t *
208vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
209 uint64_t nparity)
210{
211 raidz_map_t *rm;
212 uint64_t b = zio->io_offset >> unit_shift;
213 uint64_t s = zio->io_size >> unit_shift;
214 uint64_t f = b % dcols;
215 uint64_t o = (b / dcols) << unit_shift;
216 uint64_t q, r, c, bc, col, acols, coff, devidx;
217
218 q = s / (dcols - nparity);
219 r = s - q * (dcols - nparity);
220 bc = (r == 0 ? 0 : r + nparity);
221
222 acols = (q == 0 ? bc : dcols);
223
224 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
225
226 rm->rm_cols = acols;
227 rm->rm_bigcols = bc;
228 rm->rm_asize = 0;
229 rm->rm_missingdata = 0;
230 rm->rm_missingparity = 0;
231 rm->rm_firstdatacol = nparity;
232
233 for (c = 0; c < acols; c++) {
234 col = f + c;
235 coff = o;
236 if (col >= dcols) {
237 col -= dcols;
238 coff += 1ULL << unit_shift;
239 }
240 rm->rm_col[c].rc_devidx = col;
241 rm->rm_col[c].rc_offset = coff;
242 rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
243 rm->rm_col[c].rc_data = NULL;
244 rm->rm_col[c].rc_error = 0;
245 rm->rm_col[c].rc_tried = 0;
246 rm->rm_col[c].rc_skipped = 0;
247 rm->rm_asize += rm->rm_col[c].rc_size;
248 }
249
250 rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
251
252 for (c = 0; c < rm->rm_firstdatacol; c++)
253 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
254
255 rm->rm_col[c].rc_data = zio->io_data;
256
257 for (c = c + 1; c < acols; c++)
258 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
259 rm->rm_col[c - 1].rc_size;
260
261 /*
262 * If all data stored spans all columns, there's a danger that parity
263 * will always be on the same device and, since parity isn't read
264 * during normal operation, that that device's I/O bandwidth won't be
265 * used effectively. We therefore switch the parity every 1MB.
266 *
267 * ... at least that was, ostensibly, the theory. As a practical
268 * matter unless we juggle the parity between all devices evenly, we
269 * won't see any benefit. Further, occasional writes that aren't a
270 * multiple of the LCM of the number of children and the minimum
271 * stripe width are sufficient to avoid pessimal behavior.
272 * Unfortunately, this decision created an implicit on-disk format
273 * requirement that we need to support for all eternity, but only
274 * for single-parity RAID-Z.
275 */
276 ASSERT(rm->rm_cols >= 2);
277 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
278
279 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
280 devidx = rm->rm_col[0].rc_devidx;
281 o = rm->rm_col[0].rc_offset;
282 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
283 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
284 rm->rm_col[1].rc_devidx = devidx;
285 rm->rm_col[1].rc_offset = o;
286 }
287
288 zio->io_vsd = rm;
b128c09f 289 zio->io_vsd_free = vdev_raidz_map_free;
34dc7c2f
BB
290 return (rm);
291}
292
34dc7c2f
BB
293static void
294vdev_raidz_generate_parity_p(raidz_map_t *rm)
295{
296 uint64_t *p, *src, pcount, ccount, i;
297 int c;
298
299 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
300
301 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
302 src = rm->rm_col[c].rc_data;
303 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
304 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
305
306 if (c == rm->rm_firstdatacol) {
307 ASSERT(ccount == pcount);
308 for (i = 0; i < ccount; i++, p++, src++) {
309 *p = *src;
310 }
311 } else {
312 ASSERT(ccount <= pcount);
313 for (i = 0; i < ccount; i++, p++, src++) {
314 *p ^= *src;
315 }
316 }
317 }
318}
319
320static void
321vdev_raidz_generate_parity_pq(raidz_map_t *rm)
322{
323 uint64_t *q, *p, *src, pcount, ccount, mask, i;
324 int c;
325
326 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
327 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
328 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
329
330 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
331 src = rm->rm_col[c].rc_data;
332 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
333 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
334 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
335
336 if (c == rm->rm_firstdatacol) {
337 ASSERT(ccount == pcount || ccount == 0);
338 for (i = 0; i < ccount; i++, p++, q++, src++) {
339 *q = *src;
340 *p = *src;
341 }
342 for (; i < pcount; i++, p++, q++, src++) {
343 *q = 0;
344 *p = 0;
345 }
346 } else {
347 ASSERT(ccount <= pcount);
348
349 /*
350 * Rather than multiplying each byte individually (as
351 * described above), we are able to handle 8 at once
352 * by generating a mask based on the high bit in each
353 * byte and using that to conditionally XOR in 0x1d.
354 */
355 for (i = 0; i < ccount; i++, p++, q++, src++) {
356 mask = *q & 0x8080808080808080ULL;
357 mask = (mask << 1) - (mask >> 7);
358 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
359 (mask & 0x1d1d1d1d1d1d1d1dULL);
360 *q ^= *src;
361 *p ^= *src;
362 }
363
364 /*
365 * Treat short columns as though they are full of 0s.
366 */
367 for (; i < pcount; i++, q++) {
368 mask = *q & 0x8080808080808080ULL;
369 mask = (mask << 1) - (mask >> 7);
370 *q = ((*q << 1) & 0xfefefefefefefefeULL) ^
371 (mask & 0x1d1d1d1d1d1d1d1dULL);
372 }
373 }
374 }
375}
376
377static void
378vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
379{
380 uint64_t *dst, *src, xcount, ccount, count, i;
381 int c;
382
383 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
384 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
385 ASSERT(xcount > 0);
386
387 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
388 dst = rm->rm_col[x].rc_data;
389 for (i = 0; i < xcount; i++, dst++, src++) {
390 *dst = *src;
391 }
392
393 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
394 src = rm->rm_col[c].rc_data;
395 dst = rm->rm_col[x].rc_data;
396
397 if (c == x)
398 continue;
399
400 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
401 count = MIN(ccount, xcount);
402
403 for (i = 0; i < count; i++, dst++, src++) {
404 *dst ^= *src;
405 }
406 }
407}
408
409static void
410vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
411{
412 uint64_t *dst, *src, xcount, ccount, count, mask, i;
413 uint8_t *b;
414 int c, j, exp;
415
416 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
417 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
418
419 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
420 src = rm->rm_col[c].rc_data;
421 dst = rm->rm_col[x].rc_data;
422
423 if (c == x)
424 ccount = 0;
425 else
426 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
427
428 count = MIN(ccount, xcount);
429
430 if (c == rm->rm_firstdatacol) {
431 for (i = 0; i < count; i++, dst++, src++) {
432 *dst = *src;
433 }
434 for (; i < xcount; i++, dst++) {
435 *dst = 0;
436 }
437
438 } else {
439 /*
440 * For an explanation of this, see the comment in
441 * vdev_raidz_generate_parity_pq() above.
442 */
443 for (i = 0; i < count; i++, dst++, src++) {
444 mask = *dst & 0x8080808080808080ULL;
445 mask = (mask << 1) - (mask >> 7);
446 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
447 (mask & 0x1d1d1d1d1d1d1d1dULL);
448 *dst ^= *src;
449 }
450
451 for (; i < xcount; i++, dst++) {
452 mask = *dst & 0x8080808080808080ULL;
453 mask = (mask << 1) - (mask >> 7);
454 *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
455 (mask & 0x1d1d1d1d1d1d1d1dULL);
456 }
457 }
458 }
459
460 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
461 dst = rm->rm_col[x].rc_data;
462 exp = 255 - (rm->rm_cols - 1 - x);
463
464 for (i = 0; i < xcount; i++, dst++, src++) {
465 *dst ^= *src;
466 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
467 *b = vdev_raidz_exp2(*b, exp);
468 }
469 }
470}
471
472static void
473vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
474{
475 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
476 void *pdata, *qdata;
477 uint64_t xsize, ysize, i;
478
479 ASSERT(x < y);
480 ASSERT(x >= rm->rm_firstdatacol);
481 ASSERT(y < rm->rm_cols);
482
483 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
484
485 /*
486 * Move the parity data aside -- we're going to compute parity as
487 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
488 * reuse the parity generation mechanism without trashing the actual
489 * parity so we make those columns appear to be full of zeros by
490 * setting their lengths to zero.
491 */
492 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
493 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
494 xsize = rm->rm_col[x].rc_size;
495 ysize = rm->rm_col[y].rc_size;
496
497 rm->rm_col[VDEV_RAIDZ_P].rc_data =
498 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
499 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
500 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
501 rm->rm_col[x].rc_size = 0;
502 rm->rm_col[y].rc_size = 0;
503
504 vdev_raidz_generate_parity_pq(rm);
505
506 rm->rm_col[x].rc_size = xsize;
507 rm->rm_col[y].rc_size = ysize;
508
509 p = pdata;
510 q = qdata;
511 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
512 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
513 xd = rm->rm_col[x].rc_data;
514 yd = rm->rm_col[y].rc_data;
515
516 /*
517 * We now have:
518 * Pxy = P + D_x + D_y
519 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
520 *
521 * We can then solve for D_x:
522 * D_x = A * (P + Pxy) + B * (Q + Qxy)
523 * where
524 * A = 2^(x - y) * (2^(x - y) + 1)^-1
525 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
526 *
527 * With D_x in hand, we can easily solve for D_y:
528 * D_y = P + Pxy + D_x
529 */
530
531 a = vdev_raidz_pow2[255 + x - y];
532 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
533 tmp = 255 - vdev_raidz_log2[a ^ 1];
534
535 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
536 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
537
538 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
539 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
540 vdev_raidz_exp2(*q ^ *qxy, bexp);
541
542 if (i < ysize)
543 *yd = *p ^ *pxy ^ *xd;
544 }
545
546 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
547 rm->rm_col[VDEV_RAIDZ_P].rc_size);
548 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
549 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
550
551 /*
552 * Restore the saved parity data.
553 */
554 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
555 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
556}
557
558
559static int
560vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
561{
562 vdev_t *cvd;
563 uint64_t nparity = vd->vdev_nparity;
564 int c, error;
565 int lasterror = 0;
566 int numerrors = 0;
567
568 ASSERT(nparity > 0);
569
570 if (nparity > VDEV_RAIDZ_MAXPARITY ||
571 vd->vdev_children < nparity + 1) {
572 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
573 return (EINVAL);
574 }
575
576 for (c = 0; c < vd->vdev_children; c++) {
577 cvd = vd->vdev_child[c];
578
579 if ((error = vdev_open(cvd)) != 0) {
580 lasterror = error;
581 numerrors++;
582 continue;
583 }
584
585 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
586 *ashift = MAX(*ashift, cvd->vdev_ashift);
587 }
588
589 *asize *= vd->vdev_children;
590
591 if (numerrors > nparity) {
592 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
593 return (lasterror);
594 }
595
596 return (0);
597}
598
599static void
600vdev_raidz_close(vdev_t *vd)
601{
602 int c;
603
604 for (c = 0; c < vd->vdev_children; c++)
605 vdev_close(vd->vdev_child[c]);
606}
607
608static uint64_t
609vdev_raidz_asize(vdev_t *vd, uint64_t psize)
610{
611 uint64_t asize;
612 uint64_t ashift = vd->vdev_top->vdev_ashift;
613 uint64_t cols = vd->vdev_children;
614 uint64_t nparity = vd->vdev_nparity;
615
616 asize = ((psize - 1) >> ashift) + 1;
617 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
618 asize = roundup(asize, nparity + 1) << ashift;
619
620 return (asize);
621}
622
623static void
624vdev_raidz_child_done(zio_t *zio)
625{
626 raidz_col_t *rc = zio->io_private;
627
628 rc->rc_error = zio->io_error;
629 rc->rc_tried = 1;
630 rc->rc_skipped = 0;
631}
632
34dc7c2f
BB
633static int
634vdev_raidz_io_start(zio_t *zio)
635{
636 vdev_t *vd = zio->io_vd;
637 vdev_t *tvd = vd->vdev_top;
638 vdev_t *cvd;
639 blkptr_t *bp = zio->io_bp;
640 raidz_map_t *rm;
641 raidz_col_t *rc;
642 int c;
643
644 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
645 vd->vdev_nparity);
646
647 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
648
649 if (zio->io_type == ZIO_TYPE_WRITE) {
650 /*
651 * Generate RAID parity in the first virtual columns.
652 */
653 if (rm->rm_firstdatacol == 1)
654 vdev_raidz_generate_parity_p(rm);
655 else
656 vdev_raidz_generate_parity_pq(rm);
657
658 for (c = 0; c < rm->rm_cols; c++) {
659 rc = &rm->rm_col[c];
660 cvd = vd->vdev_child[rc->rc_devidx];
661 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
662 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 663 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
664 vdev_raidz_child_done, rc));
665 }
666
b128c09f 667 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
668 }
669
670 ASSERT(zio->io_type == ZIO_TYPE_READ);
671
672 /*
673 * Iterate over the columns in reverse order so that we hit the parity
674 * last -- any errors along the way will force us to read the parity
675 * data.
676 */
677 for (c = rm->rm_cols - 1; c >= 0; c--) {
678 rc = &rm->rm_col[c];
679 cvd = vd->vdev_child[rc->rc_devidx];
680 if (!vdev_readable(cvd)) {
681 if (c >= rm->rm_firstdatacol)
682 rm->rm_missingdata++;
683 else
684 rm->rm_missingparity++;
685 rc->rc_error = ENXIO;
686 rc->rc_tried = 1; /* don't even try */
687 rc->rc_skipped = 1;
688 continue;
689 }
fb5f0bc8 690 if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
34dc7c2f
BB
691 if (c >= rm->rm_firstdatacol)
692 rm->rm_missingdata++;
693 else
694 rm->rm_missingparity++;
695 rc->rc_error = ESTALE;
696 rc->rc_skipped = 1;
697 continue;
698 }
699 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
700 (zio->io_flags & ZIO_FLAG_SCRUB)) {
701 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
702 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 703 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
704 vdev_raidz_child_done, rc));
705 }
706 }
707
b128c09f 708 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
709}
710
711/*
712 * Report a checksum error for a child of a RAID-Z device.
713 */
714static void
715raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
716{
717 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
34dc7c2f
BB
718
719 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
720 mutex_enter(&vd->vdev_stat_lock);
721 vd->vdev_stat.vs_checksum_errors++;
722 mutex_exit(&vd->vdev_stat_lock);
723 }
724
725 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
726 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
727 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
728}
729
730/*
731 * Generate the parity from the data columns. If we tried and were able to
732 * read the parity without error, verify that the generated parity matches the
733 * data we read. If it doesn't, we fire off a checksum error. Return the
734 * number such failures.
735 */
736static int
737raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
738{
739 void *orig[VDEV_RAIDZ_MAXPARITY];
740 int c, ret = 0;
741 raidz_col_t *rc;
742
743 for (c = 0; c < rm->rm_firstdatacol; c++) {
744 rc = &rm->rm_col[c];
745 if (!rc->rc_tried || rc->rc_error != 0)
746 continue;
747 orig[c] = zio_buf_alloc(rc->rc_size);
748 bcopy(rc->rc_data, orig[c], rc->rc_size);
749 }
750
751 if (rm->rm_firstdatacol == 1)
752 vdev_raidz_generate_parity_p(rm);
753 else
754 vdev_raidz_generate_parity_pq(rm);
755
756 for (c = 0; c < rm->rm_firstdatacol; c++) {
757 rc = &rm->rm_col[c];
758 if (!rc->rc_tried || rc->rc_error != 0)
759 continue;
760 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
761 raidz_checksum_error(zio, rc);
762 rc->rc_error = ECKSUM;
763 ret++;
764 }
765 zio_buf_free(orig[c], rc->rc_size);
766 }
767
768 return (ret);
769}
770
771static uint64_t raidz_corrected_p;
772static uint64_t raidz_corrected_q;
773static uint64_t raidz_corrected_pq;
774
775static int
b128c09f
BB
776vdev_raidz_worst_error(raidz_map_t *rm)
777{
778 int error = 0;
779
780 for (int c = 0; c < rm->rm_cols; c++)
781 error = zio_worst_error(error, rm->rm_col[c].rc_error);
782
783 return (error);
784}
785
786static void
34dc7c2f
BB
787vdev_raidz_io_done(zio_t *zio)
788{
789 vdev_t *vd = zio->io_vd;
790 vdev_t *cvd;
791 raidz_map_t *rm = zio->io_vsd;
792 raidz_col_t *rc, *rc1;
793 int unexpected_errors = 0;
794 int parity_errors = 0;
795 int parity_untried = 0;
796 int data_errors = 0;
b128c09f 797 int total_errors = 0;
34dc7c2f
BB
798 int n, c, c1;
799
800 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
801
34dc7c2f
BB
802 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
803 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
804
805 for (c = 0; c < rm->rm_cols; c++) {
806 rc = &rm->rm_col[c];
807
34dc7c2f 808 if (rc->rc_error) {
b128c09f 809 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
34dc7c2f
BB
810
811 if (c < rm->rm_firstdatacol)
812 parity_errors++;
813 else
814 data_errors++;
815
816 if (!rc->rc_skipped)
817 unexpected_errors++;
818
b128c09f 819 total_errors++;
34dc7c2f
BB
820 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
821 parity_untried++;
822 }
823 }
824
825 if (zio->io_type == ZIO_TYPE_WRITE) {
826 /*
b128c09f
BB
827 * XXX -- for now, treat partial writes as a success.
828 * (If we couldn't write enough columns to reconstruct
829 * the data, the I/O failed. Otherwise, good enough.)
830 *
831 * Now that we support write reallocation, it would be better
832 * to treat partial failure as real failure unless there are
833 * no non-degraded top-level vdevs left, and not update DTLs
834 * if we intend to reallocate.
34dc7c2f
BB
835 */
836 /* XXPOLICY */
b128c09f
BB
837 if (total_errors > rm->rm_firstdatacol)
838 zio->io_error = vdev_raidz_worst_error(rm);
34dc7c2f 839
b128c09f 840 return;
34dc7c2f
BB
841 }
842
843 ASSERT(zio->io_type == ZIO_TYPE_READ);
844 /*
845 * There are three potential phases for a read:
846 * 1. produce valid data from the columns read
847 * 2. read all disks and try again
848 * 3. perform combinatorial reconstruction
849 *
850 * Each phase is progressively both more expensive and less likely to
851 * occur. If we encounter more errors than we can repair or all phases
852 * fail, we have no choice but to return an error.
853 */
854
855 /*
856 * If the number of errors we saw was correctable -- less than or equal
857 * to the number of parity disks read -- attempt to produce data that
858 * has a valid checksum. Naturally, this case applies in the absence of
859 * any errors.
860 */
b128c09f 861 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
34dc7c2f
BB
862 switch (data_errors) {
863 case 0:
864 if (zio_checksum_error(zio) == 0) {
34dc7c2f
BB
865 /*
866 * If we read parity information (unnecessarily
867 * as it happens since no reconstruction was
868 * needed) regenerate and verify the parity.
869 * We also regenerate parity when resilvering
870 * so we can write it out to the failed device
871 * later.
872 */
873 if (parity_errors + parity_untried <
874 rm->rm_firstdatacol ||
875 (zio->io_flags & ZIO_FLAG_RESILVER)) {
876 n = raidz_parity_verify(zio, rm);
877 unexpected_errors += n;
878 ASSERT(parity_errors + n <=
879 rm->rm_firstdatacol);
880 }
881 goto done;
882 }
883 break;
884
885 case 1:
886 /*
887 * We either attempt to read all the parity columns or
888 * none of them. If we didn't try to read parity, we
889 * wouldn't be here in the correctable case. There must
890 * also have been fewer parity errors than parity
891 * columns or, again, we wouldn't be in this code path.
892 */
893 ASSERT(parity_untried == 0);
894 ASSERT(parity_errors < rm->rm_firstdatacol);
895
896 /*
897 * Find the column that reported the error.
898 */
899 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
900 rc = &rm->rm_col[c];
901 if (rc->rc_error != 0)
902 break;
903 }
904 ASSERT(c != rm->rm_cols);
905 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
906 rc->rc_error == ESTALE);
907
908 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
909 vdev_raidz_reconstruct_p(rm, c);
910 } else {
911 ASSERT(rm->rm_firstdatacol > 1);
912 vdev_raidz_reconstruct_q(rm, c);
913 }
914
915 if (zio_checksum_error(zio) == 0) {
34dc7c2f
BB
916 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
917 atomic_inc_64(&raidz_corrected_p);
918 else
919 atomic_inc_64(&raidz_corrected_q);
920
921 /*
922 * If there's more than one parity disk that
923 * was successfully read, confirm that the
924 * other parity disk produced the correct data.
925 * This routine is suboptimal in that it
926 * regenerates both the parity we wish to test
927 * as well as the parity we just used to
928 * perform the reconstruction, but this should
929 * be a relatively uncommon case, and can be
930 * optimized if it becomes a problem.
931 * We also regenerate parity when resilvering
932 * so we can write it out to the failed device
933 * later.
934 */
935 if (parity_errors < rm->rm_firstdatacol - 1 ||
936 (zio->io_flags & ZIO_FLAG_RESILVER)) {
937 n = raidz_parity_verify(zio, rm);
938 unexpected_errors += n;
939 ASSERT(parity_errors + n <=
940 rm->rm_firstdatacol);
941 }
942
943 goto done;
944 }
945 break;
946
947 case 2:
948 /*
949 * Two data column errors require double parity.
950 */
951 ASSERT(rm->rm_firstdatacol == 2);
952
953 /*
954 * Find the two columns that reported errors.
955 */
956 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
957 rc = &rm->rm_col[c];
958 if (rc->rc_error != 0)
959 break;
960 }
961 ASSERT(c != rm->rm_cols);
962 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
963 rc->rc_error == ESTALE);
964
965 for (c1 = c++; c < rm->rm_cols; c++) {
966 rc = &rm->rm_col[c];
967 if (rc->rc_error != 0)
968 break;
969 }
970 ASSERT(c != rm->rm_cols);
971 ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
972 rc->rc_error == ESTALE);
973
974 vdev_raidz_reconstruct_pq(rm, c1, c);
975
976 if (zio_checksum_error(zio) == 0) {
34dc7c2f 977 atomic_inc_64(&raidz_corrected_pq);
34dc7c2f
BB
978 goto done;
979 }
980 break;
981
982 default:
983 ASSERT(rm->rm_firstdatacol <= 2);
984 ASSERT(0);
985 }
986 }
987
988 /*
989 * This isn't a typical situation -- either we got a read error or
990 * a child silently returned bad data. Read every block so we can
991 * try again with as much data and parity as we can track down. If
992 * we've already been through once before, all children will be marked
993 * as tried so we'll proceed to combinatorial reconstruction.
994 */
995 unexpected_errors = 1;
996 rm->rm_missingdata = 0;
997 rm->rm_missingparity = 0;
998
999 for (c = 0; c < rm->rm_cols; c++) {
1000 if (rm->rm_col[c].rc_tried)
1001 continue;
1002
34dc7c2f
BB
1003 zio_vdev_io_redone(zio);
1004 do {
1005 rc = &rm->rm_col[c];
1006 if (rc->rc_tried)
1007 continue;
1008 zio_nowait(zio_vdev_child_io(zio, NULL,
1009 vd->vdev_child[rc->rc_devidx],
1010 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1011 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1012 vdev_raidz_child_done, rc));
1013 } while (++c < rm->rm_cols);
34dc7c2f 1014
b128c09f 1015 return;
34dc7c2f
BB
1016 }
1017
1018 /*
1019 * At this point we've attempted to reconstruct the data given the
1020 * errors we detected, and we've attempted to read all columns. There
1021 * must, therefore, be one or more additional problems -- silent errors
1022 * resulting in invalid data rather than explicit I/O errors resulting
1023 * in absent data. Before we attempt combinatorial reconstruction make
1024 * sure we have a chance of coming up with the right answer.
1025 */
b128c09f
BB
1026 if (total_errors >= rm->rm_firstdatacol) {
1027 zio->io_error = vdev_raidz_worst_error(rm);
1028 /*
1029 * If there were exactly as many device errors as parity
1030 * columns, yet we couldn't reconstruct the data, then at
1031 * least one device must have returned bad data silently.
1032 */
1033 if (total_errors == rm->rm_firstdatacol)
1034 zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
34dc7c2f
BB
1035 goto done;
1036 }
1037
1038 if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
1039 /*
1040 * Attempt to reconstruct the data from parity P.
1041 */
1042 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1043 void *orig;
1044 rc = &rm->rm_col[c];
1045
1046 orig = zio_buf_alloc(rc->rc_size);
1047 bcopy(rc->rc_data, orig, rc->rc_size);
1048 vdev_raidz_reconstruct_p(rm, c);
1049
1050 if (zio_checksum_error(zio) == 0) {
1051 zio_buf_free(orig, rc->rc_size);
34dc7c2f
BB
1052 atomic_inc_64(&raidz_corrected_p);
1053
1054 /*
1055 * If this child didn't know that it returned
1056 * bad data, inform it.
1057 */
1058 if (rc->rc_tried && rc->rc_error == 0)
1059 raidz_checksum_error(zio, rc);
1060 rc->rc_error = ECKSUM;
1061 goto done;
1062 }
1063
1064 bcopy(orig, rc->rc_data, rc->rc_size);
1065 zio_buf_free(orig, rc->rc_size);
1066 }
1067 }
1068
1069 if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1070 /*
1071 * Attempt to reconstruct the data from parity Q.
1072 */
1073 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1074 void *orig;
1075 rc = &rm->rm_col[c];
1076
1077 orig = zio_buf_alloc(rc->rc_size);
1078 bcopy(rc->rc_data, orig, rc->rc_size);
1079 vdev_raidz_reconstruct_q(rm, c);
1080
1081 if (zio_checksum_error(zio) == 0) {
1082 zio_buf_free(orig, rc->rc_size);
34dc7c2f
BB
1083 atomic_inc_64(&raidz_corrected_q);
1084
1085 /*
1086 * If this child didn't know that it returned
1087 * bad data, inform it.
1088 */
1089 if (rc->rc_tried && rc->rc_error == 0)
1090 raidz_checksum_error(zio, rc);
1091 rc->rc_error = ECKSUM;
1092 goto done;
1093 }
1094
1095 bcopy(orig, rc->rc_data, rc->rc_size);
1096 zio_buf_free(orig, rc->rc_size);
1097 }
1098 }
1099
1100 if (rm->rm_firstdatacol > 1 &&
1101 rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
1102 rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
1103 /*
1104 * Attempt to reconstruct the data from both P and Q.
1105 */
1106 for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
1107 void *orig, *orig1;
1108 rc = &rm->rm_col[c];
1109
1110 orig = zio_buf_alloc(rc->rc_size);
1111 bcopy(rc->rc_data, orig, rc->rc_size);
1112
1113 for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
1114 rc1 = &rm->rm_col[c1];
1115
1116 orig1 = zio_buf_alloc(rc1->rc_size);
1117 bcopy(rc1->rc_data, orig1, rc1->rc_size);
1118
1119 vdev_raidz_reconstruct_pq(rm, c, c1);
1120
1121 if (zio_checksum_error(zio) == 0) {
1122 zio_buf_free(orig, rc->rc_size);
1123 zio_buf_free(orig1, rc1->rc_size);
34dc7c2f
BB
1124 atomic_inc_64(&raidz_corrected_pq);
1125
1126 /*
1127 * If these children didn't know they
1128 * returned bad data, inform them.
1129 */
1130 if (rc->rc_tried && rc->rc_error == 0)
1131 raidz_checksum_error(zio, rc);
1132 if (rc1->rc_tried && rc1->rc_error == 0)
1133 raidz_checksum_error(zio, rc1);
1134
1135 rc->rc_error = ECKSUM;
1136 rc1->rc_error = ECKSUM;
1137
1138 goto done;
1139 }
1140
1141 bcopy(orig1, rc1->rc_data, rc1->rc_size);
1142 zio_buf_free(orig1, rc1->rc_size);
1143 }
1144
1145 bcopy(orig, rc->rc_data, rc->rc_size);
1146 zio_buf_free(orig, rc->rc_size);
1147 }
1148 }
1149
1150 /*
1151 * All combinations failed to checksum. Generate checksum ereports for
1152 * all children.
1153 */
1154 zio->io_error = ECKSUM;
b128c09f 1155
34dc7c2f
BB
1156 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1157 for (c = 0; c < rm->rm_cols; c++) {
1158 rc = &rm->rm_col[c];
1159 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1160 zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
1161 rc->rc_offset, rc->rc_size);
1162 }
1163 }
1164
1165done:
1166 zio_checksum_verified(zio);
1167
fb5f0bc8 1168 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
34dc7c2f 1169 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
1170 /*
1171 * Use the good data we have in hand to repair damaged children.
34dc7c2f 1172 */
34dc7c2f
BB
1173 for (c = 0; c < rm->rm_cols; c++) {
1174 rc = &rm->rm_col[c];
1175 cvd = vd->vdev_child[rc->rc_devidx];
1176
1177 if (rc->rc_error == 0)
1178 continue;
1179
b128c09f 1180 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
34dc7c2f
BB
1181 rc->rc_offset, rc->rc_data, rc->rc_size,
1182 ZIO_TYPE_WRITE, zio->io_priority,
fb5f0bc8
BB
1183 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
1184 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
34dc7c2f 1185 }
34dc7c2f 1186 }
34dc7c2f
BB
1187}
1188
1189static void
1190vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1191{
1192 if (faulted > vd->vdev_nparity)
1193 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1194 VDEV_AUX_NO_REPLICAS);
1195 else if (degraded + faulted != 0)
1196 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1197 else
1198 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1199}
1200
1201vdev_ops_t vdev_raidz_ops = {
1202 vdev_raidz_open,
1203 vdev_raidz_close,
34dc7c2f
BB
1204 vdev_raidz_asize,
1205 vdev_raidz_io_start,
1206 vdev_raidz_io_done,
1207 vdev_raidz_state_change,
1208 VDEV_TYPE_RAIDZ, /* name of this vdev type */
1209 B_FALSE /* not a leaf vdev */
1210};