4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
28 #include <sys/zfs_context.h>
30 #include <sys/vdev_impl.h>
32 #include <sys/zio_checksum.h>
33 #include <sys/fs/zfs.h>
34 #include <sys/fm/fs/zfs.h>
35 #include <sys/vdev_raidz.h>
36 #include <sys/vdev_raidz_impl.h>
39 * Virtual device vector for RAID-Z.
41 * This vdev supports single, double, and triple parity. For single parity,
42 * we use a simple XOR of all the data columns. For double or triple parity,
43 * we use a special case of Reed-Solomon coding. This extends the
44 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
45 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
46 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
47 * former is also based. The latter is designed to provide higher performance
50 * Note that the Plank paper claimed to support arbitrary N+M, but was then
51 * amended six years later identifying a critical flaw that invalidates its
52 * claims. Nevertheless, the technique can be adapted to work for up to
53 * triple parity. For additional parity, the amendment "Note: Correction to
54 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
55 * is viable, but the additional complexity means that write performance will
58 * All of the methods above operate on a Galois field, defined over the
59 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
60 * can be expressed with a single byte. Briefly, the operations on the
61 * field are defined as follows:
63 * o addition (+) is represented by a bitwise XOR
64 * o subtraction (-) is therefore identical to addition: A + B = A - B
65 * o multiplication of A by 2 is defined by the following bitwise expression:
70 * (A * 2)_4 = A_3 + A_7
71 * (A * 2)_3 = A_2 + A_7
72 * (A * 2)_2 = A_1 + A_7
76 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
77 * As an aside, this multiplication is derived from the error correcting
78 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
80 * Observe that any number in the field (except for 0) can be expressed as a
81 * power of 2 -- a generator for the field. We store a table of the powers of
82 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
83 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
84 * than field addition). The inverse of a field element A (A^-1) is therefore
85 * A ^ (255 - 1) = A^254.
87 * The up-to-three parity columns, P, Q, R over several data columns,
88 * D_0, ... D_n-1, can be expressed by field operations:
90 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
91 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
92 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
93 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
94 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
96 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
97 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
98 * independent coefficients. (There are no additional coefficients that have
99 * this property which is why the uncorrected Plank method breaks down.)
101 * See the reconstruction code below for how P, Q and R can used individually
102 * or in concert to recover missing data columns.
105 #define VDEV_RAIDZ_P 0
106 #define VDEV_RAIDZ_Q 1
107 #define VDEV_RAIDZ_R 2
109 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
110 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
113 * We provide a mechanism to perform the field multiplication operation on a
114 * 64-bit value all at once rather than a byte at a time. This works by
115 * creating a mask from the top bit in each byte and using that to
116 * conditionally apply the XOR of 0x1d.
118 #define VDEV_RAIDZ_64MUL_2(x, mask) \
120 (mask) = (x) & 0x8080808080808080ULL; \
121 (mask) = ((mask) << 1) - ((mask) >> 7); \
122 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
123 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
126 #define VDEV_RAIDZ_64MUL_4(x, mask) \
128 VDEV_RAIDZ_64MUL_2((x), mask); \
129 VDEV_RAIDZ_64MUL_2((x), mask); \
133 vdev_raidz_map_free(raidz_map_t
*rm
)
138 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
139 zio_buf_free(rm
->rm_col
[c
].rc_data
, rm
->rm_col
[c
].rc_size
);
141 if (rm
->rm_col
[c
].rc_gdata
!= NULL
)
142 zio_buf_free(rm
->rm_col
[c
].rc_gdata
,
143 rm
->rm_col
[c
].rc_size
);
147 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++)
148 size
+= rm
->rm_col
[c
].rc_size
;
150 if (rm
->rm_datacopy
!= NULL
)
151 zio_buf_free(rm
->rm_datacopy
, size
);
153 kmem_free(rm
, offsetof(raidz_map_t
, rm_col
[rm
->rm_scols
]));
157 vdev_raidz_map_free_vsd(zio_t
*zio
)
159 raidz_map_t
*rm
= zio
->io_vsd
;
161 ASSERT0(rm
->rm_freed
);
164 if (rm
->rm_reports
== 0)
165 vdev_raidz_map_free(rm
);
170 vdev_raidz_cksum_free(void *arg
, size_t ignored
)
172 raidz_map_t
*rm
= arg
;
174 ASSERT3U(rm
->rm_reports
, >, 0);
176 if (--rm
->rm_reports
== 0 && rm
->rm_freed
!= 0)
177 vdev_raidz_map_free(rm
);
181 vdev_raidz_cksum_finish(zio_cksum_report_t
*zcr
, const void *good_data
)
183 raidz_map_t
*rm
= zcr
->zcr_cbdata
;
184 size_t c
= zcr
->zcr_cbinfo
;
187 const char *good
= NULL
;
188 const char *bad
= rm
->rm_col
[c
].rc_data
;
190 if (good_data
== NULL
) {
191 zfs_ereport_finish_checksum(zcr
, NULL
, NULL
, B_FALSE
);
195 if (c
< rm
->rm_firstdatacol
) {
197 * The first time through, calculate the parity blocks for
198 * the good data (this relies on the fact that the good
199 * data never changes for a given logical ZIO)
201 if (rm
->rm_col
[0].rc_gdata
== NULL
) {
202 char *bad_parity
[VDEV_RAIDZ_MAXPARITY
];
206 * Set up the rm_col[]s to generate the parity for
207 * good_data, first saving the parity bufs and
208 * replacing them with buffers to hold the result.
210 for (x
= 0; x
< rm
->rm_firstdatacol
; x
++) {
211 bad_parity
[x
] = rm
->rm_col
[x
].rc_data
;
212 rm
->rm_col
[x
].rc_data
= rm
->rm_col
[x
].rc_gdata
=
213 zio_buf_alloc(rm
->rm_col
[x
].rc_size
);
216 /* fill in the data columns from good_data */
217 buf
= (char *)good_data
;
218 for (; x
< rm
->rm_cols
; x
++) {
219 rm
->rm_col
[x
].rc_data
= buf
;
220 buf
+= rm
->rm_col
[x
].rc_size
;
224 * Construct the parity from the good data.
226 vdev_raidz_generate_parity(rm
);
228 /* restore everything back to its original state */
229 for (x
= 0; x
< rm
->rm_firstdatacol
; x
++)
230 rm
->rm_col
[x
].rc_data
= bad_parity
[x
];
232 buf
= rm
->rm_datacopy
;
233 for (x
= rm
->rm_firstdatacol
; x
< rm
->rm_cols
; x
++) {
234 rm
->rm_col
[x
].rc_data
= buf
;
235 buf
+= rm
->rm_col
[x
].rc_size
;
239 ASSERT3P(rm
->rm_col
[c
].rc_gdata
, !=, NULL
);
240 good
= rm
->rm_col
[c
].rc_gdata
;
242 /* adjust good_data to point at the start of our column */
245 for (x
= rm
->rm_firstdatacol
; x
< c
; x
++)
246 good
+= rm
->rm_col
[x
].rc_size
;
249 /* we drop the ereport if it ends up that the data was good */
250 zfs_ereport_finish_checksum(zcr
, good
, bad
, B_TRUE
);
254 * Invoked indirectly by zfs_ereport_start_checksum(), called
255 * below when our read operation fails completely. The main point
256 * is to keep a copy of everything we read from disk, so that at
257 * vdev_raidz_cksum_finish() time we can compare it with the good data.
260 vdev_raidz_cksum_report(zio_t
*zio
, zio_cksum_report_t
*zcr
, void *arg
)
262 size_t c
= (size_t)(uintptr_t)arg
;
265 raidz_map_t
*rm
= zio
->io_vsd
;
268 /* set up the report and bump the refcount */
269 zcr
->zcr_cbdata
= rm
;
271 zcr
->zcr_finish
= vdev_raidz_cksum_finish
;
272 zcr
->zcr_free
= vdev_raidz_cksum_free
;
275 ASSERT3U(rm
->rm_reports
, >, 0);
277 if (rm
->rm_datacopy
!= NULL
)
281 * It's the first time we're called for this raidz_map_t, so we need
282 * to copy the data aside; there's no guarantee that our zio's buffer
283 * won't be re-used for something else.
285 * Our parity data is already in separate buffers, so there's no need
290 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++)
291 size
+= rm
->rm_col
[c
].rc_size
;
293 buf
= rm
->rm_datacopy
= zio_buf_alloc(size
);
295 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
296 raidz_col_t
*col
= &rm
->rm_col
[c
];
298 bcopy(col
->rc_data
, buf
, col
->rc_size
);
303 ASSERT3P(buf
- (caddr_t
)rm
->rm_datacopy
, ==, size
);
306 static const zio_vsd_ops_t vdev_raidz_vsd_ops
= {
307 vdev_raidz_map_free_vsd
,
308 vdev_raidz_cksum_report
312 * Divides the IO evenly across all child vdevs; usually, dcols is
313 * the number of children in the target vdev.
315 * Avoid inlining the function to keep vdev_raidz_io_start(), which
316 * is this functions only caller, as small as possible on the stack.
318 noinline raidz_map_t
*
319 vdev_raidz_map_alloc(zio_t
*zio
, uint64_t unit_shift
, uint64_t dcols
,
323 /* The starting RAIDZ (parent) vdev sector of the block. */
324 uint64_t b
= zio
->io_offset
>> unit_shift
;
325 /* The zio's size in units of the vdev's minimum sector size. */
326 uint64_t s
= zio
->io_size
>> unit_shift
;
327 /* The first column for this stripe. */
328 uint64_t f
= b
% dcols
;
329 /* The starting byte offset on each child vdev. */
330 uint64_t o
= (b
/ dcols
) << unit_shift
;
331 uint64_t q
, r
, c
, bc
, col
, acols
, scols
, coff
, devidx
, asize
, tot
;
334 * "Quotient": The number of data sectors for this stripe on all but
335 * the "big column" child vdevs that also contain "remainder" data.
337 q
= s
/ (dcols
- nparity
);
340 * "Remainder": The number of partial stripe data sectors in this I/O.
341 * This will add a sector to some, but not all, child vdevs.
343 r
= s
- q
* (dcols
- nparity
);
345 /* The number of "big columns" - those which contain remainder data. */
346 bc
= (r
== 0 ? 0 : r
+ nparity
);
349 * The total number of data and parity sectors associated with
352 tot
= s
+ nparity
* (q
+ (r
== 0 ? 0 : 1));
354 /* acols: The columns that will be accessed. */
355 /* scols: The columns that will be accessed or skipped. */
357 /* Our I/O request doesn't span all child vdevs. */
359 scols
= MIN(dcols
, roundup(bc
, nparity
+ 1));
365 ASSERT3U(acols
, <=, scols
);
367 rm
= kmem_alloc(offsetof(raidz_map_t
, rm_col
[scols
]), KM_SLEEP
);
370 rm
->rm_scols
= scols
;
372 rm
->rm_skipstart
= bc
;
373 rm
->rm_missingdata
= 0;
374 rm
->rm_missingparity
= 0;
375 rm
->rm_firstdatacol
= nparity
;
376 rm
->rm_datacopy
= NULL
;
379 rm
->rm_ecksuminjected
= 0;
383 for (c
= 0; c
< scols
; c
++) {
388 coff
+= 1ULL << unit_shift
;
390 rm
->rm_col
[c
].rc_devidx
= col
;
391 rm
->rm_col
[c
].rc_offset
= coff
;
392 rm
->rm_col
[c
].rc_data
= NULL
;
393 rm
->rm_col
[c
].rc_gdata
= NULL
;
394 rm
->rm_col
[c
].rc_error
= 0;
395 rm
->rm_col
[c
].rc_tried
= 0;
396 rm
->rm_col
[c
].rc_skipped
= 0;
399 rm
->rm_col
[c
].rc_size
= 0;
401 rm
->rm_col
[c
].rc_size
= (q
+ 1) << unit_shift
;
403 rm
->rm_col
[c
].rc_size
= q
<< unit_shift
;
405 asize
+= rm
->rm_col
[c
].rc_size
;
408 ASSERT3U(asize
, ==, tot
<< unit_shift
);
409 rm
->rm_asize
= roundup(asize
, (nparity
+ 1) << unit_shift
);
410 rm
->rm_nskip
= roundup(tot
, nparity
+ 1) - tot
;
411 ASSERT3U(rm
->rm_asize
- asize
, ==, rm
->rm_nskip
<< unit_shift
);
412 ASSERT3U(rm
->rm_nskip
, <=, nparity
);
414 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++)
415 rm
->rm_col
[c
].rc_data
= zio_buf_alloc(rm
->rm_col
[c
].rc_size
);
417 rm
->rm_col
[c
].rc_data
= zio
->io_data
;
419 for (c
= c
+ 1; c
< acols
; c
++)
420 rm
->rm_col
[c
].rc_data
= (char *)rm
->rm_col
[c
- 1].rc_data
+
421 rm
->rm_col
[c
- 1].rc_size
;
424 * If all data stored spans all columns, there's a danger that parity
425 * will always be on the same device and, since parity isn't read
426 * during normal operation, that that device's I/O bandwidth won't be
427 * used effectively. We therefore switch the parity every 1MB.
429 * ... at least that was, ostensibly, the theory. As a practical
430 * matter unless we juggle the parity between all devices evenly, we
431 * won't see any benefit. Further, occasional writes that aren't a
432 * multiple of the LCM of the number of children and the minimum
433 * stripe width are sufficient to avoid pessimal behavior.
434 * Unfortunately, this decision created an implicit on-disk format
435 * requirement that we need to support for all eternity, but only
436 * for single-parity RAID-Z.
438 * If we intend to skip a sector in the zeroth column for padding
439 * we must make sure to note this swap. We will never intend to
440 * skip the first column since at least one data and one parity
441 * column must appear in each row.
443 ASSERT(rm
->rm_cols
>= 2);
444 ASSERT(rm
->rm_col
[0].rc_size
== rm
->rm_col
[1].rc_size
);
446 if (rm
->rm_firstdatacol
== 1 && (zio
->io_offset
& (1ULL << 20))) {
447 devidx
= rm
->rm_col
[0].rc_devidx
;
448 o
= rm
->rm_col
[0].rc_offset
;
449 rm
->rm_col
[0].rc_devidx
= rm
->rm_col
[1].rc_devidx
;
450 rm
->rm_col
[0].rc_offset
= rm
->rm_col
[1].rc_offset
;
451 rm
->rm_col
[1].rc_devidx
= devidx
;
452 rm
->rm_col
[1].rc_offset
= o
;
454 if (rm
->rm_skipstart
== 0)
455 rm
->rm_skipstart
= 1;
459 zio
->io_vsd_ops
= &vdev_raidz_vsd_ops
;
461 /* init RAIDZ parity ops */
462 rm
->rm_ops
= vdev_raidz_math_get_ops();
468 vdev_raidz_generate_parity_p(raidz_map_t
*rm
)
470 uint64_t *p
, *src
, pcount
, ccount
, i
;
473 pcount
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
475 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
476 src
= rm
->rm_col
[c
].rc_data
;
477 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
478 ccount
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
480 if (c
== rm
->rm_firstdatacol
) {
481 ASSERT(ccount
== pcount
);
482 for (i
= 0; i
< ccount
; i
++, src
++, p
++) {
486 ASSERT(ccount
<= pcount
);
487 for (i
= 0; i
< ccount
; i
++, src
++, p
++) {
495 vdev_raidz_generate_parity_pq(raidz_map_t
*rm
)
497 uint64_t *p
, *q
, *src
, pcnt
, ccnt
, mask
, i
;
500 pcnt
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
501 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
502 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
504 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
505 src
= rm
->rm_col
[c
].rc_data
;
506 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
507 q
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
509 ccnt
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
511 if (c
== rm
->rm_firstdatacol
) {
512 ASSERT(ccnt
== pcnt
|| ccnt
== 0);
513 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++) {
517 for (; i
< pcnt
; i
++, src
++, p
++, q
++) {
522 ASSERT(ccnt
<= pcnt
);
525 * Apply the algorithm described above by multiplying
526 * the previous result and adding in the new value.
528 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++) {
531 VDEV_RAIDZ_64MUL_2(*q
, mask
);
536 * Treat short columns as though they are full of 0s.
537 * Note that there's therefore nothing needed for P.
539 for (; i
< pcnt
; i
++, q
++) {
540 VDEV_RAIDZ_64MUL_2(*q
, mask
);
547 vdev_raidz_generate_parity_pqr(raidz_map_t
*rm
)
549 uint64_t *p
, *q
, *r
, *src
, pcnt
, ccnt
, mask
, i
;
552 pcnt
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
553 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
554 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
555 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
556 rm
->rm_col
[VDEV_RAIDZ_R
].rc_size
);
558 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
559 src
= rm
->rm_col
[c
].rc_data
;
560 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
561 q
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
562 r
= rm
->rm_col
[VDEV_RAIDZ_R
].rc_data
;
564 ccnt
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
566 if (c
== rm
->rm_firstdatacol
) {
567 ASSERT(ccnt
== pcnt
|| ccnt
== 0);
568 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++, r
++) {
573 for (; i
< pcnt
; i
++, src
++, p
++, q
++, r
++) {
579 ASSERT(ccnt
<= pcnt
);
582 * Apply the algorithm described above by multiplying
583 * the previous result and adding in the new value.
585 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++, r
++) {
588 VDEV_RAIDZ_64MUL_2(*q
, mask
);
591 VDEV_RAIDZ_64MUL_4(*r
, mask
);
596 * Treat short columns as though they are full of 0s.
597 * Note that there's therefore nothing needed for P.
599 for (; i
< pcnt
; i
++, q
++, r
++) {
600 VDEV_RAIDZ_64MUL_2(*q
, mask
);
601 VDEV_RAIDZ_64MUL_4(*r
, mask
);
608 * Generate RAID parity in the first virtual columns according to the number of
609 * parity columns available.
612 vdev_raidz_generate_parity(raidz_map_t
*rm
)
614 /* Generate using the new math implementation */
615 if (vdev_raidz_math_generate(rm
) != RAIDZ_ORIGINAL_IMPL
)
618 switch (rm
->rm_firstdatacol
) {
620 vdev_raidz_generate_parity_p(rm
);
623 vdev_raidz_generate_parity_pq(rm
);
626 vdev_raidz_generate_parity_pqr(rm
);
629 cmn_err(CE_PANIC
, "invalid RAID-Z configuration");
634 vdev_raidz_reconstruct_p(raidz_map_t
*rm
, int *tgts
, int ntgts
)
636 uint64_t *dst
, *src
, xcount
, ccount
, count
, i
;
641 ASSERT(x
>= rm
->rm_firstdatacol
);
642 ASSERT(x
< rm
->rm_cols
);
644 xcount
= rm
->rm_col
[x
].rc_size
/ sizeof (src
[0]);
645 ASSERT(xcount
<= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]));
648 src
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
649 dst
= rm
->rm_col
[x
].rc_data
;
650 for (i
= 0; i
< xcount
; i
++, dst
++, src
++) {
654 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
655 src
= rm
->rm_col
[c
].rc_data
;
656 dst
= rm
->rm_col
[x
].rc_data
;
661 ccount
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
662 count
= MIN(ccount
, xcount
);
664 for (i
= 0; i
< count
; i
++, dst
++, src
++) {
669 return (1 << VDEV_RAIDZ_P
);
673 vdev_raidz_reconstruct_q(raidz_map_t
*rm
, int *tgts
, int ntgts
)
675 uint64_t *dst
, *src
, xcount
, ccount
, count
, mask
, i
;
682 xcount
= rm
->rm_col
[x
].rc_size
/ sizeof (src
[0]);
683 ASSERT(xcount
<= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
/ sizeof (src
[0]));
685 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
686 src
= rm
->rm_col
[c
].rc_data
;
687 dst
= rm
->rm_col
[x
].rc_data
;
692 ccount
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
694 count
= MIN(ccount
, xcount
);
696 if (c
== rm
->rm_firstdatacol
) {
697 for (i
= 0; i
< count
; i
++, dst
++, src
++) {
700 for (; i
< xcount
; i
++, dst
++) {
705 for (i
= 0; i
< count
; i
++, dst
++, src
++) {
706 VDEV_RAIDZ_64MUL_2(*dst
, mask
);
710 for (; i
< xcount
; i
++, dst
++) {
711 VDEV_RAIDZ_64MUL_2(*dst
, mask
);
716 src
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
717 dst
= rm
->rm_col
[x
].rc_data
;
718 exp
= 255 - (rm
->rm_cols
- 1 - x
);
720 for (i
= 0; i
< xcount
; i
++, dst
++, src
++) {
722 for (j
= 0, b
= (uint8_t *)dst
; j
< 8; j
++, b
++) {
723 *b
= vdev_raidz_exp2(*b
, exp
);
727 return (1 << VDEV_RAIDZ_Q
);
731 vdev_raidz_reconstruct_pq(raidz_map_t
*rm
, int *tgts
, int ntgts
)
733 uint8_t *p
, *q
, *pxy
, *qxy
, *xd
, *yd
, tmp
, a
, b
, aexp
, bexp
;
735 uint64_t xsize
, ysize
, i
;
741 ASSERT(x
>= rm
->rm_firstdatacol
);
742 ASSERT(y
< rm
->rm_cols
);
744 ASSERT(rm
->rm_col
[x
].rc_size
>= rm
->rm_col
[y
].rc_size
);
747 * Move the parity data aside -- we're going to compute parity as
748 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
749 * reuse the parity generation mechanism without trashing the actual
750 * parity so we make those columns appear to be full of zeros by
751 * setting their lengths to zero.
753 pdata
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
754 qdata
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
755 xsize
= rm
->rm_col
[x
].rc_size
;
756 ysize
= rm
->rm_col
[y
].rc_size
;
758 rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
=
759 zio_buf_alloc(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
);
760 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
=
761 zio_buf_alloc(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
762 rm
->rm_col
[x
].rc_size
= 0;
763 rm
->rm_col
[y
].rc_size
= 0;
765 vdev_raidz_generate_parity_pq(rm
);
767 rm
->rm_col
[x
].rc_size
= xsize
;
768 rm
->rm_col
[y
].rc_size
= ysize
;
772 pxy
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
773 qxy
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
774 xd
= rm
->rm_col
[x
].rc_data
;
775 yd
= rm
->rm_col
[y
].rc_data
;
779 * Pxy = P + D_x + D_y
780 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
782 * We can then solve for D_x:
783 * D_x = A * (P + Pxy) + B * (Q + Qxy)
785 * A = 2^(x - y) * (2^(x - y) + 1)^-1
786 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
788 * With D_x in hand, we can easily solve for D_y:
789 * D_y = P + Pxy + D_x
792 a
= vdev_raidz_pow2
[255 + x
- y
];
793 b
= vdev_raidz_pow2
[255 - (rm
->rm_cols
- 1 - x
)];
794 tmp
= 255 - vdev_raidz_log2
[a
^ 1];
796 aexp
= vdev_raidz_log2
[vdev_raidz_exp2(a
, tmp
)];
797 bexp
= vdev_raidz_log2
[vdev_raidz_exp2(b
, tmp
)];
799 for (i
= 0; i
< xsize
; i
++, p
++, q
++, pxy
++, qxy
++, xd
++, yd
++) {
800 *xd
= vdev_raidz_exp2(*p
^ *pxy
, aexp
) ^
801 vdev_raidz_exp2(*q
^ *qxy
, bexp
);
804 *yd
= *p
^ *pxy
^ *xd
;
807 zio_buf_free(rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
,
808 rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
);
809 zio_buf_free(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
,
810 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
813 * Restore the saved parity data.
815 rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
= pdata
;
816 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
= qdata
;
818 return ((1 << VDEV_RAIDZ_P
) | (1 << VDEV_RAIDZ_Q
));
823 * In the general case of reconstruction, we must solve the system of linear
824 * equations defined by the coeffecients used to generate parity as well as
825 * the contents of the data and parity disks. This can be expressed with
826 * vectors for the original data (D) and the actual data (d) and parity (p)
827 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
831 * | V | | D_0 | | p_m-1 |
832 * | | x | : | = | d_0 |
833 * | I | | D_n-1 | | : |
834 * | | ~~ ~~ | d_n-1 |
837 * I is simply a square identity matrix of size n, and V is a vandermonde
838 * matrix defined by the coeffecients we chose for the various parity columns
839 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
840 * computation as well as linear separability.
843 * | 1 .. 1 1 1 | | p_0 |
844 * | 2^n-1 .. 4 2 1 | __ __ | : |
845 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
846 * | 1 .. 0 0 0 | | D_1 | | d_0 |
847 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
848 * | : : : : | | : | | d_2 |
849 * | 0 .. 1 0 0 | | D_n-1 | | : |
850 * | 0 .. 0 1 0 | ~~ ~~ | : |
851 * | 0 .. 0 0 1 | | d_n-1 |
854 * Note that I, V, d, and p are known. To compute D, we must invert the
855 * matrix and use the known data and parity values to reconstruct the unknown
856 * data values. We begin by removing the rows in V|I and d|p that correspond
857 * to failed or missing columns; we then make V|I square (n x n) and d|p
858 * sized n by removing rows corresponding to unused parity from the bottom up
859 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
860 * using Gauss-Jordan elimination. In the example below we use m=3 parity
861 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
863 * | 1 1 1 1 1 1 1 1 |
864 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
865 * | 19 205 116 29 64 16 4 1 | / /
866 * | 1 0 0 0 0 0 0 0 | / /
867 * | 0 1 0 0 0 0 0 0 | <--' /
868 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
869 * | 0 0 0 1 0 0 0 0 |
870 * | 0 0 0 0 1 0 0 0 |
871 * | 0 0 0 0 0 1 0 0 |
872 * | 0 0 0 0 0 0 1 0 |
873 * | 0 0 0 0 0 0 0 1 |
876 * | 1 1 1 1 1 1 1 1 |
877 * | 128 64 32 16 8 4 2 1 |
878 * | 19 205 116 29 64 16 4 1 |
879 * | 1 0 0 0 0 0 0 0 |
880 * | 0 1 0 0 0 0 0 0 |
881 * (V|I)' = | 0 0 1 0 0 0 0 0 |
882 * | 0 0 0 1 0 0 0 0 |
883 * | 0 0 0 0 1 0 0 0 |
884 * | 0 0 0 0 0 1 0 0 |
885 * | 0 0 0 0 0 0 1 0 |
886 * | 0 0 0 0 0 0 0 1 |
889 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
890 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
891 * matrix is not singular.
893 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
894 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
895 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
896 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
897 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
898 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
899 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
900 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
903 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
904 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
905 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
906 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
907 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
908 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
909 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
910 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
913 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
914 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
915 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
916 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
917 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
918 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
919 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
920 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
923 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
924 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
925 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
926 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
927 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
928 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
929 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
930 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
933 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
934 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
935 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
936 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
937 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
938 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
939 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
940 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
943 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
944 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
945 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
946 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
947 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
948 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
949 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
950 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
953 * | 0 0 1 0 0 0 0 0 |
954 * | 167 100 5 41 159 169 217 208 |
955 * | 166 100 4 40 158 168 216 209 |
956 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
957 * | 0 0 0 0 1 0 0 0 |
958 * | 0 0 0 0 0 1 0 0 |
959 * | 0 0 0 0 0 0 1 0 |
960 * | 0 0 0 0 0 0 0 1 |
963 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
964 * of the missing data.
966 * As is apparent from the example above, the only non-trivial rows in the
967 * inverse matrix correspond to the data disks that we're trying to
968 * reconstruct. Indeed, those are the only rows we need as the others would
969 * only be useful for reconstructing data known or assumed to be valid. For
970 * that reason, we only build the coefficients in the rows that correspond to
976 vdev_raidz_matrix_init(raidz_map_t
*rm
, int n
, int nmap
, int *map
,
982 ASSERT(n
== rm
->rm_cols
- rm
->rm_firstdatacol
);
985 * Fill in the missing rows of interest.
987 for (i
= 0; i
< nmap
; i
++) {
988 ASSERT3S(0, <=, map
[i
]);
989 ASSERT3S(map
[i
], <=, 2);
996 for (j
= 0; j
< n
; j
++) {
1000 rows
[i
][j
] = vdev_raidz_pow2
[pow
];
1006 vdev_raidz_matrix_invert(raidz_map_t
*rm
, int n
, int nmissing
, int *missing
,
1007 uint8_t **rows
, uint8_t **invrows
, const uint8_t *used
)
1013 * Assert that the first nmissing entries from the array of used
1014 * columns correspond to parity columns and that subsequent entries
1015 * correspond to data columns.
1017 for (i
= 0; i
< nmissing
; i
++) {
1018 ASSERT3S(used
[i
], <, rm
->rm_firstdatacol
);
1020 for (; i
< n
; i
++) {
1021 ASSERT3S(used
[i
], >=, rm
->rm_firstdatacol
);
1025 * First initialize the storage where we'll compute the inverse rows.
1027 for (i
= 0; i
< nmissing
; i
++) {
1028 for (j
= 0; j
< n
; j
++) {
1029 invrows
[i
][j
] = (i
== j
) ? 1 : 0;
1034 * Subtract all trivial rows from the rows of consequence.
1036 for (i
= 0; i
< nmissing
; i
++) {
1037 for (j
= nmissing
; j
< n
; j
++) {
1038 ASSERT3U(used
[j
], >=, rm
->rm_firstdatacol
);
1039 jj
= used
[j
] - rm
->rm_firstdatacol
;
1041 invrows
[i
][j
] = rows
[i
][jj
];
1047 * For each of the rows of interest, we must normalize it and subtract
1048 * a multiple of it from the other rows.
1050 for (i
= 0; i
< nmissing
; i
++) {
1051 for (j
= 0; j
< missing
[i
]; j
++) {
1052 ASSERT0(rows
[i
][j
]);
1054 ASSERT3U(rows
[i
][missing
[i
]], !=, 0);
1057 * Compute the inverse of the first element and multiply each
1058 * element in the row by that value.
1060 log
= 255 - vdev_raidz_log2
[rows
[i
][missing
[i
]]];
1062 for (j
= 0; j
< n
; j
++) {
1063 rows
[i
][j
] = vdev_raidz_exp2(rows
[i
][j
], log
);
1064 invrows
[i
][j
] = vdev_raidz_exp2(invrows
[i
][j
], log
);
1067 for (ii
= 0; ii
< nmissing
; ii
++) {
1071 ASSERT3U(rows
[ii
][missing
[i
]], !=, 0);
1073 log
= vdev_raidz_log2
[rows
[ii
][missing
[i
]]];
1075 for (j
= 0; j
< n
; j
++) {
1077 vdev_raidz_exp2(rows
[i
][j
], log
);
1079 vdev_raidz_exp2(invrows
[i
][j
], log
);
1085 * Verify that the data that is left in the rows are properly part of
1086 * an identity matrix.
1088 for (i
= 0; i
< nmissing
; i
++) {
1089 for (j
= 0; j
< n
; j
++) {
1090 if (j
== missing
[i
]) {
1091 ASSERT3U(rows
[i
][j
], ==, 1);
1093 ASSERT0(rows
[i
][j
]);
1100 vdev_raidz_matrix_reconstruct(raidz_map_t
*rm
, int n
, int nmissing
,
1101 int *missing
, uint8_t **invrows
, const uint8_t *used
)
1106 uint8_t *dst
[VDEV_RAIDZ_MAXPARITY
] = { NULL
};
1107 uint64_t dcount
[VDEV_RAIDZ_MAXPARITY
] = { 0 };
1111 uint8_t *invlog
[VDEV_RAIDZ_MAXPARITY
];
1115 psize
= sizeof (invlog
[0][0]) * n
* nmissing
;
1116 p
= kmem_alloc(psize
, KM_SLEEP
);
1118 for (pp
= p
, i
= 0; i
< nmissing
; i
++) {
1123 for (i
= 0; i
< nmissing
; i
++) {
1124 for (j
= 0; j
< n
; j
++) {
1125 ASSERT3U(invrows
[i
][j
], !=, 0);
1126 invlog
[i
][j
] = vdev_raidz_log2
[invrows
[i
][j
]];
1130 for (i
= 0; i
< n
; i
++) {
1132 ASSERT3U(c
, <, rm
->rm_cols
);
1134 src
= rm
->rm_col
[c
].rc_data
;
1135 ccount
= rm
->rm_col
[c
].rc_size
;
1136 for (j
= 0; j
< nmissing
; j
++) {
1137 cc
= missing
[j
] + rm
->rm_firstdatacol
;
1138 ASSERT3U(cc
, >=, rm
->rm_firstdatacol
);
1139 ASSERT3U(cc
, <, rm
->rm_cols
);
1140 ASSERT3U(cc
, !=, c
);
1142 dst
[j
] = rm
->rm_col
[cc
].rc_data
;
1143 dcount
[j
] = rm
->rm_col
[cc
].rc_size
;
1146 ASSERT(ccount
>= rm
->rm_col
[missing
[0]].rc_size
|| i
> 0);
1148 for (x
= 0; x
< ccount
; x
++, src
++) {
1150 log
= vdev_raidz_log2
[*src
];
1152 for (cc
= 0; cc
< nmissing
; cc
++) {
1153 if (x
>= dcount
[cc
])
1159 if ((ll
= log
+ invlog
[cc
][i
]) >= 255)
1161 val
= vdev_raidz_pow2
[ll
];
1172 kmem_free(p
, psize
);
1176 vdev_raidz_reconstruct_general(raidz_map_t
*rm
, int *tgts
, int ntgts
)
1180 int missing_rows
[VDEV_RAIDZ_MAXPARITY
];
1181 int parity_map
[VDEV_RAIDZ_MAXPARITY
];
1186 uint8_t *rows
[VDEV_RAIDZ_MAXPARITY
];
1187 uint8_t *invrows
[VDEV_RAIDZ_MAXPARITY
];
1193 n
= rm
->rm_cols
- rm
->rm_firstdatacol
;
1196 * Figure out which data columns are missing.
1199 for (t
= 0; t
< ntgts
; t
++) {
1200 if (tgts
[t
] >= rm
->rm_firstdatacol
) {
1201 missing_rows
[nmissing_rows
++] =
1202 tgts
[t
] - rm
->rm_firstdatacol
;
1207 * Figure out which parity columns to use to help generate the missing
1210 for (tt
= 0, c
= 0, i
= 0; i
< nmissing_rows
; c
++) {
1212 ASSERT(c
< rm
->rm_firstdatacol
);
1215 * Skip any targeted parity columns.
1217 if (c
== tgts
[tt
]) {
1229 ASSERT3U(code
, <, 1 << VDEV_RAIDZ_MAXPARITY
);
1231 psize
= (sizeof (rows
[0][0]) + sizeof (invrows
[0][0])) *
1232 nmissing_rows
* n
+ sizeof (used
[0]) * n
;
1233 p
= kmem_alloc(psize
, KM_SLEEP
);
1235 for (pp
= p
, i
= 0; i
< nmissing_rows
; i
++) {
1243 for (i
= 0; i
< nmissing_rows
; i
++) {
1244 used
[i
] = parity_map
[i
];
1247 for (tt
= 0, c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
1248 if (tt
< nmissing_rows
&&
1249 c
== missing_rows
[tt
] + rm
->rm_firstdatacol
) {
1260 * Initialize the interesting rows of the matrix.
1262 vdev_raidz_matrix_init(rm
, n
, nmissing_rows
, parity_map
, rows
);
1265 * Invert the matrix.
1267 vdev_raidz_matrix_invert(rm
, n
, nmissing_rows
, missing_rows
, rows
,
1271 * Reconstruct the missing data using the generated matrix.
1273 vdev_raidz_matrix_reconstruct(rm
, n
, nmissing_rows
, missing_rows
,
1276 kmem_free(p
, psize
);
1282 vdev_raidz_reconstruct(raidz_map_t
*rm
, const int *t
, int nt
)
1284 int tgts
[VDEV_RAIDZ_MAXPARITY
], *dt
;
1288 int nbadparity
, nbaddata
;
1289 int parity_valid
[VDEV_RAIDZ_MAXPARITY
];
1292 * The tgts list must already be sorted.
1294 for (i
= 1; i
< nt
; i
++) {
1295 ASSERT(t
[i
] > t
[i
- 1]);
1298 nbadparity
= rm
->rm_firstdatacol
;
1299 nbaddata
= rm
->rm_cols
- nbadparity
;
1301 for (i
= 0, c
= 0; c
< rm
->rm_cols
; c
++) {
1302 if (c
< rm
->rm_firstdatacol
)
1303 parity_valid
[c
] = B_FALSE
;
1305 if (i
< nt
&& c
== t
[i
]) {
1308 } else if (rm
->rm_col
[c
].rc_error
!= 0) {
1310 } else if (c
>= rm
->rm_firstdatacol
) {
1313 parity_valid
[c
] = B_TRUE
;
1318 ASSERT(ntgts
>= nt
);
1319 ASSERT(nbaddata
>= 0);
1320 ASSERT(nbaddata
+ nbadparity
== ntgts
);
1322 dt
= &tgts
[nbadparity
];
1325 /* Reconstruct using the new math implementation */
1326 ret
= vdev_raidz_math_reconstruct(rm
, parity_valid
, dt
, nbaddata
);
1327 if (ret
!= RAIDZ_ORIGINAL_IMPL
)
1331 * See if we can use any of our optimized reconstruction routines.
1335 if (parity_valid
[VDEV_RAIDZ_P
])
1336 return (vdev_raidz_reconstruct_p(rm
, dt
, 1));
1338 ASSERT(rm
->rm_firstdatacol
> 1);
1340 if (parity_valid
[VDEV_RAIDZ_Q
])
1341 return (vdev_raidz_reconstruct_q(rm
, dt
, 1));
1343 ASSERT(rm
->rm_firstdatacol
> 2);
1347 ASSERT(rm
->rm_firstdatacol
> 1);
1349 if (parity_valid
[VDEV_RAIDZ_P
] &&
1350 parity_valid
[VDEV_RAIDZ_Q
])
1351 return (vdev_raidz_reconstruct_pq(rm
, dt
, 2));
1353 ASSERT(rm
->rm_firstdatacol
> 2);
1358 code
= vdev_raidz_reconstruct_general(rm
, tgts
, ntgts
);
1359 ASSERT(code
< (1 << VDEV_RAIDZ_MAXPARITY
));
1365 vdev_raidz_open(vdev_t
*vd
, uint64_t *asize
, uint64_t *max_asize
,
1369 uint64_t nparity
= vd
->vdev_nparity
;
1374 ASSERT(nparity
> 0);
1376 if (nparity
> VDEV_RAIDZ_MAXPARITY
||
1377 vd
->vdev_children
< nparity
+ 1) {
1378 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
1379 return (SET_ERROR(EINVAL
));
1382 vdev_open_children(vd
);
1384 for (c
= 0; c
< vd
->vdev_children
; c
++) {
1385 cvd
= vd
->vdev_child
[c
];
1387 if (cvd
->vdev_open_error
!= 0) {
1388 lasterror
= cvd
->vdev_open_error
;
1393 *asize
= MIN(*asize
- 1, cvd
->vdev_asize
- 1) + 1;
1394 *max_asize
= MIN(*max_asize
- 1, cvd
->vdev_max_asize
- 1) + 1;
1395 *ashift
= MAX(*ashift
, cvd
->vdev_ashift
);
1398 *asize
*= vd
->vdev_children
;
1399 *max_asize
*= vd
->vdev_children
;
1401 if (numerrors
> nparity
) {
1402 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NO_REPLICAS
;
1410 vdev_raidz_close(vdev_t
*vd
)
1414 for (c
= 0; c
< vd
->vdev_children
; c
++)
1415 vdev_close(vd
->vdev_child
[c
]);
1419 vdev_raidz_asize(vdev_t
*vd
, uint64_t psize
)
1422 uint64_t ashift
= vd
->vdev_top
->vdev_ashift
;
1423 uint64_t cols
= vd
->vdev_children
;
1424 uint64_t nparity
= vd
->vdev_nparity
;
1426 asize
= ((psize
- 1) >> ashift
) + 1;
1427 asize
+= nparity
* ((asize
+ cols
- nparity
- 1) / (cols
- nparity
));
1428 asize
= roundup(asize
, nparity
+ 1) << ashift
;
1434 vdev_raidz_child_done(zio_t
*zio
)
1436 raidz_col_t
*rc
= zio
->io_private
;
1438 rc
->rc_error
= zio
->io_error
;
1444 * Start an IO operation on a RAIDZ VDev
1447 * - For write operations:
1448 * 1. Generate the parity data
1449 * 2. Create child zio write operations to each column's vdev, for both
1451 * 3. If the column skips any sectors for padding, create optional dummy
1452 * write zio children for those areas to improve aggregation continuity.
1453 * - For read operations:
1454 * 1. Create child zio read operations to each data column's vdev to read
1455 * the range of data required for zio.
1456 * 2. If this is a scrub or resilver operation, or if any of the data
1457 * vdevs have had errors, then create zio read operations to the parity
1458 * columns' VDevs as well.
1461 vdev_raidz_io_start(zio_t
*zio
)
1463 vdev_t
*vd
= zio
->io_vd
;
1464 vdev_t
*tvd
= vd
->vdev_top
;
1470 rm
= vdev_raidz_map_alloc(zio
, tvd
->vdev_ashift
, vd
->vdev_children
,
1473 ASSERT3U(rm
->rm_asize
, ==, vdev_psize_to_asize(vd
, zio
->io_size
));
1475 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
1476 vdev_raidz_generate_parity(rm
);
1478 for (c
= 0; c
< rm
->rm_cols
; c
++) {
1479 rc
= &rm
->rm_col
[c
];
1480 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1481 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1482 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
1483 zio
->io_type
, zio
->io_priority
, 0,
1484 vdev_raidz_child_done
, rc
));
1488 * Generate optional I/Os for any skipped sectors to improve
1489 * aggregation contiguity.
1491 for (c
= rm
->rm_skipstart
, i
= 0; i
< rm
->rm_nskip
; c
++, i
++) {
1492 ASSERT(c
<= rm
->rm_scols
);
1493 if (c
== rm
->rm_scols
)
1495 rc
= &rm
->rm_col
[c
];
1496 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1497 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1498 rc
->rc_offset
+ rc
->rc_size
, NULL
,
1499 1 << tvd
->vdev_ashift
,
1500 zio
->io_type
, zio
->io_priority
,
1501 ZIO_FLAG_NODATA
| ZIO_FLAG_OPTIONAL
, NULL
, NULL
));
1508 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
1511 * Iterate over the columns in reverse order so that we hit the parity
1512 * last -- any errors along the way will force us to read the parity.
1514 for (c
= rm
->rm_cols
- 1; c
>= 0; c
--) {
1515 rc
= &rm
->rm_col
[c
];
1516 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1517 if (!vdev_readable(cvd
)) {
1518 if (c
>= rm
->rm_firstdatacol
)
1519 rm
->rm_missingdata
++;
1521 rm
->rm_missingparity
++;
1522 rc
->rc_error
= SET_ERROR(ENXIO
);
1523 rc
->rc_tried
= 1; /* don't even try */
1527 if (vdev_dtl_contains(cvd
, DTL_MISSING
, zio
->io_txg
, 1)) {
1528 if (c
>= rm
->rm_firstdatacol
)
1529 rm
->rm_missingdata
++;
1531 rm
->rm_missingparity
++;
1532 rc
->rc_error
= SET_ERROR(ESTALE
);
1536 if (c
>= rm
->rm_firstdatacol
|| rm
->rm_missingdata
> 0 ||
1537 (zio
->io_flags
& (ZIO_FLAG_SCRUB
| ZIO_FLAG_RESILVER
))) {
1538 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1539 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
1540 zio
->io_type
, zio
->io_priority
, 0,
1541 vdev_raidz_child_done
, rc
));
1550 * Report a checksum error for a child of a RAID-Z device.
1553 raidz_checksum_error(zio_t
*zio
, raidz_col_t
*rc
, void *bad_data
)
1555 vdev_t
*vd
= zio
->io_vd
->vdev_child
[rc
->rc_devidx
];
1557 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
1558 zio_bad_cksum_t zbc
;
1559 raidz_map_t
*rm
= zio
->io_vsd
;
1561 mutex_enter(&vd
->vdev_stat_lock
);
1562 vd
->vdev_stat
.vs_checksum_errors
++;
1563 mutex_exit(&vd
->vdev_stat_lock
);
1565 zbc
.zbc_has_cksum
= 0;
1566 zbc
.zbc_injected
= rm
->rm_ecksuminjected
;
1568 zfs_ereport_post_checksum(zio
->io_spa
, vd
, zio
,
1569 rc
->rc_offset
, rc
->rc_size
, rc
->rc_data
, bad_data
,
1575 * We keep track of whether or not there were any injected errors, so that
1576 * any ereports we generate can note it.
1579 raidz_checksum_verify(zio_t
*zio
)
1581 zio_bad_cksum_t zbc
;
1582 raidz_map_t
*rm
= zio
->io_vsd
;
1585 bzero(&zbc
, sizeof (zio_bad_cksum_t
));
1587 ret
= zio_checksum_error(zio
, &zbc
);
1588 if (ret
!= 0 && zbc
.zbc_injected
!= 0)
1589 rm
->rm_ecksuminjected
= 1;
1595 * Generate the parity from the data columns. If we tried and were able to
1596 * read the parity without error, verify that the generated parity matches the
1597 * data we read. If it doesn't, we fire off a checksum error. Return the
1598 * number such failures.
1601 raidz_parity_verify(zio_t
*zio
, raidz_map_t
*rm
)
1603 void *orig
[VDEV_RAIDZ_MAXPARITY
];
1607 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
1608 rc
= &rm
->rm_col
[c
];
1609 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
1611 orig
[c
] = zio_buf_alloc(rc
->rc_size
);
1612 bcopy(rc
->rc_data
, orig
[c
], rc
->rc_size
);
1615 vdev_raidz_generate_parity(rm
);
1617 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
1618 rc
= &rm
->rm_col
[c
];
1619 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
1621 if (bcmp(orig
[c
], rc
->rc_data
, rc
->rc_size
) != 0) {
1622 raidz_checksum_error(zio
, rc
, orig
[c
]);
1623 rc
->rc_error
= SET_ERROR(ECKSUM
);
1626 zio_buf_free(orig
[c
], rc
->rc_size
);
1633 vdev_raidz_worst_error(raidz_map_t
*rm
)
1637 for (c
= 0; c
< rm
->rm_cols
; c
++)
1638 error
= zio_worst_error(error
, rm
->rm_col
[c
].rc_error
);
1644 * Iterate over all combinations of bad data and attempt a reconstruction.
1645 * Note that the algorithm below is non-optimal because it doesn't take into
1646 * account how reconstruction is actually performed. For example, with
1647 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1648 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1649 * cases we'd only use parity information in column 0.
1652 vdev_raidz_combrec(zio_t
*zio
, int total_errors
, int data_errors
)
1654 raidz_map_t
*rm
= zio
->io_vsd
;
1656 void *orig
[VDEV_RAIDZ_MAXPARITY
];
1657 int tstore
[VDEV_RAIDZ_MAXPARITY
+ 2];
1658 int *tgts
= &tstore
[1];
1659 int curr
, next
, i
, c
, n
;
1662 ASSERT(total_errors
< rm
->rm_firstdatacol
);
1665 * This simplifies one edge condition.
1669 for (n
= 1; n
<= rm
->rm_firstdatacol
- total_errors
; n
++) {
1671 * Initialize the targets array by finding the first n columns
1672 * that contain no error.
1674 * If there were no data errors, we need to ensure that we're
1675 * always explicitly attempting to reconstruct at least one
1676 * data column. To do this, we simply push the highest target
1677 * up into the data columns.
1679 for (c
= 0, i
= 0; i
< n
; i
++) {
1680 if (i
== n
- 1 && data_errors
== 0 &&
1681 c
< rm
->rm_firstdatacol
) {
1682 c
= rm
->rm_firstdatacol
;
1685 while (rm
->rm_col
[c
].rc_error
!= 0) {
1687 ASSERT3S(c
, <, rm
->rm_cols
);
1694 * Setting tgts[n] simplifies the other edge condition.
1696 tgts
[n
] = rm
->rm_cols
;
1699 * These buffers were allocated in previous iterations.
1701 for (i
= 0; i
< n
- 1; i
++) {
1702 ASSERT(orig
[i
] != NULL
);
1705 orig
[n
- 1] = zio_buf_alloc(rm
->rm_col
[0].rc_size
);
1715 * Save off the original data that we're going to
1716 * attempt to reconstruct.
1718 for (i
= 0; i
< n
; i
++) {
1719 ASSERT(orig
[i
] != NULL
);
1722 ASSERT3S(c
, <, rm
->rm_cols
);
1723 rc
= &rm
->rm_col
[c
];
1724 bcopy(rc
->rc_data
, orig
[i
], rc
->rc_size
);
1728 * Attempt a reconstruction and exit the outer loop on
1731 code
= vdev_raidz_reconstruct(rm
, tgts
, n
);
1732 if (raidz_checksum_verify(zio
) == 0) {
1734 for (i
= 0; i
< n
; i
++) {
1736 rc
= &rm
->rm_col
[c
];
1737 ASSERT(rc
->rc_error
== 0);
1739 raidz_checksum_error(zio
, rc
,
1741 rc
->rc_error
= SET_ERROR(ECKSUM
);
1749 * Restore the original data.
1751 for (i
= 0; i
< n
; i
++) {
1753 rc
= &rm
->rm_col
[c
];
1754 bcopy(orig
[i
], rc
->rc_data
, rc
->rc_size
);
1759 * Find the next valid column after the curr
1762 for (next
= tgts
[curr
] + 1;
1763 next
< rm
->rm_cols
&&
1764 rm
->rm_col
[next
].rc_error
!= 0; next
++)
1767 ASSERT(next
<= tgts
[curr
+ 1]);
1770 * If that spot is available, we're done here.
1772 if (next
!= tgts
[curr
+ 1])
1776 * Otherwise, find the next valid column after
1777 * the previous position.
1779 for (c
= tgts
[curr
- 1] + 1;
1780 rm
->rm_col
[c
].rc_error
!= 0; c
++)
1786 } while (curr
!= n
);
1791 for (i
= 0; i
< n
; i
++) {
1792 zio_buf_free(orig
[i
], rm
->rm_col
[0].rc_size
);
1799 * Complete an IO operation on a RAIDZ VDev
1802 * - For write operations:
1803 * 1. Check for errors on the child IOs.
1804 * 2. Return, setting an error code if too few child VDevs were written
1805 * to reconstruct the data later. Note that partial writes are
1806 * considered successful if they can be reconstructed at all.
1807 * - For read operations:
1808 * 1. Check for errors on the child IOs.
1809 * 2. If data errors occurred:
1810 * a. Try to reassemble the data from the parity available.
1811 * b. If we haven't yet read the parity drives, read them now.
1812 * c. If all parity drives have been read but the data still doesn't
1813 * reassemble with a correct checksum, then try combinatorial
1815 * d. If that doesn't work, return an error.
1816 * 3. If there were unexpected errors or this is a resilver operation,
1817 * rewrite the vdevs that had errors.
1820 vdev_raidz_io_done(zio_t
*zio
)
1822 vdev_t
*vd
= zio
->io_vd
;
1824 raidz_map_t
*rm
= zio
->io_vsd
;
1825 raidz_col_t
*rc
= NULL
;
1826 int unexpected_errors
= 0;
1827 int parity_errors
= 0;
1828 int parity_untried
= 0;
1829 int data_errors
= 0;
1830 int total_errors
= 0;
1832 int tgts
[VDEV_RAIDZ_MAXPARITY
];
1835 ASSERT(zio
->io_bp
!= NULL
); /* XXX need to add code to enforce this */
1837 ASSERT(rm
->rm_missingparity
<= rm
->rm_firstdatacol
);
1838 ASSERT(rm
->rm_missingdata
<= rm
->rm_cols
- rm
->rm_firstdatacol
);
1840 for (c
= 0; c
< rm
->rm_cols
; c
++) {
1841 rc
= &rm
->rm_col
[c
];
1844 ASSERT(rc
->rc_error
!= ECKSUM
); /* child has no bp */
1846 if (c
< rm
->rm_firstdatacol
)
1851 if (!rc
->rc_skipped
)
1852 unexpected_errors
++;
1855 } else if (c
< rm
->rm_firstdatacol
&& !rc
->rc_tried
) {
1860 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
1862 * XXX -- for now, treat partial writes as a success.
1863 * (If we couldn't write enough columns to reconstruct
1864 * the data, the I/O failed. Otherwise, good enough.)
1866 * Now that we support write reallocation, it would be better
1867 * to treat partial failure as real failure unless there are
1868 * no non-degraded top-level vdevs left, and not update DTLs
1869 * if we intend to reallocate.
1872 if (total_errors
> rm
->rm_firstdatacol
)
1873 zio
->io_error
= vdev_raidz_worst_error(rm
);
1878 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
1880 * There are three potential phases for a read:
1881 * 1. produce valid data from the columns read
1882 * 2. read all disks and try again
1883 * 3. perform combinatorial reconstruction
1885 * Each phase is progressively both more expensive and less likely to
1886 * occur. If we encounter more errors than we can repair or all phases
1887 * fail, we have no choice but to return an error.
1891 * If the number of errors we saw was correctable -- less than or equal
1892 * to the number of parity disks read -- attempt to produce data that
1893 * has a valid checksum. Naturally, this case applies in the absence of
1896 if (total_errors
<= rm
->rm_firstdatacol
- parity_untried
) {
1897 if (data_errors
== 0) {
1898 if (raidz_checksum_verify(zio
) == 0) {
1900 * If we read parity information (unnecessarily
1901 * as it happens since no reconstruction was
1902 * needed) regenerate and verify the parity.
1903 * We also regenerate parity when resilvering
1904 * so we can write it out to the failed device
1907 if (parity_errors
+ parity_untried
<
1908 rm
->rm_firstdatacol
||
1909 (zio
->io_flags
& ZIO_FLAG_RESILVER
)) {
1910 n
= raidz_parity_verify(zio
, rm
);
1911 unexpected_errors
+= n
;
1912 ASSERT(parity_errors
+ n
<=
1913 rm
->rm_firstdatacol
);
1919 * We either attempt to read all the parity columns or
1920 * none of them. If we didn't try to read parity, we
1921 * wouldn't be here in the correctable case. There must
1922 * also have been fewer parity errors than parity
1923 * columns or, again, we wouldn't be in this code path.
1925 ASSERT(parity_untried
== 0);
1926 ASSERT(parity_errors
< rm
->rm_firstdatacol
);
1929 * Identify the data columns that reported an error.
1932 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
1933 rc
= &rm
->rm_col
[c
];
1934 if (rc
->rc_error
!= 0) {
1935 ASSERT(n
< VDEV_RAIDZ_MAXPARITY
);
1940 ASSERT(rm
->rm_firstdatacol
>= n
);
1942 code
= vdev_raidz_reconstruct(rm
, tgts
, n
);
1944 if (raidz_checksum_verify(zio
) == 0) {
1946 * If we read more parity disks than were used
1947 * for reconstruction, confirm that the other
1948 * parity disks produced correct data. This
1949 * routine is suboptimal in that it regenerates
1950 * the parity that we already used in addition
1951 * to the parity that we're attempting to
1952 * verify, but this should be a relatively
1953 * uncommon case, and can be optimized if it
1954 * becomes a problem. Note that we regenerate
1955 * parity when resilvering so we can write it
1956 * out to failed devices later.
1958 if (parity_errors
< rm
->rm_firstdatacol
- n
||
1959 (zio
->io_flags
& ZIO_FLAG_RESILVER
)) {
1960 n
= raidz_parity_verify(zio
, rm
);
1961 unexpected_errors
+= n
;
1962 ASSERT(parity_errors
+ n
<=
1963 rm
->rm_firstdatacol
);
1972 * This isn't a typical situation -- either we got a read error or
1973 * a child silently returned bad data. Read every block so we can
1974 * try again with as much data and parity as we can track down. If
1975 * we've already been through once before, all children will be marked
1976 * as tried so we'll proceed to combinatorial reconstruction.
1978 unexpected_errors
= 1;
1979 rm
->rm_missingdata
= 0;
1980 rm
->rm_missingparity
= 0;
1982 for (c
= 0; c
< rm
->rm_cols
; c
++) {
1983 if (rm
->rm_col
[c
].rc_tried
)
1986 zio_vdev_io_redone(zio
);
1988 rc
= &rm
->rm_col
[c
];
1991 zio_nowait(zio_vdev_child_io(zio
, NULL
,
1992 vd
->vdev_child
[rc
->rc_devidx
],
1993 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
1994 zio
->io_type
, zio
->io_priority
, 0,
1995 vdev_raidz_child_done
, rc
));
1996 } while (++c
< rm
->rm_cols
);
2002 * At this point we've attempted to reconstruct the data given the
2003 * errors we detected, and we've attempted to read all columns. There
2004 * must, therefore, be one or more additional problems -- silent errors
2005 * resulting in invalid data rather than explicit I/O errors resulting
2006 * in absent data. We check if there is enough additional data to
2007 * possibly reconstruct the data and then perform combinatorial
2008 * reconstruction over all possible combinations. If that fails,
2011 if (total_errors
> rm
->rm_firstdatacol
) {
2012 zio
->io_error
= vdev_raidz_worst_error(rm
);
2014 } else if (total_errors
< rm
->rm_firstdatacol
&&
2015 (code
= vdev_raidz_combrec(zio
, total_errors
, data_errors
)) != 0) {
2017 * If we didn't use all the available parity for the
2018 * combinatorial reconstruction, verify that the remaining
2019 * parity is correct.
2021 if (code
!= (1 << rm
->rm_firstdatacol
) - 1)
2022 (void) raidz_parity_verify(zio
, rm
);
2025 * We're here because either:
2027 * total_errors == rm_first_datacol, or
2028 * vdev_raidz_combrec() failed
2030 * In either case, there is enough bad data to prevent
2033 * Start checksum ereports for all children which haven't
2034 * failed, and the IO wasn't speculative.
2036 zio
->io_error
= SET_ERROR(ECKSUM
);
2038 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
2039 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2040 rc
= &rm
->rm_col
[c
];
2041 if (rc
->rc_error
== 0) {
2042 zio_bad_cksum_t zbc
;
2043 zbc
.zbc_has_cksum
= 0;
2045 rm
->rm_ecksuminjected
;
2047 zfs_ereport_start_checksum(
2049 vd
->vdev_child
[rc
->rc_devidx
],
2050 zio
, rc
->rc_offset
, rc
->rc_size
,
2051 (void *)(uintptr_t)c
, &zbc
);
2058 zio_checksum_verified(zio
);
2060 if (zio
->io_error
== 0 && spa_writeable(zio
->io_spa
) &&
2061 (unexpected_errors
|| (zio
->io_flags
& ZIO_FLAG_RESILVER
))) {
2063 * Use the good data we have in hand to repair damaged children.
2065 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2066 rc
= &rm
->rm_col
[c
];
2067 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
2069 if (rc
->rc_error
== 0)
2072 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
2073 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
2074 ZIO_TYPE_WRITE
, ZIO_PRIORITY_ASYNC_WRITE
,
2075 ZIO_FLAG_IO_REPAIR
| (unexpected_errors
?
2076 ZIO_FLAG_SELF_HEAL
: 0), NULL
, NULL
));
2082 vdev_raidz_state_change(vdev_t
*vd
, int faulted
, int degraded
)
2084 if (faulted
> vd
->vdev_nparity
)
2085 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
2086 VDEV_AUX_NO_REPLICAS
);
2087 else if (degraded
+ faulted
!= 0)
2088 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
, VDEV_AUX_NONE
);
2090 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_HEALTHY
, VDEV_AUX_NONE
);
2093 vdev_ops_t vdev_raidz_ops
= {
2097 vdev_raidz_io_start
,
2099 vdev_raidz_state_change
,
2102 VDEV_TYPE_RAIDZ
, /* name of this vdev type */
2103 B_FALSE
/* not a leaf vdev */