4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
27 #include <sys/zfs_context.h>
29 #include <sys/vdev_impl.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
36 * Virtual device vector for RAID-Z.
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
67 * (A * 2)_4 = A_3 + A_7
68 * (A * 2)_3 = A_2 + A_7
69 * (A * 2)_2 = A_1 + A_7
73 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
74 * As an aside, this multiplication is derived from the error correcting
75 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
77 * Observe that any number in the field (except for 0) can be expressed as a
78 * power of 2 -- a generator for the field. We store a table of the powers of
79 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
80 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
81 * than field addition). The inverse of a field element A (A^-1) is therefore
82 * A ^ (255 - 1) = A^254.
84 * The up-to-three parity columns, P, Q, R over several data columns,
85 * D_0, ... D_n-1, can be expressed by field operations:
87 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
88 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
89 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
90 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
91 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
93 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
94 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
95 * independent coefficients. (There are no additional coefficients that have
96 * this property which is why the uncorrected Plank method breaks down.)
98 * See the reconstruction code below for how P, Q and R can used individually
99 * or in concert to recover missing data columns.
102 typedef struct raidz_col
{
103 uint64_t rc_devidx
; /* child device index for I/O */
104 uint64_t rc_offset
; /* device offset */
105 uint64_t rc_size
; /* I/O size */
106 void *rc_data
; /* I/O data */
107 void *rc_gdata
; /* used to store the "good" version */
108 int rc_error
; /* I/O error for this device */
109 uint8_t rc_tried
; /* Did we attempt this I/O column? */
110 uint8_t rc_skipped
; /* Did we skip this I/O column? */
113 typedef struct raidz_map
{
114 uint64_t rm_cols
; /* Regular column count */
115 uint64_t rm_scols
; /* Count including skipped columns */
116 uint64_t rm_bigcols
; /* Number of oversized columns */
117 uint64_t rm_asize
; /* Actual total I/O size */
118 uint64_t rm_missingdata
; /* Count of missing data devices */
119 uint64_t rm_missingparity
; /* Count of missing parity devices */
120 uint64_t rm_firstdatacol
; /* First data column/parity count */
121 uint64_t rm_nskip
; /* Skipped sectors for padding */
122 uint64_t rm_skipstart
; /* Column index of padding start */
123 void *rm_datacopy
; /* rm_asize-buffer of copied data */
124 uintptr_t rm_reports
; /* # of referencing checksum reports */
125 uint8_t rm_freed
; /* map no longer has referencing ZIO */
126 uint8_t rm_ecksuminjected
; /* checksum error was injected */
127 raidz_col_t rm_col
[1]; /* Flexible array of I/O columns */
130 #define VDEV_RAIDZ_P 0
131 #define VDEV_RAIDZ_Q 1
132 #define VDEV_RAIDZ_R 2
134 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
135 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
138 * We provide a mechanism to perform the field multiplication operation on a
139 * 64-bit value all at once rather than a byte at a time. This works by
140 * creating a mask from the top bit in each byte and using that to
141 * conditionally apply the XOR of 0x1d.
143 #define VDEV_RAIDZ_64MUL_2(x, mask) \
145 (mask) = (x) & 0x8080808080808080ULL; \
146 (mask) = ((mask) << 1) - ((mask) >> 7); \
147 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
148 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
151 #define VDEV_RAIDZ_64MUL_4(x, mask) \
153 VDEV_RAIDZ_64MUL_2((x), mask); \
154 VDEV_RAIDZ_64MUL_2((x), mask); \
158 * Force reconstruction to use the general purpose method.
160 int vdev_raidz_default_to_general
;
162 /* Powers of 2 in the Galois field defined above. */
163 static const uint8_t vdev_raidz_pow2
[256] = {
164 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
165 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
166 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
167 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
168 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
169 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
170 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
171 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
172 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
173 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
174 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
175 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
176 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
177 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
178 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
179 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
180 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
181 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
182 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
183 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
184 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
185 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
186 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
187 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
188 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
189 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
190 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
191 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
192 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
193 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
194 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
195 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
197 /* Logs of 2 in the Galois field defined above. */
198 static const uint8_t vdev_raidz_log2
[256] = {
199 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
200 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
201 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
202 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
203 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
204 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
205 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
206 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
207 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
208 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
209 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
210 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
211 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
212 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
213 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
214 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
215 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
216 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
217 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
218 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
219 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
220 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
221 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
222 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
223 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
224 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
225 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
226 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
227 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
228 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
229 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
230 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
233 static void vdev_raidz_generate_parity(raidz_map_t
*rm
);
236 * Multiply a given number by 2 raised to the given power.
239 vdev_raidz_exp2(uint_t a
, int exp
)
245 ASSERT(vdev_raidz_log2
[a
] > 0 || a
== 1);
247 exp
+= vdev_raidz_log2
[a
];
251 return (vdev_raidz_pow2
[exp
]);
255 vdev_raidz_map_free(raidz_map_t
*rm
)
260 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
261 zio_buf_free(rm
->rm_col
[c
].rc_data
, rm
->rm_col
[c
].rc_size
);
263 if (rm
->rm_col
[c
].rc_gdata
!= NULL
)
264 zio_buf_free(rm
->rm_col
[c
].rc_gdata
,
265 rm
->rm_col
[c
].rc_size
);
269 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++)
270 size
+= rm
->rm_col
[c
].rc_size
;
272 if (rm
->rm_datacopy
!= NULL
)
273 zio_buf_free(rm
->rm_datacopy
, size
);
275 kmem_free(rm
, offsetof(raidz_map_t
, rm_col
[rm
->rm_scols
]));
279 vdev_raidz_map_free_vsd(zio_t
*zio
)
281 raidz_map_t
*rm
= zio
->io_vsd
;
283 ASSERT0(rm
->rm_freed
);
286 if (rm
->rm_reports
== 0)
287 vdev_raidz_map_free(rm
);
292 vdev_raidz_cksum_free(void *arg
, size_t ignored
)
294 raidz_map_t
*rm
= arg
;
296 ASSERT3U(rm
->rm_reports
, >, 0);
298 if (--rm
->rm_reports
== 0 && rm
->rm_freed
!= 0)
299 vdev_raidz_map_free(rm
);
303 vdev_raidz_cksum_finish(zio_cksum_report_t
*zcr
, const void *good_data
)
305 raidz_map_t
*rm
= zcr
->zcr_cbdata
;
306 size_t c
= zcr
->zcr_cbinfo
;
309 const char *good
= NULL
;
310 const char *bad
= rm
->rm_col
[c
].rc_data
;
312 if (good_data
== NULL
) {
313 zfs_ereport_finish_checksum(zcr
, NULL
, NULL
, B_FALSE
);
317 if (c
< rm
->rm_firstdatacol
) {
319 * The first time through, calculate the parity blocks for
320 * the good data (this relies on the fact that the good
321 * data never changes for a given logical ZIO)
323 if (rm
->rm_col
[0].rc_gdata
== NULL
) {
324 char *bad_parity
[VDEV_RAIDZ_MAXPARITY
];
328 * Set up the rm_col[]s to generate the parity for
329 * good_data, first saving the parity bufs and
330 * replacing them with buffers to hold the result.
332 for (x
= 0; x
< rm
->rm_firstdatacol
; x
++) {
333 bad_parity
[x
] = rm
->rm_col
[x
].rc_data
;
334 rm
->rm_col
[x
].rc_data
= rm
->rm_col
[x
].rc_gdata
=
335 zio_buf_alloc(rm
->rm_col
[x
].rc_size
);
338 /* fill in the data columns from good_data */
339 buf
= (char *)good_data
;
340 for (; x
< rm
->rm_cols
; x
++) {
341 rm
->rm_col
[x
].rc_data
= buf
;
342 buf
+= rm
->rm_col
[x
].rc_size
;
346 * Construct the parity from the good data.
348 vdev_raidz_generate_parity(rm
);
350 /* restore everything back to its original state */
351 for (x
= 0; x
< rm
->rm_firstdatacol
; x
++)
352 rm
->rm_col
[x
].rc_data
= bad_parity
[x
];
354 buf
= rm
->rm_datacopy
;
355 for (x
= rm
->rm_firstdatacol
; x
< rm
->rm_cols
; x
++) {
356 rm
->rm_col
[x
].rc_data
= buf
;
357 buf
+= rm
->rm_col
[x
].rc_size
;
361 ASSERT3P(rm
->rm_col
[c
].rc_gdata
, !=, NULL
);
362 good
= rm
->rm_col
[c
].rc_gdata
;
364 /* adjust good_data to point at the start of our column */
367 for (x
= rm
->rm_firstdatacol
; x
< c
; x
++)
368 good
+= rm
->rm_col
[x
].rc_size
;
371 /* we drop the ereport if it ends up that the data was good */
372 zfs_ereport_finish_checksum(zcr
, good
, bad
, B_TRUE
);
376 * Invoked indirectly by zfs_ereport_start_checksum(), called
377 * below when our read operation fails completely. The main point
378 * is to keep a copy of everything we read from disk, so that at
379 * vdev_raidz_cksum_finish() time we can compare it with the good data.
382 vdev_raidz_cksum_report(zio_t
*zio
, zio_cksum_report_t
*zcr
, void *arg
)
384 size_t c
= (size_t)(uintptr_t)arg
;
387 raidz_map_t
*rm
= zio
->io_vsd
;
390 /* set up the report and bump the refcount */
391 zcr
->zcr_cbdata
= rm
;
393 zcr
->zcr_finish
= vdev_raidz_cksum_finish
;
394 zcr
->zcr_free
= vdev_raidz_cksum_free
;
397 ASSERT3U(rm
->rm_reports
, >, 0);
399 if (rm
->rm_datacopy
!= NULL
)
403 * It's the first time we're called for this raidz_map_t, so we need
404 * to copy the data aside; there's no guarantee that our zio's buffer
405 * won't be re-used for something else.
407 * Our parity data is already in separate buffers, so there's no need
412 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++)
413 size
+= rm
->rm_col
[c
].rc_size
;
415 buf
= rm
->rm_datacopy
= zio_buf_alloc(size
);
417 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
418 raidz_col_t
*col
= &rm
->rm_col
[c
];
420 bcopy(col
->rc_data
, buf
, col
->rc_size
);
425 ASSERT3P(buf
- (caddr_t
)rm
->rm_datacopy
, ==, size
);
428 static const zio_vsd_ops_t vdev_raidz_vsd_ops
= {
429 vdev_raidz_map_free_vsd
,
430 vdev_raidz_cksum_report
434 * Divides the IO evenly across all child vdevs; usually, dcols is
435 * the number of children in the target vdev.
437 * Avoid inlining the function to keep vdev_raidz_io_start(), which
438 * is this functions only caller, as small as possible on the stack.
440 noinline
static raidz_map_t
*
441 vdev_raidz_map_alloc(zio_t
*zio
, uint64_t unit_shift
, uint64_t dcols
,
445 /* The starting RAIDZ (parent) vdev sector of the block. */
446 uint64_t b
= zio
->io_offset
>> unit_shift
;
447 /* The zio's size in units of the vdev's minimum sector size. */
448 uint64_t s
= zio
->io_size
>> unit_shift
;
449 /* The first column for this stripe. */
450 uint64_t f
= b
% dcols
;
451 /* The starting byte offset on each child vdev. */
452 uint64_t o
= (b
/ dcols
) << unit_shift
;
453 uint64_t q
, r
, c
, bc
, col
, acols
, scols
, coff
, devidx
, asize
, tot
;
456 * "Quotient": The number of data sectors for this stripe on all but
457 * the "big column" child vdevs that also contain "remainder" data.
459 q
= s
/ (dcols
- nparity
);
462 * "Remainder": The number of partial stripe data sectors in this I/O.
463 * This will add a sector to some, but not all, child vdevs.
465 r
= s
- q
* (dcols
- nparity
);
467 /* The number of "big columns" - those which contain remainder data. */
468 bc
= (r
== 0 ? 0 : r
+ nparity
);
471 * The total number of data and parity sectors associated with
474 tot
= s
+ nparity
* (q
+ (r
== 0 ? 0 : 1));
476 /* acols: The columns that will be accessed. */
477 /* scols: The columns that will be accessed or skipped. */
479 /* Our I/O request doesn't span all child vdevs. */
481 scols
= MIN(dcols
, roundup(bc
, nparity
+ 1));
487 ASSERT3U(acols
, <=, scols
);
489 rm
= kmem_alloc(offsetof(raidz_map_t
, rm_col
[scols
]), KM_SLEEP
);
492 rm
->rm_scols
= scols
;
494 rm
->rm_skipstart
= bc
;
495 rm
->rm_missingdata
= 0;
496 rm
->rm_missingparity
= 0;
497 rm
->rm_firstdatacol
= nparity
;
498 rm
->rm_datacopy
= NULL
;
501 rm
->rm_ecksuminjected
= 0;
505 for (c
= 0; c
< scols
; c
++) {
510 coff
+= 1ULL << unit_shift
;
512 rm
->rm_col
[c
].rc_devidx
= col
;
513 rm
->rm_col
[c
].rc_offset
= coff
;
514 rm
->rm_col
[c
].rc_data
= NULL
;
515 rm
->rm_col
[c
].rc_gdata
= NULL
;
516 rm
->rm_col
[c
].rc_error
= 0;
517 rm
->rm_col
[c
].rc_tried
= 0;
518 rm
->rm_col
[c
].rc_skipped
= 0;
521 rm
->rm_col
[c
].rc_size
= 0;
523 rm
->rm_col
[c
].rc_size
= (q
+ 1) << unit_shift
;
525 rm
->rm_col
[c
].rc_size
= q
<< unit_shift
;
527 asize
+= rm
->rm_col
[c
].rc_size
;
530 ASSERT3U(asize
, ==, tot
<< unit_shift
);
531 rm
->rm_asize
= roundup(asize
, (nparity
+ 1) << unit_shift
);
532 rm
->rm_nskip
= roundup(tot
, nparity
+ 1) - tot
;
533 ASSERT3U(rm
->rm_asize
- asize
, ==, rm
->rm_nskip
<< unit_shift
);
534 ASSERT3U(rm
->rm_nskip
, <=, nparity
);
536 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++)
537 rm
->rm_col
[c
].rc_data
= zio_buf_alloc(rm
->rm_col
[c
].rc_size
);
539 rm
->rm_col
[c
].rc_data
= zio
->io_data
;
541 for (c
= c
+ 1; c
< acols
; c
++)
542 rm
->rm_col
[c
].rc_data
= (char *)rm
->rm_col
[c
- 1].rc_data
+
543 rm
->rm_col
[c
- 1].rc_size
;
546 * If all data stored spans all columns, there's a danger that parity
547 * will always be on the same device and, since parity isn't read
548 * during normal operation, that that device's I/O bandwidth won't be
549 * used effectively. We therefore switch the parity every 1MB.
551 * ... at least that was, ostensibly, the theory. As a practical
552 * matter unless we juggle the parity between all devices evenly, we
553 * won't see any benefit. Further, occasional writes that aren't a
554 * multiple of the LCM of the number of children and the minimum
555 * stripe width are sufficient to avoid pessimal behavior.
556 * Unfortunately, this decision created an implicit on-disk format
557 * requirement that we need to support for all eternity, but only
558 * for single-parity RAID-Z.
560 * If we intend to skip a sector in the zeroth column for padding
561 * we must make sure to note this swap. We will never intend to
562 * skip the first column since at least one data and one parity
563 * column must appear in each row.
565 ASSERT(rm
->rm_cols
>= 2);
566 ASSERT(rm
->rm_col
[0].rc_size
== rm
->rm_col
[1].rc_size
);
568 if (rm
->rm_firstdatacol
== 1 && (zio
->io_offset
& (1ULL << 20))) {
569 devidx
= rm
->rm_col
[0].rc_devidx
;
570 o
= rm
->rm_col
[0].rc_offset
;
571 rm
->rm_col
[0].rc_devidx
= rm
->rm_col
[1].rc_devidx
;
572 rm
->rm_col
[0].rc_offset
= rm
->rm_col
[1].rc_offset
;
573 rm
->rm_col
[1].rc_devidx
= devidx
;
574 rm
->rm_col
[1].rc_offset
= o
;
576 if (rm
->rm_skipstart
== 0)
577 rm
->rm_skipstart
= 1;
581 zio
->io_vsd_ops
= &vdev_raidz_vsd_ops
;
586 vdev_raidz_generate_parity_p(raidz_map_t
*rm
)
588 uint64_t *p
, *src
, pcount
, ccount
, i
;
591 pcount
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
593 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
594 src
= rm
->rm_col
[c
].rc_data
;
595 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
596 ccount
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
598 if (c
== rm
->rm_firstdatacol
) {
599 ASSERT(ccount
== pcount
);
600 for (i
= 0; i
< ccount
; i
++, src
++, p
++) {
604 ASSERT(ccount
<= pcount
);
605 for (i
= 0; i
< ccount
; i
++, src
++, p
++) {
613 vdev_raidz_generate_parity_pq(raidz_map_t
*rm
)
615 uint64_t *p
, *q
, *src
, pcnt
, ccnt
, mask
, i
;
618 pcnt
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
619 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
620 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
622 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
623 src
= rm
->rm_col
[c
].rc_data
;
624 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
625 q
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
627 ccnt
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
629 if (c
== rm
->rm_firstdatacol
) {
630 ASSERT(ccnt
== pcnt
|| ccnt
== 0);
631 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++) {
635 for (; i
< pcnt
; i
++, src
++, p
++, q
++) {
640 ASSERT(ccnt
<= pcnt
);
643 * Apply the algorithm described above by multiplying
644 * the previous result and adding in the new value.
646 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++) {
649 VDEV_RAIDZ_64MUL_2(*q
, mask
);
654 * Treat short columns as though they are full of 0s.
655 * Note that there's therefore nothing needed for P.
657 for (; i
< pcnt
; i
++, q
++) {
658 VDEV_RAIDZ_64MUL_2(*q
, mask
);
665 vdev_raidz_generate_parity_pqr(raidz_map_t
*rm
)
667 uint64_t *p
, *q
, *r
, *src
, pcnt
, ccnt
, mask
, i
;
670 pcnt
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]);
671 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
672 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
673 ASSERT(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
==
674 rm
->rm_col
[VDEV_RAIDZ_R
].rc_size
);
676 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
677 src
= rm
->rm_col
[c
].rc_data
;
678 p
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
679 q
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
680 r
= rm
->rm_col
[VDEV_RAIDZ_R
].rc_data
;
682 ccnt
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
684 if (c
== rm
->rm_firstdatacol
) {
685 ASSERT(ccnt
== pcnt
|| ccnt
== 0);
686 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++, r
++) {
691 for (; i
< pcnt
; i
++, src
++, p
++, q
++, r
++) {
697 ASSERT(ccnt
<= pcnt
);
700 * Apply the algorithm described above by multiplying
701 * the previous result and adding in the new value.
703 for (i
= 0; i
< ccnt
; i
++, src
++, p
++, q
++, r
++) {
706 VDEV_RAIDZ_64MUL_2(*q
, mask
);
709 VDEV_RAIDZ_64MUL_4(*r
, mask
);
714 * Treat short columns as though they are full of 0s.
715 * Note that there's therefore nothing needed for P.
717 for (; i
< pcnt
; i
++, q
++, r
++) {
718 VDEV_RAIDZ_64MUL_2(*q
, mask
);
719 VDEV_RAIDZ_64MUL_4(*r
, mask
);
726 * Generate RAID parity in the first virtual columns according to the number of
727 * parity columns available.
730 vdev_raidz_generate_parity(raidz_map_t
*rm
)
732 switch (rm
->rm_firstdatacol
) {
734 vdev_raidz_generate_parity_p(rm
);
737 vdev_raidz_generate_parity_pq(rm
);
740 vdev_raidz_generate_parity_pqr(rm
);
743 cmn_err(CE_PANIC
, "invalid RAID-Z configuration");
748 vdev_raidz_reconstruct_p(raidz_map_t
*rm
, int *tgts
, int ntgts
)
750 uint64_t *dst
, *src
, xcount
, ccount
, count
, i
;
755 ASSERT(x
>= rm
->rm_firstdatacol
);
756 ASSERT(x
< rm
->rm_cols
);
758 xcount
= rm
->rm_col
[x
].rc_size
/ sizeof (src
[0]);
759 ASSERT(xcount
<= rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (src
[0]));
762 src
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
763 dst
= rm
->rm_col
[x
].rc_data
;
764 for (i
= 0; i
< xcount
; i
++, dst
++, src
++) {
768 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
769 src
= rm
->rm_col
[c
].rc_data
;
770 dst
= rm
->rm_col
[x
].rc_data
;
775 ccount
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
776 count
= MIN(ccount
, xcount
);
778 for (i
= 0; i
< count
; i
++, dst
++, src
++) {
783 return (1 << VDEV_RAIDZ_P
);
787 vdev_raidz_reconstruct_q(raidz_map_t
*rm
, int *tgts
, int ntgts
)
789 uint64_t *dst
, *src
, xcount
, ccount
, count
, mask
, i
;
796 xcount
= rm
->rm_col
[x
].rc_size
/ sizeof (src
[0]);
797 ASSERT(xcount
<= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
/ sizeof (src
[0]));
799 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
800 src
= rm
->rm_col
[c
].rc_data
;
801 dst
= rm
->rm_col
[x
].rc_data
;
806 ccount
= rm
->rm_col
[c
].rc_size
/ sizeof (src
[0]);
808 count
= MIN(ccount
, xcount
);
810 if (c
== rm
->rm_firstdatacol
) {
811 for (i
= 0; i
< count
; i
++, dst
++, src
++) {
814 for (; i
< xcount
; i
++, dst
++) {
819 for (i
= 0; i
< count
; i
++, dst
++, src
++) {
820 VDEV_RAIDZ_64MUL_2(*dst
, mask
);
824 for (; i
< xcount
; i
++, dst
++) {
825 VDEV_RAIDZ_64MUL_2(*dst
, mask
);
830 src
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
831 dst
= rm
->rm_col
[x
].rc_data
;
832 exp
= 255 - (rm
->rm_cols
- 1 - x
);
834 for (i
= 0; i
< xcount
; i
++, dst
++, src
++) {
836 for (j
= 0, b
= (uint8_t *)dst
; j
< 8; j
++, b
++) {
837 *b
= vdev_raidz_exp2(*b
, exp
);
841 return (1 << VDEV_RAIDZ_Q
);
845 vdev_raidz_reconstruct_pq(raidz_map_t
*rm
, int *tgts
, int ntgts
)
847 uint8_t *p
, *q
, *pxy
, *qxy
, *xd
, *yd
, tmp
, a
, b
, aexp
, bexp
;
849 uint64_t xsize
, ysize
, i
;
855 ASSERT(x
>= rm
->rm_firstdatacol
);
856 ASSERT(y
< rm
->rm_cols
);
858 ASSERT(rm
->rm_col
[x
].rc_size
>= rm
->rm_col
[y
].rc_size
);
861 * Move the parity data aside -- we're going to compute parity as
862 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
863 * reuse the parity generation mechanism without trashing the actual
864 * parity so we make those columns appear to be full of zeros by
865 * setting their lengths to zero.
867 pdata
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
868 qdata
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
869 xsize
= rm
->rm_col
[x
].rc_size
;
870 ysize
= rm
->rm_col
[y
].rc_size
;
872 rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
=
873 zio_buf_alloc(rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
);
874 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
=
875 zio_buf_alloc(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
876 rm
->rm_col
[x
].rc_size
= 0;
877 rm
->rm_col
[y
].rc_size
= 0;
879 vdev_raidz_generate_parity_pq(rm
);
881 rm
->rm_col
[x
].rc_size
= xsize
;
882 rm
->rm_col
[y
].rc_size
= ysize
;
886 pxy
= rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
;
887 qxy
= rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
;
888 xd
= rm
->rm_col
[x
].rc_data
;
889 yd
= rm
->rm_col
[y
].rc_data
;
893 * Pxy = P + D_x + D_y
894 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
896 * We can then solve for D_x:
897 * D_x = A * (P + Pxy) + B * (Q + Qxy)
899 * A = 2^(x - y) * (2^(x - y) + 1)^-1
900 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
902 * With D_x in hand, we can easily solve for D_y:
903 * D_y = P + Pxy + D_x
906 a
= vdev_raidz_pow2
[255 + x
- y
];
907 b
= vdev_raidz_pow2
[255 - (rm
->rm_cols
- 1 - x
)];
908 tmp
= 255 - vdev_raidz_log2
[a
^ 1];
910 aexp
= vdev_raidz_log2
[vdev_raidz_exp2(a
, tmp
)];
911 bexp
= vdev_raidz_log2
[vdev_raidz_exp2(b
, tmp
)];
913 for (i
= 0; i
< xsize
; i
++, p
++, q
++, pxy
++, qxy
++, xd
++, yd
++) {
914 *xd
= vdev_raidz_exp2(*p
^ *pxy
, aexp
) ^
915 vdev_raidz_exp2(*q
^ *qxy
, bexp
);
918 *yd
= *p
^ *pxy
^ *xd
;
921 zio_buf_free(rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
,
922 rm
->rm_col
[VDEV_RAIDZ_P
].rc_size
);
923 zio_buf_free(rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
,
924 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_size
);
927 * Restore the saved parity data.
929 rm
->rm_col
[VDEV_RAIDZ_P
].rc_data
= pdata
;
930 rm
->rm_col
[VDEV_RAIDZ_Q
].rc_data
= qdata
;
932 return ((1 << VDEV_RAIDZ_P
) | (1 << VDEV_RAIDZ_Q
));
937 * In the general case of reconstruction, we must solve the system of linear
938 * equations defined by the coeffecients used to generate parity as well as
939 * the contents of the data and parity disks. This can be expressed with
940 * vectors for the original data (D) and the actual data (d) and parity (p)
941 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
945 * | V | | D_0 | | p_m-1 |
946 * | | x | : | = | d_0 |
947 * | I | | D_n-1 | | : |
948 * | | ~~ ~~ | d_n-1 |
951 * I is simply a square identity matrix of size n, and V is a vandermonde
952 * matrix defined by the coeffecients we chose for the various parity columns
953 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
954 * computation as well as linear separability.
957 * | 1 .. 1 1 1 | | p_0 |
958 * | 2^n-1 .. 4 2 1 | __ __ | : |
959 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
960 * | 1 .. 0 0 0 | | D_1 | | d_0 |
961 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
962 * | : : : : | | : | | d_2 |
963 * | 0 .. 1 0 0 | | D_n-1 | | : |
964 * | 0 .. 0 1 0 | ~~ ~~ | : |
965 * | 0 .. 0 0 1 | | d_n-1 |
968 * Note that I, V, d, and p are known. To compute D, we must invert the
969 * matrix and use the known data and parity values to reconstruct the unknown
970 * data values. We begin by removing the rows in V|I and d|p that correspond
971 * to failed or missing columns; we then make V|I square (n x n) and d|p
972 * sized n by removing rows corresponding to unused parity from the bottom up
973 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
974 * using Gauss-Jordan elimination. In the example below we use m=3 parity
975 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
977 * | 1 1 1 1 1 1 1 1 |
978 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
979 * | 19 205 116 29 64 16 4 1 | / /
980 * | 1 0 0 0 0 0 0 0 | / /
981 * | 0 1 0 0 0 0 0 0 | <--' /
982 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
983 * | 0 0 0 1 0 0 0 0 |
984 * | 0 0 0 0 1 0 0 0 |
985 * | 0 0 0 0 0 1 0 0 |
986 * | 0 0 0 0 0 0 1 0 |
987 * | 0 0 0 0 0 0 0 1 |
990 * | 1 1 1 1 1 1 1 1 |
991 * | 128 64 32 16 8 4 2 1 |
992 * | 19 205 116 29 64 16 4 1 |
993 * | 1 0 0 0 0 0 0 0 |
994 * | 0 1 0 0 0 0 0 0 |
995 * (V|I)' = | 0 0 1 0 0 0 0 0 |
996 * | 0 0 0 1 0 0 0 0 |
997 * | 0 0 0 0 1 0 0 0 |
998 * | 0 0 0 0 0 1 0 0 |
999 * | 0 0 0 0 0 0 1 0 |
1000 * | 0 0 0 0 0 0 0 1 |
1003 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1004 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1005 * matrix is not singular.
1007 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1008 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1009 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1010 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1011 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1012 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1013 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1014 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1017 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1018 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1019 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1020 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1021 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1022 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1023 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1024 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1027 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1028 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1029 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1030 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1031 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1032 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1033 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1034 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1037 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1038 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1039 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1040 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1041 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1042 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1043 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1044 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1047 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1048 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1049 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1050 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1051 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1052 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1053 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1054 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1057 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1058 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1059 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1060 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1061 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1062 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1063 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1064 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1067 * | 0 0 1 0 0 0 0 0 |
1068 * | 167 100 5 41 159 169 217 208 |
1069 * | 166 100 4 40 158 168 216 209 |
1070 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1071 * | 0 0 0 0 1 0 0 0 |
1072 * | 0 0 0 0 0 1 0 0 |
1073 * | 0 0 0 0 0 0 1 0 |
1074 * | 0 0 0 0 0 0 0 1 |
1077 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1078 * of the missing data.
1080 * As is apparent from the example above, the only non-trivial rows in the
1081 * inverse matrix correspond to the data disks that we're trying to
1082 * reconstruct. Indeed, those are the only rows we need as the others would
1083 * only be useful for reconstructing data known or assumed to be valid. For
1084 * that reason, we only build the coefficients in the rows that correspond to
1090 vdev_raidz_matrix_init(raidz_map_t
*rm
, int n
, int nmap
, int *map
,
1096 ASSERT(n
== rm
->rm_cols
- rm
->rm_firstdatacol
);
1099 * Fill in the missing rows of interest.
1101 for (i
= 0; i
< nmap
; i
++) {
1102 ASSERT3S(0, <=, map
[i
]);
1103 ASSERT3S(map
[i
], <=, 2);
1110 for (j
= 0; j
< n
; j
++) {
1114 rows
[i
][j
] = vdev_raidz_pow2
[pow
];
1120 vdev_raidz_matrix_invert(raidz_map_t
*rm
, int n
, int nmissing
, int *missing
,
1121 uint8_t **rows
, uint8_t **invrows
, const uint8_t *used
)
1127 * Assert that the first nmissing entries from the array of used
1128 * columns correspond to parity columns and that subsequent entries
1129 * correspond to data columns.
1131 for (i
= 0; i
< nmissing
; i
++) {
1132 ASSERT3S(used
[i
], <, rm
->rm_firstdatacol
);
1134 for (; i
< n
; i
++) {
1135 ASSERT3S(used
[i
], >=, rm
->rm_firstdatacol
);
1139 * First initialize the storage where we'll compute the inverse rows.
1141 for (i
= 0; i
< nmissing
; i
++) {
1142 for (j
= 0; j
< n
; j
++) {
1143 invrows
[i
][j
] = (i
== j
) ? 1 : 0;
1148 * Subtract all trivial rows from the rows of consequence.
1150 for (i
= 0; i
< nmissing
; i
++) {
1151 for (j
= nmissing
; j
< n
; j
++) {
1152 ASSERT3U(used
[j
], >=, rm
->rm_firstdatacol
);
1153 jj
= used
[j
] - rm
->rm_firstdatacol
;
1155 invrows
[i
][j
] = rows
[i
][jj
];
1161 * For each of the rows of interest, we must normalize it and subtract
1162 * a multiple of it from the other rows.
1164 for (i
= 0; i
< nmissing
; i
++) {
1165 for (j
= 0; j
< missing
[i
]; j
++) {
1166 ASSERT0(rows
[i
][j
]);
1168 ASSERT3U(rows
[i
][missing
[i
]], !=, 0);
1171 * Compute the inverse of the first element and multiply each
1172 * element in the row by that value.
1174 log
= 255 - vdev_raidz_log2
[rows
[i
][missing
[i
]]];
1176 for (j
= 0; j
< n
; j
++) {
1177 rows
[i
][j
] = vdev_raidz_exp2(rows
[i
][j
], log
);
1178 invrows
[i
][j
] = vdev_raidz_exp2(invrows
[i
][j
], log
);
1181 for (ii
= 0; ii
< nmissing
; ii
++) {
1185 ASSERT3U(rows
[ii
][missing
[i
]], !=, 0);
1187 log
= vdev_raidz_log2
[rows
[ii
][missing
[i
]]];
1189 for (j
= 0; j
< n
; j
++) {
1191 vdev_raidz_exp2(rows
[i
][j
], log
);
1193 vdev_raidz_exp2(invrows
[i
][j
], log
);
1199 * Verify that the data that is left in the rows are properly part of
1200 * an identity matrix.
1202 for (i
= 0; i
< nmissing
; i
++) {
1203 for (j
= 0; j
< n
; j
++) {
1204 if (j
== missing
[i
]) {
1205 ASSERT3U(rows
[i
][j
], ==, 1);
1207 ASSERT0(rows
[i
][j
]);
1214 vdev_raidz_matrix_reconstruct(raidz_map_t
*rm
, int n
, int nmissing
,
1215 int *missing
, uint8_t **invrows
, const uint8_t *used
)
1220 uint8_t *dst
[VDEV_RAIDZ_MAXPARITY
];
1221 uint64_t dcount
[VDEV_RAIDZ_MAXPARITY
];
1225 uint8_t *invlog
[VDEV_RAIDZ_MAXPARITY
];
1229 psize
= sizeof (invlog
[0][0]) * n
* nmissing
;
1230 p
= kmem_alloc(psize
, KM_SLEEP
);
1232 for (pp
= p
, i
= 0; i
< nmissing
; i
++) {
1237 for (i
= 0; i
< nmissing
; i
++) {
1238 for (j
= 0; j
< n
; j
++) {
1239 ASSERT3U(invrows
[i
][j
], !=, 0);
1240 invlog
[i
][j
] = vdev_raidz_log2
[invrows
[i
][j
]];
1244 for (i
= 0; i
< n
; i
++) {
1246 ASSERT3U(c
, <, rm
->rm_cols
);
1248 src
= rm
->rm_col
[c
].rc_data
;
1249 ccount
= rm
->rm_col
[c
].rc_size
;
1250 for (j
= 0; j
< nmissing
; j
++) {
1251 cc
= missing
[j
] + rm
->rm_firstdatacol
;
1252 ASSERT3U(cc
, >=, rm
->rm_firstdatacol
);
1253 ASSERT3U(cc
, <, rm
->rm_cols
);
1254 ASSERT3U(cc
, !=, c
);
1256 dst
[j
] = rm
->rm_col
[cc
].rc_data
;
1257 dcount
[j
] = rm
->rm_col
[cc
].rc_size
;
1260 ASSERT(ccount
>= rm
->rm_col
[missing
[0]].rc_size
|| i
> 0);
1262 for (x
= 0; x
< ccount
; x
++, src
++) {
1264 log
= vdev_raidz_log2
[*src
];
1266 for (cc
= 0; cc
< nmissing
; cc
++) {
1267 if (x
>= dcount
[cc
])
1273 if ((ll
= log
+ invlog
[cc
][i
]) >= 255)
1275 val
= vdev_raidz_pow2
[ll
];
1286 kmem_free(p
, psize
);
1290 vdev_raidz_reconstruct_general(raidz_map_t
*rm
, int *tgts
, int ntgts
)
1294 int missing_rows
[VDEV_RAIDZ_MAXPARITY
];
1295 int parity_map
[VDEV_RAIDZ_MAXPARITY
];
1300 uint8_t *rows
[VDEV_RAIDZ_MAXPARITY
];
1301 uint8_t *invrows
[VDEV_RAIDZ_MAXPARITY
];
1307 n
= rm
->rm_cols
- rm
->rm_firstdatacol
;
1310 * Figure out which data columns are missing.
1313 for (t
= 0; t
< ntgts
; t
++) {
1314 if (tgts
[t
] >= rm
->rm_firstdatacol
) {
1315 missing_rows
[nmissing_rows
++] =
1316 tgts
[t
] - rm
->rm_firstdatacol
;
1321 * Figure out which parity columns to use to help generate the missing
1324 for (tt
= 0, c
= 0, i
= 0; i
< nmissing_rows
; c
++) {
1326 ASSERT(c
< rm
->rm_firstdatacol
);
1329 * Skip any targeted parity columns.
1331 if (c
== tgts
[tt
]) {
1343 ASSERT3U(code
, <, 1 << VDEV_RAIDZ_MAXPARITY
);
1345 psize
= (sizeof (rows
[0][0]) + sizeof (invrows
[0][0])) *
1346 nmissing_rows
* n
+ sizeof (used
[0]) * n
;
1347 p
= kmem_alloc(psize
, KM_SLEEP
);
1349 for (pp
= p
, i
= 0; i
< nmissing_rows
; i
++) {
1357 for (i
= 0; i
< nmissing_rows
; i
++) {
1358 used
[i
] = parity_map
[i
];
1361 for (tt
= 0, c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
1362 if (tt
< nmissing_rows
&&
1363 c
== missing_rows
[tt
] + rm
->rm_firstdatacol
) {
1374 * Initialize the interesting rows of the matrix.
1376 vdev_raidz_matrix_init(rm
, n
, nmissing_rows
, parity_map
, rows
);
1379 * Invert the matrix.
1381 vdev_raidz_matrix_invert(rm
, n
, nmissing_rows
, missing_rows
, rows
,
1385 * Reconstruct the missing data using the generated matrix.
1387 vdev_raidz_matrix_reconstruct(rm
, n
, nmissing_rows
, missing_rows
,
1390 kmem_free(p
, psize
);
1396 vdev_raidz_reconstruct(raidz_map_t
*rm
, int *t
, int nt
)
1398 int tgts
[VDEV_RAIDZ_MAXPARITY
], *dt
;
1402 int nbadparity
, nbaddata
;
1403 int parity_valid
[VDEV_RAIDZ_MAXPARITY
];
1406 * The tgts list must already be sorted.
1408 for (i
= 1; i
< nt
; i
++) {
1409 ASSERT(t
[i
] > t
[i
- 1]);
1412 nbadparity
= rm
->rm_firstdatacol
;
1413 nbaddata
= rm
->rm_cols
- nbadparity
;
1415 for (i
= 0, c
= 0; c
< rm
->rm_cols
; c
++) {
1416 if (c
< rm
->rm_firstdatacol
)
1417 parity_valid
[c
] = B_FALSE
;
1419 if (i
< nt
&& c
== t
[i
]) {
1422 } else if (rm
->rm_col
[c
].rc_error
!= 0) {
1424 } else if (c
>= rm
->rm_firstdatacol
) {
1427 parity_valid
[c
] = B_TRUE
;
1432 ASSERT(ntgts
>= nt
);
1433 ASSERT(nbaddata
>= 0);
1434 ASSERT(nbaddata
+ nbadparity
== ntgts
);
1436 dt
= &tgts
[nbadparity
];
1439 * See if we can use any of our optimized reconstruction routines.
1441 if (!vdev_raidz_default_to_general
) {
1444 if (parity_valid
[VDEV_RAIDZ_P
])
1445 return (vdev_raidz_reconstruct_p(rm
, dt
, 1));
1447 ASSERT(rm
->rm_firstdatacol
> 1);
1449 if (parity_valid
[VDEV_RAIDZ_Q
])
1450 return (vdev_raidz_reconstruct_q(rm
, dt
, 1));
1452 ASSERT(rm
->rm_firstdatacol
> 2);
1456 ASSERT(rm
->rm_firstdatacol
> 1);
1458 if (parity_valid
[VDEV_RAIDZ_P
] &&
1459 parity_valid
[VDEV_RAIDZ_Q
])
1460 return (vdev_raidz_reconstruct_pq(rm
, dt
, 2));
1462 ASSERT(rm
->rm_firstdatacol
> 2);
1468 code
= vdev_raidz_reconstruct_general(rm
, tgts
, ntgts
);
1469 ASSERT(code
< (1 << VDEV_RAIDZ_MAXPARITY
));
1475 vdev_raidz_open(vdev_t
*vd
, uint64_t *asize
, uint64_t *max_asize
,
1479 uint64_t nparity
= vd
->vdev_nparity
;
1484 ASSERT(nparity
> 0);
1486 if (nparity
> VDEV_RAIDZ_MAXPARITY
||
1487 vd
->vdev_children
< nparity
+ 1) {
1488 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
1489 return (SET_ERROR(EINVAL
));
1492 vdev_open_children(vd
);
1494 for (c
= 0; c
< vd
->vdev_children
; c
++) {
1495 cvd
= vd
->vdev_child
[c
];
1497 if (cvd
->vdev_open_error
!= 0) {
1498 lasterror
= cvd
->vdev_open_error
;
1503 *asize
= MIN(*asize
- 1, cvd
->vdev_asize
- 1) + 1;
1504 *max_asize
= MIN(*max_asize
- 1, cvd
->vdev_max_asize
- 1) + 1;
1505 *ashift
= MAX(*ashift
, cvd
->vdev_ashift
);
1508 *asize
*= vd
->vdev_children
;
1509 *max_asize
*= vd
->vdev_children
;
1511 if (numerrors
> nparity
) {
1512 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NO_REPLICAS
;
1520 vdev_raidz_close(vdev_t
*vd
)
1524 for (c
= 0; c
< vd
->vdev_children
; c
++)
1525 vdev_close(vd
->vdev_child
[c
]);
1529 vdev_raidz_asize(vdev_t
*vd
, uint64_t psize
)
1532 uint64_t ashift
= vd
->vdev_top
->vdev_ashift
;
1533 uint64_t cols
= vd
->vdev_children
;
1534 uint64_t nparity
= vd
->vdev_nparity
;
1536 asize
= ((psize
- 1) >> ashift
) + 1;
1537 asize
+= nparity
* ((asize
+ cols
- nparity
- 1) / (cols
- nparity
));
1538 asize
= roundup(asize
, nparity
+ 1) << ashift
;
1544 vdev_raidz_child_done(zio_t
*zio
)
1546 raidz_col_t
*rc
= zio
->io_private
;
1548 rc
->rc_error
= zio
->io_error
;
1554 * Start an IO operation on a RAIDZ VDev
1557 * - For write operations:
1558 * 1. Generate the parity data
1559 * 2. Create child zio write operations to each column's vdev, for both
1561 * 3. If the column skips any sectors for padding, create optional dummy
1562 * write zio children for those areas to improve aggregation continuity.
1563 * - For read operations:
1564 * 1. Create child zio read operations to each data column's vdev to read
1565 * the range of data required for zio.
1566 * 2. If this is a scrub or resilver operation, or if any of the data
1567 * vdevs have had errors, then create zio read operations to the parity
1568 * columns' VDevs as well.
1571 vdev_raidz_io_start(zio_t
*zio
)
1573 vdev_t
*vd
= zio
->io_vd
;
1574 vdev_t
*tvd
= vd
->vdev_top
;
1580 rm
= vdev_raidz_map_alloc(zio
, tvd
->vdev_ashift
, vd
->vdev_children
,
1583 ASSERT3U(rm
->rm_asize
, ==, vdev_psize_to_asize(vd
, zio
->io_size
));
1585 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
1586 vdev_raidz_generate_parity(rm
);
1588 for (c
= 0; c
< rm
->rm_cols
; c
++) {
1589 rc
= &rm
->rm_col
[c
];
1590 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1591 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1592 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
1593 zio
->io_type
, zio
->io_priority
, 0,
1594 vdev_raidz_child_done
, rc
));
1598 * Generate optional I/Os for any skipped sectors to improve
1599 * aggregation contiguity.
1601 for (c
= rm
->rm_skipstart
, i
= 0; i
< rm
->rm_nskip
; c
++, i
++) {
1602 ASSERT(c
<= rm
->rm_scols
);
1603 if (c
== rm
->rm_scols
)
1605 rc
= &rm
->rm_col
[c
];
1606 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1607 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1608 rc
->rc_offset
+ rc
->rc_size
, NULL
,
1609 1 << tvd
->vdev_ashift
,
1610 zio
->io_type
, zio
->io_priority
,
1611 ZIO_FLAG_NODATA
| ZIO_FLAG_OPTIONAL
, NULL
, NULL
));
1618 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
1621 * Iterate over the columns in reverse order so that we hit the parity
1622 * last -- any errors along the way will force us to read the parity.
1624 for (c
= rm
->rm_cols
- 1; c
>= 0; c
--) {
1625 rc
= &rm
->rm_col
[c
];
1626 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
1627 if (!vdev_readable(cvd
)) {
1628 if (c
>= rm
->rm_firstdatacol
)
1629 rm
->rm_missingdata
++;
1631 rm
->rm_missingparity
++;
1632 rc
->rc_error
= SET_ERROR(ENXIO
);
1633 rc
->rc_tried
= 1; /* don't even try */
1637 if (vdev_dtl_contains(cvd
, DTL_MISSING
, zio
->io_txg
, 1)) {
1638 if (c
>= rm
->rm_firstdatacol
)
1639 rm
->rm_missingdata
++;
1641 rm
->rm_missingparity
++;
1642 rc
->rc_error
= SET_ERROR(ESTALE
);
1646 if (c
>= rm
->rm_firstdatacol
|| rm
->rm_missingdata
> 0 ||
1647 (zio
->io_flags
& (ZIO_FLAG_SCRUB
| ZIO_FLAG_RESILVER
))) {
1648 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
1649 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
1650 zio
->io_type
, zio
->io_priority
, 0,
1651 vdev_raidz_child_done
, rc
));
1660 * Report a checksum error for a child of a RAID-Z device.
1663 raidz_checksum_error(zio_t
*zio
, raidz_col_t
*rc
, void *bad_data
)
1665 vdev_t
*vd
= zio
->io_vd
->vdev_child
[rc
->rc_devidx
];
1667 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
1668 zio_bad_cksum_t zbc
;
1669 raidz_map_t
*rm
= zio
->io_vsd
;
1671 mutex_enter(&vd
->vdev_stat_lock
);
1672 vd
->vdev_stat
.vs_checksum_errors
++;
1673 mutex_exit(&vd
->vdev_stat_lock
);
1675 zbc
.zbc_has_cksum
= 0;
1676 zbc
.zbc_injected
= rm
->rm_ecksuminjected
;
1678 zfs_ereport_post_checksum(zio
->io_spa
, vd
, zio
,
1679 rc
->rc_offset
, rc
->rc_size
, rc
->rc_data
, bad_data
,
1685 * We keep track of whether or not there were any injected errors, so that
1686 * any ereports we generate can note it.
1689 raidz_checksum_verify(zio_t
*zio
)
1691 zio_bad_cksum_t zbc
;
1692 raidz_map_t
*rm
= zio
->io_vsd
;
1695 bzero(&zbc
, sizeof (zio_bad_cksum_t
));
1697 ret
= zio_checksum_error(zio
, &zbc
);
1698 if (ret
!= 0 && zbc
.zbc_injected
!= 0)
1699 rm
->rm_ecksuminjected
= 1;
1705 * Generate the parity from the data columns. If we tried and were able to
1706 * read the parity without error, verify that the generated parity matches the
1707 * data we read. If it doesn't, we fire off a checksum error. Return the
1708 * number such failures.
1711 raidz_parity_verify(zio_t
*zio
, raidz_map_t
*rm
)
1713 void *orig
[VDEV_RAIDZ_MAXPARITY
];
1717 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
1718 rc
= &rm
->rm_col
[c
];
1719 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
1721 orig
[c
] = zio_buf_alloc(rc
->rc_size
);
1722 bcopy(rc
->rc_data
, orig
[c
], rc
->rc_size
);
1725 vdev_raidz_generate_parity(rm
);
1727 for (c
= 0; c
< rm
->rm_firstdatacol
; c
++) {
1728 rc
= &rm
->rm_col
[c
];
1729 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
1731 if (bcmp(orig
[c
], rc
->rc_data
, rc
->rc_size
) != 0) {
1732 raidz_checksum_error(zio
, rc
, orig
[c
]);
1733 rc
->rc_error
= SET_ERROR(ECKSUM
);
1736 zio_buf_free(orig
[c
], rc
->rc_size
);
1743 * Keep statistics on all the ways that we used parity to correct data.
1745 static uint64_t raidz_corrected
[1 << VDEV_RAIDZ_MAXPARITY
];
1748 vdev_raidz_worst_error(raidz_map_t
*rm
)
1752 for (c
= 0; c
< rm
->rm_cols
; c
++)
1753 error
= zio_worst_error(error
, rm
->rm_col
[c
].rc_error
);
1759 * Iterate over all combinations of bad data and attempt a reconstruction.
1760 * Note that the algorithm below is non-optimal because it doesn't take into
1761 * account how reconstruction is actually performed. For example, with
1762 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1763 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1764 * cases we'd only use parity information in column 0.
1767 vdev_raidz_combrec(zio_t
*zio
, int total_errors
, int data_errors
)
1769 raidz_map_t
*rm
= zio
->io_vsd
;
1771 void *orig
[VDEV_RAIDZ_MAXPARITY
];
1772 int tstore
[VDEV_RAIDZ_MAXPARITY
+ 2];
1773 int *tgts
= &tstore
[1];
1774 int curr
, next
, i
, c
, n
;
1777 ASSERT(total_errors
< rm
->rm_firstdatacol
);
1780 * This simplifies one edge condition.
1784 for (n
= 1; n
<= rm
->rm_firstdatacol
- total_errors
; n
++) {
1786 * Initialize the targets array by finding the first n columns
1787 * that contain no error.
1789 * If there were no data errors, we need to ensure that we're
1790 * always explicitly attempting to reconstruct at least one
1791 * data column. To do this, we simply push the highest target
1792 * up into the data columns.
1794 for (c
= 0, i
= 0; i
< n
; i
++) {
1795 if (i
== n
- 1 && data_errors
== 0 &&
1796 c
< rm
->rm_firstdatacol
) {
1797 c
= rm
->rm_firstdatacol
;
1800 while (rm
->rm_col
[c
].rc_error
!= 0) {
1802 ASSERT3S(c
, <, rm
->rm_cols
);
1809 * Setting tgts[n] simplifies the other edge condition.
1811 tgts
[n
] = rm
->rm_cols
;
1814 * These buffers were allocated in previous iterations.
1816 for (i
= 0; i
< n
- 1; i
++) {
1817 ASSERT(orig
[i
] != NULL
);
1820 orig
[n
- 1] = zio_buf_alloc(rm
->rm_col
[0].rc_size
);
1830 * Save off the original data that we're going to
1831 * attempt to reconstruct.
1833 for (i
= 0; i
< n
; i
++) {
1834 ASSERT(orig
[i
] != NULL
);
1837 ASSERT3S(c
, <, rm
->rm_cols
);
1838 rc
= &rm
->rm_col
[c
];
1839 bcopy(rc
->rc_data
, orig
[i
], rc
->rc_size
);
1843 * Attempt a reconstruction and exit the outer loop on
1846 code
= vdev_raidz_reconstruct(rm
, tgts
, n
);
1847 if (raidz_checksum_verify(zio
) == 0) {
1848 atomic_inc_64(&raidz_corrected
[code
]);
1850 for (i
= 0; i
< n
; i
++) {
1852 rc
= &rm
->rm_col
[c
];
1853 ASSERT(rc
->rc_error
== 0);
1855 raidz_checksum_error(zio
, rc
,
1857 rc
->rc_error
= SET_ERROR(ECKSUM
);
1865 * Restore the original data.
1867 for (i
= 0; i
< n
; i
++) {
1869 rc
= &rm
->rm_col
[c
];
1870 bcopy(orig
[i
], rc
->rc_data
, rc
->rc_size
);
1875 * Find the next valid column after the curr
1878 for (next
= tgts
[curr
] + 1;
1879 next
< rm
->rm_cols
&&
1880 rm
->rm_col
[next
].rc_error
!= 0; next
++)
1883 ASSERT(next
<= tgts
[curr
+ 1]);
1886 * If that spot is available, we're done here.
1888 if (next
!= tgts
[curr
+ 1])
1892 * Otherwise, find the next valid column after
1893 * the previous position.
1895 for (c
= tgts
[curr
- 1] + 1;
1896 rm
->rm_col
[c
].rc_error
!= 0; c
++)
1902 } while (curr
!= n
);
1907 for (i
= 0; i
< n
; i
++) {
1908 zio_buf_free(orig
[i
], rm
->rm_col
[0].rc_size
);
1915 * Complete an IO operation on a RAIDZ VDev
1918 * - For write operations:
1919 * 1. Check for errors on the child IOs.
1920 * 2. Return, setting an error code if too few child VDevs were written
1921 * to reconstruct the data later. Note that partial writes are
1922 * considered successful if they can be reconstructed at all.
1923 * - For read operations:
1924 * 1. Check for errors on the child IOs.
1925 * 2. If data errors occurred:
1926 * a. Try to reassemble the data from the parity available.
1927 * b. If we haven't yet read the parity drives, read them now.
1928 * c. If all parity drives have been read but the data still doesn't
1929 * reassemble with a correct checksum, then try combinatorial
1931 * d. If that doesn't work, return an error.
1932 * 3. If there were unexpected errors or this is a resilver operation,
1933 * rewrite the vdevs that had errors.
1936 vdev_raidz_io_done(zio_t
*zio
)
1938 vdev_t
*vd
= zio
->io_vd
;
1940 raidz_map_t
*rm
= zio
->io_vsd
;
1941 raidz_col_t
*rc
= NULL
;
1942 int unexpected_errors
= 0;
1943 int parity_errors
= 0;
1944 int parity_untried
= 0;
1945 int data_errors
= 0;
1946 int total_errors
= 0;
1948 int tgts
[VDEV_RAIDZ_MAXPARITY
];
1951 ASSERT(zio
->io_bp
!= NULL
); /* XXX need to add code to enforce this */
1953 ASSERT(rm
->rm_missingparity
<= rm
->rm_firstdatacol
);
1954 ASSERT(rm
->rm_missingdata
<= rm
->rm_cols
- rm
->rm_firstdatacol
);
1956 for (c
= 0; c
< rm
->rm_cols
; c
++) {
1957 rc
= &rm
->rm_col
[c
];
1960 ASSERT(rc
->rc_error
!= ECKSUM
); /* child has no bp */
1962 if (c
< rm
->rm_firstdatacol
)
1967 if (!rc
->rc_skipped
)
1968 unexpected_errors
++;
1971 } else if (c
< rm
->rm_firstdatacol
&& !rc
->rc_tried
) {
1976 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
1978 * XXX -- for now, treat partial writes as a success.
1979 * (If we couldn't write enough columns to reconstruct
1980 * the data, the I/O failed. Otherwise, good enough.)
1982 * Now that we support write reallocation, it would be better
1983 * to treat partial failure as real failure unless there are
1984 * no non-degraded top-level vdevs left, and not update DTLs
1985 * if we intend to reallocate.
1988 if (total_errors
> rm
->rm_firstdatacol
)
1989 zio
->io_error
= vdev_raidz_worst_error(rm
);
1994 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
1996 * There are three potential phases for a read:
1997 * 1. produce valid data from the columns read
1998 * 2. read all disks and try again
1999 * 3. perform combinatorial reconstruction
2001 * Each phase is progressively both more expensive and less likely to
2002 * occur. If we encounter more errors than we can repair or all phases
2003 * fail, we have no choice but to return an error.
2007 * If the number of errors we saw was correctable -- less than or equal
2008 * to the number of parity disks read -- attempt to produce data that
2009 * has a valid checksum. Naturally, this case applies in the absence of
2012 if (total_errors
<= rm
->rm_firstdatacol
- parity_untried
) {
2013 if (data_errors
== 0) {
2014 if (raidz_checksum_verify(zio
) == 0) {
2016 * If we read parity information (unnecessarily
2017 * as it happens since no reconstruction was
2018 * needed) regenerate and verify the parity.
2019 * We also regenerate parity when resilvering
2020 * so we can write it out to the failed device
2023 if (parity_errors
+ parity_untried
<
2024 rm
->rm_firstdatacol
||
2025 (zio
->io_flags
& ZIO_FLAG_RESILVER
)) {
2026 n
= raidz_parity_verify(zio
, rm
);
2027 unexpected_errors
+= n
;
2028 ASSERT(parity_errors
+ n
<=
2029 rm
->rm_firstdatacol
);
2035 * We either attempt to read all the parity columns or
2036 * none of them. If we didn't try to read parity, we
2037 * wouldn't be here in the correctable case. There must
2038 * also have been fewer parity errors than parity
2039 * columns or, again, we wouldn't be in this code path.
2041 ASSERT(parity_untried
== 0);
2042 ASSERT(parity_errors
< rm
->rm_firstdatacol
);
2045 * Identify the data columns that reported an error.
2048 for (c
= rm
->rm_firstdatacol
; c
< rm
->rm_cols
; c
++) {
2049 rc
= &rm
->rm_col
[c
];
2050 if (rc
->rc_error
!= 0) {
2051 ASSERT(n
< VDEV_RAIDZ_MAXPARITY
);
2056 ASSERT(rm
->rm_firstdatacol
>= n
);
2058 code
= vdev_raidz_reconstruct(rm
, tgts
, n
);
2060 if (raidz_checksum_verify(zio
) == 0) {
2061 atomic_inc_64(&raidz_corrected
[code
]);
2064 * If we read more parity disks than were used
2065 * for reconstruction, confirm that the other
2066 * parity disks produced correct data. This
2067 * routine is suboptimal in that it regenerates
2068 * the parity that we already used in addition
2069 * to the parity that we're attempting to
2070 * verify, but this should be a relatively
2071 * uncommon case, and can be optimized if it
2072 * becomes a problem. Note that we regenerate
2073 * parity when resilvering so we can write it
2074 * out to failed devices later.
2076 if (parity_errors
< rm
->rm_firstdatacol
- n
||
2077 (zio
->io_flags
& ZIO_FLAG_RESILVER
)) {
2078 n
= raidz_parity_verify(zio
, rm
);
2079 unexpected_errors
+= n
;
2080 ASSERT(parity_errors
+ n
<=
2081 rm
->rm_firstdatacol
);
2090 * This isn't a typical situation -- either we got a read error or
2091 * a child silently returned bad data. Read every block so we can
2092 * try again with as much data and parity as we can track down. If
2093 * we've already been through once before, all children will be marked
2094 * as tried so we'll proceed to combinatorial reconstruction.
2096 unexpected_errors
= 1;
2097 rm
->rm_missingdata
= 0;
2098 rm
->rm_missingparity
= 0;
2100 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2101 if (rm
->rm_col
[c
].rc_tried
)
2104 zio_vdev_io_redone(zio
);
2106 rc
= &rm
->rm_col
[c
];
2109 zio_nowait(zio_vdev_child_io(zio
, NULL
,
2110 vd
->vdev_child
[rc
->rc_devidx
],
2111 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
2112 zio
->io_type
, zio
->io_priority
, 0,
2113 vdev_raidz_child_done
, rc
));
2114 } while (++c
< rm
->rm_cols
);
2120 * At this point we've attempted to reconstruct the data given the
2121 * errors we detected, and we've attempted to read all columns. There
2122 * must, therefore, be one or more additional problems -- silent errors
2123 * resulting in invalid data rather than explicit I/O errors resulting
2124 * in absent data. We check if there is enough additional data to
2125 * possibly reconstruct the data and then perform combinatorial
2126 * reconstruction over all possible combinations. If that fails,
2129 if (total_errors
> rm
->rm_firstdatacol
) {
2130 zio
->io_error
= vdev_raidz_worst_error(rm
);
2132 } else if (total_errors
< rm
->rm_firstdatacol
&&
2133 (code
= vdev_raidz_combrec(zio
, total_errors
, data_errors
)) != 0) {
2135 * If we didn't use all the available parity for the
2136 * combinatorial reconstruction, verify that the remaining
2137 * parity is correct.
2139 if (code
!= (1 << rm
->rm_firstdatacol
) - 1)
2140 (void) raidz_parity_verify(zio
, rm
);
2143 * We're here because either:
2145 * total_errors == rm_first_datacol, or
2146 * vdev_raidz_combrec() failed
2148 * In either case, there is enough bad data to prevent
2151 * Start checksum ereports for all children which haven't
2152 * failed, and the IO wasn't speculative.
2154 zio
->io_error
= SET_ERROR(ECKSUM
);
2156 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
2157 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2158 rc
= &rm
->rm_col
[c
];
2159 if (rc
->rc_error
== 0) {
2160 zio_bad_cksum_t zbc
;
2161 zbc
.zbc_has_cksum
= 0;
2163 rm
->rm_ecksuminjected
;
2165 zfs_ereport_start_checksum(
2167 vd
->vdev_child
[rc
->rc_devidx
],
2168 zio
, rc
->rc_offset
, rc
->rc_size
,
2169 (void *)(uintptr_t)c
, &zbc
);
2176 zio_checksum_verified(zio
);
2178 if (zio
->io_error
== 0 && spa_writeable(zio
->io_spa
) &&
2179 (unexpected_errors
|| (zio
->io_flags
& ZIO_FLAG_RESILVER
))) {
2181 * Use the good data we have in hand to repair damaged children.
2183 for (c
= 0; c
< rm
->rm_cols
; c
++) {
2184 rc
= &rm
->rm_col
[c
];
2185 cvd
= vd
->vdev_child
[rc
->rc_devidx
];
2187 if (rc
->rc_error
== 0)
2190 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
2191 rc
->rc_offset
, rc
->rc_data
, rc
->rc_size
,
2192 ZIO_TYPE_WRITE
, ZIO_PRIORITY_ASYNC_WRITE
,
2193 ZIO_FLAG_IO_REPAIR
| (unexpected_errors
?
2194 ZIO_FLAG_SELF_HEAL
: 0), NULL
, NULL
));
2200 vdev_raidz_state_change(vdev_t
*vd
, int faulted
, int degraded
)
2202 if (faulted
> vd
->vdev_nparity
)
2203 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
2204 VDEV_AUX_NO_REPLICAS
);
2205 else if (degraded
+ faulted
!= 0)
2206 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
, VDEV_AUX_NONE
);
2208 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_HEALTHY
, VDEV_AUX_NONE
);
2211 vdev_ops_t vdev_raidz_ops
= {
2215 vdev_raidz_io_start
,
2217 vdev_raidz_state_change
,
2220 VDEV_TYPE_RAIDZ
, /* name of this vdev type */
2221 B_FALSE
/* not a leaf vdev */