]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_raidz.c
Rebase master to b121
[mirror_zfs.git] / module / zfs / vdev_raidz.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
9babb374 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
34dc7c2f
BB
24 * Use is subject to license terms.
25 */
26
34dc7c2f
BB
27#include <sys/zfs_context.h>
28#include <sys/spa.h>
29#include <sys/vdev_impl.h>
30#include <sys/zio.h>
31#include <sys/zio_checksum.h>
32#include <sys/fs/zfs.h>
33#include <sys/fm/fs/zfs.h>
34
35/*
36 * Virtual device vector for RAID-Z.
37 *
45d1cae3
BB
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
45 * for writes.
46 *
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
53 * suffer.
54 *
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
34dc7c2f
BB
59 *
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
63 * (A * 2)_7 = A_6
64 * (A * 2)_6 = A_5
65 * (A * 2)_5 = A_4
66 * (A * 2)_4 = A_3 + A_7
67 * (A * 2)_3 = A_2 + A_7
68 * (A * 2)_2 = A_1 + A_7
69 * (A * 2)_1 = A_0
70 * (A * 2)_0 = A_7
71 *
72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
45d1cae3
BB
73 * As an aside, this multiplication is derived from the error correcting
74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
34dc7c2f
BB
75 *
76 * Observe that any number in the field (except for 0) can be expressed as a
77 * power of 2 -- a generator for the field. We store a table of the powers of
78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
45d1cae3
BB
80 * than field addition). The inverse of a field element A (A^-1) is therefore
81 * A ^ (255 - 1) = A^254.
34dc7c2f 82 *
45d1cae3
BB
83 * The up-to-three parity columns, P, Q, R over several data columns,
84 * D_0, ... D_n-1, can be expressed by field operations:
34dc7c2f
BB
85 *
86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
45d1cae3
BB
89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
34dc7c2f 91 *
45d1cae3
BB
92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 * independent coefficients. (There are no additional coefficients that have
95 * this property which is why the uncorrected Plank method breaks down.)
96 *
97 * See the reconstruction code below for how P, Q and R can used individually
98 * or in concert to recover missing data columns.
34dc7c2f
BB
99 */
100
101typedef struct raidz_col {
102 uint64_t rc_devidx; /* child device index for I/O */
103 uint64_t rc_offset; /* device offset */
104 uint64_t rc_size; /* I/O size */
105 void *rc_data; /* I/O data */
106 int rc_error; /* I/O error for this device */
107 uint8_t rc_tried; /* Did we attempt this I/O column? */
108 uint8_t rc_skipped; /* Did we skip this I/O column? */
109} raidz_col_t;
110
111typedef struct raidz_map {
45d1cae3
BB
112 uint64_t rm_cols; /* Regular column count */
113 uint64_t rm_scols; /* Count including skipped columns */
34dc7c2f
BB
114 uint64_t rm_bigcols; /* Number of oversized columns */
115 uint64_t rm_asize; /* Actual total I/O size */
116 uint64_t rm_missingdata; /* Count of missing data devices */
117 uint64_t rm_missingparity; /* Count of missing parity devices */
118 uint64_t rm_firstdatacol; /* First data column/parity count */
45d1cae3 119 uint64_t rm_skipped; /* Skipped sectors for padding */
34dc7c2f
BB
120 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
121} raidz_map_t;
122
123#define VDEV_RAIDZ_P 0
124#define VDEV_RAIDZ_Q 1
45d1cae3
BB
125#define VDEV_RAIDZ_R 2
126#define VDEV_RAIDZ_MAXPARITY 3
127
128#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
129#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
130
131/*
132 * We provide a mechanism to perform the field multiplication operation on a
133 * 64-bit value all at once rather than a byte at a time. This works by
134 * creating a mask from the top bit in each byte and using that to
135 * conditionally apply the XOR of 0x1d.
136 */
137#define VDEV_RAIDZ_64MUL_2(x, mask) \
138{ \
139 (mask) = (x) & 0x8080808080808080ULL; \
140 (mask) = ((mask) << 1) - ((mask) >> 7); \
141 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
142 ((mask) & 0x1d1d1d1d1d1d1d1d); \
143}
34dc7c2f 144
45d1cae3
BB
145#define VDEV_RAIDZ_64MUL_4(x, mask) \
146{ \
147 VDEV_RAIDZ_64MUL_2((x), mask); \
148 VDEV_RAIDZ_64MUL_2((x), mask); \
149}
34dc7c2f 150
45d1cae3
BB
151/*
152 * Force reconstruction to use the general purpose method.
153 */
154int vdev_raidz_default_to_general;
34dc7c2f
BB
155
156/*
157 * These two tables represent powers and logs of 2 in the Galois field defined
158 * above. These values were computed by repeatedly multiplying by 2 as above.
159 */
160static const uint8_t vdev_raidz_pow2[256] = {
161 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
162 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
163 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
164 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
165 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
166 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
167 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
168 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
169 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
170 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
171 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
172 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
173 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
174 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
175 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
176 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
177 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
178 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
179 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
180 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
181 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
182 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
183 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
184 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
185 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
186 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
187 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
188 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
189 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
190 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
191 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
192 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
193};
194static const uint8_t vdev_raidz_log2[256] = {
195 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
196 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
197 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
198 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
199 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
200 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
201 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
202 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
203 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
204 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
205 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
206 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
207 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
208 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
209 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
210 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
211 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
212 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
213 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
214 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
215 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
216 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
217 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
218 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
219 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
220 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
221 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
222 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
223 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
224 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
225 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
226 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
227};
228
229/*
230 * Multiply a given number by 2 raised to the given power.
231 */
232static uint8_t
233vdev_raidz_exp2(uint_t a, int exp)
234{
235 if (a == 0)
236 return (0);
237
238 ASSERT(exp >= 0);
239 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
240
241 exp += vdev_raidz_log2[a];
242 if (exp > 255)
243 exp -= 255;
244
245 return (vdev_raidz_pow2[exp]);
246}
247
b128c09f
BB
248static void
249vdev_raidz_map_free(zio_t *zio)
250{
251 raidz_map_t *rm = zio->io_vsd;
252 int c;
253
254 for (c = 0; c < rm->rm_firstdatacol; c++)
255 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
256
45d1cae3 257 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
b128c09f
BB
258}
259
34dc7c2f
BB
260static raidz_map_t *
261vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
262 uint64_t nparity)
263{
264 raidz_map_t *rm;
265 uint64_t b = zio->io_offset >> unit_shift;
266 uint64_t s = zio->io_size >> unit_shift;
267 uint64_t f = b % dcols;
268 uint64_t o = (b / dcols) << unit_shift;
45d1cae3 269 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
34dc7c2f
BB
270
271 q = s / (dcols - nparity);
272 r = s - q * (dcols - nparity);
273 bc = (r == 0 ? 0 : r + nparity);
45d1cae3
BB
274 tot = s + nparity * (q + (r == 0 ? 0 : 1));
275
276 if (q == 0) {
277 acols = bc;
278 scols = MIN(dcols, roundup(bc, nparity + 1));
279 } else {
280 acols = dcols;
281 scols = dcols;
282 }
34dc7c2f 283
45d1cae3 284 ASSERT3U(acols, <=, scols);
34dc7c2f 285
45d1cae3 286 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
34dc7c2f
BB
287
288 rm->rm_cols = acols;
45d1cae3 289 rm->rm_scols = scols;
34dc7c2f 290 rm->rm_bigcols = bc;
34dc7c2f
BB
291 rm->rm_missingdata = 0;
292 rm->rm_missingparity = 0;
293 rm->rm_firstdatacol = nparity;
294
45d1cae3
BB
295 asize = 0;
296
297 for (c = 0; c < scols; c++) {
34dc7c2f
BB
298 col = f + c;
299 coff = o;
300 if (col >= dcols) {
301 col -= dcols;
302 coff += 1ULL << unit_shift;
303 }
304 rm->rm_col[c].rc_devidx = col;
305 rm->rm_col[c].rc_offset = coff;
34dc7c2f
BB
306 rm->rm_col[c].rc_data = NULL;
307 rm->rm_col[c].rc_error = 0;
308 rm->rm_col[c].rc_tried = 0;
309 rm->rm_col[c].rc_skipped = 0;
45d1cae3
BB
310
311 if (c >= acols)
312 rm->rm_col[c].rc_size = 0;
313 else if (c < bc)
314 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
315 else
316 rm->rm_col[c].rc_size = q << unit_shift;
317
318 asize += rm->rm_col[c].rc_size;
34dc7c2f
BB
319 }
320
45d1cae3
BB
321 ASSERT3U(asize, ==, tot << unit_shift);
322 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
323 rm->rm_skipped = roundup(tot, nparity + 1) - tot;
324 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_skipped << unit_shift);
325 ASSERT3U(rm->rm_skipped, <=, nparity);
34dc7c2f
BB
326
327 for (c = 0; c < rm->rm_firstdatacol; c++)
328 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
329
330 rm->rm_col[c].rc_data = zio->io_data;
331
332 for (c = c + 1; c < acols; c++)
333 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
334 rm->rm_col[c - 1].rc_size;
335
336 /*
337 * If all data stored spans all columns, there's a danger that parity
338 * will always be on the same device and, since parity isn't read
339 * during normal operation, that that device's I/O bandwidth won't be
340 * used effectively. We therefore switch the parity every 1MB.
341 *
342 * ... at least that was, ostensibly, the theory. As a practical
343 * matter unless we juggle the parity between all devices evenly, we
344 * won't see any benefit. Further, occasional writes that aren't a
345 * multiple of the LCM of the number of children and the minimum
346 * stripe width are sufficient to avoid pessimal behavior.
347 * Unfortunately, this decision created an implicit on-disk format
348 * requirement that we need to support for all eternity, but only
349 * for single-parity RAID-Z.
350 */
351 ASSERT(rm->rm_cols >= 2);
352 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
353
354 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
355 devidx = rm->rm_col[0].rc_devidx;
356 o = rm->rm_col[0].rc_offset;
357 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
358 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
359 rm->rm_col[1].rc_devidx = devidx;
360 rm->rm_col[1].rc_offset = o;
361 }
362
363 zio->io_vsd = rm;
b128c09f 364 zio->io_vsd_free = vdev_raidz_map_free;
34dc7c2f
BB
365 return (rm);
366}
367
34dc7c2f
BB
368static void
369vdev_raidz_generate_parity_p(raidz_map_t *rm)
370{
371 uint64_t *p, *src, pcount, ccount, i;
372 int c;
373
374 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
375
376 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
377 src = rm->rm_col[c].rc_data;
378 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
379 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
380
381 if (c == rm->rm_firstdatacol) {
382 ASSERT(ccount == pcount);
45d1cae3 383 for (i = 0; i < ccount; i++, src++, p++) {
34dc7c2f
BB
384 *p = *src;
385 }
386 } else {
387 ASSERT(ccount <= pcount);
45d1cae3 388 for (i = 0; i < ccount; i++, src++, p++) {
34dc7c2f
BB
389 *p ^= *src;
390 }
391 }
392 }
393}
394
395static void
396vdev_raidz_generate_parity_pq(raidz_map_t *rm)
397{
45d1cae3 398 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
34dc7c2f
BB
399 int c;
400
45d1cae3 401 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
34dc7c2f
BB
402 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
403 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
404
405 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
406 src = rm->rm_col[c].rc_data;
407 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
408 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
45d1cae3
BB
409
410 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
34dc7c2f
BB
411
412 if (c == rm->rm_firstdatacol) {
45d1cae3
BB
413 ASSERT(ccnt == pcnt || ccnt == 0);
414 for (i = 0; i < ccnt; i++, src++, p++, q++) {
34dc7c2f 415 *p = *src;
45d1cae3 416 *q = *src;
34dc7c2f 417 }
45d1cae3 418 for (; i < pcnt; i++, src++, p++, q++) {
34dc7c2f 419 *p = 0;
45d1cae3 420 *q = 0;
34dc7c2f
BB
421 }
422 } else {
45d1cae3 423 ASSERT(ccnt <= pcnt);
34dc7c2f
BB
424
425 /*
45d1cae3
BB
426 * Apply the algorithm described above by multiplying
427 * the previous result and adding in the new value.
34dc7c2f 428 */
45d1cae3
BB
429 for (i = 0; i < ccnt; i++, src++, p++, q++) {
430 *p ^= *src;
431
432 VDEV_RAIDZ_64MUL_2(*q, mask);
34dc7c2f 433 *q ^= *src;
45d1cae3
BB
434 }
435
436 /*
437 * Treat short columns as though they are full of 0s.
438 * Note that there's therefore nothing needed for P.
439 */
440 for (; i < pcnt; i++, q++) {
441 VDEV_RAIDZ_64MUL_2(*q, mask);
442 }
443 }
444 }
445}
446
447static void
448vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
449{
450 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
451 int c;
452
453 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
454 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
455 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
456 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
457 rm->rm_col[VDEV_RAIDZ_R].rc_size);
458
459 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
460 src = rm->rm_col[c].rc_data;
461 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
462 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
463 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
464
465 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
466
467 if (c == rm->rm_firstdatacol) {
468 ASSERT(ccnt == pcnt || ccnt == 0);
469 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
470 *p = *src;
471 *q = *src;
472 *r = *src;
473 }
474 for (; i < pcnt; i++, src++, p++, q++, r++) {
475 *p = 0;
476 *q = 0;
477 *r = 0;
478 }
479 } else {
480 ASSERT(ccnt <= pcnt);
481
482 /*
483 * Apply the algorithm described above by multiplying
484 * the previous result and adding in the new value.
485 */
486 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
34dc7c2f 487 *p ^= *src;
45d1cae3
BB
488
489 VDEV_RAIDZ_64MUL_2(*q, mask);
490 *q ^= *src;
491
492 VDEV_RAIDZ_64MUL_4(*r, mask);
493 *r ^= *src;
34dc7c2f
BB
494 }
495
496 /*
497 * Treat short columns as though they are full of 0s.
45d1cae3 498 * Note that there's therefore nothing needed for P.
34dc7c2f 499 */
45d1cae3
BB
500 for (; i < pcnt; i++, q++, r++) {
501 VDEV_RAIDZ_64MUL_2(*q, mask);
502 VDEV_RAIDZ_64MUL_4(*r, mask);
34dc7c2f
BB
503 }
504 }
505 }
506}
507
45d1cae3
BB
508/*
509 * Generate RAID parity in the first virtual columns according to the number of
510 * parity columns available.
511 */
34dc7c2f 512static void
45d1cae3
BB
513vdev_raidz_generate_parity(raidz_map_t *rm)
514{
515 switch (rm->rm_firstdatacol) {
516 case 1:
517 vdev_raidz_generate_parity_p(rm);
518 break;
519 case 2:
520 vdev_raidz_generate_parity_pq(rm);
521 break;
522 case 3:
523 vdev_raidz_generate_parity_pqr(rm);
524 break;
525 default:
526 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
527 }
528}
529
530static int
531vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
532{
533 uint64_t *dst, *src, xcount, ccount, count, i;
45d1cae3 534 int x = tgts[0];
34dc7c2f
BB
535 int c;
536
45d1cae3
BB
537 ASSERT(ntgts == 1);
538 ASSERT(x >= rm->rm_firstdatacol);
539 ASSERT(x < rm->rm_cols);
540
34dc7c2f
BB
541 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
542 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
543 ASSERT(xcount > 0);
544
545 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
546 dst = rm->rm_col[x].rc_data;
547 for (i = 0; i < xcount; i++, dst++, src++) {
548 *dst = *src;
549 }
550
551 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
552 src = rm->rm_col[c].rc_data;
553 dst = rm->rm_col[x].rc_data;
554
555 if (c == x)
556 continue;
557
558 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
559 count = MIN(ccount, xcount);
560
561 for (i = 0; i < count; i++, dst++, src++) {
562 *dst ^= *src;
563 }
564 }
45d1cae3
BB
565
566 return (1 << VDEV_RAIDZ_P);
34dc7c2f
BB
567}
568
45d1cae3
BB
569static int
570vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
571{
572 uint64_t *dst, *src, xcount, ccount, count, mask, i;
573 uint8_t *b;
45d1cae3 574 int x = tgts[0];
34dc7c2f
BB
575 int c, j, exp;
576
45d1cae3
BB
577 ASSERT(ntgts == 1);
578
34dc7c2f
BB
579 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
580 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
581
582 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
583 src = rm->rm_col[c].rc_data;
584 dst = rm->rm_col[x].rc_data;
585
586 if (c == x)
587 ccount = 0;
588 else
589 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
590
591 count = MIN(ccount, xcount);
592
593 if (c == rm->rm_firstdatacol) {
594 for (i = 0; i < count; i++, dst++, src++) {
595 *dst = *src;
596 }
597 for (; i < xcount; i++, dst++) {
598 *dst = 0;
599 }
600
601 } else {
34dc7c2f 602 for (i = 0; i < count; i++, dst++, src++) {
45d1cae3 603 VDEV_RAIDZ_64MUL_2(*dst, mask);
34dc7c2f
BB
604 *dst ^= *src;
605 }
606
607 for (; i < xcount; i++, dst++) {
45d1cae3 608 VDEV_RAIDZ_64MUL_2(*dst, mask);
34dc7c2f
BB
609 }
610 }
611 }
612
613 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
614 dst = rm->rm_col[x].rc_data;
615 exp = 255 - (rm->rm_cols - 1 - x);
616
617 for (i = 0; i < xcount; i++, dst++, src++) {
618 *dst ^= *src;
619 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
620 *b = vdev_raidz_exp2(*b, exp);
621 }
622 }
45d1cae3
BB
623
624 return (1 << VDEV_RAIDZ_Q);
34dc7c2f
BB
625}
626
45d1cae3
BB
627static int
628vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
629{
630 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
631 void *pdata, *qdata;
632 uint64_t xsize, ysize, i;
45d1cae3
BB
633 int x = tgts[0];
634 int y = tgts[1];
34dc7c2f 635
45d1cae3 636 ASSERT(ntgts == 2);
34dc7c2f
BB
637 ASSERT(x < y);
638 ASSERT(x >= rm->rm_firstdatacol);
639 ASSERT(y < rm->rm_cols);
640
641 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
642
643 /*
644 * Move the parity data aside -- we're going to compute parity as
645 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
646 * reuse the parity generation mechanism without trashing the actual
647 * parity so we make those columns appear to be full of zeros by
648 * setting their lengths to zero.
649 */
650 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
651 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
652 xsize = rm->rm_col[x].rc_size;
653 ysize = rm->rm_col[y].rc_size;
654
655 rm->rm_col[VDEV_RAIDZ_P].rc_data =
656 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
657 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
658 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
659 rm->rm_col[x].rc_size = 0;
660 rm->rm_col[y].rc_size = 0;
661
662 vdev_raidz_generate_parity_pq(rm);
663
664 rm->rm_col[x].rc_size = xsize;
665 rm->rm_col[y].rc_size = ysize;
666
667 p = pdata;
668 q = qdata;
669 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
670 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
671 xd = rm->rm_col[x].rc_data;
672 yd = rm->rm_col[y].rc_data;
673
674 /*
675 * We now have:
676 * Pxy = P + D_x + D_y
677 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
678 *
679 * We can then solve for D_x:
680 * D_x = A * (P + Pxy) + B * (Q + Qxy)
681 * where
682 * A = 2^(x - y) * (2^(x - y) + 1)^-1
683 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
684 *
685 * With D_x in hand, we can easily solve for D_y:
686 * D_y = P + Pxy + D_x
687 */
688
689 a = vdev_raidz_pow2[255 + x - y];
690 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
691 tmp = 255 - vdev_raidz_log2[a ^ 1];
692
693 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
694 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
695
696 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
697 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
698 vdev_raidz_exp2(*q ^ *qxy, bexp);
699
700 if (i < ysize)
701 *yd = *p ^ *pxy ^ *xd;
702 }
703
704 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
705 rm->rm_col[VDEV_RAIDZ_P].rc_size);
706 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
707 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
708
709 /*
710 * Restore the saved parity data.
711 */
712 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
713 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
45d1cae3
BB
714
715 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
716}
717
718/* BEGIN CSTYLED */
719/*
720 * In the general case of reconstruction, we must solve the system of linear
721 * equations defined by the coeffecients used to generate parity as well as
722 * the contents of the data and parity disks. This can be expressed with
723 * vectors for the original data (D) and the actual data (d) and parity (p)
724 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
725 *
726 * __ __ __ __
727 * | | __ __ | p_0 |
728 * | V | | D_0 | | p_m-1 |
729 * | | x | : | = | d_0 |
730 * | I | | D_n-1 | | : |
731 * | | ~~ ~~ | d_n-1 |
732 * ~~ ~~ ~~ ~~
733 *
734 * I is simply a square identity matrix of size n, and V is a vandermonde
735 * matrix defined by the coeffecients we chose for the various parity columns
736 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
737 * computation as well as linear separability.
738 *
739 * __ __ __ __
740 * | 1 .. 1 1 1 | | p_0 |
741 * | 2^n-1 .. 4 2 1 | __ __ | : |
742 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
743 * | 1 .. 0 0 0 | | D_1 | | d_0 |
744 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
745 * | : : : : | | : | | d_2 |
746 * | 0 .. 1 0 0 | | D_n-1 | | : |
747 * | 0 .. 0 1 0 | ~~ ~~ | : |
748 * | 0 .. 0 0 1 | | d_n-1 |
749 * ~~ ~~ ~~ ~~
750 *
751 * Note that I, V, d, and p are known. To compute D, we must invert the
752 * matrix and use the known data and parity values to reconstruct the unknown
753 * data values. We begin by removing the rows in V|I and d|p that correspond
754 * to failed or missing columns; we then make V|I square (n x n) and d|p
755 * sized n by removing rows corresponding to unused parity from the bottom up
756 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
757 * using Gauss-Jordan elimination. In the example below we use m=3 parity
758 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
759 * __ __
760 * | 1 1 1 1 1 1 1 1 |
761 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
762 * | 19 205 116 29 64 16 4 1 | / /
763 * | 1 0 0 0 0 0 0 0 | / /
764 * | 0 1 0 0 0 0 0 0 | <--' /
765 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
766 * | 0 0 0 1 0 0 0 0 |
767 * | 0 0 0 0 1 0 0 0 |
768 * | 0 0 0 0 0 1 0 0 |
769 * | 0 0 0 0 0 0 1 0 |
770 * | 0 0 0 0 0 0 0 1 |
771 * ~~ ~~
772 * __ __
773 * | 1 1 1 1 1 1 1 1 |
774 * | 128 64 32 16 8 4 2 1 |
775 * | 19 205 116 29 64 16 4 1 |
776 * | 1 0 0 0 0 0 0 0 |
777 * | 0 1 0 0 0 0 0 0 |
778 * (V|I)' = | 0 0 1 0 0 0 0 0 |
779 * | 0 0 0 1 0 0 0 0 |
780 * | 0 0 0 0 1 0 0 0 |
781 * | 0 0 0 0 0 1 0 0 |
782 * | 0 0 0 0 0 0 1 0 |
783 * | 0 0 0 0 0 0 0 1 |
784 * ~~ ~~
785 *
786 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
787 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
788 * matrix is not singular.
789 * __ __
790 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
791 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
792 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
793 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
794 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
795 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
796 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
797 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
798 * ~~ ~~
799 * __ __
800 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
801 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
802 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
803 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
804 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
805 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
806 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
807 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
808 * ~~ ~~
809 * __ __
810 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
811 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
812 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
813 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
814 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
815 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
816 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
817 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
818 * ~~ ~~
819 * __ __
820 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
821 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
822 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
823 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
824 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
825 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
826 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
827 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
828 * ~~ ~~
829 * __ __
830 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
831 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
832 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
833 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
834 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
835 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
836 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
837 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
838 * ~~ ~~
839 * __ __
840 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
841 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
842 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
843 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
844 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
845 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
846 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
847 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
848 * ~~ ~~
849 * __ __
850 * | 0 0 1 0 0 0 0 0 |
851 * | 167 100 5 41 159 169 217 208 |
852 * | 166 100 4 40 158 168 216 209 |
853 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
854 * | 0 0 0 0 1 0 0 0 |
855 * | 0 0 0 0 0 1 0 0 |
856 * | 0 0 0 0 0 0 1 0 |
857 * | 0 0 0 0 0 0 0 1 |
858 * ~~ ~~
859 *
860 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
861 * of the missing data.
862 *
863 * As is apparent from the example above, the only non-trivial rows in the
864 * inverse matrix correspond to the data disks that we're trying to
865 * reconstruct. Indeed, those are the only rows we need as the others would
866 * only be useful for reconstructing data known or assumed to be valid. For
867 * that reason, we only build the coefficients in the rows that correspond to
868 * targeted columns.
869 */
870/* END CSTYLED */
871
872static void
873vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
874 uint8_t **rows)
875{
876 int i, j;
877 int pow;
878
879 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
880
881 /*
882 * Fill in the missing rows of interest.
883 */
884 for (i = 0; i < nmap; i++) {
885 ASSERT3S(0, <=, map[i]);
886 ASSERT3S(map[i], <=, 2);
887
888 pow = map[i] * n;
889 if (pow > 255)
890 pow -= 255;
891 ASSERT(pow <= 255);
892
893 for (j = 0; j < n; j++) {
894 pow -= map[i];
895 if (pow < 0)
896 pow += 255;
897 rows[i][j] = vdev_raidz_pow2[pow];
898 }
899 }
900}
901
902static void
903vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
904 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
905{
906 int i, j, ii, jj;
907 uint8_t log;
908
909 /*
910 * Assert that the first nmissing entries from the array of used
911 * columns correspond to parity columns and that subsequent entries
912 * correspond to data columns.
913 */
914 for (i = 0; i < nmissing; i++) {
915 ASSERT3S(used[i], <, rm->rm_firstdatacol);
916 }
917 for (; i < n; i++) {
918 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
919 }
920
921 /*
922 * First initialize the storage where we'll compute the inverse rows.
923 */
924 for (i = 0; i < nmissing; i++) {
925 for (j = 0; j < n; j++) {
926 invrows[i][j] = (i == j) ? 1 : 0;
927 }
928 }
929
930 /*
931 * Subtract all trivial rows from the rows of consequence.
932 */
933 for (i = 0; i < nmissing; i++) {
934 for (j = nmissing; j < n; j++) {
935 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
936 jj = used[j] - rm->rm_firstdatacol;
937 ASSERT3S(jj, <, n);
938 invrows[i][j] = rows[i][jj];
939 rows[i][jj] = 0;
940 }
941 }
942
943 /*
944 * For each of the rows of interest, we must normalize it and subtract
945 * a multiple of it from the other rows.
946 */
947 for (i = 0; i < nmissing; i++) {
948 for (j = 0; j < missing[i]; j++) {
949 ASSERT3U(rows[i][j], ==, 0);
950 }
951 ASSERT3U(rows[i][missing[i]], !=, 0);
952
953 /*
954 * Compute the inverse of the first element and multiply each
955 * element in the row by that value.
956 */
957 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
958
959 for (j = 0; j < n; j++) {
960 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
961 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
962 }
963
964 for (ii = 0; ii < nmissing; ii++) {
965 if (i == ii)
966 continue;
967
968 ASSERT3U(rows[ii][missing[i]], !=, 0);
969
970 log = vdev_raidz_log2[rows[ii][missing[i]]];
971
972 for (j = 0; j < n; j++) {
973 rows[ii][j] ^=
974 vdev_raidz_exp2(rows[i][j], log);
975 invrows[ii][j] ^=
976 vdev_raidz_exp2(invrows[i][j], log);
977 }
978 }
979 }
980
981 /*
982 * Verify that the data that is left in the rows are properly part of
983 * an identity matrix.
984 */
985 for (i = 0; i < nmissing; i++) {
986 for (j = 0; j < n; j++) {
987 if (j == missing[i]) {
988 ASSERT3U(rows[i][j], ==, 1);
989 } else {
990 ASSERT3U(rows[i][j], ==, 0);
991 }
992 }
993 }
994}
995
996static void
997vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
998 int *missing, uint8_t **invrows, const uint8_t *used)
999{
1000 int i, j, x, cc, c;
1001 uint8_t *src;
1002 uint64_t ccount;
1003 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1004 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1005 uint8_t log, val;
1006 int ll;
1007 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1008 uint8_t *p, *pp;
1009 size_t psize;
1010
1011 psize = sizeof (invlog[0][0]) * n * nmissing;
1012 p = kmem_alloc(psize, KM_SLEEP);
1013
1014 for (pp = p, i = 0; i < nmissing; i++) {
1015 invlog[i] = pp;
1016 pp += n;
1017 }
1018
1019 for (i = 0; i < nmissing; i++) {
1020 for (j = 0; j < n; j++) {
1021 ASSERT3U(invrows[i][j], !=, 0);
1022 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1023 }
1024 }
1025
1026 for (i = 0; i < n; i++) {
1027 c = used[i];
1028 ASSERT3U(c, <, rm->rm_cols);
1029
1030 src = rm->rm_col[c].rc_data;
1031 ccount = rm->rm_col[c].rc_size;
1032 for (j = 0; j < nmissing; j++) {
1033 cc = missing[j] + rm->rm_firstdatacol;
1034 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1035 ASSERT3U(cc, <, rm->rm_cols);
1036 ASSERT3U(cc, !=, c);
1037
1038 dst[j] = rm->rm_col[cc].rc_data;
1039 dcount[j] = rm->rm_col[cc].rc_size;
1040 }
1041
1042 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1043
1044 for (x = 0; x < ccount; x++, src++) {
1045 if (*src != 0)
1046 log = vdev_raidz_log2[*src];
1047
1048 for (cc = 0; cc < nmissing; cc++) {
1049 if (x >= dcount[cc])
1050 continue;
1051
1052 if (*src == 0) {
1053 val = 0;
1054 } else {
1055 if ((ll = log + invlog[cc][i]) >= 255)
1056 ll -= 255;
1057 val = vdev_raidz_pow2[ll];
1058 }
1059
1060 if (i == 0)
1061 dst[cc][x] = val;
1062 else
1063 dst[cc][x] ^= val;
1064 }
1065 }
1066 }
1067
1068 kmem_free(p, psize);
1069}
1070
1071static int
1072vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1073{
1074 int n, i, c, t, tt;
1075 int nmissing_rows;
1076 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1077 int parity_map[VDEV_RAIDZ_MAXPARITY];
1078
1079 uint8_t *p, *pp;
1080 size_t psize;
1081
1082 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1083 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1084 uint8_t *used;
1085
1086 int code = 0;
1087
1088
1089 n = rm->rm_cols - rm->rm_firstdatacol;
1090
1091 /*
1092 * Figure out which data columns are missing.
1093 */
1094 nmissing_rows = 0;
1095 for (t = 0; t < ntgts; t++) {
1096 if (tgts[t] >= rm->rm_firstdatacol) {
1097 missing_rows[nmissing_rows++] =
1098 tgts[t] - rm->rm_firstdatacol;
1099 }
1100 }
1101
1102 /*
1103 * Figure out which parity columns to use to help generate the missing
1104 * data columns.
1105 */
1106 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1107 ASSERT(tt < ntgts);
1108 ASSERT(c < rm->rm_firstdatacol);
1109
1110 /*
1111 * Skip any targeted parity columns.
1112 */
1113 if (c == tgts[tt]) {
1114 tt++;
1115 continue;
1116 }
1117
1118 code |= 1 << c;
1119
1120 parity_map[i] = c;
1121 i++;
1122 }
1123
1124 ASSERT(code != 0);
1125 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1126
1127 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1128 nmissing_rows * n + sizeof (used[0]) * n;
1129 p = kmem_alloc(psize, KM_SLEEP);
1130
1131 for (pp = p, i = 0; i < nmissing_rows; i++) {
1132 rows[i] = pp;
1133 pp += n;
1134 invrows[i] = pp;
1135 pp += n;
1136 }
1137 used = pp;
1138
1139 for (i = 0; i < nmissing_rows; i++) {
1140 used[i] = parity_map[i];
1141 }
1142
1143 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1144 if (tt < nmissing_rows &&
1145 c == missing_rows[tt] + rm->rm_firstdatacol) {
1146 tt++;
1147 continue;
1148 }
1149
1150 ASSERT3S(i, <, n);
1151 used[i] = c;
1152 i++;
1153 }
1154
1155 /*
1156 * Initialize the interesting rows of the matrix.
1157 */
1158 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1159
1160 /*
1161 * Invert the matrix.
1162 */
1163 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1164 invrows, used);
1165
1166 /*
1167 * Reconstruct the missing data using the generated matrix.
1168 */
1169 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1170 invrows, used);
1171
1172 kmem_free(p, psize);
1173
1174 return (code);
34dc7c2f
BB
1175}
1176
45d1cae3
BB
1177static int
1178vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1179{
1180 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1181 int ntgts;
1182 int i, c;
1183 int code;
1184 int nbadparity, nbaddata;
1185 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1186
1187 /*
1188 * The tgts list must already be sorted.
1189 */
1190 for (i = 1; i < nt; i++) {
1191 ASSERT(t[i] > t[i - 1]);
1192 }
1193
1194 nbadparity = rm->rm_firstdatacol;
1195 nbaddata = rm->rm_cols - nbadparity;
1196 ntgts = 0;
1197 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1198 if (c < rm->rm_firstdatacol)
1199 parity_valid[c] = B_FALSE;
1200
1201 if (i < nt && c == t[i]) {
1202 tgts[ntgts++] = c;
1203 i++;
1204 } else if (rm->rm_col[c].rc_error != 0) {
1205 tgts[ntgts++] = c;
1206 } else if (c >= rm->rm_firstdatacol) {
1207 nbaddata--;
1208 } else {
1209 parity_valid[c] = B_TRUE;
1210 nbadparity--;
1211 }
1212 }
1213
1214 ASSERT(ntgts >= nt);
1215 ASSERT(nbaddata >= 0);
1216 ASSERT(nbaddata + nbadparity == ntgts);
1217
1218 dt = &tgts[nbadparity];
1219
1220 /*
1221 * See if we can use any of our optimized reconstruction routines.
1222 */
1223 if (!vdev_raidz_default_to_general) {
1224 switch (nbaddata) {
1225 case 1:
1226 if (parity_valid[VDEV_RAIDZ_P])
1227 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1228
1229 ASSERT(rm->rm_firstdatacol > 1);
1230
1231 if (parity_valid[VDEV_RAIDZ_Q])
1232 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1233
1234 ASSERT(rm->rm_firstdatacol > 2);
1235 break;
1236
1237 case 2:
1238 ASSERT(rm->rm_firstdatacol > 1);
1239
1240 if (parity_valid[VDEV_RAIDZ_P] &&
1241 parity_valid[VDEV_RAIDZ_Q])
1242 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1243
1244 ASSERT(rm->rm_firstdatacol > 2);
1245
1246 break;
1247 }
1248 }
1249
1250 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1251 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1252 ASSERT(code > 0);
1253 return (code);
1254}
34dc7c2f
BB
1255
1256static int
1257vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1258{
1259 vdev_t *cvd;
1260 uint64_t nparity = vd->vdev_nparity;
45d1cae3 1261 int c;
34dc7c2f
BB
1262 int lasterror = 0;
1263 int numerrors = 0;
1264
1265 ASSERT(nparity > 0);
1266
1267 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1268 vd->vdev_children < nparity + 1) {
1269 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1270 return (EINVAL);
1271 }
1272
45d1cae3
BB
1273 vdev_open_children(vd);
1274
34dc7c2f
BB
1275 for (c = 0; c < vd->vdev_children; c++) {
1276 cvd = vd->vdev_child[c];
1277
45d1cae3
BB
1278 if (cvd->vdev_open_error != 0) {
1279 lasterror = cvd->vdev_open_error;
34dc7c2f
BB
1280 numerrors++;
1281 continue;
1282 }
1283
1284 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1285 *ashift = MAX(*ashift, cvd->vdev_ashift);
1286 }
1287
1288 *asize *= vd->vdev_children;
1289
1290 if (numerrors > nparity) {
1291 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1292 return (lasterror);
1293 }
1294
1295 return (0);
1296}
1297
1298static void
1299vdev_raidz_close(vdev_t *vd)
1300{
1301 int c;
1302
1303 for (c = 0; c < vd->vdev_children; c++)
1304 vdev_close(vd->vdev_child[c]);
1305}
1306
1307static uint64_t
1308vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1309{
1310 uint64_t asize;
1311 uint64_t ashift = vd->vdev_top->vdev_ashift;
1312 uint64_t cols = vd->vdev_children;
1313 uint64_t nparity = vd->vdev_nparity;
1314
1315 asize = ((psize - 1) >> ashift) + 1;
1316 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1317 asize = roundup(asize, nparity + 1) << ashift;
1318
1319 return (asize);
1320}
1321
1322static void
1323vdev_raidz_child_done(zio_t *zio)
1324{
1325 raidz_col_t *rc = zio->io_private;
1326
1327 rc->rc_error = zio->io_error;
1328 rc->rc_tried = 1;
1329 rc->rc_skipped = 0;
1330}
1331
34dc7c2f
BB
1332static int
1333vdev_raidz_io_start(zio_t *zio)
1334{
1335 vdev_t *vd = zio->io_vd;
1336 vdev_t *tvd = vd->vdev_top;
1337 vdev_t *cvd;
1338 blkptr_t *bp = zio->io_bp;
1339 raidz_map_t *rm;
1340 raidz_col_t *rc;
45d1cae3 1341 int c, i;
34dc7c2f
BB
1342
1343 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1344 vd->vdev_nparity);
1345
1346 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1347
1348 if (zio->io_type == ZIO_TYPE_WRITE) {
45d1cae3 1349 vdev_raidz_generate_parity(rm);
34dc7c2f
BB
1350
1351 for (c = 0; c < rm->rm_cols; c++) {
1352 rc = &rm->rm_col[c];
1353 cvd = vd->vdev_child[rc->rc_devidx];
1354 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1355 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1356 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1357 vdev_raidz_child_done, rc));
1358 }
1359
45d1cae3
BB
1360 /*
1361 * Generate optional I/Os for any skipped sectors to improve
1362 * aggregation contiguity.
1363 */
1364 for (c = rm->rm_bigcols, i = 0; i < rm->rm_skipped; c++, i++) {
1365 ASSERT(c <= rm->rm_scols);
1366 if (c == rm->rm_scols)
1367 c = 0;
1368 rc = &rm->rm_col[c];
1369 cvd = vd->vdev_child[rc->rc_devidx];
1370 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1371 rc->rc_offset + rc->rc_size, NULL,
1372 1 << tvd->vdev_ashift,
1373 zio->io_type, zio->io_priority,
1374 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1375 }
1376
b128c09f 1377 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1378 }
1379
1380 ASSERT(zio->io_type == ZIO_TYPE_READ);
1381
1382 /*
1383 * Iterate over the columns in reverse order so that we hit the parity
45d1cae3 1384 * last -- any errors along the way will force us to read the parity.
34dc7c2f
BB
1385 */
1386 for (c = rm->rm_cols - 1; c >= 0; c--) {
1387 rc = &rm->rm_col[c];
1388 cvd = vd->vdev_child[rc->rc_devidx];
1389 if (!vdev_readable(cvd)) {
1390 if (c >= rm->rm_firstdatacol)
1391 rm->rm_missingdata++;
1392 else
1393 rm->rm_missingparity++;
1394 rc->rc_error = ENXIO;
1395 rc->rc_tried = 1; /* don't even try */
1396 rc->rc_skipped = 1;
1397 continue;
1398 }
fb5f0bc8 1399 if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) {
34dc7c2f
BB
1400 if (c >= rm->rm_firstdatacol)
1401 rm->rm_missingdata++;
1402 else
1403 rm->rm_missingparity++;
1404 rc->rc_error = ESTALE;
1405 rc->rc_skipped = 1;
1406 continue;
1407 }
1408 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
9babb374 1409 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
1410 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1411 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1412 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1413 vdev_raidz_child_done, rc));
1414 }
1415 }
1416
b128c09f 1417 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1418}
1419
1420/*
1421 * Report a checksum error for a child of a RAID-Z device.
1422 */
1423static void
1424raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
1425{
1426 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
34dc7c2f
BB
1427
1428 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1429 mutex_enter(&vd->vdev_stat_lock);
1430 vd->vdev_stat.vs_checksum_errors++;
1431 mutex_exit(&vd->vdev_stat_lock);
1432 }
1433
1434 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
1435 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1436 zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
1437}
1438
1439/*
1440 * Generate the parity from the data columns. If we tried and were able to
1441 * read the parity without error, verify that the generated parity matches the
1442 * data we read. If it doesn't, we fire off a checksum error. Return the
1443 * number such failures.
1444 */
1445static int
1446raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1447{
1448 void *orig[VDEV_RAIDZ_MAXPARITY];
1449 int c, ret = 0;
1450 raidz_col_t *rc;
1451
1452 for (c = 0; c < rm->rm_firstdatacol; c++) {
1453 rc = &rm->rm_col[c];
1454 if (!rc->rc_tried || rc->rc_error != 0)
1455 continue;
1456 orig[c] = zio_buf_alloc(rc->rc_size);
1457 bcopy(rc->rc_data, orig[c], rc->rc_size);
1458 }
1459
45d1cae3 1460 vdev_raidz_generate_parity(rm);
34dc7c2f
BB
1461
1462 for (c = 0; c < rm->rm_firstdatacol; c++) {
1463 rc = &rm->rm_col[c];
1464 if (!rc->rc_tried || rc->rc_error != 0)
1465 continue;
1466 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1467 raidz_checksum_error(zio, rc);
1468 rc->rc_error = ECKSUM;
1469 ret++;
1470 }
1471 zio_buf_free(orig[c], rc->rc_size);
1472 }
1473
1474 return (ret);
1475}
1476
45d1cae3
BB
1477/*
1478 * Keep statistics on all the ways that we used parity to correct data.
1479 */
1480static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
34dc7c2f
BB
1481
1482static int
b128c09f
BB
1483vdev_raidz_worst_error(raidz_map_t *rm)
1484{
1485 int error = 0;
1486
1487 for (int c = 0; c < rm->rm_cols; c++)
1488 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1489
1490 return (error);
1491}
1492
45d1cae3
BB
1493/*
1494 * Iterate over all combinations of bad data and attempt a reconstruction.
1495 * Note that the algorithm below is non-optimal because it doesn't take into
1496 * account how reconstruction is actually performed. For example, with
1497 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1498 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1499 * cases we'd only use parity information in column 0.
1500 */
1501static int
1502vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1503{
1504 raidz_map_t *rm = zio->io_vsd;
1505 raidz_col_t *rc;
1506 void *orig[VDEV_RAIDZ_MAXPARITY];
1507 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1508 int *tgts = &tstore[1];
1509 int current, next, i, c, n;
1510 int code, ret = 0;
1511
1512 ASSERT(total_errors < rm->rm_firstdatacol);
1513
1514 /*
1515 * This simplifies one edge condition.
1516 */
1517 tgts[-1] = -1;
1518
1519 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1520 /*
1521 * Initialize the targets array by finding the first n columns
1522 * that contain no error.
1523 *
1524 * If there were no data errors, we need to ensure that we're
1525 * always explicitly attempting to reconstruct at least one
1526 * data column. To do this, we simply push the highest target
1527 * up into the data columns.
1528 */
1529 for (c = 0, i = 0; i < n; i++) {
1530 if (i == n - 1 && data_errors == 0 &&
1531 c < rm->rm_firstdatacol) {
1532 c = rm->rm_firstdatacol;
1533 }
1534
1535 while (rm->rm_col[c].rc_error != 0) {
1536 c++;
1537 ASSERT3S(c, <, rm->rm_cols);
1538 }
1539
1540 tgts[i] = c++;
1541 }
1542
1543 /*
1544 * Setting tgts[n] simplifies the other edge condition.
1545 */
1546 tgts[n] = rm->rm_cols;
1547
1548 /*
1549 * These buffers were allocated in previous iterations.
1550 */
1551 for (i = 0; i < n - 1; i++) {
1552 ASSERT(orig[i] != NULL);
1553 }
1554
1555 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1556
1557 current = 0;
1558 next = tgts[current];
1559
1560 while (current != n) {
1561 tgts[current] = next;
1562 current = 0;
1563
1564 /*
1565 * Save off the original data that we're going to
1566 * attempt to reconstruct.
1567 */
1568 for (i = 0; i < n; i++) {
1569 ASSERT(orig[i] != NULL);
1570 c = tgts[i];
1571 ASSERT3S(c, >=, 0);
1572 ASSERT3S(c, <, rm->rm_cols);
1573 rc = &rm->rm_col[c];
1574 bcopy(rc->rc_data, orig[i], rc->rc_size);
1575 }
1576
1577 /*
1578 * Attempt a reconstruction and exit the outer loop on
1579 * success.
1580 */
1581 code = vdev_raidz_reconstruct(rm, tgts, n);
1582 if (zio_checksum_error(zio) == 0) {
1583 atomic_inc_64(&raidz_corrected[code]);
1584
1585 for (i = 0; i < n; i++) {
1586 c = tgts[i];
1587 rc = &rm->rm_col[c];
1588 ASSERT(rc->rc_error == 0);
1589 if (rc->rc_tried)
1590 raidz_checksum_error(zio, rc);
1591 rc->rc_error = ECKSUM;
1592 }
1593
1594 ret = code;
1595 goto done;
1596 }
1597
1598 /*
1599 * Restore the original data.
1600 */
1601 for (i = 0; i < n; i++) {
1602 c = tgts[i];
1603 rc = &rm->rm_col[c];
1604 bcopy(orig[i], rc->rc_data, rc->rc_size);
1605 }
1606
1607 do {
1608 /*
1609 * Find the next valid column after the current
1610 * position..
1611 */
1612 for (next = tgts[current] + 1;
1613 next < rm->rm_cols &&
1614 rm->rm_col[next].rc_error != 0; next++)
1615 continue;
1616
1617 ASSERT(next <= tgts[current + 1]);
1618
1619 /*
1620 * If that spot is available, we're done here.
1621 */
1622 if (next != tgts[current + 1])
1623 break;
1624
1625 /*
1626 * Otherwise, find the next valid column after
1627 * the previous position.
1628 */
1629 for (c = tgts[current - 1] + 1;
1630 rm->rm_col[c].rc_error != 0; c++)
1631 continue;
1632
1633 tgts[current] = c;
1634 current++;
1635
1636 } while (current != n);
1637 }
1638 }
1639 n--;
1640done:
1641 for (i = 0; i < n; i++) {
1642 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1643 }
1644
1645 return (ret);
1646}
1647
b128c09f 1648static void
34dc7c2f
BB
1649vdev_raidz_io_done(zio_t *zio)
1650{
1651 vdev_t *vd = zio->io_vd;
1652 vdev_t *cvd;
1653 raidz_map_t *rm = zio->io_vsd;
45d1cae3 1654 raidz_col_t *rc;
34dc7c2f
BB
1655 int unexpected_errors = 0;
1656 int parity_errors = 0;
1657 int parity_untried = 0;
1658 int data_errors = 0;
b128c09f 1659 int total_errors = 0;
45d1cae3
BB
1660 int n, c;
1661 int tgts[VDEV_RAIDZ_MAXPARITY];
1662 int code;
34dc7c2f
BB
1663
1664 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1665
34dc7c2f
BB
1666 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1667 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1668
1669 for (c = 0; c < rm->rm_cols; c++) {
1670 rc = &rm->rm_col[c];
1671
34dc7c2f 1672 if (rc->rc_error) {
b128c09f 1673 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
34dc7c2f
BB
1674
1675 if (c < rm->rm_firstdatacol)
1676 parity_errors++;
1677 else
1678 data_errors++;
1679
1680 if (!rc->rc_skipped)
1681 unexpected_errors++;
1682
b128c09f 1683 total_errors++;
34dc7c2f
BB
1684 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1685 parity_untried++;
1686 }
1687 }
1688
1689 if (zio->io_type == ZIO_TYPE_WRITE) {
1690 /*
b128c09f
BB
1691 * XXX -- for now, treat partial writes as a success.
1692 * (If we couldn't write enough columns to reconstruct
1693 * the data, the I/O failed. Otherwise, good enough.)
1694 *
1695 * Now that we support write reallocation, it would be better
1696 * to treat partial failure as real failure unless there are
1697 * no non-degraded top-level vdevs left, and not update DTLs
1698 * if we intend to reallocate.
34dc7c2f
BB
1699 */
1700 /* XXPOLICY */
b128c09f
BB
1701 if (total_errors > rm->rm_firstdatacol)
1702 zio->io_error = vdev_raidz_worst_error(rm);
34dc7c2f 1703
b128c09f 1704 return;
34dc7c2f
BB
1705 }
1706
1707 ASSERT(zio->io_type == ZIO_TYPE_READ);
1708 /*
1709 * There are three potential phases for a read:
1710 * 1. produce valid data from the columns read
1711 * 2. read all disks and try again
1712 * 3. perform combinatorial reconstruction
1713 *
1714 * Each phase is progressively both more expensive and less likely to
1715 * occur. If we encounter more errors than we can repair or all phases
1716 * fail, we have no choice but to return an error.
1717 */
1718
1719 /*
1720 * If the number of errors we saw was correctable -- less than or equal
1721 * to the number of parity disks read -- attempt to produce data that
1722 * has a valid checksum. Naturally, this case applies in the absence of
1723 * any errors.
1724 */
b128c09f 1725 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
45d1cae3 1726 if (data_errors == 0) {
34dc7c2f 1727 if (zio_checksum_error(zio) == 0) {
34dc7c2f
BB
1728 /*
1729 * If we read parity information (unnecessarily
1730 * as it happens since no reconstruction was
1731 * needed) regenerate and verify the parity.
1732 * We also regenerate parity when resilvering
1733 * so we can write it out to the failed device
1734 * later.
1735 */
1736 if (parity_errors + parity_untried <
1737 rm->rm_firstdatacol ||
1738 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1739 n = raidz_parity_verify(zio, rm);
1740 unexpected_errors += n;
1741 ASSERT(parity_errors + n <=
1742 rm->rm_firstdatacol);
1743 }
1744 goto done;
1745 }
45d1cae3 1746 } else {
34dc7c2f
BB
1747 /*
1748 * We either attempt to read all the parity columns or
1749 * none of them. If we didn't try to read parity, we
1750 * wouldn't be here in the correctable case. There must
1751 * also have been fewer parity errors than parity
1752 * columns or, again, we wouldn't be in this code path.
1753 */
1754 ASSERT(parity_untried == 0);
1755 ASSERT(parity_errors < rm->rm_firstdatacol);
1756
1757 /*
45d1cae3 1758 * Identify the data columns that reported an error.
34dc7c2f 1759 */
45d1cae3 1760 n = 0;
34dc7c2f
BB
1761 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1762 rc = &rm->rm_col[c];
45d1cae3
BB
1763 if (rc->rc_error != 0) {
1764 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1765 tgts[n++] = c;
1766 }
34dc7c2f 1767 }
34dc7c2f 1768
45d1cae3
BB
1769 ASSERT(rm->rm_firstdatacol >= n);
1770
1771 code = vdev_raidz_reconstruct(rm, tgts, n);
34dc7c2f
BB
1772
1773 if (zio_checksum_error(zio) == 0) {
45d1cae3 1774 atomic_inc_64(&raidz_corrected[code]);
34dc7c2f
BB
1775
1776 /*
45d1cae3
BB
1777 * If we read more parity disks than were used
1778 * for reconstruction, confirm that the other
1779 * parity disks produced correct data. This
1780 * routine is suboptimal in that it regenerates
1781 * the parity that we already used in addition
1782 * to the parity that we're attempting to
1783 * verify, but this should be a relatively
1784 * uncommon case, and can be optimized if it
1785 * becomes a problem. Note that we regenerate
1786 * parity when resilvering so we can write it
1787 * out to failed devices later.
34dc7c2f 1788 */
45d1cae3 1789 if (parity_errors < rm->rm_firstdatacol - n ||
34dc7c2f
BB
1790 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1791 n = raidz_parity_verify(zio, rm);
1792 unexpected_errors += n;
1793 ASSERT(parity_errors + n <=
1794 rm->rm_firstdatacol);
1795 }
1796
1797 goto done;
1798 }
34dc7c2f
BB
1799 }
1800 }
1801
1802 /*
1803 * This isn't a typical situation -- either we got a read error or
1804 * a child silently returned bad data. Read every block so we can
1805 * try again with as much data and parity as we can track down. If
1806 * we've already been through once before, all children will be marked
1807 * as tried so we'll proceed to combinatorial reconstruction.
1808 */
1809 unexpected_errors = 1;
1810 rm->rm_missingdata = 0;
1811 rm->rm_missingparity = 0;
1812
1813 for (c = 0; c < rm->rm_cols; c++) {
1814 if (rm->rm_col[c].rc_tried)
1815 continue;
1816
34dc7c2f
BB
1817 zio_vdev_io_redone(zio);
1818 do {
1819 rc = &rm->rm_col[c];
1820 if (rc->rc_tried)
1821 continue;
1822 zio_nowait(zio_vdev_child_io(zio, NULL,
1823 vd->vdev_child[rc->rc_devidx],
1824 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1825 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1826 vdev_raidz_child_done, rc));
1827 } while (++c < rm->rm_cols);
34dc7c2f 1828
b128c09f 1829 return;
34dc7c2f
BB
1830 }
1831
1832 /*
1833 * At this point we've attempted to reconstruct the data given the
1834 * errors we detected, and we've attempted to read all columns. There
1835 * must, therefore, be one or more additional problems -- silent errors
1836 * resulting in invalid data rather than explicit I/O errors resulting
45d1cae3
BB
1837 * in absent data. We check if there is enough additional data to
1838 * possibly reconstruct the data and then perform combinatorial
1839 * reconstruction over all possible combinations. If that fails,
1840 * we're cooked.
34dc7c2f 1841 */
b128c09f
BB
1842 if (total_errors >= rm->rm_firstdatacol) {
1843 zio->io_error = vdev_raidz_worst_error(rm);
1844 /*
1845 * If there were exactly as many device errors as parity
1846 * columns, yet we couldn't reconstruct the data, then at
1847 * least one device must have returned bad data silently.
1848 */
1849 if (total_errors == rm->rm_firstdatacol)
1850 zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
34dc7c2f 1851
45d1cae3
BB
1852 } else if ((code = vdev_raidz_combrec(zio, total_errors,
1853 data_errors)) != 0) {
34dc7c2f 1854 /*
45d1cae3
BB
1855 * If we didn't use all the available parity for the
1856 * combinatorial reconstruction, verify that the remaining
1857 * parity is correct.
34dc7c2f 1858 */
45d1cae3
BB
1859 if (code != (1 << rm->rm_firstdatacol) - 1)
1860 (void) raidz_parity_verify(zio, rm);
1861 } else {
34dc7c2f 1862 /*
45d1cae3
BB
1863 * All combinations failed to checksum. Generate checksum
1864 * ereports for all children.
34dc7c2f 1865 */
45d1cae3 1866 zio->io_error = ECKSUM;
34dc7c2f 1867
45d1cae3
BB
1868 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1869 for (c = 0; c < rm->rm_cols; c++) {
1870 rc = &rm->rm_col[c];
1871 zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
1872 zio->io_spa, vd->vdev_child[rc->rc_devidx],
1873 zio, rc->rc_offset, rc->rc_size);
34dc7c2f 1874 }
34dc7c2f
BB
1875 }
1876 }
1877
1878done:
1879 zio_checksum_verified(zio);
1880
fb5f0bc8 1881 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
34dc7c2f 1882 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
1883 /*
1884 * Use the good data we have in hand to repair damaged children.
34dc7c2f 1885 */
34dc7c2f
BB
1886 for (c = 0; c < rm->rm_cols; c++) {
1887 rc = &rm->rm_col[c];
1888 cvd = vd->vdev_child[rc->rc_devidx];
1889
1890 if (rc->rc_error == 0)
1891 continue;
1892
b128c09f 1893 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
34dc7c2f
BB
1894 rc->rc_offset, rc->rc_data, rc->rc_size,
1895 ZIO_TYPE_WRITE, zio->io_priority,
fb5f0bc8
BB
1896 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
1897 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
34dc7c2f 1898 }
34dc7c2f 1899 }
34dc7c2f
BB
1900}
1901
1902static void
1903vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
1904{
1905 if (faulted > vd->vdev_nparity)
1906 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
1907 VDEV_AUX_NO_REPLICAS);
1908 else if (degraded + faulted != 0)
1909 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
1910 else
1911 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
1912}
1913
1914vdev_ops_t vdev_raidz_ops = {
1915 vdev_raidz_open,
1916 vdev_raidz_close,
34dc7c2f
BB
1917 vdev_raidz_asize,
1918 vdev_raidz_io_start,
1919 vdev_raidz_io_done,
1920 vdev_raidz_state_change,
1921 VDEV_TYPE_RAIDZ, /* name of this vdev type */
1922 B_FALSE /* not a leaf vdev */
1923};