]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_raidz.c
Some nvlist allocations in hold processing need to use KM_PUSHPAGE.
[mirror_zfs.git] / module / zfs / vdev_raidz.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2e528b49 24 * Copyright (c) 2013 by Delphix. All rights reserved.
34dc7c2f
BB
25 */
26
34dc7c2f
BB
27#include <sys/zfs_context.h>
28#include <sys/spa.h>
29#include <sys/vdev_impl.h>
30#include <sys/zio.h>
31#include <sys/zio_checksum.h>
32#include <sys/fs/zfs.h>
33#include <sys/fm/fs/zfs.h>
34
35/*
36 * Virtual device vector for RAID-Z.
37 *
45d1cae3
BB
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
45 * for writes.
46 *
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
53 * suffer.
54 *
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
34dc7c2f
BB
59 *
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
d3cc8b15 63 *
34dc7c2f
BB
64 * (A * 2)_7 = A_6
65 * (A * 2)_6 = A_5
66 * (A * 2)_5 = A_4
67 * (A * 2)_4 = A_3 + A_7
68 * (A * 2)_3 = A_2 + A_7
69 * (A * 2)_2 = A_1 + A_7
70 * (A * 2)_1 = A_0
71 * (A * 2)_0 = A_7
72 *
73 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
45d1cae3
BB
74 * As an aside, this multiplication is derived from the error correcting
75 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
34dc7c2f
BB
76 *
77 * Observe that any number in the field (except for 0) can be expressed as a
78 * power of 2 -- a generator for the field. We store a table of the powers of
79 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
80 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
45d1cae3
BB
81 * than field addition). The inverse of a field element A (A^-1) is therefore
82 * A ^ (255 - 1) = A^254.
34dc7c2f 83 *
45d1cae3
BB
84 * The up-to-three parity columns, P, Q, R over several data columns,
85 * D_0, ... D_n-1, can be expressed by field operations:
34dc7c2f
BB
86 *
87 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
88 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
89 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
45d1cae3
BB
90 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
91 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
34dc7c2f 92 *
45d1cae3
BB
93 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
94 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
95 * independent coefficients. (There are no additional coefficients that have
96 * this property which is why the uncorrected Plank method breaks down.)
97 *
98 * See the reconstruction code below for how P, Q and R can used individually
99 * or in concert to recover missing data columns.
34dc7c2f
BB
100 */
101
102typedef struct raidz_col {
103 uint64_t rc_devidx; /* child device index for I/O */
104 uint64_t rc_offset; /* device offset */
105 uint64_t rc_size; /* I/O size */
106 void *rc_data; /* I/O data */
428870ff 107 void *rc_gdata; /* used to store the "good" version */
34dc7c2f
BB
108 int rc_error; /* I/O error for this device */
109 uint8_t rc_tried; /* Did we attempt this I/O column? */
110 uint8_t rc_skipped; /* Did we skip this I/O column? */
111} raidz_col_t;
112
113typedef struct raidz_map {
45d1cae3
BB
114 uint64_t rm_cols; /* Regular column count */
115 uint64_t rm_scols; /* Count including skipped columns */
34dc7c2f
BB
116 uint64_t rm_bigcols; /* Number of oversized columns */
117 uint64_t rm_asize; /* Actual total I/O size */
118 uint64_t rm_missingdata; /* Count of missing data devices */
119 uint64_t rm_missingparity; /* Count of missing parity devices */
120 uint64_t rm_firstdatacol; /* First data column/parity count */
428870ff 121 uint64_t rm_nskip; /* Skipped sectors for padding */
d3cc8b15 122 uint64_t rm_skipstart; /* Column index of padding start */
428870ff
BB
123 void *rm_datacopy; /* rm_asize-buffer of copied data */
124 uintptr_t rm_reports; /* # of referencing checksum reports */
125 uint8_t rm_freed; /* map no longer has referencing ZIO */
126 uint8_t rm_ecksuminjected; /* checksum error was injected */
34dc7c2f
BB
127 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
128} raidz_map_t;
129
130#define VDEV_RAIDZ_P 0
131#define VDEV_RAIDZ_Q 1
45d1cae3 132#define VDEV_RAIDZ_R 2
45d1cae3
BB
133
134#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
135#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
136
137/*
138 * We provide a mechanism to perform the field multiplication operation on a
139 * 64-bit value all at once rather than a byte at a time. This works by
140 * creating a mask from the top bit in each byte and using that to
141 * conditionally apply the XOR of 0x1d.
142 */
143#define VDEV_RAIDZ_64MUL_2(x, mask) \
144{ \
145 (mask) = (x) & 0x8080808080808080ULL; \
146 (mask) = ((mask) << 1) - ((mask) >> 7); \
147 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
c5b3a7bb 148 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
45d1cae3 149}
34dc7c2f 150
45d1cae3
BB
151#define VDEV_RAIDZ_64MUL_4(x, mask) \
152{ \
153 VDEV_RAIDZ_64MUL_2((x), mask); \
154 VDEV_RAIDZ_64MUL_2((x), mask); \
155}
34dc7c2f 156
45d1cae3
BB
157/*
158 * Force reconstruction to use the general purpose method.
159 */
160int vdev_raidz_default_to_general;
34dc7c2f 161
d3cc8b15 162/* Powers of 2 in the Galois field defined above. */
34dc7c2f
BB
163static const uint8_t vdev_raidz_pow2[256] = {
164 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
165 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
166 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
167 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
168 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
169 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
170 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
171 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
172 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
173 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
174 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
175 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
176 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
177 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
178 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
179 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
180 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
181 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
182 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
183 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
184 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
185 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
186 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
187 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
188 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
189 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
190 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
191 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
192 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
193 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
194 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
195 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
196};
d3cc8b15 197/* Logs of 2 in the Galois field defined above. */
34dc7c2f
BB
198static const uint8_t vdev_raidz_log2[256] = {
199 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
200 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
201 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
202 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
203 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
204 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
205 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
206 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
207 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
208 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
209 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
210 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
211 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
212 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
213 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
214 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
215 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
216 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
217 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
218 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
219 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
220 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
221 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
222 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
223 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
224 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
225 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
226 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
227 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
228 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
229 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
230 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
231};
232
428870ff
BB
233static void vdev_raidz_generate_parity(raidz_map_t *rm);
234
34dc7c2f
BB
235/*
236 * Multiply a given number by 2 raised to the given power.
237 */
238static uint8_t
239vdev_raidz_exp2(uint_t a, int exp)
240{
241 if (a == 0)
242 return (0);
243
244 ASSERT(exp >= 0);
245 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
246
247 exp += vdev_raidz_log2[a];
248 if (exp > 255)
249 exp -= 255;
250
251 return (vdev_raidz_pow2[exp]);
252}
253
b128c09f 254static void
428870ff 255vdev_raidz_map_free(raidz_map_t *rm)
b128c09f 256{
b128c09f 257 int c;
428870ff 258 size_t size;
b128c09f 259
428870ff 260 for (c = 0; c < rm->rm_firstdatacol; c++) {
b128c09f
BB
261 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
262
428870ff
BB
263 if (rm->rm_col[c].rc_gdata != NULL)
264 zio_buf_free(rm->rm_col[c].rc_gdata,
265 rm->rm_col[c].rc_size);
266 }
267
268 size = 0;
269 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
270 size += rm->rm_col[c].rc_size;
271
272 if (rm->rm_datacopy != NULL)
273 zio_buf_free(rm->rm_datacopy, size);
274
45d1cae3 275 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
b128c09f
BB
276}
277
428870ff
BB
278static void
279vdev_raidz_map_free_vsd(zio_t *zio)
280{
281 raidz_map_t *rm = zio->io_vsd;
282
c99c9001 283 ASSERT0(rm->rm_freed);
428870ff
BB
284 rm->rm_freed = 1;
285
286 if (rm->rm_reports == 0)
287 vdev_raidz_map_free(rm);
288}
289
290/*ARGSUSED*/
291static void
292vdev_raidz_cksum_free(void *arg, size_t ignored)
293{
294 raidz_map_t *rm = arg;
295
296 ASSERT3U(rm->rm_reports, >, 0);
297
298 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
299 vdev_raidz_map_free(rm);
300}
301
302static void
303vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
304{
305 raidz_map_t *rm = zcr->zcr_cbdata;
306 size_t c = zcr->zcr_cbinfo;
307 size_t x;
308
309 const char *good = NULL;
310 const char *bad = rm->rm_col[c].rc_data;
311
312 if (good_data == NULL) {
313 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
314 return;
315 }
316
317 if (c < rm->rm_firstdatacol) {
318 /*
319 * The first time through, calculate the parity blocks for
320 * the good data (this relies on the fact that the good
321 * data never changes for a given logical ZIO)
322 */
323 if (rm->rm_col[0].rc_gdata == NULL) {
324 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
325 char *buf;
326
327 /*
328 * Set up the rm_col[]s to generate the parity for
329 * good_data, first saving the parity bufs and
330 * replacing them with buffers to hold the result.
331 */
332 for (x = 0; x < rm->rm_firstdatacol; x++) {
333 bad_parity[x] = rm->rm_col[x].rc_data;
334 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
335 zio_buf_alloc(rm->rm_col[x].rc_size);
336 }
337
338 /* fill in the data columns from good_data */
339 buf = (char *)good_data;
340 for (; x < rm->rm_cols; x++) {
341 rm->rm_col[x].rc_data = buf;
342 buf += rm->rm_col[x].rc_size;
343 }
344
345 /*
346 * Construct the parity from the good data.
347 */
348 vdev_raidz_generate_parity(rm);
349
350 /* restore everything back to its original state */
351 for (x = 0; x < rm->rm_firstdatacol; x++)
352 rm->rm_col[x].rc_data = bad_parity[x];
353
354 buf = rm->rm_datacopy;
355 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
356 rm->rm_col[x].rc_data = buf;
357 buf += rm->rm_col[x].rc_size;
358 }
359 }
360
361 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
362 good = rm->rm_col[c].rc_gdata;
363 } else {
364 /* adjust good_data to point at the start of our column */
365 good = good_data;
366
367 for (x = rm->rm_firstdatacol; x < c; x++)
368 good += rm->rm_col[x].rc_size;
369 }
370
371 /* we drop the ereport if it ends up that the data was good */
372 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
373}
374
375/*
376 * Invoked indirectly by zfs_ereport_start_checksum(), called
377 * below when our read operation fails completely. The main point
378 * is to keep a copy of everything we read from disk, so that at
379 * vdev_raidz_cksum_finish() time we can compare it with the good data.
380 */
381static void
382vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
383{
384 size_t c = (size_t)(uintptr_t)arg;
385 caddr_t buf;
386
387 raidz_map_t *rm = zio->io_vsd;
388 size_t size;
389
390 /* set up the report and bump the refcount */
391 zcr->zcr_cbdata = rm;
392 zcr->zcr_cbinfo = c;
393 zcr->zcr_finish = vdev_raidz_cksum_finish;
394 zcr->zcr_free = vdev_raidz_cksum_free;
395
396 rm->rm_reports++;
397 ASSERT3U(rm->rm_reports, >, 0);
398
399 if (rm->rm_datacopy != NULL)
400 return;
401
402 /*
403 * It's the first time we're called for this raidz_map_t, so we need
404 * to copy the data aside; there's no guarantee that our zio's buffer
405 * won't be re-used for something else.
406 *
407 * Our parity data is already in separate buffers, so there's no need
408 * to copy them.
409 */
410
411 size = 0;
412 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
413 size += rm->rm_col[c].rc_size;
414
415 buf = rm->rm_datacopy = zio_buf_alloc(size);
416
417 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
418 raidz_col_t *col = &rm->rm_col[c];
419
420 bcopy(col->rc_data, buf, col->rc_size);
421 col->rc_data = buf;
422
423 buf += col->rc_size;
424 }
425 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
426}
427
428static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
429 vdev_raidz_map_free_vsd,
430 vdev_raidz_cksum_report
431};
432
e49f1e20
WA
433/*
434 * Divides the IO evenly across all child vdevs; usually, dcols is
435 * the number of children in the target vdev.
436 */
34dc7c2f
BB
437static raidz_map_t *
438vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
439 uint64_t nparity)
440{
441 raidz_map_t *rm;
e49f1e20 442 /* The starting RAIDZ (parent) vdev sector of the block. */
34dc7c2f 443 uint64_t b = zio->io_offset >> unit_shift;
e49f1e20 444 /* The zio's size in units of the vdev's minimum sector size. */
34dc7c2f 445 uint64_t s = zio->io_size >> unit_shift;
e49f1e20 446 /* The first column for this stripe. */
34dc7c2f 447 uint64_t f = b % dcols;
e49f1e20 448 /* The starting byte offset on each child vdev. */
34dc7c2f 449 uint64_t o = (b / dcols) << unit_shift;
45d1cae3 450 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
34dc7c2f 451
e49f1e20
WA
452 /*
453 * "Quotient": The number of data sectors for this stripe on all but
454 * the "big column" child vdevs that also contain "remainder" data.
455 */
34dc7c2f 456 q = s / (dcols - nparity);
e49f1e20
WA
457
458 /*
459 * "Remainder": The number of partial stripe data sectors in this I/O.
460 * This will add a sector to some, but not all, child vdevs.
461 */
34dc7c2f 462 r = s - q * (dcols - nparity);
e49f1e20
WA
463
464 /* The number of "big columns" - those which contain remainder data. */
34dc7c2f 465 bc = (r == 0 ? 0 : r + nparity);
e49f1e20
WA
466
467 /*
468 * The total number of data and parity sectors associated with
469 * this I/O.
470 */
45d1cae3
BB
471 tot = s + nparity * (q + (r == 0 ? 0 : 1));
472
e49f1e20
WA
473 /* acols: The columns that will be accessed. */
474 /* scols: The columns that will be accessed or skipped. */
45d1cae3 475 if (q == 0) {
e49f1e20 476 /* Our I/O request doesn't span all child vdevs. */
45d1cae3
BB
477 acols = bc;
478 scols = MIN(dcols, roundup(bc, nparity + 1));
479 } else {
480 acols = dcols;
481 scols = dcols;
482 }
34dc7c2f 483
45d1cae3 484 ASSERT3U(acols, <=, scols);
34dc7c2f 485
b8d06fca 486 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_PUSHPAGE);
34dc7c2f
BB
487
488 rm->rm_cols = acols;
45d1cae3 489 rm->rm_scols = scols;
34dc7c2f 490 rm->rm_bigcols = bc;
428870ff 491 rm->rm_skipstart = bc;
34dc7c2f
BB
492 rm->rm_missingdata = 0;
493 rm->rm_missingparity = 0;
494 rm->rm_firstdatacol = nparity;
428870ff
BB
495 rm->rm_datacopy = NULL;
496 rm->rm_reports = 0;
497 rm->rm_freed = 0;
498 rm->rm_ecksuminjected = 0;
34dc7c2f 499
45d1cae3
BB
500 asize = 0;
501
502 for (c = 0; c < scols; c++) {
34dc7c2f
BB
503 col = f + c;
504 coff = o;
505 if (col >= dcols) {
506 col -= dcols;
507 coff += 1ULL << unit_shift;
508 }
509 rm->rm_col[c].rc_devidx = col;
510 rm->rm_col[c].rc_offset = coff;
34dc7c2f 511 rm->rm_col[c].rc_data = NULL;
428870ff 512 rm->rm_col[c].rc_gdata = NULL;
34dc7c2f
BB
513 rm->rm_col[c].rc_error = 0;
514 rm->rm_col[c].rc_tried = 0;
515 rm->rm_col[c].rc_skipped = 0;
45d1cae3
BB
516
517 if (c >= acols)
518 rm->rm_col[c].rc_size = 0;
519 else if (c < bc)
520 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
521 else
522 rm->rm_col[c].rc_size = q << unit_shift;
523
524 asize += rm->rm_col[c].rc_size;
34dc7c2f
BB
525 }
526
45d1cae3
BB
527 ASSERT3U(asize, ==, tot << unit_shift);
528 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
428870ff
BB
529 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
530 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
531 ASSERT3U(rm->rm_nskip, <=, nparity);
34dc7c2f
BB
532
533 for (c = 0; c < rm->rm_firstdatacol; c++)
534 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
535
536 rm->rm_col[c].rc_data = zio->io_data;
537
538 for (c = c + 1; c < acols; c++)
539 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
540 rm->rm_col[c - 1].rc_size;
541
542 /*
543 * If all data stored spans all columns, there's a danger that parity
544 * will always be on the same device and, since parity isn't read
545 * during normal operation, that that device's I/O bandwidth won't be
546 * used effectively. We therefore switch the parity every 1MB.
547 *
548 * ... at least that was, ostensibly, the theory. As a practical
549 * matter unless we juggle the parity between all devices evenly, we
550 * won't see any benefit. Further, occasional writes that aren't a
551 * multiple of the LCM of the number of children and the minimum
552 * stripe width are sufficient to avoid pessimal behavior.
553 * Unfortunately, this decision created an implicit on-disk format
554 * requirement that we need to support for all eternity, but only
555 * for single-parity RAID-Z.
428870ff
BB
556 *
557 * If we intend to skip a sector in the zeroth column for padding
558 * we must make sure to note this swap. We will never intend to
559 * skip the first column since at least one data and one parity
560 * column must appear in each row.
34dc7c2f
BB
561 */
562 ASSERT(rm->rm_cols >= 2);
563 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
564
565 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
566 devidx = rm->rm_col[0].rc_devidx;
567 o = rm->rm_col[0].rc_offset;
568 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
569 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
570 rm->rm_col[1].rc_devidx = devidx;
571 rm->rm_col[1].rc_offset = o;
428870ff
BB
572
573 if (rm->rm_skipstart == 0)
574 rm->rm_skipstart = 1;
34dc7c2f
BB
575 }
576
577 zio->io_vsd = rm;
428870ff 578 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
34dc7c2f
BB
579 return (rm);
580}
581
34dc7c2f
BB
582static void
583vdev_raidz_generate_parity_p(raidz_map_t *rm)
584{
585 uint64_t *p, *src, pcount, ccount, i;
586 int c;
587
588 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
589
590 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
591 src = rm->rm_col[c].rc_data;
592 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
593 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
594
595 if (c == rm->rm_firstdatacol) {
596 ASSERT(ccount == pcount);
45d1cae3 597 for (i = 0; i < ccount; i++, src++, p++) {
34dc7c2f
BB
598 *p = *src;
599 }
600 } else {
601 ASSERT(ccount <= pcount);
45d1cae3 602 for (i = 0; i < ccount; i++, src++, p++) {
34dc7c2f
BB
603 *p ^= *src;
604 }
605 }
606 }
607}
608
609static void
610vdev_raidz_generate_parity_pq(raidz_map_t *rm)
611{
45d1cae3 612 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
34dc7c2f
BB
613 int c;
614
45d1cae3 615 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
34dc7c2f
BB
616 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
617 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
618
619 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
620 src = rm->rm_col[c].rc_data;
621 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
622 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
45d1cae3
BB
623
624 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
34dc7c2f
BB
625
626 if (c == rm->rm_firstdatacol) {
45d1cae3
BB
627 ASSERT(ccnt == pcnt || ccnt == 0);
628 for (i = 0; i < ccnt; i++, src++, p++, q++) {
34dc7c2f 629 *p = *src;
45d1cae3 630 *q = *src;
34dc7c2f 631 }
45d1cae3 632 for (; i < pcnt; i++, src++, p++, q++) {
34dc7c2f 633 *p = 0;
45d1cae3 634 *q = 0;
34dc7c2f
BB
635 }
636 } else {
45d1cae3 637 ASSERT(ccnt <= pcnt);
34dc7c2f
BB
638
639 /*
45d1cae3
BB
640 * Apply the algorithm described above by multiplying
641 * the previous result and adding in the new value.
34dc7c2f 642 */
45d1cae3
BB
643 for (i = 0; i < ccnt; i++, src++, p++, q++) {
644 *p ^= *src;
645
646 VDEV_RAIDZ_64MUL_2(*q, mask);
34dc7c2f 647 *q ^= *src;
45d1cae3
BB
648 }
649
650 /*
651 * Treat short columns as though they are full of 0s.
652 * Note that there's therefore nothing needed for P.
653 */
654 for (; i < pcnt; i++, q++) {
655 VDEV_RAIDZ_64MUL_2(*q, mask);
656 }
657 }
658 }
659}
660
661static void
662vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
663{
664 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
665 int c;
666
667 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
668 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
669 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
670 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
671 rm->rm_col[VDEV_RAIDZ_R].rc_size);
672
673 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
674 src = rm->rm_col[c].rc_data;
675 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
676 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
677 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
678
679 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
680
681 if (c == rm->rm_firstdatacol) {
682 ASSERT(ccnt == pcnt || ccnt == 0);
683 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
684 *p = *src;
685 *q = *src;
686 *r = *src;
687 }
688 for (; i < pcnt; i++, src++, p++, q++, r++) {
689 *p = 0;
690 *q = 0;
691 *r = 0;
692 }
693 } else {
694 ASSERT(ccnt <= pcnt);
695
696 /*
697 * Apply the algorithm described above by multiplying
698 * the previous result and adding in the new value.
699 */
700 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
34dc7c2f 701 *p ^= *src;
45d1cae3
BB
702
703 VDEV_RAIDZ_64MUL_2(*q, mask);
704 *q ^= *src;
705
706 VDEV_RAIDZ_64MUL_4(*r, mask);
707 *r ^= *src;
34dc7c2f
BB
708 }
709
710 /*
711 * Treat short columns as though they are full of 0s.
45d1cae3 712 * Note that there's therefore nothing needed for P.
34dc7c2f 713 */
45d1cae3
BB
714 for (; i < pcnt; i++, q++, r++) {
715 VDEV_RAIDZ_64MUL_2(*q, mask);
716 VDEV_RAIDZ_64MUL_4(*r, mask);
34dc7c2f
BB
717 }
718 }
719 }
720}
721
45d1cae3
BB
722/*
723 * Generate RAID parity in the first virtual columns according to the number of
724 * parity columns available.
725 */
34dc7c2f 726static void
45d1cae3
BB
727vdev_raidz_generate_parity(raidz_map_t *rm)
728{
729 switch (rm->rm_firstdatacol) {
730 case 1:
731 vdev_raidz_generate_parity_p(rm);
732 break;
733 case 2:
734 vdev_raidz_generate_parity_pq(rm);
735 break;
736 case 3:
737 vdev_raidz_generate_parity_pqr(rm);
738 break;
739 default:
740 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
741 }
742}
743
744static int
745vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
746{
747 uint64_t *dst, *src, xcount, ccount, count, i;
45d1cae3 748 int x = tgts[0];
34dc7c2f
BB
749 int c;
750
45d1cae3
BB
751 ASSERT(ntgts == 1);
752 ASSERT(x >= rm->rm_firstdatacol);
753 ASSERT(x < rm->rm_cols);
754
34dc7c2f
BB
755 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
756 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
757 ASSERT(xcount > 0);
758
759 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
760 dst = rm->rm_col[x].rc_data;
761 for (i = 0; i < xcount; i++, dst++, src++) {
762 *dst = *src;
763 }
764
765 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
766 src = rm->rm_col[c].rc_data;
767 dst = rm->rm_col[x].rc_data;
768
769 if (c == x)
770 continue;
771
772 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
773 count = MIN(ccount, xcount);
774
775 for (i = 0; i < count; i++, dst++, src++) {
776 *dst ^= *src;
777 }
778 }
45d1cae3
BB
779
780 return (1 << VDEV_RAIDZ_P);
34dc7c2f
BB
781}
782
45d1cae3
BB
783static int
784vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
785{
786 uint64_t *dst, *src, xcount, ccount, count, mask, i;
787 uint8_t *b;
45d1cae3 788 int x = tgts[0];
34dc7c2f
BB
789 int c, j, exp;
790
45d1cae3
BB
791 ASSERT(ntgts == 1);
792
34dc7c2f
BB
793 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
794 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
795
796 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
797 src = rm->rm_col[c].rc_data;
798 dst = rm->rm_col[x].rc_data;
799
800 if (c == x)
801 ccount = 0;
802 else
803 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
804
805 count = MIN(ccount, xcount);
806
807 if (c == rm->rm_firstdatacol) {
808 for (i = 0; i < count; i++, dst++, src++) {
809 *dst = *src;
810 }
811 for (; i < xcount; i++, dst++) {
812 *dst = 0;
813 }
814
815 } else {
34dc7c2f 816 for (i = 0; i < count; i++, dst++, src++) {
45d1cae3 817 VDEV_RAIDZ_64MUL_2(*dst, mask);
34dc7c2f
BB
818 *dst ^= *src;
819 }
820
821 for (; i < xcount; i++, dst++) {
45d1cae3 822 VDEV_RAIDZ_64MUL_2(*dst, mask);
34dc7c2f
BB
823 }
824 }
825 }
826
827 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
828 dst = rm->rm_col[x].rc_data;
829 exp = 255 - (rm->rm_cols - 1 - x);
830
831 for (i = 0; i < xcount; i++, dst++, src++) {
832 *dst ^= *src;
833 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
834 *b = vdev_raidz_exp2(*b, exp);
835 }
836 }
45d1cae3
BB
837
838 return (1 << VDEV_RAIDZ_Q);
34dc7c2f
BB
839}
840
45d1cae3
BB
841static int
842vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
843{
844 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
845 void *pdata, *qdata;
846 uint64_t xsize, ysize, i;
45d1cae3
BB
847 int x = tgts[0];
848 int y = tgts[1];
34dc7c2f 849
45d1cae3 850 ASSERT(ntgts == 2);
34dc7c2f
BB
851 ASSERT(x < y);
852 ASSERT(x >= rm->rm_firstdatacol);
853 ASSERT(y < rm->rm_cols);
854
855 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
856
857 /*
858 * Move the parity data aside -- we're going to compute parity as
859 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
860 * reuse the parity generation mechanism without trashing the actual
861 * parity so we make those columns appear to be full of zeros by
862 * setting their lengths to zero.
863 */
864 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
865 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
866 xsize = rm->rm_col[x].rc_size;
867 ysize = rm->rm_col[y].rc_size;
868
869 rm->rm_col[VDEV_RAIDZ_P].rc_data =
870 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
871 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
872 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
873 rm->rm_col[x].rc_size = 0;
874 rm->rm_col[y].rc_size = 0;
875
876 vdev_raidz_generate_parity_pq(rm);
877
878 rm->rm_col[x].rc_size = xsize;
879 rm->rm_col[y].rc_size = ysize;
880
881 p = pdata;
882 q = qdata;
883 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
884 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
885 xd = rm->rm_col[x].rc_data;
886 yd = rm->rm_col[y].rc_data;
887
888 /*
889 * We now have:
890 * Pxy = P + D_x + D_y
891 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
892 *
893 * We can then solve for D_x:
894 * D_x = A * (P + Pxy) + B * (Q + Qxy)
895 * where
896 * A = 2^(x - y) * (2^(x - y) + 1)^-1
897 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
898 *
899 * With D_x in hand, we can easily solve for D_y:
900 * D_y = P + Pxy + D_x
901 */
902
903 a = vdev_raidz_pow2[255 + x - y];
904 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
905 tmp = 255 - vdev_raidz_log2[a ^ 1];
906
907 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
908 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
909
910 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
911 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
912 vdev_raidz_exp2(*q ^ *qxy, bexp);
913
914 if (i < ysize)
915 *yd = *p ^ *pxy ^ *xd;
916 }
917
918 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
919 rm->rm_col[VDEV_RAIDZ_P].rc_size);
920 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
921 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
922
923 /*
924 * Restore the saved parity data.
925 */
926 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
927 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
45d1cae3
BB
928
929 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
930}
931
932/* BEGIN CSTYLED */
933/*
934 * In the general case of reconstruction, we must solve the system of linear
935 * equations defined by the coeffecients used to generate parity as well as
936 * the contents of the data and parity disks. This can be expressed with
937 * vectors for the original data (D) and the actual data (d) and parity (p)
938 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
939 *
940 * __ __ __ __
941 * | | __ __ | p_0 |
942 * | V | | D_0 | | p_m-1 |
943 * | | x | : | = | d_0 |
944 * | I | | D_n-1 | | : |
945 * | | ~~ ~~ | d_n-1 |
946 * ~~ ~~ ~~ ~~
947 *
948 * I is simply a square identity matrix of size n, and V is a vandermonde
949 * matrix defined by the coeffecients we chose for the various parity columns
950 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
951 * computation as well as linear separability.
952 *
953 * __ __ __ __
954 * | 1 .. 1 1 1 | | p_0 |
955 * | 2^n-1 .. 4 2 1 | __ __ | : |
956 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
957 * | 1 .. 0 0 0 | | D_1 | | d_0 |
958 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
959 * | : : : : | | : | | d_2 |
960 * | 0 .. 1 0 0 | | D_n-1 | | : |
961 * | 0 .. 0 1 0 | ~~ ~~ | : |
962 * | 0 .. 0 0 1 | | d_n-1 |
963 * ~~ ~~ ~~ ~~
964 *
965 * Note that I, V, d, and p are known. To compute D, we must invert the
966 * matrix and use the known data and parity values to reconstruct the unknown
967 * data values. We begin by removing the rows in V|I and d|p that correspond
968 * to failed or missing columns; we then make V|I square (n x n) and d|p
969 * sized n by removing rows corresponding to unused parity from the bottom up
970 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
971 * using Gauss-Jordan elimination. In the example below we use m=3 parity
972 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
973 * __ __
974 * | 1 1 1 1 1 1 1 1 |
975 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
976 * | 19 205 116 29 64 16 4 1 | / /
977 * | 1 0 0 0 0 0 0 0 | / /
978 * | 0 1 0 0 0 0 0 0 | <--' /
979 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
980 * | 0 0 0 1 0 0 0 0 |
981 * | 0 0 0 0 1 0 0 0 |
982 * | 0 0 0 0 0 1 0 0 |
983 * | 0 0 0 0 0 0 1 0 |
984 * | 0 0 0 0 0 0 0 1 |
985 * ~~ ~~
986 * __ __
987 * | 1 1 1 1 1 1 1 1 |
988 * | 128 64 32 16 8 4 2 1 |
989 * | 19 205 116 29 64 16 4 1 |
990 * | 1 0 0 0 0 0 0 0 |
991 * | 0 1 0 0 0 0 0 0 |
992 * (V|I)' = | 0 0 1 0 0 0 0 0 |
993 * | 0 0 0 1 0 0 0 0 |
994 * | 0 0 0 0 1 0 0 0 |
995 * | 0 0 0 0 0 1 0 0 |
996 * | 0 0 0 0 0 0 1 0 |
997 * | 0 0 0 0 0 0 0 1 |
998 * ~~ ~~
999 *
1000 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1001 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1002 * matrix is not singular.
1003 * __ __
1004 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1005 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1006 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1007 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1008 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1009 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1010 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1011 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1012 * ~~ ~~
1013 * __ __
1014 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1015 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1016 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1017 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1018 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1019 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1020 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1021 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1022 * ~~ ~~
1023 * __ __
1024 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1025 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1026 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1027 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1028 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1029 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1030 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1031 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1032 * ~~ ~~
1033 * __ __
1034 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1035 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1036 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1037 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1038 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1039 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1040 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1041 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1042 * ~~ ~~
1043 * __ __
1044 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1045 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1046 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1047 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1048 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1049 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1050 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1051 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1052 * ~~ ~~
1053 * __ __
1054 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1055 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1056 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1057 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1058 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1059 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1060 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1061 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1062 * ~~ ~~
1063 * __ __
1064 * | 0 0 1 0 0 0 0 0 |
1065 * | 167 100 5 41 159 169 217 208 |
1066 * | 166 100 4 40 158 168 216 209 |
1067 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1068 * | 0 0 0 0 1 0 0 0 |
1069 * | 0 0 0 0 0 1 0 0 |
1070 * | 0 0 0 0 0 0 1 0 |
1071 * | 0 0 0 0 0 0 0 1 |
1072 * ~~ ~~
1073 *
1074 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1075 * of the missing data.
1076 *
1077 * As is apparent from the example above, the only non-trivial rows in the
1078 * inverse matrix correspond to the data disks that we're trying to
1079 * reconstruct. Indeed, those are the only rows we need as the others would
1080 * only be useful for reconstructing data known or assumed to be valid. For
1081 * that reason, we only build the coefficients in the rows that correspond to
1082 * targeted columns.
1083 */
1084/* END CSTYLED */
1085
1086static void
1087vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1088 uint8_t **rows)
1089{
1090 int i, j;
1091 int pow;
1092
1093 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1094
1095 /*
1096 * Fill in the missing rows of interest.
1097 */
1098 for (i = 0; i < nmap; i++) {
1099 ASSERT3S(0, <=, map[i]);
1100 ASSERT3S(map[i], <=, 2);
1101
1102 pow = map[i] * n;
1103 if (pow > 255)
1104 pow -= 255;
1105 ASSERT(pow <= 255);
1106
1107 for (j = 0; j < n; j++) {
1108 pow -= map[i];
1109 if (pow < 0)
1110 pow += 255;
1111 rows[i][j] = vdev_raidz_pow2[pow];
1112 }
1113 }
1114}
1115
1116static void
1117vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1118 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1119{
1120 int i, j, ii, jj;
1121 uint8_t log;
1122
1123 /*
1124 * Assert that the first nmissing entries from the array of used
1125 * columns correspond to parity columns and that subsequent entries
1126 * correspond to data columns.
1127 */
1128 for (i = 0; i < nmissing; i++) {
1129 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1130 }
1131 for (; i < n; i++) {
1132 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1133 }
1134
1135 /*
1136 * First initialize the storage where we'll compute the inverse rows.
1137 */
1138 for (i = 0; i < nmissing; i++) {
1139 for (j = 0; j < n; j++) {
1140 invrows[i][j] = (i == j) ? 1 : 0;
1141 }
1142 }
1143
1144 /*
1145 * Subtract all trivial rows from the rows of consequence.
1146 */
1147 for (i = 0; i < nmissing; i++) {
1148 for (j = nmissing; j < n; j++) {
1149 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1150 jj = used[j] - rm->rm_firstdatacol;
1151 ASSERT3S(jj, <, n);
1152 invrows[i][j] = rows[i][jj];
1153 rows[i][jj] = 0;
1154 }
1155 }
1156
1157 /*
1158 * For each of the rows of interest, we must normalize it and subtract
1159 * a multiple of it from the other rows.
1160 */
1161 for (i = 0; i < nmissing; i++) {
1162 for (j = 0; j < missing[i]; j++) {
c99c9001 1163 ASSERT0(rows[i][j]);
45d1cae3
BB
1164 }
1165 ASSERT3U(rows[i][missing[i]], !=, 0);
1166
1167 /*
1168 * Compute the inverse of the first element and multiply each
1169 * element in the row by that value.
1170 */
1171 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1172
1173 for (j = 0; j < n; j++) {
1174 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1175 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1176 }
1177
1178 for (ii = 0; ii < nmissing; ii++) {
1179 if (i == ii)
1180 continue;
1181
1182 ASSERT3U(rows[ii][missing[i]], !=, 0);
1183
1184 log = vdev_raidz_log2[rows[ii][missing[i]]];
1185
1186 for (j = 0; j < n; j++) {
1187 rows[ii][j] ^=
1188 vdev_raidz_exp2(rows[i][j], log);
1189 invrows[ii][j] ^=
1190 vdev_raidz_exp2(invrows[i][j], log);
1191 }
1192 }
1193 }
1194
1195 /*
1196 * Verify that the data that is left in the rows are properly part of
1197 * an identity matrix.
1198 */
1199 for (i = 0; i < nmissing; i++) {
1200 for (j = 0; j < n; j++) {
1201 if (j == missing[i]) {
1202 ASSERT3U(rows[i][j], ==, 1);
1203 } else {
c99c9001 1204 ASSERT0(rows[i][j]);
45d1cae3
BB
1205 }
1206 }
1207 }
1208}
1209
1210static void
1211vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1212 int *missing, uint8_t **invrows, const uint8_t *used)
1213{
1214 int i, j, x, cc, c;
1215 uint8_t *src;
1216 uint64_t ccount;
1217 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1218 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
a117a6d6
GW
1219 uint8_t log = 0;
1220 uint8_t val;
45d1cae3
BB
1221 int ll;
1222 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1223 uint8_t *p, *pp;
1224 size_t psize;
1225
1226 psize = sizeof (invlog[0][0]) * n * nmissing;
b8d06fca 1227 p = kmem_alloc(psize, KM_PUSHPAGE);
45d1cae3
BB
1228
1229 for (pp = p, i = 0; i < nmissing; i++) {
1230 invlog[i] = pp;
1231 pp += n;
1232 }
1233
1234 for (i = 0; i < nmissing; i++) {
1235 for (j = 0; j < n; j++) {
1236 ASSERT3U(invrows[i][j], !=, 0);
1237 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1238 }
1239 }
1240
1241 for (i = 0; i < n; i++) {
1242 c = used[i];
1243 ASSERT3U(c, <, rm->rm_cols);
1244
1245 src = rm->rm_col[c].rc_data;
1246 ccount = rm->rm_col[c].rc_size;
1247 for (j = 0; j < nmissing; j++) {
1248 cc = missing[j] + rm->rm_firstdatacol;
1249 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1250 ASSERT3U(cc, <, rm->rm_cols);
1251 ASSERT3U(cc, !=, c);
1252
1253 dst[j] = rm->rm_col[cc].rc_data;
1254 dcount[j] = rm->rm_col[cc].rc_size;
1255 }
1256
1257 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1258
1259 for (x = 0; x < ccount; x++, src++) {
1260 if (*src != 0)
1261 log = vdev_raidz_log2[*src];
1262
1263 for (cc = 0; cc < nmissing; cc++) {
1264 if (x >= dcount[cc])
1265 continue;
1266
1267 if (*src == 0) {
1268 val = 0;
1269 } else {
1270 if ((ll = log + invlog[cc][i]) >= 255)
1271 ll -= 255;
1272 val = vdev_raidz_pow2[ll];
1273 }
1274
1275 if (i == 0)
1276 dst[cc][x] = val;
1277 else
1278 dst[cc][x] ^= val;
1279 }
1280 }
1281 }
1282
1283 kmem_free(p, psize);
1284}
1285
1286static int
1287vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1288{
1289 int n, i, c, t, tt;
1290 int nmissing_rows;
1291 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1292 int parity_map[VDEV_RAIDZ_MAXPARITY];
1293
1294 uint8_t *p, *pp;
1295 size_t psize;
1296
1297 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1298 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1299 uint8_t *used;
1300
1301 int code = 0;
1302
1303
1304 n = rm->rm_cols - rm->rm_firstdatacol;
1305
1306 /*
1307 * Figure out which data columns are missing.
1308 */
1309 nmissing_rows = 0;
1310 for (t = 0; t < ntgts; t++) {
1311 if (tgts[t] >= rm->rm_firstdatacol) {
1312 missing_rows[nmissing_rows++] =
1313 tgts[t] - rm->rm_firstdatacol;
1314 }
1315 }
1316
1317 /*
1318 * Figure out which parity columns to use to help generate the missing
1319 * data columns.
1320 */
1321 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1322 ASSERT(tt < ntgts);
1323 ASSERT(c < rm->rm_firstdatacol);
1324
1325 /*
1326 * Skip any targeted parity columns.
1327 */
1328 if (c == tgts[tt]) {
1329 tt++;
1330 continue;
1331 }
1332
1333 code |= 1 << c;
1334
1335 parity_map[i] = c;
1336 i++;
1337 }
1338
1339 ASSERT(code != 0);
1340 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1341
1342 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1343 nmissing_rows * n + sizeof (used[0]) * n;
b8d06fca 1344 p = kmem_alloc(psize, KM_PUSHPAGE);
45d1cae3
BB
1345
1346 for (pp = p, i = 0; i < nmissing_rows; i++) {
1347 rows[i] = pp;
1348 pp += n;
1349 invrows[i] = pp;
1350 pp += n;
1351 }
1352 used = pp;
1353
1354 for (i = 0; i < nmissing_rows; i++) {
1355 used[i] = parity_map[i];
1356 }
1357
1358 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1359 if (tt < nmissing_rows &&
1360 c == missing_rows[tt] + rm->rm_firstdatacol) {
1361 tt++;
1362 continue;
1363 }
1364
1365 ASSERT3S(i, <, n);
1366 used[i] = c;
1367 i++;
1368 }
1369
1370 /*
1371 * Initialize the interesting rows of the matrix.
1372 */
1373 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1374
1375 /*
1376 * Invert the matrix.
1377 */
1378 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1379 invrows, used);
1380
1381 /*
1382 * Reconstruct the missing data using the generated matrix.
1383 */
1384 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1385 invrows, used);
1386
1387 kmem_free(p, psize);
1388
1389 return (code);
34dc7c2f
BB
1390}
1391
45d1cae3
BB
1392static int
1393vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1394{
1395 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1396 int ntgts;
1397 int i, c;
1398 int code;
1399 int nbadparity, nbaddata;
1400 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1401
1402 /*
1403 * The tgts list must already be sorted.
1404 */
1405 for (i = 1; i < nt; i++) {
1406 ASSERT(t[i] > t[i - 1]);
1407 }
1408
1409 nbadparity = rm->rm_firstdatacol;
1410 nbaddata = rm->rm_cols - nbadparity;
1411 ntgts = 0;
1412 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1413 if (c < rm->rm_firstdatacol)
1414 parity_valid[c] = B_FALSE;
1415
1416 if (i < nt && c == t[i]) {
1417 tgts[ntgts++] = c;
1418 i++;
1419 } else if (rm->rm_col[c].rc_error != 0) {
1420 tgts[ntgts++] = c;
1421 } else if (c >= rm->rm_firstdatacol) {
1422 nbaddata--;
1423 } else {
1424 parity_valid[c] = B_TRUE;
1425 nbadparity--;
1426 }
1427 }
1428
1429 ASSERT(ntgts >= nt);
1430 ASSERT(nbaddata >= 0);
1431 ASSERT(nbaddata + nbadparity == ntgts);
1432
1433 dt = &tgts[nbadparity];
1434
1435 /*
1436 * See if we can use any of our optimized reconstruction routines.
1437 */
1438 if (!vdev_raidz_default_to_general) {
1439 switch (nbaddata) {
1440 case 1:
1441 if (parity_valid[VDEV_RAIDZ_P])
1442 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1443
1444 ASSERT(rm->rm_firstdatacol > 1);
1445
1446 if (parity_valid[VDEV_RAIDZ_Q])
1447 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1448
1449 ASSERT(rm->rm_firstdatacol > 2);
1450 break;
1451
1452 case 2:
1453 ASSERT(rm->rm_firstdatacol > 1);
1454
1455 if (parity_valid[VDEV_RAIDZ_P] &&
1456 parity_valid[VDEV_RAIDZ_Q])
1457 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1458
1459 ASSERT(rm->rm_firstdatacol > 2);
1460
1461 break;
1462 }
1463 }
1464
1465 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1466 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1467 ASSERT(code > 0);
1468 return (code);
1469}
34dc7c2f
BB
1470
1471static int
1bd201e7
CS
1472vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1473 uint64_t *ashift)
34dc7c2f
BB
1474{
1475 vdev_t *cvd;
1476 uint64_t nparity = vd->vdev_nparity;
45d1cae3 1477 int c;
34dc7c2f
BB
1478 int lasterror = 0;
1479 int numerrors = 0;
1480
1481 ASSERT(nparity > 0);
1482
1483 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1484 vd->vdev_children < nparity + 1) {
1485 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2e528b49 1486 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1487 }
1488
45d1cae3
BB
1489 vdev_open_children(vd);
1490
34dc7c2f
BB
1491 for (c = 0; c < vd->vdev_children; c++) {
1492 cvd = vd->vdev_child[c];
1493
45d1cae3
BB
1494 if (cvd->vdev_open_error != 0) {
1495 lasterror = cvd->vdev_open_error;
34dc7c2f
BB
1496 numerrors++;
1497 continue;
1498 }
1499
1500 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1bd201e7 1501 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
34dc7c2f
BB
1502 *ashift = MAX(*ashift, cvd->vdev_ashift);
1503 }
1504
1505 *asize *= vd->vdev_children;
1bd201e7 1506 *max_asize *= vd->vdev_children;
34dc7c2f
BB
1507
1508 if (numerrors > nparity) {
1509 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1510 return (lasterror);
1511 }
1512
1513 return (0);
1514}
1515
1516static void
1517vdev_raidz_close(vdev_t *vd)
1518{
1519 int c;
1520
1521 for (c = 0; c < vd->vdev_children; c++)
1522 vdev_close(vd->vdev_child[c]);
1523}
1524
1525static uint64_t
1526vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1527{
1528 uint64_t asize;
1529 uint64_t ashift = vd->vdev_top->vdev_ashift;
1530 uint64_t cols = vd->vdev_children;
1531 uint64_t nparity = vd->vdev_nparity;
1532
1533 asize = ((psize - 1) >> ashift) + 1;
1534 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1535 asize = roundup(asize, nparity + 1) << ashift;
1536
1537 return (asize);
1538}
1539
1540static void
1541vdev_raidz_child_done(zio_t *zio)
1542{
1543 raidz_col_t *rc = zio->io_private;
1544
1545 rc->rc_error = zio->io_error;
1546 rc->rc_tried = 1;
1547 rc->rc_skipped = 0;
1548}
1549
e49f1e20
WA
1550/*
1551 * Start an IO operation on a RAIDZ VDev
1552 *
1553 * Outline:
1554 * - For write operations:
1555 * 1. Generate the parity data
1556 * 2. Create child zio write operations to each column's vdev, for both
1557 * data and parity.
1558 * 3. If the column skips any sectors for padding, create optional dummy
1559 * write zio children for those areas to improve aggregation continuity.
1560 * - For read operations:
1561 * 1. Create child zio read operations to each data column's vdev to read
1562 * the range of data required for zio.
1563 * 2. If this is a scrub or resilver operation, or if any of the data
1564 * vdevs have had errors, then create zio read operations to the parity
1565 * columns' VDevs as well.
1566 */
34dc7c2f
BB
1567static int
1568vdev_raidz_io_start(zio_t *zio)
1569{
1570 vdev_t *vd = zio->io_vd;
1571 vdev_t *tvd = vd->vdev_top;
1572 vdev_t *cvd;
34dc7c2f
BB
1573 raidz_map_t *rm;
1574 raidz_col_t *rc;
45d1cae3 1575 int c, i;
34dc7c2f
BB
1576
1577 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1578 vd->vdev_nparity);
1579
1580 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1581
1582 if (zio->io_type == ZIO_TYPE_WRITE) {
45d1cae3 1583 vdev_raidz_generate_parity(rm);
34dc7c2f
BB
1584
1585 for (c = 0; c < rm->rm_cols; c++) {
1586 rc = &rm->rm_col[c];
1587 cvd = vd->vdev_child[rc->rc_devidx];
1588 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1589 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1590 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1591 vdev_raidz_child_done, rc));
1592 }
1593
45d1cae3
BB
1594 /*
1595 * Generate optional I/Os for any skipped sectors to improve
1596 * aggregation contiguity.
1597 */
428870ff 1598 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
45d1cae3
BB
1599 ASSERT(c <= rm->rm_scols);
1600 if (c == rm->rm_scols)
1601 c = 0;
1602 rc = &rm->rm_col[c];
1603 cvd = vd->vdev_child[rc->rc_devidx];
1604 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1605 rc->rc_offset + rc->rc_size, NULL,
1606 1 << tvd->vdev_ashift,
1607 zio->io_type, zio->io_priority,
1608 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1609 }
1610
b128c09f 1611 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1612 }
1613
1614 ASSERT(zio->io_type == ZIO_TYPE_READ);
1615
1616 /*
1617 * Iterate over the columns in reverse order so that we hit the parity
45d1cae3 1618 * last -- any errors along the way will force us to read the parity.
34dc7c2f
BB
1619 */
1620 for (c = rm->rm_cols - 1; c >= 0; c--) {
1621 rc = &rm->rm_col[c];
1622 cvd = vd->vdev_child[rc->rc_devidx];
1623 if (!vdev_readable(cvd)) {
1624 if (c >= rm->rm_firstdatacol)
1625 rm->rm_missingdata++;
1626 else
1627 rm->rm_missingparity++;
2e528b49 1628 rc->rc_error = SET_ERROR(ENXIO);
34dc7c2f
BB
1629 rc->rc_tried = 1; /* don't even try */
1630 rc->rc_skipped = 1;
1631 continue;
1632 }
428870ff 1633 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
34dc7c2f
BB
1634 if (c >= rm->rm_firstdatacol)
1635 rm->rm_missingdata++;
1636 else
1637 rm->rm_missingparity++;
2e528b49 1638 rc->rc_error = SET_ERROR(ESTALE);
34dc7c2f
BB
1639 rc->rc_skipped = 1;
1640 continue;
1641 }
1642 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
9babb374 1643 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
1644 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1645 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1646 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1647 vdev_raidz_child_done, rc));
1648 }
1649 }
1650
b128c09f 1651 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1652}
1653
428870ff 1654
34dc7c2f
BB
1655/*
1656 * Report a checksum error for a child of a RAID-Z device.
1657 */
1658static void
428870ff 1659raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
34dc7c2f
BB
1660{
1661 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
34dc7c2f
BB
1662
1663 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
428870ff
BB
1664 zio_bad_cksum_t zbc;
1665 raidz_map_t *rm = zio->io_vsd;
1666
34dc7c2f
BB
1667 mutex_enter(&vd->vdev_stat_lock);
1668 vd->vdev_stat.vs_checksum_errors++;
1669 mutex_exit(&vd->vdev_stat_lock);
428870ff
BB
1670
1671 zbc.zbc_has_cksum = 0;
1672 zbc.zbc_injected = rm->rm_ecksuminjected;
1673
1674 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1675 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1676 &zbc);
34dc7c2f 1677 }
428870ff
BB
1678}
1679
1680/*
1681 * We keep track of whether or not there were any injected errors, so that
1682 * any ereports we generate can note it.
1683 */
1684static int
1685raidz_checksum_verify(zio_t *zio)
1686{
1687 zio_bad_cksum_t zbc;
1688 raidz_map_t *rm = zio->io_vsd;
d4ed6673 1689 int ret;
428870ff 1690
d4ed6673
BB
1691 bzero(&zbc, sizeof (zio_bad_cksum_t));
1692
1693 ret = zio_checksum_error(zio, &zbc);
428870ff
BB
1694 if (ret != 0 && zbc.zbc_injected != 0)
1695 rm->rm_ecksuminjected = 1;
34dc7c2f 1696
428870ff 1697 return (ret);
34dc7c2f
BB
1698}
1699
1700/*
1701 * Generate the parity from the data columns. If we tried and were able to
1702 * read the parity without error, verify that the generated parity matches the
1703 * data we read. If it doesn't, we fire off a checksum error. Return the
1704 * number such failures.
1705 */
1706static int
1707raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1708{
1709 void *orig[VDEV_RAIDZ_MAXPARITY];
1710 int c, ret = 0;
1711 raidz_col_t *rc;
1712
1713 for (c = 0; c < rm->rm_firstdatacol; c++) {
1714 rc = &rm->rm_col[c];
1715 if (!rc->rc_tried || rc->rc_error != 0)
1716 continue;
1717 orig[c] = zio_buf_alloc(rc->rc_size);
1718 bcopy(rc->rc_data, orig[c], rc->rc_size);
1719 }
1720
45d1cae3 1721 vdev_raidz_generate_parity(rm);
34dc7c2f
BB
1722
1723 for (c = 0; c < rm->rm_firstdatacol; c++) {
1724 rc = &rm->rm_col[c];
1725 if (!rc->rc_tried || rc->rc_error != 0)
1726 continue;
1727 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
428870ff 1728 raidz_checksum_error(zio, rc, orig[c]);
2e528b49 1729 rc->rc_error = SET_ERROR(ECKSUM);
34dc7c2f
BB
1730 ret++;
1731 }
1732 zio_buf_free(orig[c], rc->rc_size);
1733 }
1734
1735 return (ret);
1736}
1737
45d1cae3
BB
1738/*
1739 * Keep statistics on all the ways that we used parity to correct data.
1740 */
1741static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
34dc7c2f
BB
1742
1743static int
b128c09f
BB
1744vdev_raidz_worst_error(raidz_map_t *rm)
1745{
d6320ddb 1746 int c, error = 0;
b128c09f 1747
d6320ddb 1748 for (c = 0; c < rm->rm_cols; c++)
b128c09f
BB
1749 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1750
1751 return (error);
1752}
1753
45d1cae3
BB
1754/*
1755 * Iterate over all combinations of bad data and attempt a reconstruction.
1756 * Note that the algorithm below is non-optimal because it doesn't take into
1757 * account how reconstruction is actually performed. For example, with
1758 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1759 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1760 * cases we'd only use parity information in column 0.
1761 */
1762static int
1763vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1764{
1765 raidz_map_t *rm = zio->io_vsd;
1766 raidz_col_t *rc;
1767 void *orig[VDEV_RAIDZ_MAXPARITY];
1768 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1769 int *tgts = &tstore[1];
5631c038 1770 int curr, next, i, c, n;
45d1cae3
BB
1771 int code, ret = 0;
1772
1773 ASSERT(total_errors < rm->rm_firstdatacol);
1774
1775 /*
1776 * This simplifies one edge condition.
1777 */
1778 tgts[-1] = -1;
1779
1780 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1781 /*
1782 * Initialize the targets array by finding the first n columns
1783 * that contain no error.
1784 *
1785 * If there were no data errors, we need to ensure that we're
1786 * always explicitly attempting to reconstruct at least one
1787 * data column. To do this, we simply push the highest target
1788 * up into the data columns.
1789 */
1790 for (c = 0, i = 0; i < n; i++) {
1791 if (i == n - 1 && data_errors == 0 &&
1792 c < rm->rm_firstdatacol) {
1793 c = rm->rm_firstdatacol;
1794 }
1795
1796 while (rm->rm_col[c].rc_error != 0) {
1797 c++;
1798 ASSERT3S(c, <, rm->rm_cols);
1799 }
1800
1801 tgts[i] = c++;
1802 }
1803
1804 /*
1805 * Setting tgts[n] simplifies the other edge condition.
1806 */
1807 tgts[n] = rm->rm_cols;
1808
1809 /*
1810 * These buffers were allocated in previous iterations.
1811 */
1812 for (i = 0; i < n - 1; i++) {
1813 ASSERT(orig[i] != NULL);
1814 }
1815
1816 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1817
5631c038
BB
1818 curr = 0;
1819 next = tgts[curr];
45d1cae3 1820
5631c038
BB
1821 while (curr != n) {
1822 tgts[curr] = next;
1823 curr = 0;
45d1cae3
BB
1824
1825 /*
1826 * Save off the original data that we're going to
1827 * attempt to reconstruct.
1828 */
1829 for (i = 0; i < n; i++) {
1830 ASSERT(orig[i] != NULL);
1831 c = tgts[i];
1832 ASSERT3S(c, >=, 0);
1833 ASSERT3S(c, <, rm->rm_cols);
1834 rc = &rm->rm_col[c];
1835 bcopy(rc->rc_data, orig[i], rc->rc_size);
1836 }
1837
1838 /*
1839 * Attempt a reconstruction and exit the outer loop on
1840 * success.
1841 */
1842 code = vdev_raidz_reconstruct(rm, tgts, n);
428870ff 1843 if (raidz_checksum_verify(zio) == 0) {
45d1cae3
BB
1844 atomic_inc_64(&raidz_corrected[code]);
1845
1846 for (i = 0; i < n; i++) {
1847 c = tgts[i];
1848 rc = &rm->rm_col[c];
1849 ASSERT(rc->rc_error == 0);
1850 if (rc->rc_tried)
428870ff
BB
1851 raidz_checksum_error(zio, rc,
1852 orig[i]);
2e528b49 1853 rc->rc_error = SET_ERROR(ECKSUM);
45d1cae3
BB
1854 }
1855
1856 ret = code;
1857 goto done;
1858 }
1859
1860 /*
1861 * Restore the original data.
1862 */
1863 for (i = 0; i < n; i++) {
1864 c = tgts[i];
1865 rc = &rm->rm_col[c];
1866 bcopy(orig[i], rc->rc_data, rc->rc_size);
1867 }
1868
1869 do {
1870 /*
5631c038 1871 * Find the next valid column after the curr
45d1cae3
BB
1872 * position..
1873 */
5631c038 1874 for (next = tgts[curr] + 1;
45d1cae3
BB
1875 next < rm->rm_cols &&
1876 rm->rm_col[next].rc_error != 0; next++)
1877 continue;
1878
5631c038 1879 ASSERT(next <= tgts[curr + 1]);
45d1cae3
BB
1880
1881 /*
1882 * If that spot is available, we're done here.
1883 */
5631c038 1884 if (next != tgts[curr + 1])
45d1cae3
BB
1885 break;
1886
1887 /*
1888 * Otherwise, find the next valid column after
1889 * the previous position.
1890 */
5631c038 1891 for (c = tgts[curr - 1] + 1;
45d1cae3
BB
1892 rm->rm_col[c].rc_error != 0; c++)
1893 continue;
1894
5631c038
BB
1895 tgts[curr] = c;
1896 curr++;
45d1cae3 1897
5631c038 1898 } while (curr != n);
45d1cae3
BB
1899 }
1900 }
1901 n--;
1902done:
1903 for (i = 0; i < n; i++) {
1904 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1905 }
1906
1907 return (ret);
1908}
1909
e49f1e20
WA
1910/*
1911 * Complete an IO operation on a RAIDZ VDev
1912 *
1913 * Outline:
1914 * - For write operations:
1915 * 1. Check for errors on the child IOs.
1916 * 2. Return, setting an error code if too few child VDevs were written
1917 * to reconstruct the data later. Note that partial writes are
1918 * considered successful if they can be reconstructed at all.
1919 * - For read operations:
1920 * 1. Check for errors on the child IOs.
1921 * 2. If data errors occurred:
1922 * a. Try to reassemble the data from the parity available.
1923 * b. If we haven't yet read the parity drives, read them now.
1924 * c. If all parity drives have been read but the data still doesn't
1925 * reassemble with a correct checksum, then try combinatorial
1926 * reconstruction.
1927 * d. If that doesn't work, return an error.
1928 * 3. If there were unexpected errors or this is a resilver operation,
1929 * rewrite the vdevs that had errors.
1930 */
b128c09f 1931static void
34dc7c2f
BB
1932vdev_raidz_io_done(zio_t *zio)
1933{
1934 vdev_t *vd = zio->io_vd;
1935 vdev_t *cvd;
1936 raidz_map_t *rm = zio->io_vsd;
d4ed6673 1937 raidz_col_t *rc = NULL;
34dc7c2f
BB
1938 int unexpected_errors = 0;
1939 int parity_errors = 0;
1940 int parity_untried = 0;
1941 int data_errors = 0;
b128c09f 1942 int total_errors = 0;
45d1cae3
BB
1943 int n, c;
1944 int tgts[VDEV_RAIDZ_MAXPARITY];
1945 int code;
34dc7c2f
BB
1946
1947 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1948
34dc7c2f
BB
1949 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1950 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1951
1952 for (c = 0; c < rm->rm_cols; c++) {
1953 rc = &rm->rm_col[c];
1954
34dc7c2f 1955 if (rc->rc_error) {
b128c09f 1956 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
34dc7c2f
BB
1957
1958 if (c < rm->rm_firstdatacol)
1959 parity_errors++;
1960 else
1961 data_errors++;
1962
1963 if (!rc->rc_skipped)
1964 unexpected_errors++;
1965
b128c09f 1966 total_errors++;
34dc7c2f
BB
1967 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1968 parity_untried++;
1969 }
1970 }
1971
1972 if (zio->io_type == ZIO_TYPE_WRITE) {
1973 /*
b128c09f
BB
1974 * XXX -- for now, treat partial writes as a success.
1975 * (If we couldn't write enough columns to reconstruct
1976 * the data, the I/O failed. Otherwise, good enough.)
1977 *
1978 * Now that we support write reallocation, it would be better
1979 * to treat partial failure as real failure unless there are
1980 * no non-degraded top-level vdevs left, and not update DTLs
1981 * if we intend to reallocate.
34dc7c2f
BB
1982 */
1983 /* XXPOLICY */
b128c09f
BB
1984 if (total_errors > rm->rm_firstdatacol)
1985 zio->io_error = vdev_raidz_worst_error(rm);
34dc7c2f 1986
b128c09f 1987 return;
34dc7c2f
BB
1988 }
1989
1990 ASSERT(zio->io_type == ZIO_TYPE_READ);
1991 /*
1992 * There are three potential phases for a read:
1993 * 1. produce valid data from the columns read
1994 * 2. read all disks and try again
1995 * 3. perform combinatorial reconstruction
1996 *
1997 * Each phase is progressively both more expensive and less likely to
1998 * occur. If we encounter more errors than we can repair or all phases
1999 * fail, we have no choice but to return an error.
2000 */
2001
2002 /*
2003 * If the number of errors we saw was correctable -- less than or equal
2004 * to the number of parity disks read -- attempt to produce data that
2005 * has a valid checksum. Naturally, this case applies in the absence of
2006 * any errors.
2007 */
b128c09f 2008 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
45d1cae3 2009 if (data_errors == 0) {
428870ff 2010 if (raidz_checksum_verify(zio) == 0) {
34dc7c2f
BB
2011 /*
2012 * If we read parity information (unnecessarily
2013 * as it happens since no reconstruction was
2014 * needed) regenerate and verify the parity.
2015 * We also regenerate parity when resilvering
2016 * so we can write it out to the failed device
2017 * later.
2018 */
2019 if (parity_errors + parity_untried <
2020 rm->rm_firstdatacol ||
2021 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2022 n = raidz_parity_verify(zio, rm);
2023 unexpected_errors += n;
2024 ASSERT(parity_errors + n <=
2025 rm->rm_firstdatacol);
2026 }
2027 goto done;
2028 }
45d1cae3 2029 } else {
34dc7c2f
BB
2030 /*
2031 * We either attempt to read all the parity columns or
2032 * none of them. If we didn't try to read parity, we
2033 * wouldn't be here in the correctable case. There must
2034 * also have been fewer parity errors than parity
2035 * columns or, again, we wouldn't be in this code path.
2036 */
2037 ASSERT(parity_untried == 0);
2038 ASSERT(parity_errors < rm->rm_firstdatacol);
2039
2040 /*
45d1cae3 2041 * Identify the data columns that reported an error.
34dc7c2f 2042 */
45d1cae3 2043 n = 0;
34dc7c2f
BB
2044 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2045 rc = &rm->rm_col[c];
45d1cae3
BB
2046 if (rc->rc_error != 0) {
2047 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2048 tgts[n++] = c;
2049 }
34dc7c2f 2050 }
34dc7c2f 2051
45d1cae3
BB
2052 ASSERT(rm->rm_firstdatacol >= n);
2053
2054 code = vdev_raidz_reconstruct(rm, tgts, n);
34dc7c2f 2055
428870ff 2056 if (raidz_checksum_verify(zio) == 0) {
45d1cae3 2057 atomic_inc_64(&raidz_corrected[code]);
34dc7c2f
BB
2058
2059 /*
45d1cae3
BB
2060 * If we read more parity disks than were used
2061 * for reconstruction, confirm that the other
2062 * parity disks produced correct data. This
2063 * routine is suboptimal in that it regenerates
2064 * the parity that we already used in addition
2065 * to the parity that we're attempting to
2066 * verify, but this should be a relatively
2067 * uncommon case, and can be optimized if it
2068 * becomes a problem. Note that we regenerate
2069 * parity when resilvering so we can write it
2070 * out to failed devices later.
34dc7c2f 2071 */
45d1cae3 2072 if (parity_errors < rm->rm_firstdatacol - n ||
34dc7c2f
BB
2073 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2074 n = raidz_parity_verify(zio, rm);
2075 unexpected_errors += n;
2076 ASSERT(parity_errors + n <=
2077 rm->rm_firstdatacol);
2078 }
2079
2080 goto done;
2081 }
34dc7c2f
BB
2082 }
2083 }
2084
2085 /*
2086 * This isn't a typical situation -- either we got a read error or
2087 * a child silently returned bad data. Read every block so we can
2088 * try again with as much data and parity as we can track down. If
2089 * we've already been through once before, all children will be marked
2090 * as tried so we'll proceed to combinatorial reconstruction.
2091 */
2092 unexpected_errors = 1;
2093 rm->rm_missingdata = 0;
2094 rm->rm_missingparity = 0;
2095
2096 for (c = 0; c < rm->rm_cols; c++) {
2097 if (rm->rm_col[c].rc_tried)
2098 continue;
2099
34dc7c2f
BB
2100 zio_vdev_io_redone(zio);
2101 do {
2102 rc = &rm->rm_col[c];
2103 if (rc->rc_tried)
2104 continue;
2105 zio_nowait(zio_vdev_child_io(zio, NULL,
2106 vd->vdev_child[rc->rc_devidx],
2107 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 2108 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
2109 vdev_raidz_child_done, rc));
2110 } while (++c < rm->rm_cols);
34dc7c2f 2111
b128c09f 2112 return;
34dc7c2f
BB
2113 }
2114
2115 /*
2116 * At this point we've attempted to reconstruct the data given the
2117 * errors we detected, and we've attempted to read all columns. There
2118 * must, therefore, be one or more additional problems -- silent errors
2119 * resulting in invalid data rather than explicit I/O errors resulting
45d1cae3
BB
2120 * in absent data. We check if there is enough additional data to
2121 * possibly reconstruct the data and then perform combinatorial
2122 * reconstruction over all possible combinations. If that fails,
2123 * we're cooked.
34dc7c2f 2124 */
428870ff 2125 if (total_errors > rm->rm_firstdatacol) {
b128c09f 2126 zio->io_error = vdev_raidz_worst_error(rm);
34dc7c2f 2127
428870ff
BB
2128 } else if (total_errors < rm->rm_firstdatacol &&
2129 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
34dc7c2f 2130 /*
45d1cae3
BB
2131 * If we didn't use all the available parity for the
2132 * combinatorial reconstruction, verify that the remaining
2133 * parity is correct.
34dc7c2f 2134 */
45d1cae3
BB
2135 if (code != (1 << rm->rm_firstdatacol) - 1)
2136 (void) raidz_parity_verify(zio, rm);
2137 } else {
34dc7c2f 2138 /*
428870ff
BB
2139 * We're here because either:
2140 *
2141 * total_errors == rm_first_datacol, or
2142 * vdev_raidz_combrec() failed
2143 *
2144 * In either case, there is enough bad data to prevent
2145 * reconstruction.
2146 *
2147 * Start checksum ereports for all children which haven't
2148 * failed, and the IO wasn't speculative.
34dc7c2f 2149 */
2e528b49 2150 zio->io_error = SET_ERROR(ECKSUM);
34dc7c2f 2151
45d1cae3
BB
2152 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2153 for (c = 0; c < rm->rm_cols; c++) {
2154 rc = &rm->rm_col[c];
428870ff
BB
2155 if (rc->rc_error == 0) {
2156 zio_bad_cksum_t zbc;
2157 zbc.zbc_has_cksum = 0;
2158 zbc.zbc_injected =
2159 rm->rm_ecksuminjected;
2160
2161 zfs_ereport_start_checksum(
2162 zio->io_spa,
2163 vd->vdev_child[rc->rc_devidx],
2164 zio, rc->rc_offset, rc->rc_size,
2165 (void *)(uintptr_t)c, &zbc);
2166 }
34dc7c2f 2167 }
34dc7c2f
BB
2168 }
2169 }
2170
2171done:
2172 zio_checksum_verified(zio);
2173
fb5f0bc8 2174 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
34dc7c2f 2175 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
2176 /*
2177 * Use the good data we have in hand to repair damaged children.
34dc7c2f 2178 */
34dc7c2f
BB
2179 for (c = 0; c < rm->rm_cols; c++) {
2180 rc = &rm->rm_col[c];
2181 cvd = vd->vdev_child[rc->rc_devidx];
2182
2183 if (rc->rc_error == 0)
2184 continue;
2185
b128c09f 2186 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
34dc7c2f
BB
2187 rc->rc_offset, rc->rc_data, rc->rc_size,
2188 ZIO_TYPE_WRITE, zio->io_priority,
fb5f0bc8
BB
2189 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2190 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
34dc7c2f 2191 }
34dc7c2f 2192 }
34dc7c2f
BB
2193}
2194
2195static void
2196vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2197{
2198 if (faulted > vd->vdev_nparity)
2199 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2200 VDEV_AUX_NO_REPLICAS);
2201 else if (degraded + faulted != 0)
2202 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2203 else
2204 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2205}
2206
2207vdev_ops_t vdev_raidz_ops = {
2208 vdev_raidz_open,
2209 vdev_raidz_close,
34dc7c2f
BB
2210 vdev_raidz_asize,
2211 vdev_raidz_io_start,
2212 vdev_raidz_io_done,
2213 vdev_raidz_state_change,
428870ff
BB
2214 NULL,
2215 NULL,
34dc7c2f
BB
2216 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2217 B_FALSE /* not a leaf vdev */
2218};