]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_raidz.c
Illumos #4045 write throttle & i/o scheduler performance work
[mirror_zfs.git] / module / zfs / vdev_raidz.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
2e528b49 24 * Copyright (c) 2013 by Delphix. All rights reserved.
34dc7c2f
BB
25 */
26
34dc7c2f
BB
27#include <sys/zfs_context.h>
28#include <sys/spa.h>
29#include <sys/vdev_impl.h>
30#include <sys/zio.h>
31#include <sys/zio_checksum.h>
32#include <sys/fs/zfs.h>
33#include <sys/fm/fs/zfs.h>
34
35/*
36 * Virtual device vector for RAID-Z.
37 *
45d1cae3
BB
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
45 * for writes.
46 *
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
53 * suffer.
54 *
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
34dc7c2f
BB
59 *
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
d3cc8b15 63 *
34dc7c2f
BB
64 * (A * 2)_7 = A_6
65 * (A * 2)_6 = A_5
66 * (A * 2)_5 = A_4
67 * (A * 2)_4 = A_3 + A_7
68 * (A * 2)_3 = A_2 + A_7
69 * (A * 2)_2 = A_1 + A_7
70 * (A * 2)_1 = A_0
71 * (A * 2)_0 = A_7
72 *
73 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
45d1cae3
BB
74 * As an aside, this multiplication is derived from the error correcting
75 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
34dc7c2f
BB
76 *
77 * Observe that any number in the field (except for 0) can be expressed as a
78 * power of 2 -- a generator for the field. We store a table of the powers of
79 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
80 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
45d1cae3
BB
81 * than field addition). The inverse of a field element A (A^-1) is therefore
82 * A ^ (255 - 1) = A^254.
34dc7c2f 83 *
45d1cae3
BB
84 * The up-to-three parity columns, P, Q, R over several data columns,
85 * D_0, ... D_n-1, can be expressed by field operations:
34dc7c2f
BB
86 *
87 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
88 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
89 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
45d1cae3
BB
90 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
91 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
34dc7c2f 92 *
45d1cae3
BB
93 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
94 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
95 * independent coefficients. (There are no additional coefficients that have
96 * this property which is why the uncorrected Plank method breaks down.)
97 *
98 * See the reconstruction code below for how P, Q and R can used individually
99 * or in concert to recover missing data columns.
34dc7c2f
BB
100 */
101
102typedef struct raidz_col {
103 uint64_t rc_devidx; /* child device index for I/O */
104 uint64_t rc_offset; /* device offset */
105 uint64_t rc_size; /* I/O size */
106 void *rc_data; /* I/O data */
428870ff 107 void *rc_gdata; /* used to store the "good" version */
34dc7c2f
BB
108 int rc_error; /* I/O error for this device */
109 uint8_t rc_tried; /* Did we attempt this I/O column? */
110 uint8_t rc_skipped; /* Did we skip this I/O column? */
111} raidz_col_t;
112
113typedef struct raidz_map {
45d1cae3
BB
114 uint64_t rm_cols; /* Regular column count */
115 uint64_t rm_scols; /* Count including skipped columns */
34dc7c2f
BB
116 uint64_t rm_bigcols; /* Number of oversized columns */
117 uint64_t rm_asize; /* Actual total I/O size */
118 uint64_t rm_missingdata; /* Count of missing data devices */
119 uint64_t rm_missingparity; /* Count of missing parity devices */
120 uint64_t rm_firstdatacol; /* First data column/parity count */
428870ff 121 uint64_t rm_nskip; /* Skipped sectors for padding */
d3cc8b15 122 uint64_t rm_skipstart; /* Column index of padding start */
428870ff
BB
123 void *rm_datacopy; /* rm_asize-buffer of copied data */
124 uintptr_t rm_reports; /* # of referencing checksum reports */
125 uint8_t rm_freed; /* map no longer has referencing ZIO */
126 uint8_t rm_ecksuminjected; /* checksum error was injected */
34dc7c2f
BB
127 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
128} raidz_map_t;
129
130#define VDEV_RAIDZ_P 0
131#define VDEV_RAIDZ_Q 1
45d1cae3 132#define VDEV_RAIDZ_R 2
45d1cae3
BB
133
134#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
135#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
136
137/*
138 * We provide a mechanism to perform the field multiplication operation on a
139 * 64-bit value all at once rather than a byte at a time. This works by
140 * creating a mask from the top bit in each byte and using that to
141 * conditionally apply the XOR of 0x1d.
142 */
143#define VDEV_RAIDZ_64MUL_2(x, mask) \
144{ \
145 (mask) = (x) & 0x8080808080808080ULL; \
146 (mask) = ((mask) << 1) - ((mask) >> 7); \
147 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
c5b3a7bb 148 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
45d1cae3 149}
34dc7c2f 150
45d1cae3
BB
151#define VDEV_RAIDZ_64MUL_4(x, mask) \
152{ \
153 VDEV_RAIDZ_64MUL_2((x), mask); \
154 VDEV_RAIDZ_64MUL_2((x), mask); \
155}
34dc7c2f 156
45d1cae3
BB
157/*
158 * Force reconstruction to use the general purpose method.
159 */
160int vdev_raidz_default_to_general;
34dc7c2f 161
d3cc8b15 162/* Powers of 2 in the Galois field defined above. */
34dc7c2f
BB
163static const uint8_t vdev_raidz_pow2[256] = {
164 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
165 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
166 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
167 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
168 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
169 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
170 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
171 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
172 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
173 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
174 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
175 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
176 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
177 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
178 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
179 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
180 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
181 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
182 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
183 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
184 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
185 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
186 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
187 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
188 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
189 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
190 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
191 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
192 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
193 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
194 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
195 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
196};
d3cc8b15 197/* Logs of 2 in the Galois field defined above. */
34dc7c2f
BB
198static const uint8_t vdev_raidz_log2[256] = {
199 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
200 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
201 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
202 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
203 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
204 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
205 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
206 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
207 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
208 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
209 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
210 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
211 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
212 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
213 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
214 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
215 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
216 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
217 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
218 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
219 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
220 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
221 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
222 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
223 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
224 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
225 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
226 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
227 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
228 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
229 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
230 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
231};
232
428870ff
BB
233static void vdev_raidz_generate_parity(raidz_map_t *rm);
234
34dc7c2f
BB
235/*
236 * Multiply a given number by 2 raised to the given power.
237 */
238static uint8_t
239vdev_raidz_exp2(uint_t a, int exp)
240{
241 if (a == 0)
242 return (0);
243
244 ASSERT(exp >= 0);
245 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
246
247 exp += vdev_raidz_log2[a];
248 if (exp > 255)
249 exp -= 255;
250
251 return (vdev_raidz_pow2[exp]);
252}
253
b128c09f 254static void
428870ff 255vdev_raidz_map_free(raidz_map_t *rm)
b128c09f 256{
b128c09f 257 int c;
428870ff 258 size_t size;
b128c09f 259
428870ff 260 for (c = 0; c < rm->rm_firstdatacol; c++) {
b128c09f
BB
261 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
262
428870ff
BB
263 if (rm->rm_col[c].rc_gdata != NULL)
264 zio_buf_free(rm->rm_col[c].rc_gdata,
265 rm->rm_col[c].rc_size);
266 }
267
268 size = 0;
269 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
270 size += rm->rm_col[c].rc_size;
271
272 if (rm->rm_datacopy != NULL)
273 zio_buf_free(rm->rm_datacopy, size);
274
45d1cae3 275 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
b128c09f
BB
276}
277
428870ff
BB
278static void
279vdev_raidz_map_free_vsd(zio_t *zio)
280{
281 raidz_map_t *rm = zio->io_vsd;
282
c99c9001 283 ASSERT0(rm->rm_freed);
428870ff
BB
284 rm->rm_freed = 1;
285
286 if (rm->rm_reports == 0)
287 vdev_raidz_map_free(rm);
288}
289
290/*ARGSUSED*/
291static void
292vdev_raidz_cksum_free(void *arg, size_t ignored)
293{
294 raidz_map_t *rm = arg;
295
296 ASSERT3U(rm->rm_reports, >, 0);
297
298 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
299 vdev_raidz_map_free(rm);
300}
301
302static void
303vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
304{
305 raidz_map_t *rm = zcr->zcr_cbdata;
306 size_t c = zcr->zcr_cbinfo;
307 size_t x;
308
309 const char *good = NULL;
310 const char *bad = rm->rm_col[c].rc_data;
311
312 if (good_data == NULL) {
313 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
314 return;
315 }
316
317 if (c < rm->rm_firstdatacol) {
318 /*
319 * The first time through, calculate the parity blocks for
320 * the good data (this relies on the fact that the good
321 * data never changes for a given logical ZIO)
322 */
323 if (rm->rm_col[0].rc_gdata == NULL) {
324 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
325 char *buf;
326
327 /*
328 * Set up the rm_col[]s to generate the parity for
329 * good_data, first saving the parity bufs and
330 * replacing them with buffers to hold the result.
331 */
332 for (x = 0; x < rm->rm_firstdatacol; x++) {
333 bad_parity[x] = rm->rm_col[x].rc_data;
334 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
335 zio_buf_alloc(rm->rm_col[x].rc_size);
336 }
337
338 /* fill in the data columns from good_data */
339 buf = (char *)good_data;
340 for (; x < rm->rm_cols; x++) {
341 rm->rm_col[x].rc_data = buf;
342 buf += rm->rm_col[x].rc_size;
343 }
344
345 /*
346 * Construct the parity from the good data.
347 */
348 vdev_raidz_generate_parity(rm);
349
350 /* restore everything back to its original state */
351 for (x = 0; x < rm->rm_firstdatacol; x++)
352 rm->rm_col[x].rc_data = bad_parity[x];
353
354 buf = rm->rm_datacopy;
355 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
356 rm->rm_col[x].rc_data = buf;
357 buf += rm->rm_col[x].rc_size;
358 }
359 }
360
361 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
362 good = rm->rm_col[c].rc_gdata;
363 } else {
364 /* adjust good_data to point at the start of our column */
365 good = good_data;
366
367 for (x = rm->rm_firstdatacol; x < c; x++)
368 good += rm->rm_col[x].rc_size;
369 }
370
371 /* we drop the ereport if it ends up that the data was good */
372 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
373}
374
375/*
376 * Invoked indirectly by zfs_ereport_start_checksum(), called
377 * below when our read operation fails completely. The main point
378 * is to keep a copy of everything we read from disk, so that at
379 * vdev_raidz_cksum_finish() time we can compare it with the good data.
380 */
381static void
382vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
383{
384 size_t c = (size_t)(uintptr_t)arg;
385 caddr_t buf;
386
387 raidz_map_t *rm = zio->io_vsd;
388 size_t size;
389
390 /* set up the report and bump the refcount */
391 zcr->zcr_cbdata = rm;
392 zcr->zcr_cbinfo = c;
393 zcr->zcr_finish = vdev_raidz_cksum_finish;
394 zcr->zcr_free = vdev_raidz_cksum_free;
395
396 rm->rm_reports++;
397 ASSERT3U(rm->rm_reports, >, 0);
398
399 if (rm->rm_datacopy != NULL)
400 return;
401
402 /*
403 * It's the first time we're called for this raidz_map_t, so we need
404 * to copy the data aside; there's no guarantee that our zio's buffer
405 * won't be re-used for something else.
406 *
407 * Our parity data is already in separate buffers, so there's no need
408 * to copy them.
409 */
410
411 size = 0;
412 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
413 size += rm->rm_col[c].rc_size;
414
415 buf = rm->rm_datacopy = zio_buf_alloc(size);
416
417 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
418 raidz_col_t *col = &rm->rm_col[c];
419
420 bcopy(col->rc_data, buf, col->rc_size);
421 col->rc_data = buf;
422
423 buf += col->rc_size;
424 }
425 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
426}
427
428static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
429 vdev_raidz_map_free_vsd,
430 vdev_raidz_cksum_report
431};
432
e49f1e20
WA
433/*
434 * Divides the IO evenly across all child vdevs; usually, dcols is
435 * the number of children in the target vdev.
a1687880
BB
436 *
437 * Avoid inlining the function to keep vdev_raidz_io_start(), which
438 * is this functions only caller, as small as possible on the stack.
e49f1e20 439 */
a1687880 440noinline static raidz_map_t *
34dc7c2f
BB
441vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
442 uint64_t nparity)
443{
444 raidz_map_t *rm;
e49f1e20 445 /* The starting RAIDZ (parent) vdev sector of the block. */
34dc7c2f 446 uint64_t b = zio->io_offset >> unit_shift;
e49f1e20 447 /* The zio's size in units of the vdev's minimum sector size. */
34dc7c2f 448 uint64_t s = zio->io_size >> unit_shift;
e49f1e20 449 /* The first column for this stripe. */
34dc7c2f 450 uint64_t f = b % dcols;
e49f1e20 451 /* The starting byte offset on each child vdev. */
34dc7c2f 452 uint64_t o = (b / dcols) << unit_shift;
45d1cae3 453 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
34dc7c2f 454
e49f1e20
WA
455 /*
456 * "Quotient": The number of data sectors for this stripe on all but
457 * the "big column" child vdevs that also contain "remainder" data.
458 */
34dc7c2f 459 q = s / (dcols - nparity);
e49f1e20
WA
460
461 /*
462 * "Remainder": The number of partial stripe data sectors in this I/O.
463 * This will add a sector to some, but not all, child vdevs.
464 */
34dc7c2f 465 r = s - q * (dcols - nparity);
e49f1e20
WA
466
467 /* The number of "big columns" - those which contain remainder data. */
34dc7c2f 468 bc = (r == 0 ? 0 : r + nparity);
e49f1e20
WA
469
470 /*
471 * The total number of data and parity sectors associated with
472 * this I/O.
473 */
45d1cae3
BB
474 tot = s + nparity * (q + (r == 0 ? 0 : 1));
475
e49f1e20
WA
476 /* acols: The columns that will be accessed. */
477 /* scols: The columns that will be accessed or skipped. */
45d1cae3 478 if (q == 0) {
e49f1e20 479 /* Our I/O request doesn't span all child vdevs. */
45d1cae3
BB
480 acols = bc;
481 scols = MIN(dcols, roundup(bc, nparity + 1));
482 } else {
483 acols = dcols;
484 scols = dcols;
485 }
34dc7c2f 486
45d1cae3 487 ASSERT3U(acols, <=, scols);
34dc7c2f 488
b8d06fca 489 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_PUSHPAGE);
34dc7c2f
BB
490
491 rm->rm_cols = acols;
45d1cae3 492 rm->rm_scols = scols;
34dc7c2f 493 rm->rm_bigcols = bc;
428870ff 494 rm->rm_skipstart = bc;
34dc7c2f
BB
495 rm->rm_missingdata = 0;
496 rm->rm_missingparity = 0;
497 rm->rm_firstdatacol = nparity;
428870ff
BB
498 rm->rm_datacopy = NULL;
499 rm->rm_reports = 0;
500 rm->rm_freed = 0;
501 rm->rm_ecksuminjected = 0;
34dc7c2f 502
45d1cae3
BB
503 asize = 0;
504
505 for (c = 0; c < scols; c++) {
34dc7c2f
BB
506 col = f + c;
507 coff = o;
508 if (col >= dcols) {
509 col -= dcols;
510 coff += 1ULL << unit_shift;
511 }
512 rm->rm_col[c].rc_devidx = col;
513 rm->rm_col[c].rc_offset = coff;
34dc7c2f 514 rm->rm_col[c].rc_data = NULL;
428870ff 515 rm->rm_col[c].rc_gdata = NULL;
34dc7c2f
BB
516 rm->rm_col[c].rc_error = 0;
517 rm->rm_col[c].rc_tried = 0;
518 rm->rm_col[c].rc_skipped = 0;
45d1cae3
BB
519
520 if (c >= acols)
521 rm->rm_col[c].rc_size = 0;
522 else if (c < bc)
523 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
524 else
525 rm->rm_col[c].rc_size = q << unit_shift;
526
527 asize += rm->rm_col[c].rc_size;
34dc7c2f
BB
528 }
529
45d1cae3
BB
530 ASSERT3U(asize, ==, tot << unit_shift);
531 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
428870ff
BB
532 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
533 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
534 ASSERT3U(rm->rm_nskip, <=, nparity);
34dc7c2f
BB
535
536 for (c = 0; c < rm->rm_firstdatacol; c++)
537 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
538
539 rm->rm_col[c].rc_data = zio->io_data;
540
541 for (c = c + 1; c < acols; c++)
542 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
543 rm->rm_col[c - 1].rc_size;
544
545 /*
546 * If all data stored spans all columns, there's a danger that parity
547 * will always be on the same device and, since parity isn't read
548 * during normal operation, that that device's I/O bandwidth won't be
549 * used effectively. We therefore switch the parity every 1MB.
550 *
551 * ... at least that was, ostensibly, the theory. As a practical
552 * matter unless we juggle the parity between all devices evenly, we
553 * won't see any benefit. Further, occasional writes that aren't a
554 * multiple of the LCM of the number of children and the minimum
555 * stripe width are sufficient to avoid pessimal behavior.
556 * Unfortunately, this decision created an implicit on-disk format
557 * requirement that we need to support for all eternity, but only
558 * for single-parity RAID-Z.
428870ff
BB
559 *
560 * If we intend to skip a sector in the zeroth column for padding
561 * we must make sure to note this swap. We will never intend to
562 * skip the first column since at least one data and one parity
563 * column must appear in each row.
34dc7c2f
BB
564 */
565 ASSERT(rm->rm_cols >= 2);
566 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
567
568 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
569 devidx = rm->rm_col[0].rc_devidx;
570 o = rm->rm_col[0].rc_offset;
571 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
572 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
573 rm->rm_col[1].rc_devidx = devidx;
574 rm->rm_col[1].rc_offset = o;
428870ff
BB
575
576 if (rm->rm_skipstart == 0)
577 rm->rm_skipstart = 1;
34dc7c2f
BB
578 }
579
580 zio->io_vsd = rm;
428870ff 581 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
34dc7c2f
BB
582 return (rm);
583}
584
34dc7c2f
BB
585static void
586vdev_raidz_generate_parity_p(raidz_map_t *rm)
587{
588 uint64_t *p, *src, pcount, ccount, i;
589 int c;
590
591 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
592
593 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
594 src = rm->rm_col[c].rc_data;
595 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
596 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
597
598 if (c == rm->rm_firstdatacol) {
599 ASSERT(ccount == pcount);
45d1cae3 600 for (i = 0; i < ccount; i++, src++, p++) {
34dc7c2f
BB
601 *p = *src;
602 }
603 } else {
604 ASSERT(ccount <= pcount);
45d1cae3 605 for (i = 0; i < ccount; i++, src++, p++) {
34dc7c2f
BB
606 *p ^= *src;
607 }
608 }
609 }
610}
611
612static void
613vdev_raidz_generate_parity_pq(raidz_map_t *rm)
614{
45d1cae3 615 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
34dc7c2f
BB
616 int c;
617
45d1cae3 618 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
34dc7c2f
BB
619 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
620 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
621
622 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
623 src = rm->rm_col[c].rc_data;
624 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
625 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
45d1cae3
BB
626
627 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
34dc7c2f
BB
628
629 if (c == rm->rm_firstdatacol) {
45d1cae3
BB
630 ASSERT(ccnt == pcnt || ccnt == 0);
631 for (i = 0; i < ccnt; i++, src++, p++, q++) {
34dc7c2f 632 *p = *src;
45d1cae3 633 *q = *src;
34dc7c2f 634 }
45d1cae3 635 for (; i < pcnt; i++, src++, p++, q++) {
34dc7c2f 636 *p = 0;
45d1cae3 637 *q = 0;
34dc7c2f
BB
638 }
639 } else {
45d1cae3 640 ASSERT(ccnt <= pcnt);
34dc7c2f
BB
641
642 /*
45d1cae3
BB
643 * Apply the algorithm described above by multiplying
644 * the previous result and adding in the new value.
34dc7c2f 645 */
45d1cae3
BB
646 for (i = 0; i < ccnt; i++, src++, p++, q++) {
647 *p ^= *src;
648
649 VDEV_RAIDZ_64MUL_2(*q, mask);
34dc7c2f 650 *q ^= *src;
45d1cae3
BB
651 }
652
653 /*
654 * Treat short columns as though they are full of 0s.
655 * Note that there's therefore nothing needed for P.
656 */
657 for (; i < pcnt; i++, q++) {
658 VDEV_RAIDZ_64MUL_2(*q, mask);
659 }
660 }
661 }
662}
663
664static void
665vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
666{
667 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
668 int c;
669
670 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
671 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
672 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
673 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
674 rm->rm_col[VDEV_RAIDZ_R].rc_size);
675
676 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
677 src = rm->rm_col[c].rc_data;
678 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
679 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
680 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
681
682 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
683
684 if (c == rm->rm_firstdatacol) {
685 ASSERT(ccnt == pcnt || ccnt == 0);
686 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
687 *p = *src;
688 *q = *src;
689 *r = *src;
690 }
691 for (; i < pcnt; i++, src++, p++, q++, r++) {
692 *p = 0;
693 *q = 0;
694 *r = 0;
695 }
696 } else {
697 ASSERT(ccnt <= pcnt);
698
699 /*
700 * Apply the algorithm described above by multiplying
701 * the previous result and adding in the new value.
702 */
703 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
34dc7c2f 704 *p ^= *src;
45d1cae3
BB
705
706 VDEV_RAIDZ_64MUL_2(*q, mask);
707 *q ^= *src;
708
709 VDEV_RAIDZ_64MUL_4(*r, mask);
710 *r ^= *src;
34dc7c2f
BB
711 }
712
713 /*
714 * Treat short columns as though they are full of 0s.
45d1cae3 715 * Note that there's therefore nothing needed for P.
34dc7c2f 716 */
45d1cae3
BB
717 for (; i < pcnt; i++, q++, r++) {
718 VDEV_RAIDZ_64MUL_2(*q, mask);
719 VDEV_RAIDZ_64MUL_4(*r, mask);
34dc7c2f
BB
720 }
721 }
722 }
723}
724
45d1cae3
BB
725/*
726 * Generate RAID parity in the first virtual columns according to the number of
727 * parity columns available.
728 */
34dc7c2f 729static void
45d1cae3
BB
730vdev_raidz_generate_parity(raidz_map_t *rm)
731{
732 switch (rm->rm_firstdatacol) {
733 case 1:
734 vdev_raidz_generate_parity_p(rm);
735 break;
736 case 2:
737 vdev_raidz_generate_parity_pq(rm);
738 break;
739 case 3:
740 vdev_raidz_generate_parity_pqr(rm);
741 break;
742 default:
743 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
744 }
745}
746
747static int
748vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
749{
750 uint64_t *dst, *src, xcount, ccount, count, i;
45d1cae3 751 int x = tgts[0];
34dc7c2f
BB
752 int c;
753
45d1cae3
BB
754 ASSERT(ntgts == 1);
755 ASSERT(x >= rm->rm_firstdatacol);
756 ASSERT(x < rm->rm_cols);
757
34dc7c2f
BB
758 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
759 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
760 ASSERT(xcount > 0);
761
762 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
763 dst = rm->rm_col[x].rc_data;
764 for (i = 0; i < xcount; i++, dst++, src++) {
765 *dst = *src;
766 }
767
768 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
769 src = rm->rm_col[c].rc_data;
770 dst = rm->rm_col[x].rc_data;
771
772 if (c == x)
773 continue;
774
775 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
776 count = MIN(ccount, xcount);
777
778 for (i = 0; i < count; i++, dst++, src++) {
779 *dst ^= *src;
780 }
781 }
45d1cae3
BB
782
783 return (1 << VDEV_RAIDZ_P);
34dc7c2f
BB
784}
785
45d1cae3
BB
786static int
787vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
788{
789 uint64_t *dst, *src, xcount, ccount, count, mask, i;
790 uint8_t *b;
45d1cae3 791 int x = tgts[0];
34dc7c2f
BB
792 int c, j, exp;
793
45d1cae3
BB
794 ASSERT(ntgts == 1);
795
34dc7c2f
BB
796 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
797 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
798
799 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
800 src = rm->rm_col[c].rc_data;
801 dst = rm->rm_col[x].rc_data;
802
803 if (c == x)
804 ccount = 0;
805 else
806 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
807
808 count = MIN(ccount, xcount);
809
810 if (c == rm->rm_firstdatacol) {
811 for (i = 0; i < count; i++, dst++, src++) {
812 *dst = *src;
813 }
814 for (; i < xcount; i++, dst++) {
815 *dst = 0;
816 }
817
818 } else {
34dc7c2f 819 for (i = 0; i < count; i++, dst++, src++) {
45d1cae3 820 VDEV_RAIDZ_64MUL_2(*dst, mask);
34dc7c2f
BB
821 *dst ^= *src;
822 }
823
824 for (; i < xcount; i++, dst++) {
45d1cae3 825 VDEV_RAIDZ_64MUL_2(*dst, mask);
34dc7c2f
BB
826 }
827 }
828 }
829
830 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
831 dst = rm->rm_col[x].rc_data;
832 exp = 255 - (rm->rm_cols - 1 - x);
833
834 for (i = 0; i < xcount; i++, dst++, src++) {
835 *dst ^= *src;
836 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
837 *b = vdev_raidz_exp2(*b, exp);
838 }
839 }
45d1cae3
BB
840
841 return (1 << VDEV_RAIDZ_Q);
34dc7c2f
BB
842}
843
45d1cae3
BB
844static int
845vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
846{
847 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
848 void *pdata, *qdata;
849 uint64_t xsize, ysize, i;
45d1cae3
BB
850 int x = tgts[0];
851 int y = tgts[1];
34dc7c2f 852
45d1cae3 853 ASSERT(ntgts == 2);
34dc7c2f
BB
854 ASSERT(x < y);
855 ASSERT(x >= rm->rm_firstdatacol);
856 ASSERT(y < rm->rm_cols);
857
858 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
859
860 /*
861 * Move the parity data aside -- we're going to compute parity as
862 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
863 * reuse the parity generation mechanism without trashing the actual
864 * parity so we make those columns appear to be full of zeros by
865 * setting their lengths to zero.
866 */
867 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
868 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
869 xsize = rm->rm_col[x].rc_size;
870 ysize = rm->rm_col[y].rc_size;
871
872 rm->rm_col[VDEV_RAIDZ_P].rc_data =
873 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
874 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
875 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
876 rm->rm_col[x].rc_size = 0;
877 rm->rm_col[y].rc_size = 0;
878
879 vdev_raidz_generate_parity_pq(rm);
880
881 rm->rm_col[x].rc_size = xsize;
882 rm->rm_col[y].rc_size = ysize;
883
884 p = pdata;
885 q = qdata;
886 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
887 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
888 xd = rm->rm_col[x].rc_data;
889 yd = rm->rm_col[y].rc_data;
890
891 /*
892 * We now have:
893 * Pxy = P + D_x + D_y
894 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
895 *
896 * We can then solve for D_x:
897 * D_x = A * (P + Pxy) + B * (Q + Qxy)
898 * where
899 * A = 2^(x - y) * (2^(x - y) + 1)^-1
900 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
901 *
902 * With D_x in hand, we can easily solve for D_y:
903 * D_y = P + Pxy + D_x
904 */
905
906 a = vdev_raidz_pow2[255 + x - y];
907 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
908 tmp = 255 - vdev_raidz_log2[a ^ 1];
909
910 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
911 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
912
913 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
914 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
915 vdev_raidz_exp2(*q ^ *qxy, bexp);
916
917 if (i < ysize)
918 *yd = *p ^ *pxy ^ *xd;
919 }
920
921 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
922 rm->rm_col[VDEV_RAIDZ_P].rc_size);
923 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
924 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
925
926 /*
927 * Restore the saved parity data.
928 */
929 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
930 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
45d1cae3
BB
931
932 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
933}
934
935/* BEGIN CSTYLED */
936/*
937 * In the general case of reconstruction, we must solve the system of linear
938 * equations defined by the coeffecients used to generate parity as well as
939 * the contents of the data and parity disks. This can be expressed with
940 * vectors for the original data (D) and the actual data (d) and parity (p)
941 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
942 *
943 * __ __ __ __
944 * | | __ __ | p_0 |
945 * | V | | D_0 | | p_m-1 |
946 * | | x | : | = | d_0 |
947 * | I | | D_n-1 | | : |
948 * | | ~~ ~~ | d_n-1 |
949 * ~~ ~~ ~~ ~~
950 *
951 * I is simply a square identity matrix of size n, and V is a vandermonde
952 * matrix defined by the coeffecients we chose for the various parity columns
953 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
954 * computation as well as linear separability.
955 *
956 * __ __ __ __
957 * | 1 .. 1 1 1 | | p_0 |
958 * | 2^n-1 .. 4 2 1 | __ __ | : |
959 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
960 * | 1 .. 0 0 0 | | D_1 | | d_0 |
961 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
962 * | : : : : | | : | | d_2 |
963 * | 0 .. 1 0 0 | | D_n-1 | | : |
964 * | 0 .. 0 1 0 | ~~ ~~ | : |
965 * | 0 .. 0 0 1 | | d_n-1 |
966 * ~~ ~~ ~~ ~~
967 *
968 * Note that I, V, d, and p are known. To compute D, we must invert the
969 * matrix and use the known data and parity values to reconstruct the unknown
970 * data values. We begin by removing the rows in V|I and d|p that correspond
971 * to failed or missing columns; we then make V|I square (n x n) and d|p
972 * sized n by removing rows corresponding to unused parity from the bottom up
973 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
974 * using Gauss-Jordan elimination. In the example below we use m=3 parity
975 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
976 * __ __
977 * | 1 1 1 1 1 1 1 1 |
978 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
979 * | 19 205 116 29 64 16 4 1 | / /
980 * | 1 0 0 0 0 0 0 0 | / /
981 * | 0 1 0 0 0 0 0 0 | <--' /
982 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
983 * | 0 0 0 1 0 0 0 0 |
984 * | 0 0 0 0 1 0 0 0 |
985 * | 0 0 0 0 0 1 0 0 |
986 * | 0 0 0 0 0 0 1 0 |
987 * | 0 0 0 0 0 0 0 1 |
988 * ~~ ~~
989 * __ __
990 * | 1 1 1 1 1 1 1 1 |
991 * | 128 64 32 16 8 4 2 1 |
992 * | 19 205 116 29 64 16 4 1 |
993 * | 1 0 0 0 0 0 0 0 |
994 * | 0 1 0 0 0 0 0 0 |
995 * (V|I)' = | 0 0 1 0 0 0 0 0 |
996 * | 0 0 0 1 0 0 0 0 |
997 * | 0 0 0 0 1 0 0 0 |
998 * | 0 0 0 0 0 1 0 0 |
999 * | 0 0 0 0 0 0 1 0 |
1000 * | 0 0 0 0 0 0 0 1 |
1001 * ~~ ~~
1002 *
1003 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1004 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1005 * matrix is not singular.
1006 * __ __
1007 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1008 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1009 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1010 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1011 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1012 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1013 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1014 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1015 * ~~ ~~
1016 * __ __
1017 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1018 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1019 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1020 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1021 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1022 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1023 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1024 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1025 * ~~ ~~
1026 * __ __
1027 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1028 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1029 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1030 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1031 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1032 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1033 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1034 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1035 * ~~ ~~
1036 * __ __
1037 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1038 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1039 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1040 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1041 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1042 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1043 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1044 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1045 * ~~ ~~
1046 * __ __
1047 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1048 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1049 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1050 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1051 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1052 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1053 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1054 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1055 * ~~ ~~
1056 * __ __
1057 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1058 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1059 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1060 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1061 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1062 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1063 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1064 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1065 * ~~ ~~
1066 * __ __
1067 * | 0 0 1 0 0 0 0 0 |
1068 * | 167 100 5 41 159 169 217 208 |
1069 * | 166 100 4 40 158 168 216 209 |
1070 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1071 * | 0 0 0 0 1 0 0 0 |
1072 * | 0 0 0 0 0 1 0 0 |
1073 * | 0 0 0 0 0 0 1 0 |
1074 * | 0 0 0 0 0 0 0 1 |
1075 * ~~ ~~
1076 *
1077 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1078 * of the missing data.
1079 *
1080 * As is apparent from the example above, the only non-trivial rows in the
1081 * inverse matrix correspond to the data disks that we're trying to
1082 * reconstruct. Indeed, those are the only rows we need as the others would
1083 * only be useful for reconstructing data known or assumed to be valid. For
1084 * that reason, we only build the coefficients in the rows that correspond to
1085 * targeted columns.
1086 */
1087/* END CSTYLED */
1088
1089static void
1090vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1091 uint8_t **rows)
1092{
1093 int i, j;
1094 int pow;
1095
1096 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1097
1098 /*
1099 * Fill in the missing rows of interest.
1100 */
1101 for (i = 0; i < nmap; i++) {
1102 ASSERT3S(0, <=, map[i]);
1103 ASSERT3S(map[i], <=, 2);
1104
1105 pow = map[i] * n;
1106 if (pow > 255)
1107 pow -= 255;
1108 ASSERT(pow <= 255);
1109
1110 for (j = 0; j < n; j++) {
1111 pow -= map[i];
1112 if (pow < 0)
1113 pow += 255;
1114 rows[i][j] = vdev_raidz_pow2[pow];
1115 }
1116 }
1117}
1118
1119static void
1120vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1121 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1122{
1123 int i, j, ii, jj;
1124 uint8_t log;
1125
1126 /*
1127 * Assert that the first nmissing entries from the array of used
1128 * columns correspond to parity columns and that subsequent entries
1129 * correspond to data columns.
1130 */
1131 for (i = 0; i < nmissing; i++) {
1132 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1133 }
1134 for (; i < n; i++) {
1135 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1136 }
1137
1138 /*
1139 * First initialize the storage where we'll compute the inverse rows.
1140 */
1141 for (i = 0; i < nmissing; i++) {
1142 for (j = 0; j < n; j++) {
1143 invrows[i][j] = (i == j) ? 1 : 0;
1144 }
1145 }
1146
1147 /*
1148 * Subtract all trivial rows from the rows of consequence.
1149 */
1150 for (i = 0; i < nmissing; i++) {
1151 for (j = nmissing; j < n; j++) {
1152 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1153 jj = used[j] - rm->rm_firstdatacol;
1154 ASSERT3S(jj, <, n);
1155 invrows[i][j] = rows[i][jj];
1156 rows[i][jj] = 0;
1157 }
1158 }
1159
1160 /*
1161 * For each of the rows of interest, we must normalize it and subtract
1162 * a multiple of it from the other rows.
1163 */
1164 for (i = 0; i < nmissing; i++) {
1165 for (j = 0; j < missing[i]; j++) {
c99c9001 1166 ASSERT0(rows[i][j]);
45d1cae3
BB
1167 }
1168 ASSERT3U(rows[i][missing[i]], !=, 0);
1169
1170 /*
1171 * Compute the inverse of the first element and multiply each
1172 * element in the row by that value.
1173 */
1174 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1175
1176 for (j = 0; j < n; j++) {
1177 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1178 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1179 }
1180
1181 for (ii = 0; ii < nmissing; ii++) {
1182 if (i == ii)
1183 continue;
1184
1185 ASSERT3U(rows[ii][missing[i]], !=, 0);
1186
1187 log = vdev_raidz_log2[rows[ii][missing[i]]];
1188
1189 for (j = 0; j < n; j++) {
1190 rows[ii][j] ^=
1191 vdev_raidz_exp2(rows[i][j], log);
1192 invrows[ii][j] ^=
1193 vdev_raidz_exp2(invrows[i][j], log);
1194 }
1195 }
1196 }
1197
1198 /*
1199 * Verify that the data that is left in the rows are properly part of
1200 * an identity matrix.
1201 */
1202 for (i = 0; i < nmissing; i++) {
1203 for (j = 0; j < n; j++) {
1204 if (j == missing[i]) {
1205 ASSERT3U(rows[i][j], ==, 1);
1206 } else {
c99c9001 1207 ASSERT0(rows[i][j]);
45d1cae3
BB
1208 }
1209 }
1210 }
1211}
1212
1213static void
1214vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1215 int *missing, uint8_t **invrows, const uint8_t *used)
1216{
1217 int i, j, x, cc, c;
1218 uint8_t *src;
1219 uint64_t ccount;
1220 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1221 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
a117a6d6
GW
1222 uint8_t log = 0;
1223 uint8_t val;
45d1cae3
BB
1224 int ll;
1225 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1226 uint8_t *p, *pp;
1227 size_t psize;
1228
1229 psize = sizeof (invlog[0][0]) * n * nmissing;
b8d06fca 1230 p = kmem_alloc(psize, KM_PUSHPAGE);
45d1cae3
BB
1231
1232 for (pp = p, i = 0; i < nmissing; i++) {
1233 invlog[i] = pp;
1234 pp += n;
1235 }
1236
1237 for (i = 0; i < nmissing; i++) {
1238 for (j = 0; j < n; j++) {
1239 ASSERT3U(invrows[i][j], !=, 0);
1240 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1241 }
1242 }
1243
1244 for (i = 0; i < n; i++) {
1245 c = used[i];
1246 ASSERT3U(c, <, rm->rm_cols);
1247
1248 src = rm->rm_col[c].rc_data;
1249 ccount = rm->rm_col[c].rc_size;
1250 for (j = 0; j < nmissing; j++) {
1251 cc = missing[j] + rm->rm_firstdatacol;
1252 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1253 ASSERT3U(cc, <, rm->rm_cols);
1254 ASSERT3U(cc, !=, c);
1255
1256 dst[j] = rm->rm_col[cc].rc_data;
1257 dcount[j] = rm->rm_col[cc].rc_size;
1258 }
1259
1260 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1261
1262 for (x = 0; x < ccount; x++, src++) {
1263 if (*src != 0)
1264 log = vdev_raidz_log2[*src];
1265
1266 for (cc = 0; cc < nmissing; cc++) {
1267 if (x >= dcount[cc])
1268 continue;
1269
1270 if (*src == 0) {
1271 val = 0;
1272 } else {
1273 if ((ll = log + invlog[cc][i]) >= 255)
1274 ll -= 255;
1275 val = vdev_raidz_pow2[ll];
1276 }
1277
1278 if (i == 0)
1279 dst[cc][x] = val;
1280 else
1281 dst[cc][x] ^= val;
1282 }
1283 }
1284 }
1285
1286 kmem_free(p, psize);
1287}
1288
1289static int
1290vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1291{
1292 int n, i, c, t, tt;
1293 int nmissing_rows;
1294 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1295 int parity_map[VDEV_RAIDZ_MAXPARITY];
1296
1297 uint8_t *p, *pp;
1298 size_t psize;
1299
1300 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1301 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1302 uint8_t *used;
1303
1304 int code = 0;
1305
1306
1307 n = rm->rm_cols - rm->rm_firstdatacol;
1308
1309 /*
1310 * Figure out which data columns are missing.
1311 */
1312 nmissing_rows = 0;
1313 for (t = 0; t < ntgts; t++) {
1314 if (tgts[t] >= rm->rm_firstdatacol) {
1315 missing_rows[nmissing_rows++] =
1316 tgts[t] - rm->rm_firstdatacol;
1317 }
1318 }
1319
1320 /*
1321 * Figure out which parity columns to use to help generate the missing
1322 * data columns.
1323 */
1324 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1325 ASSERT(tt < ntgts);
1326 ASSERT(c < rm->rm_firstdatacol);
1327
1328 /*
1329 * Skip any targeted parity columns.
1330 */
1331 if (c == tgts[tt]) {
1332 tt++;
1333 continue;
1334 }
1335
1336 code |= 1 << c;
1337
1338 parity_map[i] = c;
1339 i++;
1340 }
1341
1342 ASSERT(code != 0);
1343 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1344
1345 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1346 nmissing_rows * n + sizeof (used[0]) * n;
b8d06fca 1347 p = kmem_alloc(psize, KM_PUSHPAGE);
45d1cae3
BB
1348
1349 for (pp = p, i = 0; i < nmissing_rows; i++) {
1350 rows[i] = pp;
1351 pp += n;
1352 invrows[i] = pp;
1353 pp += n;
1354 }
1355 used = pp;
1356
1357 for (i = 0; i < nmissing_rows; i++) {
1358 used[i] = parity_map[i];
1359 }
1360
1361 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1362 if (tt < nmissing_rows &&
1363 c == missing_rows[tt] + rm->rm_firstdatacol) {
1364 tt++;
1365 continue;
1366 }
1367
1368 ASSERT3S(i, <, n);
1369 used[i] = c;
1370 i++;
1371 }
1372
1373 /*
1374 * Initialize the interesting rows of the matrix.
1375 */
1376 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1377
1378 /*
1379 * Invert the matrix.
1380 */
1381 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1382 invrows, used);
1383
1384 /*
1385 * Reconstruct the missing data using the generated matrix.
1386 */
1387 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1388 invrows, used);
1389
1390 kmem_free(p, psize);
1391
1392 return (code);
34dc7c2f
BB
1393}
1394
45d1cae3
BB
1395static int
1396vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1397{
1398 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1399 int ntgts;
1400 int i, c;
1401 int code;
1402 int nbadparity, nbaddata;
1403 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1404
1405 /*
1406 * The tgts list must already be sorted.
1407 */
1408 for (i = 1; i < nt; i++) {
1409 ASSERT(t[i] > t[i - 1]);
1410 }
1411
1412 nbadparity = rm->rm_firstdatacol;
1413 nbaddata = rm->rm_cols - nbadparity;
1414 ntgts = 0;
1415 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1416 if (c < rm->rm_firstdatacol)
1417 parity_valid[c] = B_FALSE;
1418
1419 if (i < nt && c == t[i]) {
1420 tgts[ntgts++] = c;
1421 i++;
1422 } else if (rm->rm_col[c].rc_error != 0) {
1423 tgts[ntgts++] = c;
1424 } else if (c >= rm->rm_firstdatacol) {
1425 nbaddata--;
1426 } else {
1427 parity_valid[c] = B_TRUE;
1428 nbadparity--;
1429 }
1430 }
1431
1432 ASSERT(ntgts >= nt);
1433 ASSERT(nbaddata >= 0);
1434 ASSERT(nbaddata + nbadparity == ntgts);
1435
1436 dt = &tgts[nbadparity];
1437
1438 /*
1439 * See if we can use any of our optimized reconstruction routines.
1440 */
1441 if (!vdev_raidz_default_to_general) {
1442 switch (nbaddata) {
1443 case 1:
1444 if (parity_valid[VDEV_RAIDZ_P])
1445 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1446
1447 ASSERT(rm->rm_firstdatacol > 1);
1448
1449 if (parity_valid[VDEV_RAIDZ_Q])
1450 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1451
1452 ASSERT(rm->rm_firstdatacol > 2);
1453 break;
1454
1455 case 2:
1456 ASSERT(rm->rm_firstdatacol > 1);
1457
1458 if (parity_valid[VDEV_RAIDZ_P] &&
1459 parity_valid[VDEV_RAIDZ_Q])
1460 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1461
1462 ASSERT(rm->rm_firstdatacol > 2);
1463
1464 break;
1465 }
1466 }
1467
1468 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1469 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1470 ASSERT(code > 0);
1471 return (code);
1472}
34dc7c2f
BB
1473
1474static int
1bd201e7
CS
1475vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1476 uint64_t *ashift)
34dc7c2f
BB
1477{
1478 vdev_t *cvd;
1479 uint64_t nparity = vd->vdev_nparity;
45d1cae3 1480 int c;
34dc7c2f
BB
1481 int lasterror = 0;
1482 int numerrors = 0;
1483
1484 ASSERT(nparity > 0);
1485
1486 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1487 vd->vdev_children < nparity + 1) {
1488 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2e528b49 1489 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1490 }
1491
45d1cae3
BB
1492 vdev_open_children(vd);
1493
34dc7c2f
BB
1494 for (c = 0; c < vd->vdev_children; c++) {
1495 cvd = vd->vdev_child[c];
1496
45d1cae3
BB
1497 if (cvd->vdev_open_error != 0) {
1498 lasterror = cvd->vdev_open_error;
34dc7c2f
BB
1499 numerrors++;
1500 continue;
1501 }
1502
1503 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1bd201e7 1504 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
34dc7c2f
BB
1505 *ashift = MAX(*ashift, cvd->vdev_ashift);
1506 }
1507
1508 *asize *= vd->vdev_children;
1bd201e7 1509 *max_asize *= vd->vdev_children;
34dc7c2f
BB
1510
1511 if (numerrors > nparity) {
1512 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1513 return (lasterror);
1514 }
1515
1516 return (0);
1517}
1518
1519static void
1520vdev_raidz_close(vdev_t *vd)
1521{
1522 int c;
1523
1524 for (c = 0; c < vd->vdev_children; c++)
1525 vdev_close(vd->vdev_child[c]);
1526}
1527
1528static uint64_t
1529vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1530{
1531 uint64_t asize;
1532 uint64_t ashift = vd->vdev_top->vdev_ashift;
1533 uint64_t cols = vd->vdev_children;
1534 uint64_t nparity = vd->vdev_nparity;
1535
1536 asize = ((psize - 1) >> ashift) + 1;
1537 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1538 asize = roundup(asize, nparity + 1) << ashift;
1539
1540 return (asize);
1541}
1542
1543static void
1544vdev_raidz_child_done(zio_t *zio)
1545{
1546 raidz_col_t *rc = zio->io_private;
1547
1548 rc->rc_error = zio->io_error;
1549 rc->rc_tried = 1;
1550 rc->rc_skipped = 0;
1551}
1552
e49f1e20
WA
1553/*
1554 * Start an IO operation on a RAIDZ VDev
1555 *
1556 * Outline:
1557 * - For write operations:
1558 * 1. Generate the parity data
1559 * 2. Create child zio write operations to each column's vdev, for both
1560 * data and parity.
1561 * 3. If the column skips any sectors for padding, create optional dummy
1562 * write zio children for those areas to improve aggregation continuity.
1563 * - For read operations:
1564 * 1. Create child zio read operations to each data column's vdev to read
1565 * the range of data required for zio.
1566 * 2. If this is a scrub or resilver operation, or if any of the data
1567 * vdevs have had errors, then create zio read operations to the parity
1568 * columns' VDevs as well.
1569 */
34dc7c2f
BB
1570static int
1571vdev_raidz_io_start(zio_t *zio)
1572{
1573 vdev_t *vd = zio->io_vd;
1574 vdev_t *tvd = vd->vdev_top;
1575 vdev_t *cvd;
34dc7c2f
BB
1576 raidz_map_t *rm;
1577 raidz_col_t *rc;
45d1cae3 1578 int c, i;
34dc7c2f
BB
1579
1580 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1581 vd->vdev_nparity);
1582
1583 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1584
1585 if (zio->io_type == ZIO_TYPE_WRITE) {
45d1cae3 1586 vdev_raidz_generate_parity(rm);
34dc7c2f
BB
1587
1588 for (c = 0; c < rm->rm_cols; c++) {
1589 rc = &rm->rm_col[c];
1590 cvd = vd->vdev_child[rc->rc_devidx];
1591 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1592 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1593 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1594 vdev_raidz_child_done, rc));
1595 }
1596
45d1cae3
BB
1597 /*
1598 * Generate optional I/Os for any skipped sectors to improve
1599 * aggregation contiguity.
1600 */
428870ff 1601 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
45d1cae3
BB
1602 ASSERT(c <= rm->rm_scols);
1603 if (c == rm->rm_scols)
1604 c = 0;
1605 rc = &rm->rm_col[c];
1606 cvd = vd->vdev_child[rc->rc_devidx];
1607 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1608 rc->rc_offset + rc->rc_size, NULL,
1609 1 << tvd->vdev_ashift,
1610 zio->io_type, zio->io_priority,
1611 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1612 }
1613
b128c09f 1614 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1615 }
1616
1617 ASSERT(zio->io_type == ZIO_TYPE_READ);
1618
1619 /*
1620 * Iterate over the columns in reverse order so that we hit the parity
45d1cae3 1621 * last -- any errors along the way will force us to read the parity.
34dc7c2f
BB
1622 */
1623 for (c = rm->rm_cols - 1; c >= 0; c--) {
1624 rc = &rm->rm_col[c];
1625 cvd = vd->vdev_child[rc->rc_devidx];
1626 if (!vdev_readable(cvd)) {
1627 if (c >= rm->rm_firstdatacol)
1628 rm->rm_missingdata++;
1629 else
1630 rm->rm_missingparity++;
2e528b49 1631 rc->rc_error = SET_ERROR(ENXIO);
34dc7c2f
BB
1632 rc->rc_tried = 1; /* don't even try */
1633 rc->rc_skipped = 1;
1634 continue;
1635 }
428870ff 1636 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
34dc7c2f
BB
1637 if (c >= rm->rm_firstdatacol)
1638 rm->rm_missingdata++;
1639 else
1640 rm->rm_missingparity++;
2e528b49 1641 rc->rc_error = SET_ERROR(ESTALE);
34dc7c2f
BB
1642 rc->rc_skipped = 1;
1643 continue;
1644 }
1645 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
9babb374 1646 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
1647 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1648 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1649 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1650 vdev_raidz_child_done, rc));
1651 }
1652 }
1653
b128c09f 1654 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1655}
1656
428870ff 1657
34dc7c2f
BB
1658/*
1659 * Report a checksum error for a child of a RAID-Z device.
1660 */
1661static void
428870ff 1662raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
34dc7c2f
BB
1663{
1664 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
34dc7c2f
BB
1665
1666 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
428870ff
BB
1667 zio_bad_cksum_t zbc;
1668 raidz_map_t *rm = zio->io_vsd;
1669
34dc7c2f
BB
1670 mutex_enter(&vd->vdev_stat_lock);
1671 vd->vdev_stat.vs_checksum_errors++;
1672 mutex_exit(&vd->vdev_stat_lock);
428870ff
BB
1673
1674 zbc.zbc_has_cksum = 0;
1675 zbc.zbc_injected = rm->rm_ecksuminjected;
1676
1677 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1678 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1679 &zbc);
34dc7c2f 1680 }
428870ff
BB
1681}
1682
1683/*
1684 * We keep track of whether or not there were any injected errors, so that
1685 * any ereports we generate can note it.
1686 */
1687static int
1688raidz_checksum_verify(zio_t *zio)
1689{
1690 zio_bad_cksum_t zbc;
1691 raidz_map_t *rm = zio->io_vsd;
d4ed6673 1692 int ret;
428870ff 1693
d4ed6673
BB
1694 bzero(&zbc, sizeof (zio_bad_cksum_t));
1695
1696 ret = zio_checksum_error(zio, &zbc);
428870ff
BB
1697 if (ret != 0 && zbc.zbc_injected != 0)
1698 rm->rm_ecksuminjected = 1;
34dc7c2f 1699
428870ff 1700 return (ret);
34dc7c2f
BB
1701}
1702
1703/*
1704 * Generate the parity from the data columns. If we tried and were able to
1705 * read the parity without error, verify that the generated parity matches the
1706 * data we read. If it doesn't, we fire off a checksum error. Return the
1707 * number such failures.
1708 */
1709static int
1710raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1711{
1712 void *orig[VDEV_RAIDZ_MAXPARITY];
1713 int c, ret = 0;
1714 raidz_col_t *rc;
1715
1716 for (c = 0; c < rm->rm_firstdatacol; c++) {
1717 rc = &rm->rm_col[c];
1718 if (!rc->rc_tried || rc->rc_error != 0)
1719 continue;
1720 orig[c] = zio_buf_alloc(rc->rc_size);
1721 bcopy(rc->rc_data, orig[c], rc->rc_size);
1722 }
1723
45d1cae3 1724 vdev_raidz_generate_parity(rm);
34dc7c2f
BB
1725
1726 for (c = 0; c < rm->rm_firstdatacol; c++) {
1727 rc = &rm->rm_col[c];
1728 if (!rc->rc_tried || rc->rc_error != 0)
1729 continue;
1730 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
428870ff 1731 raidz_checksum_error(zio, rc, orig[c]);
2e528b49 1732 rc->rc_error = SET_ERROR(ECKSUM);
34dc7c2f
BB
1733 ret++;
1734 }
1735 zio_buf_free(orig[c], rc->rc_size);
1736 }
1737
1738 return (ret);
1739}
1740
45d1cae3
BB
1741/*
1742 * Keep statistics on all the ways that we used parity to correct data.
1743 */
1744static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
34dc7c2f
BB
1745
1746static int
b128c09f
BB
1747vdev_raidz_worst_error(raidz_map_t *rm)
1748{
d6320ddb 1749 int c, error = 0;
b128c09f 1750
d6320ddb 1751 for (c = 0; c < rm->rm_cols; c++)
b128c09f
BB
1752 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1753
1754 return (error);
1755}
1756
45d1cae3
BB
1757/*
1758 * Iterate over all combinations of bad data and attempt a reconstruction.
1759 * Note that the algorithm below is non-optimal because it doesn't take into
1760 * account how reconstruction is actually performed. For example, with
1761 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1762 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1763 * cases we'd only use parity information in column 0.
1764 */
1765static int
1766vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1767{
1768 raidz_map_t *rm = zio->io_vsd;
1769 raidz_col_t *rc;
1770 void *orig[VDEV_RAIDZ_MAXPARITY];
1771 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1772 int *tgts = &tstore[1];
5631c038 1773 int curr, next, i, c, n;
45d1cae3
BB
1774 int code, ret = 0;
1775
1776 ASSERT(total_errors < rm->rm_firstdatacol);
1777
1778 /*
1779 * This simplifies one edge condition.
1780 */
1781 tgts[-1] = -1;
1782
1783 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1784 /*
1785 * Initialize the targets array by finding the first n columns
1786 * that contain no error.
1787 *
1788 * If there were no data errors, we need to ensure that we're
1789 * always explicitly attempting to reconstruct at least one
1790 * data column. To do this, we simply push the highest target
1791 * up into the data columns.
1792 */
1793 for (c = 0, i = 0; i < n; i++) {
1794 if (i == n - 1 && data_errors == 0 &&
1795 c < rm->rm_firstdatacol) {
1796 c = rm->rm_firstdatacol;
1797 }
1798
1799 while (rm->rm_col[c].rc_error != 0) {
1800 c++;
1801 ASSERT3S(c, <, rm->rm_cols);
1802 }
1803
1804 tgts[i] = c++;
1805 }
1806
1807 /*
1808 * Setting tgts[n] simplifies the other edge condition.
1809 */
1810 tgts[n] = rm->rm_cols;
1811
1812 /*
1813 * These buffers were allocated in previous iterations.
1814 */
1815 for (i = 0; i < n - 1; i++) {
1816 ASSERT(orig[i] != NULL);
1817 }
1818
1819 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1820
5631c038
BB
1821 curr = 0;
1822 next = tgts[curr];
45d1cae3 1823
5631c038
BB
1824 while (curr != n) {
1825 tgts[curr] = next;
1826 curr = 0;
45d1cae3
BB
1827
1828 /*
1829 * Save off the original data that we're going to
1830 * attempt to reconstruct.
1831 */
1832 for (i = 0; i < n; i++) {
1833 ASSERT(orig[i] != NULL);
1834 c = tgts[i];
1835 ASSERT3S(c, >=, 0);
1836 ASSERT3S(c, <, rm->rm_cols);
1837 rc = &rm->rm_col[c];
1838 bcopy(rc->rc_data, orig[i], rc->rc_size);
1839 }
1840
1841 /*
1842 * Attempt a reconstruction and exit the outer loop on
1843 * success.
1844 */
1845 code = vdev_raidz_reconstruct(rm, tgts, n);
428870ff 1846 if (raidz_checksum_verify(zio) == 0) {
45d1cae3
BB
1847 atomic_inc_64(&raidz_corrected[code]);
1848
1849 for (i = 0; i < n; i++) {
1850 c = tgts[i];
1851 rc = &rm->rm_col[c];
1852 ASSERT(rc->rc_error == 0);
1853 if (rc->rc_tried)
428870ff
BB
1854 raidz_checksum_error(zio, rc,
1855 orig[i]);
2e528b49 1856 rc->rc_error = SET_ERROR(ECKSUM);
45d1cae3
BB
1857 }
1858
1859 ret = code;
1860 goto done;
1861 }
1862
1863 /*
1864 * Restore the original data.
1865 */
1866 for (i = 0; i < n; i++) {
1867 c = tgts[i];
1868 rc = &rm->rm_col[c];
1869 bcopy(orig[i], rc->rc_data, rc->rc_size);
1870 }
1871
1872 do {
1873 /*
5631c038 1874 * Find the next valid column after the curr
45d1cae3
BB
1875 * position..
1876 */
5631c038 1877 for (next = tgts[curr] + 1;
45d1cae3
BB
1878 next < rm->rm_cols &&
1879 rm->rm_col[next].rc_error != 0; next++)
1880 continue;
1881
5631c038 1882 ASSERT(next <= tgts[curr + 1]);
45d1cae3
BB
1883
1884 /*
1885 * If that spot is available, we're done here.
1886 */
5631c038 1887 if (next != tgts[curr + 1])
45d1cae3
BB
1888 break;
1889
1890 /*
1891 * Otherwise, find the next valid column after
1892 * the previous position.
1893 */
5631c038 1894 for (c = tgts[curr - 1] + 1;
45d1cae3
BB
1895 rm->rm_col[c].rc_error != 0; c++)
1896 continue;
1897
5631c038
BB
1898 tgts[curr] = c;
1899 curr++;
45d1cae3 1900
5631c038 1901 } while (curr != n);
45d1cae3
BB
1902 }
1903 }
1904 n--;
1905done:
1906 for (i = 0; i < n; i++) {
1907 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1908 }
1909
1910 return (ret);
1911}
1912
e49f1e20
WA
1913/*
1914 * Complete an IO operation on a RAIDZ VDev
1915 *
1916 * Outline:
1917 * - For write operations:
1918 * 1. Check for errors on the child IOs.
1919 * 2. Return, setting an error code if too few child VDevs were written
1920 * to reconstruct the data later. Note that partial writes are
1921 * considered successful if they can be reconstructed at all.
1922 * - For read operations:
1923 * 1. Check for errors on the child IOs.
1924 * 2. If data errors occurred:
1925 * a. Try to reassemble the data from the parity available.
1926 * b. If we haven't yet read the parity drives, read them now.
1927 * c. If all parity drives have been read but the data still doesn't
1928 * reassemble with a correct checksum, then try combinatorial
1929 * reconstruction.
1930 * d. If that doesn't work, return an error.
1931 * 3. If there were unexpected errors or this is a resilver operation,
1932 * rewrite the vdevs that had errors.
1933 */
b128c09f 1934static void
34dc7c2f
BB
1935vdev_raidz_io_done(zio_t *zio)
1936{
1937 vdev_t *vd = zio->io_vd;
1938 vdev_t *cvd;
1939 raidz_map_t *rm = zio->io_vsd;
d4ed6673 1940 raidz_col_t *rc = NULL;
34dc7c2f
BB
1941 int unexpected_errors = 0;
1942 int parity_errors = 0;
1943 int parity_untried = 0;
1944 int data_errors = 0;
b128c09f 1945 int total_errors = 0;
45d1cae3
BB
1946 int n, c;
1947 int tgts[VDEV_RAIDZ_MAXPARITY];
1948 int code;
34dc7c2f
BB
1949
1950 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1951
34dc7c2f
BB
1952 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1953 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1954
1955 for (c = 0; c < rm->rm_cols; c++) {
1956 rc = &rm->rm_col[c];
1957
34dc7c2f 1958 if (rc->rc_error) {
b128c09f 1959 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
34dc7c2f
BB
1960
1961 if (c < rm->rm_firstdatacol)
1962 parity_errors++;
1963 else
1964 data_errors++;
1965
1966 if (!rc->rc_skipped)
1967 unexpected_errors++;
1968
b128c09f 1969 total_errors++;
34dc7c2f
BB
1970 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1971 parity_untried++;
1972 }
1973 }
1974
1975 if (zio->io_type == ZIO_TYPE_WRITE) {
1976 /*
b128c09f
BB
1977 * XXX -- for now, treat partial writes as a success.
1978 * (If we couldn't write enough columns to reconstruct
1979 * the data, the I/O failed. Otherwise, good enough.)
1980 *
1981 * Now that we support write reallocation, it would be better
1982 * to treat partial failure as real failure unless there are
1983 * no non-degraded top-level vdevs left, and not update DTLs
1984 * if we intend to reallocate.
34dc7c2f
BB
1985 */
1986 /* XXPOLICY */
b128c09f
BB
1987 if (total_errors > rm->rm_firstdatacol)
1988 zio->io_error = vdev_raidz_worst_error(rm);
34dc7c2f 1989
b128c09f 1990 return;
34dc7c2f
BB
1991 }
1992
1993 ASSERT(zio->io_type == ZIO_TYPE_READ);
1994 /*
1995 * There are three potential phases for a read:
1996 * 1. produce valid data from the columns read
1997 * 2. read all disks and try again
1998 * 3. perform combinatorial reconstruction
1999 *
2000 * Each phase is progressively both more expensive and less likely to
2001 * occur. If we encounter more errors than we can repair or all phases
2002 * fail, we have no choice but to return an error.
2003 */
2004
2005 /*
2006 * If the number of errors we saw was correctable -- less than or equal
2007 * to the number of parity disks read -- attempt to produce data that
2008 * has a valid checksum. Naturally, this case applies in the absence of
2009 * any errors.
2010 */
b128c09f 2011 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
45d1cae3 2012 if (data_errors == 0) {
428870ff 2013 if (raidz_checksum_verify(zio) == 0) {
34dc7c2f
BB
2014 /*
2015 * If we read parity information (unnecessarily
2016 * as it happens since no reconstruction was
2017 * needed) regenerate and verify the parity.
2018 * We also regenerate parity when resilvering
2019 * so we can write it out to the failed device
2020 * later.
2021 */
2022 if (parity_errors + parity_untried <
2023 rm->rm_firstdatacol ||
2024 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2025 n = raidz_parity_verify(zio, rm);
2026 unexpected_errors += n;
2027 ASSERT(parity_errors + n <=
2028 rm->rm_firstdatacol);
2029 }
2030 goto done;
2031 }
45d1cae3 2032 } else {
34dc7c2f
BB
2033 /*
2034 * We either attempt to read all the parity columns or
2035 * none of them. If we didn't try to read parity, we
2036 * wouldn't be here in the correctable case. There must
2037 * also have been fewer parity errors than parity
2038 * columns or, again, we wouldn't be in this code path.
2039 */
2040 ASSERT(parity_untried == 0);
2041 ASSERT(parity_errors < rm->rm_firstdatacol);
2042
2043 /*
45d1cae3 2044 * Identify the data columns that reported an error.
34dc7c2f 2045 */
45d1cae3 2046 n = 0;
34dc7c2f
BB
2047 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
2048 rc = &rm->rm_col[c];
45d1cae3
BB
2049 if (rc->rc_error != 0) {
2050 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
2051 tgts[n++] = c;
2052 }
34dc7c2f 2053 }
34dc7c2f 2054
45d1cae3
BB
2055 ASSERT(rm->rm_firstdatacol >= n);
2056
2057 code = vdev_raidz_reconstruct(rm, tgts, n);
34dc7c2f 2058
428870ff 2059 if (raidz_checksum_verify(zio) == 0) {
45d1cae3 2060 atomic_inc_64(&raidz_corrected[code]);
34dc7c2f
BB
2061
2062 /*
45d1cae3
BB
2063 * If we read more parity disks than were used
2064 * for reconstruction, confirm that the other
2065 * parity disks produced correct data. This
2066 * routine is suboptimal in that it regenerates
2067 * the parity that we already used in addition
2068 * to the parity that we're attempting to
2069 * verify, but this should be a relatively
2070 * uncommon case, and can be optimized if it
2071 * becomes a problem. Note that we regenerate
2072 * parity when resilvering so we can write it
2073 * out to failed devices later.
34dc7c2f 2074 */
45d1cae3 2075 if (parity_errors < rm->rm_firstdatacol - n ||
34dc7c2f
BB
2076 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2077 n = raidz_parity_verify(zio, rm);
2078 unexpected_errors += n;
2079 ASSERT(parity_errors + n <=
2080 rm->rm_firstdatacol);
2081 }
2082
2083 goto done;
2084 }
34dc7c2f
BB
2085 }
2086 }
2087
2088 /*
2089 * This isn't a typical situation -- either we got a read error or
2090 * a child silently returned bad data. Read every block so we can
2091 * try again with as much data and parity as we can track down. If
2092 * we've already been through once before, all children will be marked
2093 * as tried so we'll proceed to combinatorial reconstruction.
2094 */
2095 unexpected_errors = 1;
2096 rm->rm_missingdata = 0;
2097 rm->rm_missingparity = 0;
2098
2099 for (c = 0; c < rm->rm_cols; c++) {
2100 if (rm->rm_col[c].rc_tried)
2101 continue;
2102
34dc7c2f
BB
2103 zio_vdev_io_redone(zio);
2104 do {
2105 rc = &rm->rm_col[c];
2106 if (rc->rc_tried)
2107 continue;
2108 zio_nowait(zio_vdev_child_io(zio, NULL,
2109 vd->vdev_child[rc->rc_devidx],
2110 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 2111 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
2112 vdev_raidz_child_done, rc));
2113 } while (++c < rm->rm_cols);
34dc7c2f 2114
b128c09f 2115 return;
34dc7c2f
BB
2116 }
2117
2118 /*
2119 * At this point we've attempted to reconstruct the data given the
2120 * errors we detected, and we've attempted to read all columns. There
2121 * must, therefore, be one or more additional problems -- silent errors
2122 * resulting in invalid data rather than explicit I/O errors resulting
45d1cae3
BB
2123 * in absent data. We check if there is enough additional data to
2124 * possibly reconstruct the data and then perform combinatorial
2125 * reconstruction over all possible combinations. If that fails,
2126 * we're cooked.
34dc7c2f 2127 */
428870ff 2128 if (total_errors > rm->rm_firstdatacol) {
b128c09f 2129 zio->io_error = vdev_raidz_worst_error(rm);
34dc7c2f 2130
428870ff
BB
2131 } else if (total_errors < rm->rm_firstdatacol &&
2132 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
34dc7c2f 2133 /*
45d1cae3
BB
2134 * If we didn't use all the available parity for the
2135 * combinatorial reconstruction, verify that the remaining
2136 * parity is correct.
34dc7c2f 2137 */
45d1cae3
BB
2138 if (code != (1 << rm->rm_firstdatacol) - 1)
2139 (void) raidz_parity_verify(zio, rm);
2140 } else {
34dc7c2f 2141 /*
428870ff
BB
2142 * We're here because either:
2143 *
2144 * total_errors == rm_first_datacol, or
2145 * vdev_raidz_combrec() failed
2146 *
2147 * In either case, there is enough bad data to prevent
2148 * reconstruction.
2149 *
2150 * Start checksum ereports for all children which haven't
2151 * failed, and the IO wasn't speculative.
34dc7c2f 2152 */
2e528b49 2153 zio->io_error = SET_ERROR(ECKSUM);
34dc7c2f 2154
45d1cae3
BB
2155 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2156 for (c = 0; c < rm->rm_cols; c++) {
2157 rc = &rm->rm_col[c];
428870ff
BB
2158 if (rc->rc_error == 0) {
2159 zio_bad_cksum_t zbc;
2160 zbc.zbc_has_cksum = 0;
2161 zbc.zbc_injected =
2162 rm->rm_ecksuminjected;
2163
2164 zfs_ereport_start_checksum(
2165 zio->io_spa,
2166 vd->vdev_child[rc->rc_devidx],
2167 zio, rc->rc_offset, rc->rc_size,
2168 (void *)(uintptr_t)c, &zbc);
2169 }
34dc7c2f 2170 }
34dc7c2f
BB
2171 }
2172 }
2173
2174done:
2175 zio_checksum_verified(zio);
2176
fb5f0bc8 2177 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
34dc7c2f 2178 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
2179 /*
2180 * Use the good data we have in hand to repair damaged children.
34dc7c2f 2181 */
34dc7c2f
BB
2182 for (c = 0; c < rm->rm_cols; c++) {
2183 rc = &rm->rm_col[c];
2184 cvd = vd->vdev_child[rc->rc_devidx];
2185
2186 if (rc->rc_error == 0)
2187 continue;
2188
b128c09f 2189 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
34dc7c2f 2190 rc->rc_offset, rc->rc_data, rc->rc_size,
e8b96c60 2191 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
fb5f0bc8
BB
2192 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2193 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
34dc7c2f 2194 }
34dc7c2f 2195 }
34dc7c2f
BB
2196}
2197
2198static void
2199vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2200{
2201 if (faulted > vd->vdev_nparity)
2202 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2203 VDEV_AUX_NO_REPLICAS);
2204 else if (degraded + faulted != 0)
2205 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2206 else
2207 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2208}
2209
2210vdev_ops_t vdev_raidz_ops = {
2211 vdev_raidz_open,
2212 vdev_raidz_close,
34dc7c2f
BB
2213 vdev_raidz_asize,
2214 vdev_raidz_io_start,
2215 vdev_raidz_io_done,
2216 vdev_raidz_state_change,
428870ff
BB
2217 NULL,
2218 NULL,
34dc7c2f
BB
2219 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2220 B_FALSE /* not a leaf vdev */
2221};