]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/vdev_raidz.c
Illumos #3598
[mirror_zfs.git] / module / zfs / vdev_raidz.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27 #include <sys/zfs_context.h>
28 #include <sys/spa.h>
29 #include <sys/vdev_impl.h>
30 #include <sys/zio.h>
31 #include <sys/zio_checksum.h>
32 #include <sys/fs/zfs.h>
33 #include <sys/fm/fs/zfs.h>
34
35 /*
36 * Virtual device vector for RAID-Z.
37 *
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
45 * for writes.
46 *
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
53 * suffer.
54 *
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
59 *
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
63 * (A * 2)_7 = A_6
64 * (A * 2)_6 = A_5
65 * (A * 2)_5 = A_4
66 * (A * 2)_4 = A_3 + A_7
67 * (A * 2)_3 = A_2 + A_7
68 * (A * 2)_2 = A_1 + A_7
69 * (A * 2)_1 = A_0
70 * (A * 2)_0 = A_7
71 *
72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
73 * As an aside, this multiplication is derived from the error correcting
74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
75 *
76 * Observe that any number in the field (except for 0) can be expressed as a
77 * power of 2 -- a generator for the field. We store a table of the powers of
78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
80 * than field addition). The inverse of a field element A (A^-1) is therefore
81 * A ^ (255 - 1) = A^254.
82 *
83 * The up-to-three parity columns, P, Q, R over several data columns,
84 * D_0, ... D_n-1, can be expressed by field operations:
85 *
86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
91 *
92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 * independent coefficients. (There are no additional coefficients that have
95 * this property which is why the uncorrected Plank method breaks down.)
96 *
97 * See the reconstruction code below for how P, Q and R can used individually
98 * or in concert to recover missing data columns.
99 */
100
101 typedef struct raidz_col {
102 uint64_t rc_devidx; /* child device index for I/O */
103 uint64_t rc_offset; /* device offset */
104 uint64_t rc_size; /* I/O size */
105 void *rc_data; /* I/O data */
106 void *rc_gdata; /* used to store the "good" version */
107 int rc_error; /* I/O error for this device */
108 uint8_t rc_tried; /* Did we attempt this I/O column? */
109 uint8_t rc_skipped; /* Did we skip this I/O column? */
110 } raidz_col_t;
111
112 typedef struct raidz_map {
113 uint64_t rm_cols; /* Regular column count */
114 uint64_t rm_scols; /* Count including skipped columns */
115 uint64_t rm_bigcols; /* Number of oversized columns */
116 uint64_t rm_asize; /* Actual total I/O size */
117 uint64_t rm_missingdata; /* Count of missing data devices */
118 uint64_t rm_missingparity; /* Count of missing parity devices */
119 uint64_t rm_firstdatacol; /* First data column/parity count */
120 uint64_t rm_nskip; /* Skipped sectors for padding */
121 uint64_t rm_skipstart; /* Column index of padding start */
122 void *rm_datacopy; /* rm_asize-buffer of copied data */
123 uintptr_t rm_reports; /* # of referencing checksum reports */
124 uint8_t rm_freed; /* map no longer has referencing ZIO */
125 uint8_t rm_ecksuminjected; /* checksum error was injected */
126 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
127 } raidz_map_t;
128
129 #define VDEV_RAIDZ_P 0
130 #define VDEV_RAIDZ_Q 1
131 #define VDEV_RAIDZ_R 2
132
133 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
135
136 /*
137 * We provide a mechanism to perform the field multiplication operation on a
138 * 64-bit value all at once rather than a byte at a time. This works by
139 * creating a mask from the top bit in each byte and using that to
140 * conditionally apply the XOR of 0x1d.
141 */
142 #define VDEV_RAIDZ_64MUL_2(x, mask) \
143 { \
144 (mask) = (x) & 0x8080808080808080ULL; \
145 (mask) = ((mask) << 1) - ((mask) >> 7); \
146 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
147 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
148 }
149
150 #define VDEV_RAIDZ_64MUL_4(x, mask) \
151 { \
152 VDEV_RAIDZ_64MUL_2((x), mask); \
153 VDEV_RAIDZ_64MUL_2((x), mask); \
154 }
155
156 /*
157 * Force reconstruction to use the general purpose method.
158 */
159 int vdev_raidz_default_to_general;
160
161 /*
162 * These two tables represent powers and logs of 2 in the Galois field defined
163 * above. These values were computed by repeatedly multiplying by 2 as above.
164 */
165 static const uint8_t vdev_raidz_pow2[256] = {
166 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
189 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
198 };
199 static const uint8_t vdev_raidz_log2[256] = {
200 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
232 };
233
234 static void vdev_raidz_generate_parity(raidz_map_t *rm);
235
236 /*
237 * Multiply a given number by 2 raised to the given power.
238 */
239 static uint8_t
240 vdev_raidz_exp2(uint_t a, int exp)
241 {
242 if (a == 0)
243 return (0);
244
245 ASSERT(exp >= 0);
246 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
247
248 exp += vdev_raidz_log2[a];
249 if (exp > 255)
250 exp -= 255;
251
252 return (vdev_raidz_pow2[exp]);
253 }
254
255 static void
256 vdev_raidz_map_free(raidz_map_t *rm)
257 {
258 int c;
259 size_t size;
260
261 for (c = 0; c < rm->rm_firstdatacol; c++) {
262 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
263
264 if (rm->rm_col[c].rc_gdata != NULL)
265 zio_buf_free(rm->rm_col[c].rc_gdata,
266 rm->rm_col[c].rc_size);
267 }
268
269 size = 0;
270 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
271 size += rm->rm_col[c].rc_size;
272
273 if (rm->rm_datacopy != NULL)
274 zio_buf_free(rm->rm_datacopy, size);
275
276 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
277 }
278
279 static void
280 vdev_raidz_map_free_vsd(zio_t *zio)
281 {
282 raidz_map_t *rm = zio->io_vsd;
283
284 ASSERT0(rm->rm_freed);
285 rm->rm_freed = 1;
286
287 if (rm->rm_reports == 0)
288 vdev_raidz_map_free(rm);
289 }
290
291 /*ARGSUSED*/
292 static void
293 vdev_raidz_cksum_free(void *arg, size_t ignored)
294 {
295 raidz_map_t *rm = arg;
296
297 ASSERT3U(rm->rm_reports, >, 0);
298
299 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
300 vdev_raidz_map_free(rm);
301 }
302
303 static void
304 vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
305 {
306 raidz_map_t *rm = zcr->zcr_cbdata;
307 size_t c = zcr->zcr_cbinfo;
308 size_t x;
309
310 const char *good = NULL;
311 const char *bad = rm->rm_col[c].rc_data;
312
313 if (good_data == NULL) {
314 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
315 return;
316 }
317
318 if (c < rm->rm_firstdatacol) {
319 /*
320 * The first time through, calculate the parity blocks for
321 * the good data (this relies on the fact that the good
322 * data never changes for a given logical ZIO)
323 */
324 if (rm->rm_col[0].rc_gdata == NULL) {
325 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
326 char *buf;
327
328 /*
329 * Set up the rm_col[]s to generate the parity for
330 * good_data, first saving the parity bufs and
331 * replacing them with buffers to hold the result.
332 */
333 for (x = 0; x < rm->rm_firstdatacol; x++) {
334 bad_parity[x] = rm->rm_col[x].rc_data;
335 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
336 zio_buf_alloc(rm->rm_col[x].rc_size);
337 }
338
339 /* fill in the data columns from good_data */
340 buf = (char *)good_data;
341 for (; x < rm->rm_cols; x++) {
342 rm->rm_col[x].rc_data = buf;
343 buf += rm->rm_col[x].rc_size;
344 }
345
346 /*
347 * Construct the parity from the good data.
348 */
349 vdev_raidz_generate_parity(rm);
350
351 /* restore everything back to its original state */
352 for (x = 0; x < rm->rm_firstdatacol; x++)
353 rm->rm_col[x].rc_data = bad_parity[x];
354
355 buf = rm->rm_datacopy;
356 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
357 rm->rm_col[x].rc_data = buf;
358 buf += rm->rm_col[x].rc_size;
359 }
360 }
361
362 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
363 good = rm->rm_col[c].rc_gdata;
364 } else {
365 /* adjust good_data to point at the start of our column */
366 good = good_data;
367
368 for (x = rm->rm_firstdatacol; x < c; x++)
369 good += rm->rm_col[x].rc_size;
370 }
371
372 /* we drop the ereport if it ends up that the data was good */
373 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
374 }
375
376 /*
377 * Invoked indirectly by zfs_ereport_start_checksum(), called
378 * below when our read operation fails completely. The main point
379 * is to keep a copy of everything we read from disk, so that at
380 * vdev_raidz_cksum_finish() time we can compare it with the good data.
381 */
382 static void
383 vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
384 {
385 size_t c = (size_t)(uintptr_t)arg;
386 caddr_t buf;
387
388 raidz_map_t *rm = zio->io_vsd;
389 size_t size;
390
391 /* set up the report and bump the refcount */
392 zcr->zcr_cbdata = rm;
393 zcr->zcr_cbinfo = c;
394 zcr->zcr_finish = vdev_raidz_cksum_finish;
395 zcr->zcr_free = vdev_raidz_cksum_free;
396
397 rm->rm_reports++;
398 ASSERT3U(rm->rm_reports, >, 0);
399
400 if (rm->rm_datacopy != NULL)
401 return;
402
403 /*
404 * It's the first time we're called for this raidz_map_t, so we need
405 * to copy the data aside; there's no guarantee that our zio's buffer
406 * won't be re-used for something else.
407 *
408 * Our parity data is already in separate buffers, so there's no need
409 * to copy them.
410 */
411
412 size = 0;
413 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
414 size += rm->rm_col[c].rc_size;
415
416 buf = rm->rm_datacopy = zio_buf_alloc(size);
417
418 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
419 raidz_col_t *col = &rm->rm_col[c];
420
421 bcopy(col->rc_data, buf, col->rc_size);
422 col->rc_data = buf;
423
424 buf += col->rc_size;
425 }
426 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
427 }
428
429 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
430 vdev_raidz_map_free_vsd,
431 vdev_raidz_cksum_report
432 };
433
434 static raidz_map_t *
435 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
436 uint64_t nparity)
437 {
438 raidz_map_t *rm;
439 uint64_t b = zio->io_offset >> unit_shift;
440 uint64_t s = zio->io_size >> unit_shift;
441 uint64_t f = b % dcols;
442 uint64_t o = (b / dcols) << unit_shift;
443 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
444
445 q = s / (dcols - nparity);
446 r = s - q * (dcols - nparity);
447 bc = (r == 0 ? 0 : r + nparity);
448 tot = s + nparity * (q + (r == 0 ? 0 : 1));
449
450 if (q == 0) {
451 acols = bc;
452 scols = MIN(dcols, roundup(bc, nparity + 1));
453 } else {
454 acols = dcols;
455 scols = dcols;
456 }
457
458 ASSERT3U(acols, <=, scols);
459
460 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_PUSHPAGE);
461
462 rm->rm_cols = acols;
463 rm->rm_scols = scols;
464 rm->rm_bigcols = bc;
465 rm->rm_skipstart = bc;
466 rm->rm_missingdata = 0;
467 rm->rm_missingparity = 0;
468 rm->rm_firstdatacol = nparity;
469 rm->rm_datacopy = NULL;
470 rm->rm_reports = 0;
471 rm->rm_freed = 0;
472 rm->rm_ecksuminjected = 0;
473
474 asize = 0;
475
476 for (c = 0; c < scols; c++) {
477 col = f + c;
478 coff = o;
479 if (col >= dcols) {
480 col -= dcols;
481 coff += 1ULL << unit_shift;
482 }
483 rm->rm_col[c].rc_devidx = col;
484 rm->rm_col[c].rc_offset = coff;
485 rm->rm_col[c].rc_data = NULL;
486 rm->rm_col[c].rc_gdata = NULL;
487 rm->rm_col[c].rc_error = 0;
488 rm->rm_col[c].rc_tried = 0;
489 rm->rm_col[c].rc_skipped = 0;
490
491 if (c >= acols)
492 rm->rm_col[c].rc_size = 0;
493 else if (c < bc)
494 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
495 else
496 rm->rm_col[c].rc_size = q << unit_shift;
497
498 asize += rm->rm_col[c].rc_size;
499 }
500
501 ASSERT3U(asize, ==, tot << unit_shift);
502 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
503 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
504 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
505 ASSERT3U(rm->rm_nskip, <=, nparity);
506
507 for (c = 0; c < rm->rm_firstdatacol; c++)
508 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
509
510 rm->rm_col[c].rc_data = zio->io_data;
511
512 for (c = c + 1; c < acols; c++)
513 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
514 rm->rm_col[c - 1].rc_size;
515
516 /*
517 * If all data stored spans all columns, there's a danger that parity
518 * will always be on the same device and, since parity isn't read
519 * during normal operation, that that device's I/O bandwidth won't be
520 * used effectively. We therefore switch the parity every 1MB.
521 *
522 * ... at least that was, ostensibly, the theory. As a practical
523 * matter unless we juggle the parity between all devices evenly, we
524 * won't see any benefit. Further, occasional writes that aren't a
525 * multiple of the LCM of the number of children and the minimum
526 * stripe width are sufficient to avoid pessimal behavior.
527 * Unfortunately, this decision created an implicit on-disk format
528 * requirement that we need to support for all eternity, but only
529 * for single-parity RAID-Z.
530 *
531 * If we intend to skip a sector in the zeroth column for padding
532 * we must make sure to note this swap. We will never intend to
533 * skip the first column since at least one data and one parity
534 * column must appear in each row.
535 */
536 ASSERT(rm->rm_cols >= 2);
537 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
538
539 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
540 devidx = rm->rm_col[0].rc_devidx;
541 o = rm->rm_col[0].rc_offset;
542 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
543 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
544 rm->rm_col[1].rc_devidx = devidx;
545 rm->rm_col[1].rc_offset = o;
546
547 if (rm->rm_skipstart == 0)
548 rm->rm_skipstart = 1;
549 }
550
551 zio->io_vsd = rm;
552 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
553 return (rm);
554 }
555
556 static void
557 vdev_raidz_generate_parity_p(raidz_map_t *rm)
558 {
559 uint64_t *p, *src, pcount, ccount, i;
560 int c;
561
562 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
563
564 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565 src = rm->rm_col[c].rc_data;
566 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
568
569 if (c == rm->rm_firstdatacol) {
570 ASSERT(ccount == pcount);
571 for (i = 0; i < ccount; i++, src++, p++) {
572 *p = *src;
573 }
574 } else {
575 ASSERT(ccount <= pcount);
576 for (i = 0; i < ccount; i++, src++, p++) {
577 *p ^= *src;
578 }
579 }
580 }
581 }
582
583 static void
584 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
585 {
586 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
587 int c;
588
589 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
590 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
591 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
592
593 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
594 src = rm->rm_col[c].rc_data;
595 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
596 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
597
598 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
599
600 if (c == rm->rm_firstdatacol) {
601 ASSERT(ccnt == pcnt || ccnt == 0);
602 for (i = 0; i < ccnt; i++, src++, p++, q++) {
603 *p = *src;
604 *q = *src;
605 }
606 for (; i < pcnt; i++, src++, p++, q++) {
607 *p = 0;
608 *q = 0;
609 }
610 } else {
611 ASSERT(ccnt <= pcnt);
612
613 /*
614 * Apply the algorithm described above by multiplying
615 * the previous result and adding in the new value.
616 */
617 for (i = 0; i < ccnt; i++, src++, p++, q++) {
618 *p ^= *src;
619
620 VDEV_RAIDZ_64MUL_2(*q, mask);
621 *q ^= *src;
622 }
623
624 /*
625 * Treat short columns as though they are full of 0s.
626 * Note that there's therefore nothing needed for P.
627 */
628 for (; i < pcnt; i++, q++) {
629 VDEV_RAIDZ_64MUL_2(*q, mask);
630 }
631 }
632 }
633 }
634
635 static void
636 vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
637 {
638 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
639 int c;
640
641 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
642 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
643 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
644 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
645 rm->rm_col[VDEV_RAIDZ_R].rc_size);
646
647 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
648 src = rm->rm_col[c].rc_data;
649 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
650 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
651 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
652
653 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
654
655 if (c == rm->rm_firstdatacol) {
656 ASSERT(ccnt == pcnt || ccnt == 0);
657 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
658 *p = *src;
659 *q = *src;
660 *r = *src;
661 }
662 for (; i < pcnt; i++, src++, p++, q++, r++) {
663 *p = 0;
664 *q = 0;
665 *r = 0;
666 }
667 } else {
668 ASSERT(ccnt <= pcnt);
669
670 /*
671 * Apply the algorithm described above by multiplying
672 * the previous result and adding in the new value.
673 */
674 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
675 *p ^= *src;
676
677 VDEV_RAIDZ_64MUL_2(*q, mask);
678 *q ^= *src;
679
680 VDEV_RAIDZ_64MUL_4(*r, mask);
681 *r ^= *src;
682 }
683
684 /*
685 * Treat short columns as though they are full of 0s.
686 * Note that there's therefore nothing needed for P.
687 */
688 for (; i < pcnt; i++, q++, r++) {
689 VDEV_RAIDZ_64MUL_2(*q, mask);
690 VDEV_RAIDZ_64MUL_4(*r, mask);
691 }
692 }
693 }
694 }
695
696 /*
697 * Generate RAID parity in the first virtual columns according to the number of
698 * parity columns available.
699 */
700 static void
701 vdev_raidz_generate_parity(raidz_map_t *rm)
702 {
703 switch (rm->rm_firstdatacol) {
704 case 1:
705 vdev_raidz_generate_parity_p(rm);
706 break;
707 case 2:
708 vdev_raidz_generate_parity_pq(rm);
709 break;
710 case 3:
711 vdev_raidz_generate_parity_pqr(rm);
712 break;
713 default:
714 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
715 }
716 }
717
718 static int
719 vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
720 {
721 uint64_t *dst, *src, xcount, ccount, count, i;
722 int x = tgts[0];
723 int c;
724
725 ASSERT(ntgts == 1);
726 ASSERT(x >= rm->rm_firstdatacol);
727 ASSERT(x < rm->rm_cols);
728
729 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
730 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
731 ASSERT(xcount > 0);
732
733 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
734 dst = rm->rm_col[x].rc_data;
735 for (i = 0; i < xcount; i++, dst++, src++) {
736 *dst = *src;
737 }
738
739 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
740 src = rm->rm_col[c].rc_data;
741 dst = rm->rm_col[x].rc_data;
742
743 if (c == x)
744 continue;
745
746 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
747 count = MIN(ccount, xcount);
748
749 for (i = 0; i < count; i++, dst++, src++) {
750 *dst ^= *src;
751 }
752 }
753
754 return (1 << VDEV_RAIDZ_P);
755 }
756
757 static int
758 vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
759 {
760 uint64_t *dst, *src, xcount, ccount, count, mask, i;
761 uint8_t *b;
762 int x = tgts[0];
763 int c, j, exp;
764
765 ASSERT(ntgts == 1);
766
767 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
768 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
769
770 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
771 src = rm->rm_col[c].rc_data;
772 dst = rm->rm_col[x].rc_data;
773
774 if (c == x)
775 ccount = 0;
776 else
777 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
778
779 count = MIN(ccount, xcount);
780
781 if (c == rm->rm_firstdatacol) {
782 for (i = 0; i < count; i++, dst++, src++) {
783 *dst = *src;
784 }
785 for (; i < xcount; i++, dst++) {
786 *dst = 0;
787 }
788
789 } else {
790 for (i = 0; i < count; i++, dst++, src++) {
791 VDEV_RAIDZ_64MUL_2(*dst, mask);
792 *dst ^= *src;
793 }
794
795 for (; i < xcount; i++, dst++) {
796 VDEV_RAIDZ_64MUL_2(*dst, mask);
797 }
798 }
799 }
800
801 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
802 dst = rm->rm_col[x].rc_data;
803 exp = 255 - (rm->rm_cols - 1 - x);
804
805 for (i = 0; i < xcount; i++, dst++, src++) {
806 *dst ^= *src;
807 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
808 *b = vdev_raidz_exp2(*b, exp);
809 }
810 }
811
812 return (1 << VDEV_RAIDZ_Q);
813 }
814
815 static int
816 vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
817 {
818 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
819 void *pdata, *qdata;
820 uint64_t xsize, ysize, i;
821 int x = tgts[0];
822 int y = tgts[1];
823
824 ASSERT(ntgts == 2);
825 ASSERT(x < y);
826 ASSERT(x >= rm->rm_firstdatacol);
827 ASSERT(y < rm->rm_cols);
828
829 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
830
831 /*
832 * Move the parity data aside -- we're going to compute parity as
833 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
834 * reuse the parity generation mechanism without trashing the actual
835 * parity so we make those columns appear to be full of zeros by
836 * setting their lengths to zero.
837 */
838 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
839 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
840 xsize = rm->rm_col[x].rc_size;
841 ysize = rm->rm_col[y].rc_size;
842
843 rm->rm_col[VDEV_RAIDZ_P].rc_data =
844 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
845 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
846 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
847 rm->rm_col[x].rc_size = 0;
848 rm->rm_col[y].rc_size = 0;
849
850 vdev_raidz_generate_parity_pq(rm);
851
852 rm->rm_col[x].rc_size = xsize;
853 rm->rm_col[y].rc_size = ysize;
854
855 p = pdata;
856 q = qdata;
857 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
858 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
859 xd = rm->rm_col[x].rc_data;
860 yd = rm->rm_col[y].rc_data;
861
862 /*
863 * We now have:
864 * Pxy = P + D_x + D_y
865 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
866 *
867 * We can then solve for D_x:
868 * D_x = A * (P + Pxy) + B * (Q + Qxy)
869 * where
870 * A = 2^(x - y) * (2^(x - y) + 1)^-1
871 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
872 *
873 * With D_x in hand, we can easily solve for D_y:
874 * D_y = P + Pxy + D_x
875 */
876
877 a = vdev_raidz_pow2[255 + x - y];
878 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
879 tmp = 255 - vdev_raidz_log2[a ^ 1];
880
881 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
882 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
883
884 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
885 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
886 vdev_raidz_exp2(*q ^ *qxy, bexp);
887
888 if (i < ysize)
889 *yd = *p ^ *pxy ^ *xd;
890 }
891
892 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
893 rm->rm_col[VDEV_RAIDZ_P].rc_size);
894 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
895 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
896
897 /*
898 * Restore the saved parity data.
899 */
900 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
901 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
902
903 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
904 }
905
906 /* BEGIN CSTYLED */
907 /*
908 * In the general case of reconstruction, we must solve the system of linear
909 * equations defined by the coeffecients used to generate parity as well as
910 * the contents of the data and parity disks. This can be expressed with
911 * vectors for the original data (D) and the actual data (d) and parity (p)
912 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
913 *
914 * __ __ __ __
915 * | | __ __ | p_0 |
916 * | V | | D_0 | | p_m-1 |
917 * | | x | : | = | d_0 |
918 * | I | | D_n-1 | | : |
919 * | | ~~ ~~ | d_n-1 |
920 * ~~ ~~ ~~ ~~
921 *
922 * I is simply a square identity matrix of size n, and V is a vandermonde
923 * matrix defined by the coeffecients we chose for the various parity columns
924 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
925 * computation as well as linear separability.
926 *
927 * __ __ __ __
928 * | 1 .. 1 1 1 | | p_0 |
929 * | 2^n-1 .. 4 2 1 | __ __ | : |
930 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
931 * | 1 .. 0 0 0 | | D_1 | | d_0 |
932 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
933 * | : : : : | | : | | d_2 |
934 * | 0 .. 1 0 0 | | D_n-1 | | : |
935 * | 0 .. 0 1 0 | ~~ ~~ | : |
936 * | 0 .. 0 0 1 | | d_n-1 |
937 * ~~ ~~ ~~ ~~
938 *
939 * Note that I, V, d, and p are known. To compute D, we must invert the
940 * matrix and use the known data and parity values to reconstruct the unknown
941 * data values. We begin by removing the rows in V|I and d|p that correspond
942 * to failed or missing columns; we then make V|I square (n x n) and d|p
943 * sized n by removing rows corresponding to unused parity from the bottom up
944 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
945 * using Gauss-Jordan elimination. In the example below we use m=3 parity
946 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
947 * __ __
948 * | 1 1 1 1 1 1 1 1 |
949 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
950 * | 19 205 116 29 64 16 4 1 | / /
951 * | 1 0 0 0 0 0 0 0 | / /
952 * | 0 1 0 0 0 0 0 0 | <--' /
953 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
954 * | 0 0 0 1 0 0 0 0 |
955 * | 0 0 0 0 1 0 0 0 |
956 * | 0 0 0 0 0 1 0 0 |
957 * | 0 0 0 0 0 0 1 0 |
958 * | 0 0 0 0 0 0 0 1 |
959 * ~~ ~~
960 * __ __
961 * | 1 1 1 1 1 1 1 1 |
962 * | 128 64 32 16 8 4 2 1 |
963 * | 19 205 116 29 64 16 4 1 |
964 * | 1 0 0 0 0 0 0 0 |
965 * | 0 1 0 0 0 0 0 0 |
966 * (V|I)' = | 0 0 1 0 0 0 0 0 |
967 * | 0 0 0 1 0 0 0 0 |
968 * | 0 0 0 0 1 0 0 0 |
969 * | 0 0 0 0 0 1 0 0 |
970 * | 0 0 0 0 0 0 1 0 |
971 * | 0 0 0 0 0 0 0 1 |
972 * ~~ ~~
973 *
974 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
975 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
976 * matrix is not singular.
977 * __ __
978 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
979 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
980 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
981 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
982 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
983 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
984 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
985 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
986 * ~~ ~~
987 * __ __
988 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
989 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
990 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
991 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
992 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
993 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
994 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
995 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
996 * ~~ ~~
997 * __ __
998 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
999 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1000 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1001 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1002 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1003 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1004 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1005 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1006 * ~~ ~~
1007 * __ __
1008 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1009 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1010 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1011 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1012 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1013 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1014 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1015 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1016 * ~~ ~~
1017 * __ __
1018 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1019 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1020 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1021 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1022 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1023 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1024 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1025 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1026 * ~~ ~~
1027 * __ __
1028 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1029 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1030 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1031 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1032 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1033 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1034 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1035 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1036 * ~~ ~~
1037 * __ __
1038 * | 0 0 1 0 0 0 0 0 |
1039 * | 167 100 5 41 159 169 217 208 |
1040 * | 166 100 4 40 158 168 216 209 |
1041 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1042 * | 0 0 0 0 1 0 0 0 |
1043 * | 0 0 0 0 0 1 0 0 |
1044 * | 0 0 0 0 0 0 1 0 |
1045 * | 0 0 0 0 0 0 0 1 |
1046 * ~~ ~~
1047 *
1048 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1049 * of the missing data.
1050 *
1051 * As is apparent from the example above, the only non-trivial rows in the
1052 * inverse matrix correspond to the data disks that we're trying to
1053 * reconstruct. Indeed, those are the only rows we need as the others would
1054 * only be useful for reconstructing data known or assumed to be valid. For
1055 * that reason, we only build the coefficients in the rows that correspond to
1056 * targeted columns.
1057 */
1058 /* END CSTYLED */
1059
1060 static void
1061 vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1062 uint8_t **rows)
1063 {
1064 int i, j;
1065 int pow;
1066
1067 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1068
1069 /*
1070 * Fill in the missing rows of interest.
1071 */
1072 for (i = 0; i < nmap; i++) {
1073 ASSERT3S(0, <=, map[i]);
1074 ASSERT3S(map[i], <=, 2);
1075
1076 pow = map[i] * n;
1077 if (pow > 255)
1078 pow -= 255;
1079 ASSERT(pow <= 255);
1080
1081 for (j = 0; j < n; j++) {
1082 pow -= map[i];
1083 if (pow < 0)
1084 pow += 255;
1085 rows[i][j] = vdev_raidz_pow2[pow];
1086 }
1087 }
1088 }
1089
1090 static void
1091 vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1092 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1093 {
1094 int i, j, ii, jj;
1095 uint8_t log;
1096
1097 /*
1098 * Assert that the first nmissing entries from the array of used
1099 * columns correspond to parity columns and that subsequent entries
1100 * correspond to data columns.
1101 */
1102 for (i = 0; i < nmissing; i++) {
1103 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1104 }
1105 for (; i < n; i++) {
1106 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1107 }
1108
1109 /*
1110 * First initialize the storage where we'll compute the inverse rows.
1111 */
1112 for (i = 0; i < nmissing; i++) {
1113 for (j = 0; j < n; j++) {
1114 invrows[i][j] = (i == j) ? 1 : 0;
1115 }
1116 }
1117
1118 /*
1119 * Subtract all trivial rows from the rows of consequence.
1120 */
1121 for (i = 0; i < nmissing; i++) {
1122 for (j = nmissing; j < n; j++) {
1123 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1124 jj = used[j] - rm->rm_firstdatacol;
1125 ASSERT3S(jj, <, n);
1126 invrows[i][j] = rows[i][jj];
1127 rows[i][jj] = 0;
1128 }
1129 }
1130
1131 /*
1132 * For each of the rows of interest, we must normalize it and subtract
1133 * a multiple of it from the other rows.
1134 */
1135 for (i = 0; i < nmissing; i++) {
1136 for (j = 0; j < missing[i]; j++) {
1137 ASSERT0(rows[i][j]);
1138 }
1139 ASSERT3U(rows[i][missing[i]], !=, 0);
1140
1141 /*
1142 * Compute the inverse of the first element and multiply each
1143 * element in the row by that value.
1144 */
1145 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1146
1147 for (j = 0; j < n; j++) {
1148 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1149 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1150 }
1151
1152 for (ii = 0; ii < nmissing; ii++) {
1153 if (i == ii)
1154 continue;
1155
1156 ASSERT3U(rows[ii][missing[i]], !=, 0);
1157
1158 log = vdev_raidz_log2[rows[ii][missing[i]]];
1159
1160 for (j = 0; j < n; j++) {
1161 rows[ii][j] ^=
1162 vdev_raidz_exp2(rows[i][j], log);
1163 invrows[ii][j] ^=
1164 vdev_raidz_exp2(invrows[i][j], log);
1165 }
1166 }
1167 }
1168
1169 /*
1170 * Verify that the data that is left in the rows are properly part of
1171 * an identity matrix.
1172 */
1173 for (i = 0; i < nmissing; i++) {
1174 for (j = 0; j < n; j++) {
1175 if (j == missing[i]) {
1176 ASSERT3U(rows[i][j], ==, 1);
1177 } else {
1178 ASSERT0(rows[i][j]);
1179 }
1180 }
1181 }
1182 }
1183
1184 static void
1185 vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1186 int *missing, uint8_t **invrows, const uint8_t *used)
1187 {
1188 int i, j, x, cc, c;
1189 uint8_t *src;
1190 uint64_t ccount;
1191 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1192 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1193 uint8_t log = 0;
1194 uint8_t val;
1195 int ll;
1196 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1197 uint8_t *p, *pp;
1198 size_t psize;
1199
1200 psize = sizeof (invlog[0][0]) * n * nmissing;
1201 p = kmem_alloc(psize, KM_PUSHPAGE);
1202
1203 for (pp = p, i = 0; i < nmissing; i++) {
1204 invlog[i] = pp;
1205 pp += n;
1206 }
1207
1208 for (i = 0; i < nmissing; i++) {
1209 for (j = 0; j < n; j++) {
1210 ASSERT3U(invrows[i][j], !=, 0);
1211 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1212 }
1213 }
1214
1215 for (i = 0; i < n; i++) {
1216 c = used[i];
1217 ASSERT3U(c, <, rm->rm_cols);
1218
1219 src = rm->rm_col[c].rc_data;
1220 ccount = rm->rm_col[c].rc_size;
1221 for (j = 0; j < nmissing; j++) {
1222 cc = missing[j] + rm->rm_firstdatacol;
1223 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1224 ASSERT3U(cc, <, rm->rm_cols);
1225 ASSERT3U(cc, !=, c);
1226
1227 dst[j] = rm->rm_col[cc].rc_data;
1228 dcount[j] = rm->rm_col[cc].rc_size;
1229 }
1230
1231 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1232
1233 for (x = 0; x < ccount; x++, src++) {
1234 if (*src != 0)
1235 log = vdev_raidz_log2[*src];
1236
1237 for (cc = 0; cc < nmissing; cc++) {
1238 if (x >= dcount[cc])
1239 continue;
1240
1241 if (*src == 0) {
1242 val = 0;
1243 } else {
1244 if ((ll = log + invlog[cc][i]) >= 255)
1245 ll -= 255;
1246 val = vdev_raidz_pow2[ll];
1247 }
1248
1249 if (i == 0)
1250 dst[cc][x] = val;
1251 else
1252 dst[cc][x] ^= val;
1253 }
1254 }
1255 }
1256
1257 kmem_free(p, psize);
1258 }
1259
1260 static int
1261 vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1262 {
1263 int n, i, c, t, tt;
1264 int nmissing_rows;
1265 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1266 int parity_map[VDEV_RAIDZ_MAXPARITY];
1267
1268 uint8_t *p, *pp;
1269 size_t psize;
1270
1271 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1272 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1273 uint8_t *used;
1274
1275 int code = 0;
1276
1277
1278 n = rm->rm_cols - rm->rm_firstdatacol;
1279
1280 /*
1281 * Figure out which data columns are missing.
1282 */
1283 nmissing_rows = 0;
1284 for (t = 0; t < ntgts; t++) {
1285 if (tgts[t] >= rm->rm_firstdatacol) {
1286 missing_rows[nmissing_rows++] =
1287 tgts[t] - rm->rm_firstdatacol;
1288 }
1289 }
1290
1291 /*
1292 * Figure out which parity columns to use to help generate the missing
1293 * data columns.
1294 */
1295 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1296 ASSERT(tt < ntgts);
1297 ASSERT(c < rm->rm_firstdatacol);
1298
1299 /*
1300 * Skip any targeted parity columns.
1301 */
1302 if (c == tgts[tt]) {
1303 tt++;
1304 continue;
1305 }
1306
1307 code |= 1 << c;
1308
1309 parity_map[i] = c;
1310 i++;
1311 }
1312
1313 ASSERT(code != 0);
1314 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1315
1316 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1317 nmissing_rows * n + sizeof (used[0]) * n;
1318 p = kmem_alloc(psize, KM_PUSHPAGE);
1319
1320 for (pp = p, i = 0; i < nmissing_rows; i++) {
1321 rows[i] = pp;
1322 pp += n;
1323 invrows[i] = pp;
1324 pp += n;
1325 }
1326 used = pp;
1327
1328 for (i = 0; i < nmissing_rows; i++) {
1329 used[i] = parity_map[i];
1330 }
1331
1332 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1333 if (tt < nmissing_rows &&
1334 c == missing_rows[tt] + rm->rm_firstdatacol) {
1335 tt++;
1336 continue;
1337 }
1338
1339 ASSERT3S(i, <, n);
1340 used[i] = c;
1341 i++;
1342 }
1343
1344 /*
1345 * Initialize the interesting rows of the matrix.
1346 */
1347 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1348
1349 /*
1350 * Invert the matrix.
1351 */
1352 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1353 invrows, used);
1354
1355 /*
1356 * Reconstruct the missing data using the generated matrix.
1357 */
1358 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1359 invrows, used);
1360
1361 kmem_free(p, psize);
1362
1363 return (code);
1364 }
1365
1366 static int
1367 vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1368 {
1369 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1370 int ntgts;
1371 int i, c;
1372 int code;
1373 int nbadparity, nbaddata;
1374 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1375
1376 /*
1377 * The tgts list must already be sorted.
1378 */
1379 for (i = 1; i < nt; i++) {
1380 ASSERT(t[i] > t[i - 1]);
1381 }
1382
1383 nbadparity = rm->rm_firstdatacol;
1384 nbaddata = rm->rm_cols - nbadparity;
1385 ntgts = 0;
1386 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1387 if (c < rm->rm_firstdatacol)
1388 parity_valid[c] = B_FALSE;
1389
1390 if (i < nt && c == t[i]) {
1391 tgts[ntgts++] = c;
1392 i++;
1393 } else if (rm->rm_col[c].rc_error != 0) {
1394 tgts[ntgts++] = c;
1395 } else if (c >= rm->rm_firstdatacol) {
1396 nbaddata--;
1397 } else {
1398 parity_valid[c] = B_TRUE;
1399 nbadparity--;
1400 }
1401 }
1402
1403 ASSERT(ntgts >= nt);
1404 ASSERT(nbaddata >= 0);
1405 ASSERT(nbaddata + nbadparity == ntgts);
1406
1407 dt = &tgts[nbadparity];
1408
1409 /*
1410 * See if we can use any of our optimized reconstruction routines.
1411 */
1412 if (!vdev_raidz_default_to_general) {
1413 switch (nbaddata) {
1414 case 1:
1415 if (parity_valid[VDEV_RAIDZ_P])
1416 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1417
1418 ASSERT(rm->rm_firstdatacol > 1);
1419
1420 if (parity_valid[VDEV_RAIDZ_Q])
1421 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1422
1423 ASSERT(rm->rm_firstdatacol > 2);
1424 break;
1425
1426 case 2:
1427 ASSERT(rm->rm_firstdatacol > 1);
1428
1429 if (parity_valid[VDEV_RAIDZ_P] &&
1430 parity_valid[VDEV_RAIDZ_Q])
1431 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1432
1433 ASSERT(rm->rm_firstdatacol > 2);
1434
1435 break;
1436 }
1437 }
1438
1439 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1440 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1441 ASSERT(code > 0);
1442 return (code);
1443 }
1444
1445 static int
1446 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1447 uint64_t *ashift)
1448 {
1449 vdev_t *cvd;
1450 uint64_t nparity = vd->vdev_nparity;
1451 int c;
1452 int lasterror = 0;
1453 int numerrors = 0;
1454
1455 ASSERT(nparity > 0);
1456
1457 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1458 vd->vdev_children < nparity + 1) {
1459 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1460 return (SET_ERROR(EINVAL));
1461 }
1462
1463 vdev_open_children(vd);
1464
1465 for (c = 0; c < vd->vdev_children; c++) {
1466 cvd = vd->vdev_child[c];
1467
1468 if (cvd->vdev_open_error != 0) {
1469 lasterror = cvd->vdev_open_error;
1470 numerrors++;
1471 continue;
1472 }
1473
1474 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1475 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
1476 *ashift = MAX(*ashift, cvd->vdev_ashift);
1477 }
1478
1479 *asize *= vd->vdev_children;
1480 *max_asize *= vd->vdev_children;
1481
1482 if (numerrors > nparity) {
1483 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1484 return (lasterror);
1485 }
1486
1487 return (0);
1488 }
1489
1490 static void
1491 vdev_raidz_close(vdev_t *vd)
1492 {
1493 int c;
1494
1495 for (c = 0; c < vd->vdev_children; c++)
1496 vdev_close(vd->vdev_child[c]);
1497 }
1498
1499 static uint64_t
1500 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1501 {
1502 uint64_t asize;
1503 uint64_t ashift = vd->vdev_top->vdev_ashift;
1504 uint64_t cols = vd->vdev_children;
1505 uint64_t nparity = vd->vdev_nparity;
1506
1507 asize = ((psize - 1) >> ashift) + 1;
1508 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1509 asize = roundup(asize, nparity + 1) << ashift;
1510
1511 return (asize);
1512 }
1513
1514 static void
1515 vdev_raidz_child_done(zio_t *zio)
1516 {
1517 raidz_col_t *rc = zio->io_private;
1518
1519 rc->rc_error = zio->io_error;
1520 rc->rc_tried = 1;
1521 rc->rc_skipped = 0;
1522 }
1523
1524 static int
1525 vdev_raidz_io_start(zio_t *zio)
1526 {
1527 vdev_t *vd = zio->io_vd;
1528 vdev_t *tvd = vd->vdev_top;
1529 vdev_t *cvd;
1530 raidz_map_t *rm;
1531 raidz_col_t *rc;
1532 int c, i;
1533
1534 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1535 vd->vdev_nparity);
1536
1537 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1538
1539 if (zio->io_type == ZIO_TYPE_WRITE) {
1540 vdev_raidz_generate_parity(rm);
1541
1542 for (c = 0; c < rm->rm_cols; c++) {
1543 rc = &rm->rm_col[c];
1544 cvd = vd->vdev_child[rc->rc_devidx];
1545 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1546 rc->rc_offset, rc->rc_data, rc->rc_size,
1547 zio->io_type, zio->io_priority, 0,
1548 vdev_raidz_child_done, rc));
1549 }
1550
1551 /*
1552 * Generate optional I/Os for any skipped sectors to improve
1553 * aggregation contiguity.
1554 */
1555 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
1556 ASSERT(c <= rm->rm_scols);
1557 if (c == rm->rm_scols)
1558 c = 0;
1559 rc = &rm->rm_col[c];
1560 cvd = vd->vdev_child[rc->rc_devidx];
1561 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1562 rc->rc_offset + rc->rc_size, NULL,
1563 1 << tvd->vdev_ashift,
1564 zio->io_type, zio->io_priority,
1565 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1566 }
1567
1568 return (ZIO_PIPELINE_CONTINUE);
1569 }
1570
1571 ASSERT(zio->io_type == ZIO_TYPE_READ);
1572
1573 /*
1574 * Iterate over the columns in reverse order so that we hit the parity
1575 * last -- any errors along the way will force us to read the parity.
1576 */
1577 for (c = rm->rm_cols - 1; c >= 0; c--) {
1578 rc = &rm->rm_col[c];
1579 cvd = vd->vdev_child[rc->rc_devidx];
1580 if (!vdev_readable(cvd)) {
1581 if (c >= rm->rm_firstdatacol)
1582 rm->rm_missingdata++;
1583 else
1584 rm->rm_missingparity++;
1585 rc->rc_error = SET_ERROR(ENXIO);
1586 rc->rc_tried = 1; /* don't even try */
1587 rc->rc_skipped = 1;
1588 continue;
1589 }
1590 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
1591 if (c >= rm->rm_firstdatacol)
1592 rm->rm_missingdata++;
1593 else
1594 rm->rm_missingparity++;
1595 rc->rc_error = SET_ERROR(ESTALE);
1596 rc->rc_skipped = 1;
1597 continue;
1598 }
1599 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
1600 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1601 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1602 rc->rc_offset, rc->rc_data, rc->rc_size,
1603 zio->io_type, zio->io_priority, 0,
1604 vdev_raidz_child_done, rc));
1605 }
1606 }
1607
1608 return (ZIO_PIPELINE_CONTINUE);
1609 }
1610
1611
1612 /*
1613 * Report a checksum error for a child of a RAID-Z device.
1614 */
1615 static void
1616 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
1617 {
1618 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
1619
1620 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
1621 zio_bad_cksum_t zbc;
1622 raidz_map_t *rm = zio->io_vsd;
1623
1624 mutex_enter(&vd->vdev_stat_lock);
1625 vd->vdev_stat.vs_checksum_errors++;
1626 mutex_exit(&vd->vdev_stat_lock);
1627
1628 zbc.zbc_has_cksum = 0;
1629 zbc.zbc_injected = rm->rm_ecksuminjected;
1630
1631 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1632 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1633 &zbc);
1634 }
1635 }
1636
1637 /*
1638 * We keep track of whether or not there were any injected errors, so that
1639 * any ereports we generate can note it.
1640 */
1641 static int
1642 raidz_checksum_verify(zio_t *zio)
1643 {
1644 zio_bad_cksum_t zbc;
1645 raidz_map_t *rm = zio->io_vsd;
1646 int ret;
1647
1648 bzero(&zbc, sizeof (zio_bad_cksum_t));
1649
1650 ret = zio_checksum_error(zio, &zbc);
1651 if (ret != 0 && zbc.zbc_injected != 0)
1652 rm->rm_ecksuminjected = 1;
1653
1654 return (ret);
1655 }
1656
1657 /*
1658 * Generate the parity from the data columns. If we tried and were able to
1659 * read the parity without error, verify that the generated parity matches the
1660 * data we read. If it doesn't, we fire off a checksum error. Return the
1661 * number such failures.
1662 */
1663 static int
1664 raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1665 {
1666 void *orig[VDEV_RAIDZ_MAXPARITY];
1667 int c, ret = 0;
1668 raidz_col_t *rc;
1669
1670 for (c = 0; c < rm->rm_firstdatacol; c++) {
1671 rc = &rm->rm_col[c];
1672 if (!rc->rc_tried || rc->rc_error != 0)
1673 continue;
1674 orig[c] = zio_buf_alloc(rc->rc_size);
1675 bcopy(rc->rc_data, orig[c], rc->rc_size);
1676 }
1677
1678 vdev_raidz_generate_parity(rm);
1679
1680 for (c = 0; c < rm->rm_firstdatacol; c++) {
1681 rc = &rm->rm_col[c];
1682 if (!rc->rc_tried || rc->rc_error != 0)
1683 continue;
1684 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
1685 raidz_checksum_error(zio, rc, orig[c]);
1686 rc->rc_error = SET_ERROR(ECKSUM);
1687 ret++;
1688 }
1689 zio_buf_free(orig[c], rc->rc_size);
1690 }
1691
1692 return (ret);
1693 }
1694
1695 /*
1696 * Keep statistics on all the ways that we used parity to correct data.
1697 */
1698 static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
1699
1700 static int
1701 vdev_raidz_worst_error(raidz_map_t *rm)
1702 {
1703 int c, error = 0;
1704
1705 for (c = 0; c < rm->rm_cols; c++)
1706 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1707
1708 return (error);
1709 }
1710
1711 /*
1712 * Iterate over all combinations of bad data and attempt a reconstruction.
1713 * Note that the algorithm below is non-optimal because it doesn't take into
1714 * account how reconstruction is actually performed. For example, with
1715 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1716 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1717 * cases we'd only use parity information in column 0.
1718 */
1719 static int
1720 vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1721 {
1722 raidz_map_t *rm = zio->io_vsd;
1723 raidz_col_t *rc;
1724 void *orig[VDEV_RAIDZ_MAXPARITY];
1725 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1726 int *tgts = &tstore[1];
1727 int curr, next, i, c, n;
1728 int code, ret = 0;
1729
1730 ASSERT(total_errors < rm->rm_firstdatacol);
1731
1732 /*
1733 * This simplifies one edge condition.
1734 */
1735 tgts[-1] = -1;
1736
1737 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1738 /*
1739 * Initialize the targets array by finding the first n columns
1740 * that contain no error.
1741 *
1742 * If there were no data errors, we need to ensure that we're
1743 * always explicitly attempting to reconstruct at least one
1744 * data column. To do this, we simply push the highest target
1745 * up into the data columns.
1746 */
1747 for (c = 0, i = 0; i < n; i++) {
1748 if (i == n - 1 && data_errors == 0 &&
1749 c < rm->rm_firstdatacol) {
1750 c = rm->rm_firstdatacol;
1751 }
1752
1753 while (rm->rm_col[c].rc_error != 0) {
1754 c++;
1755 ASSERT3S(c, <, rm->rm_cols);
1756 }
1757
1758 tgts[i] = c++;
1759 }
1760
1761 /*
1762 * Setting tgts[n] simplifies the other edge condition.
1763 */
1764 tgts[n] = rm->rm_cols;
1765
1766 /*
1767 * These buffers were allocated in previous iterations.
1768 */
1769 for (i = 0; i < n - 1; i++) {
1770 ASSERT(orig[i] != NULL);
1771 }
1772
1773 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1774
1775 curr = 0;
1776 next = tgts[curr];
1777
1778 while (curr != n) {
1779 tgts[curr] = next;
1780 curr = 0;
1781
1782 /*
1783 * Save off the original data that we're going to
1784 * attempt to reconstruct.
1785 */
1786 for (i = 0; i < n; i++) {
1787 ASSERT(orig[i] != NULL);
1788 c = tgts[i];
1789 ASSERT3S(c, >=, 0);
1790 ASSERT3S(c, <, rm->rm_cols);
1791 rc = &rm->rm_col[c];
1792 bcopy(rc->rc_data, orig[i], rc->rc_size);
1793 }
1794
1795 /*
1796 * Attempt a reconstruction and exit the outer loop on
1797 * success.
1798 */
1799 code = vdev_raidz_reconstruct(rm, tgts, n);
1800 if (raidz_checksum_verify(zio) == 0) {
1801 atomic_inc_64(&raidz_corrected[code]);
1802
1803 for (i = 0; i < n; i++) {
1804 c = tgts[i];
1805 rc = &rm->rm_col[c];
1806 ASSERT(rc->rc_error == 0);
1807 if (rc->rc_tried)
1808 raidz_checksum_error(zio, rc,
1809 orig[i]);
1810 rc->rc_error = SET_ERROR(ECKSUM);
1811 }
1812
1813 ret = code;
1814 goto done;
1815 }
1816
1817 /*
1818 * Restore the original data.
1819 */
1820 for (i = 0; i < n; i++) {
1821 c = tgts[i];
1822 rc = &rm->rm_col[c];
1823 bcopy(orig[i], rc->rc_data, rc->rc_size);
1824 }
1825
1826 do {
1827 /*
1828 * Find the next valid column after the curr
1829 * position..
1830 */
1831 for (next = tgts[curr] + 1;
1832 next < rm->rm_cols &&
1833 rm->rm_col[next].rc_error != 0; next++)
1834 continue;
1835
1836 ASSERT(next <= tgts[curr + 1]);
1837
1838 /*
1839 * If that spot is available, we're done here.
1840 */
1841 if (next != tgts[curr + 1])
1842 break;
1843
1844 /*
1845 * Otherwise, find the next valid column after
1846 * the previous position.
1847 */
1848 for (c = tgts[curr - 1] + 1;
1849 rm->rm_col[c].rc_error != 0; c++)
1850 continue;
1851
1852 tgts[curr] = c;
1853 curr++;
1854
1855 } while (curr != n);
1856 }
1857 }
1858 n--;
1859 done:
1860 for (i = 0; i < n; i++) {
1861 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1862 }
1863
1864 return (ret);
1865 }
1866
1867 static void
1868 vdev_raidz_io_done(zio_t *zio)
1869 {
1870 vdev_t *vd = zio->io_vd;
1871 vdev_t *cvd;
1872 raidz_map_t *rm = zio->io_vsd;
1873 raidz_col_t *rc = NULL;
1874 int unexpected_errors = 0;
1875 int parity_errors = 0;
1876 int parity_untried = 0;
1877 int data_errors = 0;
1878 int total_errors = 0;
1879 int n, c;
1880 int tgts[VDEV_RAIDZ_MAXPARITY];
1881 int code;
1882
1883 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1884
1885 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1886 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1887
1888 for (c = 0; c < rm->rm_cols; c++) {
1889 rc = &rm->rm_col[c];
1890
1891 if (rc->rc_error) {
1892 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
1893
1894 if (c < rm->rm_firstdatacol)
1895 parity_errors++;
1896 else
1897 data_errors++;
1898
1899 if (!rc->rc_skipped)
1900 unexpected_errors++;
1901
1902 total_errors++;
1903 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1904 parity_untried++;
1905 }
1906 }
1907
1908 if (zio->io_type == ZIO_TYPE_WRITE) {
1909 /*
1910 * XXX -- for now, treat partial writes as a success.
1911 * (If we couldn't write enough columns to reconstruct
1912 * the data, the I/O failed. Otherwise, good enough.)
1913 *
1914 * Now that we support write reallocation, it would be better
1915 * to treat partial failure as real failure unless there are
1916 * no non-degraded top-level vdevs left, and not update DTLs
1917 * if we intend to reallocate.
1918 */
1919 /* XXPOLICY */
1920 if (total_errors > rm->rm_firstdatacol)
1921 zio->io_error = vdev_raidz_worst_error(rm);
1922
1923 return;
1924 }
1925
1926 ASSERT(zio->io_type == ZIO_TYPE_READ);
1927 /*
1928 * There are three potential phases for a read:
1929 * 1. produce valid data from the columns read
1930 * 2. read all disks and try again
1931 * 3. perform combinatorial reconstruction
1932 *
1933 * Each phase is progressively both more expensive and less likely to
1934 * occur. If we encounter more errors than we can repair or all phases
1935 * fail, we have no choice but to return an error.
1936 */
1937
1938 /*
1939 * If the number of errors we saw was correctable -- less than or equal
1940 * to the number of parity disks read -- attempt to produce data that
1941 * has a valid checksum. Naturally, this case applies in the absence of
1942 * any errors.
1943 */
1944 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
1945 if (data_errors == 0) {
1946 if (raidz_checksum_verify(zio) == 0) {
1947 /*
1948 * If we read parity information (unnecessarily
1949 * as it happens since no reconstruction was
1950 * needed) regenerate and verify the parity.
1951 * We also regenerate parity when resilvering
1952 * so we can write it out to the failed device
1953 * later.
1954 */
1955 if (parity_errors + parity_untried <
1956 rm->rm_firstdatacol ||
1957 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1958 n = raidz_parity_verify(zio, rm);
1959 unexpected_errors += n;
1960 ASSERT(parity_errors + n <=
1961 rm->rm_firstdatacol);
1962 }
1963 goto done;
1964 }
1965 } else {
1966 /*
1967 * We either attempt to read all the parity columns or
1968 * none of them. If we didn't try to read parity, we
1969 * wouldn't be here in the correctable case. There must
1970 * also have been fewer parity errors than parity
1971 * columns or, again, we wouldn't be in this code path.
1972 */
1973 ASSERT(parity_untried == 0);
1974 ASSERT(parity_errors < rm->rm_firstdatacol);
1975
1976 /*
1977 * Identify the data columns that reported an error.
1978 */
1979 n = 0;
1980 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1981 rc = &rm->rm_col[c];
1982 if (rc->rc_error != 0) {
1983 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1984 tgts[n++] = c;
1985 }
1986 }
1987
1988 ASSERT(rm->rm_firstdatacol >= n);
1989
1990 code = vdev_raidz_reconstruct(rm, tgts, n);
1991
1992 if (raidz_checksum_verify(zio) == 0) {
1993 atomic_inc_64(&raidz_corrected[code]);
1994
1995 /*
1996 * If we read more parity disks than were used
1997 * for reconstruction, confirm that the other
1998 * parity disks produced correct data. This
1999 * routine is suboptimal in that it regenerates
2000 * the parity that we already used in addition
2001 * to the parity that we're attempting to
2002 * verify, but this should be a relatively
2003 * uncommon case, and can be optimized if it
2004 * becomes a problem. Note that we regenerate
2005 * parity when resilvering so we can write it
2006 * out to failed devices later.
2007 */
2008 if (parity_errors < rm->rm_firstdatacol - n ||
2009 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2010 n = raidz_parity_verify(zio, rm);
2011 unexpected_errors += n;
2012 ASSERT(parity_errors + n <=
2013 rm->rm_firstdatacol);
2014 }
2015
2016 goto done;
2017 }
2018 }
2019 }
2020
2021 /*
2022 * This isn't a typical situation -- either we got a read error or
2023 * a child silently returned bad data. Read every block so we can
2024 * try again with as much data and parity as we can track down. If
2025 * we've already been through once before, all children will be marked
2026 * as tried so we'll proceed to combinatorial reconstruction.
2027 */
2028 unexpected_errors = 1;
2029 rm->rm_missingdata = 0;
2030 rm->rm_missingparity = 0;
2031
2032 for (c = 0; c < rm->rm_cols; c++) {
2033 if (rm->rm_col[c].rc_tried)
2034 continue;
2035
2036 zio_vdev_io_redone(zio);
2037 do {
2038 rc = &rm->rm_col[c];
2039 if (rc->rc_tried)
2040 continue;
2041 zio_nowait(zio_vdev_child_io(zio, NULL,
2042 vd->vdev_child[rc->rc_devidx],
2043 rc->rc_offset, rc->rc_data, rc->rc_size,
2044 zio->io_type, zio->io_priority, 0,
2045 vdev_raidz_child_done, rc));
2046 } while (++c < rm->rm_cols);
2047
2048 return;
2049 }
2050
2051 /*
2052 * At this point we've attempted to reconstruct the data given the
2053 * errors we detected, and we've attempted to read all columns. There
2054 * must, therefore, be one or more additional problems -- silent errors
2055 * resulting in invalid data rather than explicit I/O errors resulting
2056 * in absent data. We check if there is enough additional data to
2057 * possibly reconstruct the data and then perform combinatorial
2058 * reconstruction over all possible combinations. If that fails,
2059 * we're cooked.
2060 */
2061 if (total_errors > rm->rm_firstdatacol) {
2062 zio->io_error = vdev_raidz_worst_error(rm);
2063
2064 } else if (total_errors < rm->rm_firstdatacol &&
2065 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
2066 /*
2067 * If we didn't use all the available parity for the
2068 * combinatorial reconstruction, verify that the remaining
2069 * parity is correct.
2070 */
2071 if (code != (1 << rm->rm_firstdatacol) - 1)
2072 (void) raidz_parity_verify(zio, rm);
2073 } else {
2074 /*
2075 * We're here because either:
2076 *
2077 * total_errors == rm_first_datacol, or
2078 * vdev_raidz_combrec() failed
2079 *
2080 * In either case, there is enough bad data to prevent
2081 * reconstruction.
2082 *
2083 * Start checksum ereports for all children which haven't
2084 * failed, and the IO wasn't speculative.
2085 */
2086 zio->io_error = SET_ERROR(ECKSUM);
2087
2088 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2089 for (c = 0; c < rm->rm_cols; c++) {
2090 rc = &rm->rm_col[c];
2091 if (rc->rc_error == 0) {
2092 zio_bad_cksum_t zbc;
2093 zbc.zbc_has_cksum = 0;
2094 zbc.zbc_injected =
2095 rm->rm_ecksuminjected;
2096
2097 zfs_ereport_start_checksum(
2098 zio->io_spa,
2099 vd->vdev_child[rc->rc_devidx],
2100 zio, rc->rc_offset, rc->rc_size,
2101 (void *)(uintptr_t)c, &zbc);
2102 }
2103 }
2104 }
2105 }
2106
2107 done:
2108 zio_checksum_verified(zio);
2109
2110 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2111 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
2112 /*
2113 * Use the good data we have in hand to repair damaged children.
2114 */
2115 for (c = 0; c < rm->rm_cols; c++) {
2116 rc = &rm->rm_col[c];
2117 cvd = vd->vdev_child[rc->rc_devidx];
2118
2119 if (rc->rc_error == 0)
2120 continue;
2121
2122 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2123 rc->rc_offset, rc->rc_data, rc->rc_size,
2124 ZIO_TYPE_WRITE, zio->io_priority,
2125 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2126 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2127 }
2128 }
2129 }
2130
2131 static void
2132 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2133 {
2134 if (faulted > vd->vdev_nparity)
2135 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2136 VDEV_AUX_NO_REPLICAS);
2137 else if (degraded + faulted != 0)
2138 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2139 else
2140 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2141 }
2142
2143 vdev_ops_t vdev_raidz_ops = {
2144 vdev_raidz_open,
2145 vdev_raidz_close,
2146 vdev_raidz_asize,
2147 vdev_raidz_io_start,
2148 vdev_raidz_io_done,
2149 vdev_raidz_state_change,
2150 NULL,
2151 NULL,
2152 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2153 B_FALSE /* not a leaf vdev */
2154 };