]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_raidz.c
Fix gcc c90 compliance warnings
[mirror_zfs.git] / module / zfs / vdev_raidz.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
34dc7c2f
BB
24 */
25
34dc7c2f
BB
26#include <sys/zfs_context.h>
27#include <sys/spa.h>
28#include <sys/vdev_impl.h>
29#include <sys/zio.h>
30#include <sys/zio_checksum.h>
31#include <sys/fs/zfs.h>
32#include <sys/fm/fs/zfs.h>
33
34/*
35 * Virtual device vector for RAID-Z.
36 *
45d1cae3
BB
37 * This vdev supports single, double, and triple parity. For single parity,
38 * we use a simple XOR of all the data columns. For double or triple parity,
39 * we use a special case of Reed-Solomon coding. This extends the
40 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
41 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
42 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
43 * former is also based. The latter is designed to provide higher performance
44 * for writes.
45 *
46 * Note that the Plank paper claimed to support arbitrary N+M, but was then
47 * amended six years later identifying a critical flaw that invalidates its
48 * claims. Nevertheless, the technique can be adapted to work for up to
49 * triple parity. For additional parity, the amendment "Note: Correction to
50 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
51 * is viable, but the additional complexity means that write performance will
52 * suffer.
53 *
54 * All of the methods above operate on a Galois field, defined over the
55 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
56 * can be expressed with a single byte. Briefly, the operations on the
57 * field are defined as follows:
34dc7c2f
BB
58 *
59 * o addition (+) is represented by a bitwise XOR
60 * o subtraction (-) is therefore identical to addition: A + B = A - B
61 * o multiplication of A by 2 is defined by the following bitwise expression:
62 * (A * 2)_7 = A_6
63 * (A * 2)_6 = A_5
64 * (A * 2)_5 = A_4
65 * (A * 2)_4 = A_3 + A_7
66 * (A * 2)_3 = A_2 + A_7
67 * (A * 2)_2 = A_1 + A_7
68 * (A * 2)_1 = A_0
69 * (A * 2)_0 = A_7
70 *
71 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
45d1cae3
BB
72 * As an aside, this multiplication is derived from the error correcting
73 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
34dc7c2f
BB
74 *
75 * Observe that any number in the field (except for 0) can be expressed as a
76 * power of 2 -- a generator for the field. We store a table of the powers of
77 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
78 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
45d1cae3
BB
79 * than field addition). The inverse of a field element A (A^-1) is therefore
80 * A ^ (255 - 1) = A^254.
34dc7c2f 81 *
45d1cae3
BB
82 * The up-to-three parity columns, P, Q, R over several data columns,
83 * D_0, ... D_n-1, can be expressed by field operations:
34dc7c2f
BB
84 *
85 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
86 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
87 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
45d1cae3
BB
88 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
89 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
34dc7c2f 90 *
45d1cae3
BB
91 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
92 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
93 * independent coefficients. (There are no additional coefficients that have
94 * this property which is why the uncorrected Plank method breaks down.)
95 *
96 * See the reconstruction code below for how P, Q and R can used individually
97 * or in concert to recover missing data columns.
34dc7c2f
BB
98 */
99
100typedef struct raidz_col {
101 uint64_t rc_devidx; /* child device index for I/O */
102 uint64_t rc_offset; /* device offset */
103 uint64_t rc_size; /* I/O size */
104 void *rc_data; /* I/O data */
428870ff 105 void *rc_gdata; /* used to store the "good" version */
34dc7c2f
BB
106 int rc_error; /* I/O error for this device */
107 uint8_t rc_tried; /* Did we attempt this I/O column? */
108 uint8_t rc_skipped; /* Did we skip this I/O column? */
109} raidz_col_t;
110
111typedef struct raidz_map {
45d1cae3
BB
112 uint64_t rm_cols; /* Regular column count */
113 uint64_t rm_scols; /* Count including skipped columns */
34dc7c2f
BB
114 uint64_t rm_bigcols; /* Number of oversized columns */
115 uint64_t rm_asize; /* Actual total I/O size */
116 uint64_t rm_missingdata; /* Count of missing data devices */
117 uint64_t rm_missingparity; /* Count of missing parity devices */
118 uint64_t rm_firstdatacol; /* First data column/parity count */
428870ff
BB
119 uint64_t rm_nskip; /* Skipped sectors for padding */
120 uint64_t rm_skipstart; /* Column index of padding start */
121 void *rm_datacopy; /* rm_asize-buffer of copied data */
122 uintptr_t rm_reports; /* # of referencing checksum reports */
123 uint8_t rm_freed; /* map no longer has referencing ZIO */
124 uint8_t rm_ecksuminjected; /* checksum error was injected */
34dc7c2f
BB
125 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
126} raidz_map_t;
127
128#define VDEV_RAIDZ_P 0
129#define VDEV_RAIDZ_Q 1
45d1cae3 130#define VDEV_RAIDZ_R 2
45d1cae3
BB
131
132#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
133#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
134
135/*
136 * We provide a mechanism to perform the field multiplication operation on a
137 * 64-bit value all at once rather than a byte at a time. This works by
138 * creating a mask from the top bit in each byte and using that to
139 * conditionally apply the XOR of 0x1d.
140 */
141#define VDEV_RAIDZ_64MUL_2(x, mask) \
142{ \
143 (mask) = (x) & 0x8080808080808080ULL; \
144 (mask) = ((mask) << 1) - ((mask) >> 7); \
145 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
c5b3a7bb 146 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
45d1cae3 147}
34dc7c2f 148
45d1cae3
BB
149#define VDEV_RAIDZ_64MUL_4(x, mask) \
150{ \
151 VDEV_RAIDZ_64MUL_2((x), mask); \
152 VDEV_RAIDZ_64MUL_2((x), mask); \
153}
34dc7c2f 154
45d1cae3
BB
155/*
156 * Force reconstruction to use the general purpose method.
157 */
158int vdev_raidz_default_to_general;
34dc7c2f
BB
159
160/*
161 * These two tables represent powers and logs of 2 in the Galois field defined
162 * above. These values were computed by repeatedly multiplying by 2 as above.
163 */
164static const uint8_t vdev_raidz_pow2[256] = {
165 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
166 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
167 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
168 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
169 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
170 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
171 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
172 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
173 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
174 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
175 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
176 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
177 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
178 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
179 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
180 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
181 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
182 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
183 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
184 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
185 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
186 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
187 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
188 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
189 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
190 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
191 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
192 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
193 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
194 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
195 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
196 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
197};
198static const uint8_t vdev_raidz_log2[256] = {
199 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
200 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
201 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
202 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
203 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
204 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
205 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
206 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
207 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
208 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
209 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
210 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
211 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
212 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
213 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
214 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
215 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
216 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
217 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
218 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
219 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
220 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
221 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
222 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
223 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
224 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
225 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
226 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
227 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
228 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
229 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
230 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
231};
232
428870ff
BB
233static void vdev_raidz_generate_parity(raidz_map_t *rm);
234
34dc7c2f
BB
235/*
236 * Multiply a given number by 2 raised to the given power.
237 */
238static uint8_t
239vdev_raidz_exp2(uint_t a, int exp)
240{
241 if (a == 0)
242 return (0);
243
244 ASSERT(exp >= 0);
245 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
246
247 exp += vdev_raidz_log2[a];
248 if (exp > 255)
249 exp -= 255;
250
251 return (vdev_raidz_pow2[exp]);
252}
253
b128c09f 254static void
428870ff 255vdev_raidz_map_free(raidz_map_t *rm)
b128c09f 256{
b128c09f 257 int c;
428870ff 258 size_t size;
b128c09f 259
428870ff 260 for (c = 0; c < rm->rm_firstdatacol; c++) {
b128c09f
BB
261 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
262
428870ff
BB
263 if (rm->rm_col[c].rc_gdata != NULL)
264 zio_buf_free(rm->rm_col[c].rc_gdata,
265 rm->rm_col[c].rc_size);
266 }
267
268 size = 0;
269 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
270 size += rm->rm_col[c].rc_size;
271
272 if (rm->rm_datacopy != NULL)
273 zio_buf_free(rm->rm_datacopy, size);
274
45d1cae3 275 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
b128c09f
BB
276}
277
428870ff
BB
278static void
279vdev_raidz_map_free_vsd(zio_t *zio)
280{
281 raidz_map_t *rm = zio->io_vsd;
282
283 ASSERT3U(rm->rm_freed, ==, 0);
284 rm->rm_freed = 1;
285
286 if (rm->rm_reports == 0)
287 vdev_raidz_map_free(rm);
288}
289
290/*ARGSUSED*/
291static void
292vdev_raidz_cksum_free(void *arg, size_t ignored)
293{
294 raidz_map_t *rm = arg;
295
296 ASSERT3U(rm->rm_reports, >, 0);
297
298 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
299 vdev_raidz_map_free(rm);
300}
301
302static void
303vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
304{
305 raidz_map_t *rm = zcr->zcr_cbdata;
306 size_t c = zcr->zcr_cbinfo;
307 size_t x;
308
309 const char *good = NULL;
310 const char *bad = rm->rm_col[c].rc_data;
311
312 if (good_data == NULL) {
313 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
314 return;
315 }
316
317 if (c < rm->rm_firstdatacol) {
318 /*
319 * The first time through, calculate the parity blocks for
320 * the good data (this relies on the fact that the good
321 * data never changes for a given logical ZIO)
322 */
323 if (rm->rm_col[0].rc_gdata == NULL) {
324 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
325 char *buf;
326
327 /*
328 * Set up the rm_col[]s to generate the parity for
329 * good_data, first saving the parity bufs and
330 * replacing them with buffers to hold the result.
331 */
332 for (x = 0; x < rm->rm_firstdatacol; x++) {
333 bad_parity[x] = rm->rm_col[x].rc_data;
334 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
335 zio_buf_alloc(rm->rm_col[x].rc_size);
336 }
337
338 /* fill in the data columns from good_data */
339 buf = (char *)good_data;
340 for (; x < rm->rm_cols; x++) {
341 rm->rm_col[x].rc_data = buf;
342 buf += rm->rm_col[x].rc_size;
343 }
344
345 /*
346 * Construct the parity from the good data.
347 */
348 vdev_raidz_generate_parity(rm);
349
350 /* restore everything back to its original state */
351 for (x = 0; x < rm->rm_firstdatacol; x++)
352 rm->rm_col[x].rc_data = bad_parity[x];
353
354 buf = rm->rm_datacopy;
355 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
356 rm->rm_col[x].rc_data = buf;
357 buf += rm->rm_col[x].rc_size;
358 }
359 }
360
361 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
362 good = rm->rm_col[c].rc_gdata;
363 } else {
364 /* adjust good_data to point at the start of our column */
365 good = good_data;
366
367 for (x = rm->rm_firstdatacol; x < c; x++)
368 good += rm->rm_col[x].rc_size;
369 }
370
371 /* we drop the ereport if it ends up that the data was good */
372 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
373}
374
375/*
376 * Invoked indirectly by zfs_ereport_start_checksum(), called
377 * below when our read operation fails completely. The main point
378 * is to keep a copy of everything we read from disk, so that at
379 * vdev_raidz_cksum_finish() time we can compare it with the good data.
380 */
381static void
382vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
383{
384 size_t c = (size_t)(uintptr_t)arg;
385 caddr_t buf;
386
387 raidz_map_t *rm = zio->io_vsd;
388 size_t size;
389
390 /* set up the report and bump the refcount */
391 zcr->zcr_cbdata = rm;
392 zcr->zcr_cbinfo = c;
393 zcr->zcr_finish = vdev_raidz_cksum_finish;
394 zcr->zcr_free = vdev_raidz_cksum_free;
395
396 rm->rm_reports++;
397 ASSERT3U(rm->rm_reports, >, 0);
398
399 if (rm->rm_datacopy != NULL)
400 return;
401
402 /*
403 * It's the first time we're called for this raidz_map_t, so we need
404 * to copy the data aside; there's no guarantee that our zio's buffer
405 * won't be re-used for something else.
406 *
407 * Our parity data is already in separate buffers, so there's no need
408 * to copy them.
409 */
410
411 size = 0;
412 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
413 size += rm->rm_col[c].rc_size;
414
415 buf = rm->rm_datacopy = zio_buf_alloc(size);
416
417 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
418 raidz_col_t *col = &rm->rm_col[c];
419
420 bcopy(col->rc_data, buf, col->rc_size);
421 col->rc_data = buf;
422
423 buf += col->rc_size;
424 }
425 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
426}
427
428static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
429 vdev_raidz_map_free_vsd,
430 vdev_raidz_cksum_report
431};
432
34dc7c2f
BB
433static raidz_map_t *
434vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
435 uint64_t nparity)
436{
437 raidz_map_t *rm;
438 uint64_t b = zio->io_offset >> unit_shift;
439 uint64_t s = zio->io_size >> unit_shift;
440 uint64_t f = b % dcols;
441 uint64_t o = (b / dcols) << unit_shift;
45d1cae3 442 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
34dc7c2f
BB
443
444 q = s / (dcols - nparity);
445 r = s - q * (dcols - nparity);
446 bc = (r == 0 ? 0 : r + nparity);
45d1cae3
BB
447 tot = s + nparity * (q + (r == 0 ? 0 : 1));
448
449 if (q == 0) {
450 acols = bc;
451 scols = MIN(dcols, roundup(bc, nparity + 1));
452 } else {
453 acols = dcols;
454 scols = dcols;
455 }
34dc7c2f 456
45d1cae3 457 ASSERT3U(acols, <=, scols);
34dc7c2f 458
45d1cae3 459 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
34dc7c2f
BB
460
461 rm->rm_cols = acols;
45d1cae3 462 rm->rm_scols = scols;
34dc7c2f 463 rm->rm_bigcols = bc;
428870ff 464 rm->rm_skipstart = bc;
34dc7c2f
BB
465 rm->rm_missingdata = 0;
466 rm->rm_missingparity = 0;
467 rm->rm_firstdatacol = nparity;
428870ff
BB
468 rm->rm_datacopy = NULL;
469 rm->rm_reports = 0;
470 rm->rm_freed = 0;
471 rm->rm_ecksuminjected = 0;
34dc7c2f 472
45d1cae3
BB
473 asize = 0;
474
475 for (c = 0; c < scols; c++) {
34dc7c2f
BB
476 col = f + c;
477 coff = o;
478 if (col >= dcols) {
479 col -= dcols;
480 coff += 1ULL << unit_shift;
481 }
482 rm->rm_col[c].rc_devidx = col;
483 rm->rm_col[c].rc_offset = coff;
34dc7c2f 484 rm->rm_col[c].rc_data = NULL;
428870ff 485 rm->rm_col[c].rc_gdata = NULL;
34dc7c2f
BB
486 rm->rm_col[c].rc_error = 0;
487 rm->rm_col[c].rc_tried = 0;
488 rm->rm_col[c].rc_skipped = 0;
45d1cae3
BB
489
490 if (c >= acols)
491 rm->rm_col[c].rc_size = 0;
492 else if (c < bc)
493 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
494 else
495 rm->rm_col[c].rc_size = q << unit_shift;
496
497 asize += rm->rm_col[c].rc_size;
34dc7c2f
BB
498 }
499
45d1cae3
BB
500 ASSERT3U(asize, ==, tot << unit_shift);
501 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
428870ff
BB
502 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
503 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
504 ASSERT3U(rm->rm_nskip, <=, nparity);
34dc7c2f
BB
505
506 for (c = 0; c < rm->rm_firstdatacol; c++)
507 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
508
509 rm->rm_col[c].rc_data = zio->io_data;
510
511 for (c = c + 1; c < acols; c++)
512 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
513 rm->rm_col[c - 1].rc_size;
514
515 /*
516 * If all data stored spans all columns, there's a danger that parity
517 * will always be on the same device and, since parity isn't read
518 * during normal operation, that that device's I/O bandwidth won't be
519 * used effectively. We therefore switch the parity every 1MB.
520 *
521 * ... at least that was, ostensibly, the theory. As a practical
522 * matter unless we juggle the parity between all devices evenly, we
523 * won't see any benefit. Further, occasional writes that aren't a
524 * multiple of the LCM of the number of children and the minimum
525 * stripe width are sufficient to avoid pessimal behavior.
526 * Unfortunately, this decision created an implicit on-disk format
527 * requirement that we need to support for all eternity, but only
528 * for single-parity RAID-Z.
428870ff
BB
529 *
530 * If we intend to skip a sector in the zeroth column for padding
531 * we must make sure to note this swap. We will never intend to
532 * skip the first column since at least one data and one parity
533 * column must appear in each row.
34dc7c2f
BB
534 */
535 ASSERT(rm->rm_cols >= 2);
536 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
537
538 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
539 devidx = rm->rm_col[0].rc_devidx;
540 o = rm->rm_col[0].rc_offset;
541 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
542 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
543 rm->rm_col[1].rc_devidx = devidx;
544 rm->rm_col[1].rc_offset = o;
428870ff
BB
545
546 if (rm->rm_skipstart == 0)
547 rm->rm_skipstart = 1;
34dc7c2f
BB
548 }
549
550 zio->io_vsd = rm;
428870ff 551 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
34dc7c2f
BB
552 return (rm);
553}
554
34dc7c2f
BB
555static void
556vdev_raidz_generate_parity_p(raidz_map_t *rm)
557{
558 uint64_t *p, *src, pcount, ccount, i;
559 int c;
560
561 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
562
563 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
564 src = rm->rm_col[c].rc_data;
565 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
566 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
567
568 if (c == rm->rm_firstdatacol) {
569 ASSERT(ccount == pcount);
45d1cae3 570 for (i = 0; i < ccount; i++, src++, p++) {
34dc7c2f
BB
571 *p = *src;
572 }
573 } else {
574 ASSERT(ccount <= pcount);
45d1cae3 575 for (i = 0; i < ccount; i++, src++, p++) {
34dc7c2f
BB
576 *p ^= *src;
577 }
578 }
579 }
580}
581
582static void
583vdev_raidz_generate_parity_pq(raidz_map_t *rm)
584{
45d1cae3 585 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
34dc7c2f
BB
586 int c;
587
45d1cae3 588 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
34dc7c2f
BB
589 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
590 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
591
592 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
593 src = rm->rm_col[c].rc_data;
594 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
595 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
45d1cae3
BB
596
597 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
34dc7c2f
BB
598
599 if (c == rm->rm_firstdatacol) {
45d1cae3
BB
600 ASSERT(ccnt == pcnt || ccnt == 0);
601 for (i = 0; i < ccnt; i++, src++, p++, q++) {
34dc7c2f 602 *p = *src;
45d1cae3 603 *q = *src;
34dc7c2f 604 }
45d1cae3 605 for (; i < pcnt; i++, src++, p++, q++) {
34dc7c2f 606 *p = 0;
45d1cae3 607 *q = 0;
34dc7c2f
BB
608 }
609 } else {
45d1cae3 610 ASSERT(ccnt <= pcnt);
34dc7c2f
BB
611
612 /*
45d1cae3
BB
613 * Apply the algorithm described above by multiplying
614 * the previous result and adding in the new value.
34dc7c2f 615 */
45d1cae3
BB
616 for (i = 0; i < ccnt; i++, src++, p++, q++) {
617 *p ^= *src;
618
619 VDEV_RAIDZ_64MUL_2(*q, mask);
34dc7c2f 620 *q ^= *src;
45d1cae3
BB
621 }
622
623 /*
624 * Treat short columns as though they are full of 0s.
625 * Note that there's therefore nothing needed for P.
626 */
627 for (; i < pcnt; i++, q++) {
628 VDEV_RAIDZ_64MUL_2(*q, mask);
629 }
630 }
631 }
632}
633
634static void
635vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
636{
637 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
638 int c;
639
640 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
641 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
642 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
643 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
644 rm->rm_col[VDEV_RAIDZ_R].rc_size);
645
646 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
647 src = rm->rm_col[c].rc_data;
648 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
649 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
650 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
651
652 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
653
654 if (c == rm->rm_firstdatacol) {
655 ASSERT(ccnt == pcnt || ccnt == 0);
656 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
657 *p = *src;
658 *q = *src;
659 *r = *src;
660 }
661 for (; i < pcnt; i++, src++, p++, q++, r++) {
662 *p = 0;
663 *q = 0;
664 *r = 0;
665 }
666 } else {
667 ASSERT(ccnt <= pcnt);
668
669 /*
670 * Apply the algorithm described above by multiplying
671 * the previous result and adding in the new value.
672 */
673 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
34dc7c2f 674 *p ^= *src;
45d1cae3
BB
675
676 VDEV_RAIDZ_64MUL_2(*q, mask);
677 *q ^= *src;
678
679 VDEV_RAIDZ_64MUL_4(*r, mask);
680 *r ^= *src;
34dc7c2f
BB
681 }
682
683 /*
684 * Treat short columns as though they are full of 0s.
45d1cae3 685 * Note that there's therefore nothing needed for P.
34dc7c2f 686 */
45d1cae3
BB
687 for (; i < pcnt; i++, q++, r++) {
688 VDEV_RAIDZ_64MUL_2(*q, mask);
689 VDEV_RAIDZ_64MUL_4(*r, mask);
34dc7c2f
BB
690 }
691 }
692 }
693}
694
45d1cae3
BB
695/*
696 * Generate RAID parity in the first virtual columns according to the number of
697 * parity columns available.
698 */
34dc7c2f 699static void
45d1cae3
BB
700vdev_raidz_generate_parity(raidz_map_t *rm)
701{
702 switch (rm->rm_firstdatacol) {
703 case 1:
704 vdev_raidz_generate_parity_p(rm);
705 break;
706 case 2:
707 vdev_raidz_generate_parity_pq(rm);
708 break;
709 case 3:
710 vdev_raidz_generate_parity_pqr(rm);
711 break;
712 default:
713 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
714 }
715}
716
717static int
718vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
719{
720 uint64_t *dst, *src, xcount, ccount, count, i;
45d1cae3 721 int x = tgts[0];
34dc7c2f
BB
722 int c;
723
45d1cae3
BB
724 ASSERT(ntgts == 1);
725 ASSERT(x >= rm->rm_firstdatacol);
726 ASSERT(x < rm->rm_cols);
727
34dc7c2f
BB
728 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
729 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
730 ASSERT(xcount > 0);
731
732 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
733 dst = rm->rm_col[x].rc_data;
734 for (i = 0; i < xcount; i++, dst++, src++) {
735 *dst = *src;
736 }
737
738 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
739 src = rm->rm_col[c].rc_data;
740 dst = rm->rm_col[x].rc_data;
741
742 if (c == x)
743 continue;
744
745 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
746 count = MIN(ccount, xcount);
747
748 for (i = 0; i < count; i++, dst++, src++) {
749 *dst ^= *src;
750 }
751 }
45d1cae3
BB
752
753 return (1 << VDEV_RAIDZ_P);
34dc7c2f
BB
754}
755
45d1cae3
BB
756static int
757vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
758{
759 uint64_t *dst, *src, xcount, ccount, count, mask, i;
760 uint8_t *b;
45d1cae3 761 int x = tgts[0];
34dc7c2f
BB
762 int c, j, exp;
763
45d1cae3
BB
764 ASSERT(ntgts == 1);
765
34dc7c2f
BB
766 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
767 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
768
769 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
770 src = rm->rm_col[c].rc_data;
771 dst = rm->rm_col[x].rc_data;
772
773 if (c == x)
774 ccount = 0;
775 else
776 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
777
778 count = MIN(ccount, xcount);
779
780 if (c == rm->rm_firstdatacol) {
781 for (i = 0; i < count; i++, dst++, src++) {
782 *dst = *src;
783 }
784 for (; i < xcount; i++, dst++) {
785 *dst = 0;
786 }
787
788 } else {
34dc7c2f 789 for (i = 0; i < count; i++, dst++, src++) {
45d1cae3 790 VDEV_RAIDZ_64MUL_2(*dst, mask);
34dc7c2f
BB
791 *dst ^= *src;
792 }
793
794 for (; i < xcount; i++, dst++) {
45d1cae3 795 VDEV_RAIDZ_64MUL_2(*dst, mask);
34dc7c2f
BB
796 }
797 }
798 }
799
800 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
801 dst = rm->rm_col[x].rc_data;
802 exp = 255 - (rm->rm_cols - 1 - x);
803
804 for (i = 0; i < xcount; i++, dst++, src++) {
805 *dst ^= *src;
806 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
807 *b = vdev_raidz_exp2(*b, exp);
808 }
809 }
45d1cae3
BB
810
811 return (1 << VDEV_RAIDZ_Q);
34dc7c2f
BB
812}
813
45d1cae3
BB
814static int
815vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
816{
817 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
818 void *pdata, *qdata;
819 uint64_t xsize, ysize, i;
45d1cae3
BB
820 int x = tgts[0];
821 int y = tgts[1];
34dc7c2f 822
45d1cae3 823 ASSERT(ntgts == 2);
34dc7c2f
BB
824 ASSERT(x < y);
825 ASSERT(x >= rm->rm_firstdatacol);
826 ASSERT(y < rm->rm_cols);
827
828 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
829
830 /*
831 * Move the parity data aside -- we're going to compute parity as
832 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
833 * reuse the parity generation mechanism without trashing the actual
834 * parity so we make those columns appear to be full of zeros by
835 * setting their lengths to zero.
836 */
837 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
838 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
839 xsize = rm->rm_col[x].rc_size;
840 ysize = rm->rm_col[y].rc_size;
841
842 rm->rm_col[VDEV_RAIDZ_P].rc_data =
843 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
844 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
845 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
846 rm->rm_col[x].rc_size = 0;
847 rm->rm_col[y].rc_size = 0;
848
849 vdev_raidz_generate_parity_pq(rm);
850
851 rm->rm_col[x].rc_size = xsize;
852 rm->rm_col[y].rc_size = ysize;
853
854 p = pdata;
855 q = qdata;
856 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
857 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
858 xd = rm->rm_col[x].rc_data;
859 yd = rm->rm_col[y].rc_data;
860
861 /*
862 * We now have:
863 * Pxy = P + D_x + D_y
864 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
865 *
866 * We can then solve for D_x:
867 * D_x = A * (P + Pxy) + B * (Q + Qxy)
868 * where
869 * A = 2^(x - y) * (2^(x - y) + 1)^-1
870 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
871 *
872 * With D_x in hand, we can easily solve for D_y:
873 * D_y = P + Pxy + D_x
874 */
875
876 a = vdev_raidz_pow2[255 + x - y];
877 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
878 tmp = 255 - vdev_raidz_log2[a ^ 1];
879
880 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
881 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
882
883 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
884 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
885 vdev_raidz_exp2(*q ^ *qxy, bexp);
886
887 if (i < ysize)
888 *yd = *p ^ *pxy ^ *xd;
889 }
890
891 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
892 rm->rm_col[VDEV_RAIDZ_P].rc_size);
893 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
894 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
895
896 /*
897 * Restore the saved parity data.
898 */
899 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
900 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
45d1cae3
BB
901
902 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
903}
904
905/* BEGIN CSTYLED */
906/*
907 * In the general case of reconstruction, we must solve the system of linear
908 * equations defined by the coeffecients used to generate parity as well as
909 * the contents of the data and parity disks. This can be expressed with
910 * vectors for the original data (D) and the actual data (d) and parity (p)
911 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
912 *
913 * __ __ __ __
914 * | | __ __ | p_0 |
915 * | V | | D_0 | | p_m-1 |
916 * | | x | : | = | d_0 |
917 * | I | | D_n-1 | | : |
918 * | | ~~ ~~ | d_n-1 |
919 * ~~ ~~ ~~ ~~
920 *
921 * I is simply a square identity matrix of size n, and V is a vandermonde
922 * matrix defined by the coeffecients we chose for the various parity columns
923 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
924 * computation as well as linear separability.
925 *
926 * __ __ __ __
927 * | 1 .. 1 1 1 | | p_0 |
928 * | 2^n-1 .. 4 2 1 | __ __ | : |
929 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
930 * | 1 .. 0 0 0 | | D_1 | | d_0 |
931 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
932 * | : : : : | | : | | d_2 |
933 * | 0 .. 1 0 0 | | D_n-1 | | : |
934 * | 0 .. 0 1 0 | ~~ ~~ | : |
935 * | 0 .. 0 0 1 | | d_n-1 |
936 * ~~ ~~ ~~ ~~
937 *
938 * Note that I, V, d, and p are known. To compute D, we must invert the
939 * matrix and use the known data and parity values to reconstruct the unknown
940 * data values. We begin by removing the rows in V|I and d|p that correspond
941 * to failed or missing columns; we then make V|I square (n x n) and d|p
942 * sized n by removing rows corresponding to unused parity from the bottom up
943 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
944 * using Gauss-Jordan elimination. In the example below we use m=3 parity
945 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
946 * __ __
947 * | 1 1 1 1 1 1 1 1 |
948 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
949 * | 19 205 116 29 64 16 4 1 | / /
950 * | 1 0 0 0 0 0 0 0 | / /
951 * | 0 1 0 0 0 0 0 0 | <--' /
952 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
953 * | 0 0 0 1 0 0 0 0 |
954 * | 0 0 0 0 1 0 0 0 |
955 * | 0 0 0 0 0 1 0 0 |
956 * | 0 0 0 0 0 0 1 0 |
957 * | 0 0 0 0 0 0 0 1 |
958 * ~~ ~~
959 * __ __
960 * | 1 1 1 1 1 1 1 1 |
961 * | 128 64 32 16 8 4 2 1 |
962 * | 19 205 116 29 64 16 4 1 |
963 * | 1 0 0 0 0 0 0 0 |
964 * | 0 1 0 0 0 0 0 0 |
965 * (V|I)' = | 0 0 1 0 0 0 0 0 |
966 * | 0 0 0 1 0 0 0 0 |
967 * | 0 0 0 0 1 0 0 0 |
968 * | 0 0 0 0 0 1 0 0 |
969 * | 0 0 0 0 0 0 1 0 |
970 * | 0 0 0 0 0 0 0 1 |
971 * ~~ ~~
972 *
973 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
974 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
975 * matrix is not singular.
976 * __ __
977 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
978 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
979 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
980 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
981 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
982 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
983 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
984 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
985 * ~~ ~~
986 * __ __
987 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
988 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
989 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
990 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
991 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
992 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
993 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
994 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
995 * ~~ ~~
996 * __ __
997 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
998 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
999 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1000 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1001 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1002 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1003 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1004 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1005 * ~~ ~~
1006 * __ __
1007 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1008 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1009 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1010 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1011 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1012 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1013 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1014 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1015 * ~~ ~~
1016 * __ __
1017 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1018 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1019 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1020 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1021 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1022 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1023 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1024 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1025 * ~~ ~~
1026 * __ __
1027 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1028 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1029 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1030 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1031 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1032 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1033 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1034 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1035 * ~~ ~~
1036 * __ __
1037 * | 0 0 1 0 0 0 0 0 |
1038 * | 167 100 5 41 159 169 217 208 |
1039 * | 166 100 4 40 158 168 216 209 |
1040 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1041 * | 0 0 0 0 1 0 0 0 |
1042 * | 0 0 0 0 0 1 0 0 |
1043 * | 0 0 0 0 0 0 1 0 |
1044 * | 0 0 0 0 0 0 0 1 |
1045 * ~~ ~~
1046 *
1047 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1048 * of the missing data.
1049 *
1050 * As is apparent from the example above, the only non-trivial rows in the
1051 * inverse matrix correspond to the data disks that we're trying to
1052 * reconstruct. Indeed, those are the only rows we need as the others would
1053 * only be useful for reconstructing data known or assumed to be valid. For
1054 * that reason, we only build the coefficients in the rows that correspond to
1055 * targeted columns.
1056 */
1057/* END CSTYLED */
1058
1059static void
1060vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1061 uint8_t **rows)
1062{
1063 int i, j;
1064 int pow;
1065
1066 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1067
1068 /*
1069 * Fill in the missing rows of interest.
1070 */
1071 for (i = 0; i < nmap; i++) {
1072 ASSERT3S(0, <=, map[i]);
1073 ASSERT3S(map[i], <=, 2);
1074
1075 pow = map[i] * n;
1076 if (pow > 255)
1077 pow -= 255;
1078 ASSERT(pow <= 255);
1079
1080 for (j = 0; j < n; j++) {
1081 pow -= map[i];
1082 if (pow < 0)
1083 pow += 255;
1084 rows[i][j] = vdev_raidz_pow2[pow];
1085 }
1086 }
1087}
1088
1089static void
1090vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1091 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1092{
1093 int i, j, ii, jj;
1094 uint8_t log;
1095
1096 /*
1097 * Assert that the first nmissing entries from the array of used
1098 * columns correspond to parity columns and that subsequent entries
1099 * correspond to data columns.
1100 */
1101 for (i = 0; i < nmissing; i++) {
1102 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1103 }
1104 for (; i < n; i++) {
1105 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1106 }
1107
1108 /*
1109 * First initialize the storage where we'll compute the inverse rows.
1110 */
1111 for (i = 0; i < nmissing; i++) {
1112 for (j = 0; j < n; j++) {
1113 invrows[i][j] = (i == j) ? 1 : 0;
1114 }
1115 }
1116
1117 /*
1118 * Subtract all trivial rows from the rows of consequence.
1119 */
1120 for (i = 0; i < nmissing; i++) {
1121 for (j = nmissing; j < n; j++) {
1122 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1123 jj = used[j] - rm->rm_firstdatacol;
1124 ASSERT3S(jj, <, n);
1125 invrows[i][j] = rows[i][jj];
1126 rows[i][jj] = 0;
1127 }
1128 }
1129
1130 /*
1131 * For each of the rows of interest, we must normalize it and subtract
1132 * a multiple of it from the other rows.
1133 */
1134 for (i = 0; i < nmissing; i++) {
1135 for (j = 0; j < missing[i]; j++) {
1136 ASSERT3U(rows[i][j], ==, 0);
1137 }
1138 ASSERT3U(rows[i][missing[i]], !=, 0);
1139
1140 /*
1141 * Compute the inverse of the first element and multiply each
1142 * element in the row by that value.
1143 */
1144 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1145
1146 for (j = 0; j < n; j++) {
1147 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1148 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1149 }
1150
1151 for (ii = 0; ii < nmissing; ii++) {
1152 if (i == ii)
1153 continue;
1154
1155 ASSERT3U(rows[ii][missing[i]], !=, 0);
1156
1157 log = vdev_raidz_log2[rows[ii][missing[i]]];
1158
1159 for (j = 0; j < n; j++) {
1160 rows[ii][j] ^=
1161 vdev_raidz_exp2(rows[i][j], log);
1162 invrows[ii][j] ^=
1163 vdev_raidz_exp2(invrows[i][j], log);
1164 }
1165 }
1166 }
1167
1168 /*
1169 * Verify that the data that is left in the rows are properly part of
1170 * an identity matrix.
1171 */
1172 for (i = 0; i < nmissing; i++) {
1173 for (j = 0; j < n; j++) {
1174 if (j == missing[i]) {
1175 ASSERT3U(rows[i][j], ==, 1);
1176 } else {
1177 ASSERT3U(rows[i][j], ==, 0);
1178 }
1179 }
1180 }
1181}
1182
1183static void
1184vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1185 int *missing, uint8_t **invrows, const uint8_t *used)
1186{
1187 int i, j, x, cc, c;
1188 uint8_t *src;
1189 uint64_t ccount;
1190 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1191 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
1192 uint8_t log, val;
1193 int ll;
1194 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1195 uint8_t *p, *pp;
1196 size_t psize;
1197
1198 psize = sizeof (invlog[0][0]) * n * nmissing;
1199 p = kmem_alloc(psize, KM_SLEEP);
1200
1201 for (pp = p, i = 0; i < nmissing; i++) {
1202 invlog[i] = pp;
1203 pp += n;
1204 }
1205
1206 for (i = 0; i < nmissing; i++) {
1207 for (j = 0; j < n; j++) {
1208 ASSERT3U(invrows[i][j], !=, 0);
1209 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1210 }
1211 }
1212
1213 for (i = 0; i < n; i++) {
1214 c = used[i];
1215 ASSERT3U(c, <, rm->rm_cols);
1216
1217 src = rm->rm_col[c].rc_data;
1218 ccount = rm->rm_col[c].rc_size;
1219 for (j = 0; j < nmissing; j++) {
1220 cc = missing[j] + rm->rm_firstdatacol;
1221 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1222 ASSERT3U(cc, <, rm->rm_cols);
1223 ASSERT3U(cc, !=, c);
1224
1225 dst[j] = rm->rm_col[cc].rc_data;
1226 dcount[j] = rm->rm_col[cc].rc_size;
1227 }
1228
1229 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1230
1231 for (x = 0; x < ccount; x++, src++) {
1232 if (*src != 0)
1233 log = vdev_raidz_log2[*src];
1234
1235 for (cc = 0; cc < nmissing; cc++) {
1236 if (x >= dcount[cc])
1237 continue;
1238
1239 if (*src == 0) {
1240 val = 0;
1241 } else {
1242 if ((ll = log + invlog[cc][i]) >= 255)
1243 ll -= 255;
1244 val = vdev_raidz_pow2[ll];
1245 }
1246
1247 if (i == 0)
1248 dst[cc][x] = val;
1249 else
1250 dst[cc][x] ^= val;
1251 }
1252 }
1253 }
1254
1255 kmem_free(p, psize);
1256}
1257
1258static int
1259vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1260{
1261 int n, i, c, t, tt;
1262 int nmissing_rows;
1263 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1264 int parity_map[VDEV_RAIDZ_MAXPARITY];
1265
1266 uint8_t *p, *pp;
1267 size_t psize;
1268
1269 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1270 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1271 uint8_t *used;
1272
1273 int code = 0;
1274
1275
1276 n = rm->rm_cols - rm->rm_firstdatacol;
1277
1278 /*
1279 * Figure out which data columns are missing.
1280 */
1281 nmissing_rows = 0;
1282 for (t = 0; t < ntgts; t++) {
1283 if (tgts[t] >= rm->rm_firstdatacol) {
1284 missing_rows[nmissing_rows++] =
1285 tgts[t] - rm->rm_firstdatacol;
1286 }
1287 }
1288
1289 /*
1290 * Figure out which parity columns to use to help generate the missing
1291 * data columns.
1292 */
1293 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1294 ASSERT(tt < ntgts);
1295 ASSERT(c < rm->rm_firstdatacol);
1296
1297 /*
1298 * Skip any targeted parity columns.
1299 */
1300 if (c == tgts[tt]) {
1301 tt++;
1302 continue;
1303 }
1304
1305 code |= 1 << c;
1306
1307 parity_map[i] = c;
1308 i++;
1309 }
1310
1311 ASSERT(code != 0);
1312 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1313
1314 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1315 nmissing_rows * n + sizeof (used[0]) * n;
1316 p = kmem_alloc(psize, KM_SLEEP);
1317
1318 for (pp = p, i = 0; i < nmissing_rows; i++) {
1319 rows[i] = pp;
1320 pp += n;
1321 invrows[i] = pp;
1322 pp += n;
1323 }
1324 used = pp;
1325
1326 for (i = 0; i < nmissing_rows; i++) {
1327 used[i] = parity_map[i];
1328 }
1329
1330 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1331 if (tt < nmissing_rows &&
1332 c == missing_rows[tt] + rm->rm_firstdatacol) {
1333 tt++;
1334 continue;
1335 }
1336
1337 ASSERT3S(i, <, n);
1338 used[i] = c;
1339 i++;
1340 }
1341
1342 /*
1343 * Initialize the interesting rows of the matrix.
1344 */
1345 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1346
1347 /*
1348 * Invert the matrix.
1349 */
1350 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1351 invrows, used);
1352
1353 /*
1354 * Reconstruct the missing data using the generated matrix.
1355 */
1356 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1357 invrows, used);
1358
1359 kmem_free(p, psize);
1360
1361 return (code);
34dc7c2f
BB
1362}
1363
45d1cae3
BB
1364static int
1365vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1366{
1367 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1368 int ntgts;
1369 int i, c;
1370 int code;
1371 int nbadparity, nbaddata;
1372 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1373
1374 /*
1375 * The tgts list must already be sorted.
1376 */
1377 for (i = 1; i < nt; i++) {
1378 ASSERT(t[i] > t[i - 1]);
1379 }
1380
1381 nbadparity = rm->rm_firstdatacol;
1382 nbaddata = rm->rm_cols - nbadparity;
1383 ntgts = 0;
1384 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1385 if (c < rm->rm_firstdatacol)
1386 parity_valid[c] = B_FALSE;
1387
1388 if (i < nt && c == t[i]) {
1389 tgts[ntgts++] = c;
1390 i++;
1391 } else if (rm->rm_col[c].rc_error != 0) {
1392 tgts[ntgts++] = c;
1393 } else if (c >= rm->rm_firstdatacol) {
1394 nbaddata--;
1395 } else {
1396 parity_valid[c] = B_TRUE;
1397 nbadparity--;
1398 }
1399 }
1400
1401 ASSERT(ntgts >= nt);
1402 ASSERT(nbaddata >= 0);
1403 ASSERT(nbaddata + nbadparity == ntgts);
1404
1405 dt = &tgts[nbadparity];
1406
1407 /*
1408 * See if we can use any of our optimized reconstruction routines.
1409 */
1410 if (!vdev_raidz_default_to_general) {
1411 switch (nbaddata) {
1412 case 1:
1413 if (parity_valid[VDEV_RAIDZ_P])
1414 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1415
1416 ASSERT(rm->rm_firstdatacol > 1);
1417
1418 if (parity_valid[VDEV_RAIDZ_Q])
1419 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1420
1421 ASSERT(rm->rm_firstdatacol > 2);
1422 break;
1423
1424 case 2:
1425 ASSERT(rm->rm_firstdatacol > 1);
1426
1427 if (parity_valid[VDEV_RAIDZ_P] &&
1428 parity_valid[VDEV_RAIDZ_Q])
1429 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1430
1431 ASSERT(rm->rm_firstdatacol > 2);
1432
1433 break;
1434 }
1435 }
1436
1437 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1438 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1439 ASSERT(code > 0);
1440 return (code);
1441}
34dc7c2f
BB
1442
1443static int
1444vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
1445{
1446 vdev_t *cvd;
1447 uint64_t nparity = vd->vdev_nparity;
45d1cae3 1448 int c;
34dc7c2f
BB
1449 int lasterror = 0;
1450 int numerrors = 0;
1451
1452 ASSERT(nparity > 0);
1453
1454 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1455 vd->vdev_children < nparity + 1) {
1456 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1457 return (EINVAL);
1458 }
1459
45d1cae3
BB
1460 vdev_open_children(vd);
1461
34dc7c2f
BB
1462 for (c = 0; c < vd->vdev_children; c++) {
1463 cvd = vd->vdev_child[c];
1464
45d1cae3
BB
1465 if (cvd->vdev_open_error != 0) {
1466 lasterror = cvd->vdev_open_error;
34dc7c2f
BB
1467 numerrors++;
1468 continue;
1469 }
1470
1471 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1472 *ashift = MAX(*ashift, cvd->vdev_ashift);
1473 }
1474
1475 *asize *= vd->vdev_children;
1476
1477 if (numerrors > nparity) {
1478 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1479 return (lasterror);
1480 }
1481
1482 return (0);
1483}
1484
1485static void
1486vdev_raidz_close(vdev_t *vd)
1487{
1488 int c;
1489
1490 for (c = 0; c < vd->vdev_children; c++)
1491 vdev_close(vd->vdev_child[c]);
1492}
1493
1494static uint64_t
1495vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1496{
1497 uint64_t asize;
1498 uint64_t ashift = vd->vdev_top->vdev_ashift;
1499 uint64_t cols = vd->vdev_children;
1500 uint64_t nparity = vd->vdev_nparity;
1501
1502 asize = ((psize - 1) >> ashift) + 1;
1503 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1504 asize = roundup(asize, nparity + 1) << ashift;
1505
1506 return (asize);
1507}
1508
1509static void
1510vdev_raidz_child_done(zio_t *zio)
1511{
1512 raidz_col_t *rc = zio->io_private;
1513
1514 rc->rc_error = zio->io_error;
1515 rc->rc_tried = 1;
1516 rc->rc_skipped = 0;
1517}
1518
34dc7c2f
BB
1519static int
1520vdev_raidz_io_start(zio_t *zio)
1521{
1522 vdev_t *vd = zio->io_vd;
1523 vdev_t *tvd = vd->vdev_top;
1524 vdev_t *cvd;
34dc7c2f
BB
1525 raidz_map_t *rm;
1526 raidz_col_t *rc;
45d1cae3 1527 int c, i;
34dc7c2f
BB
1528
1529 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1530 vd->vdev_nparity);
1531
1532 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1533
1534 if (zio->io_type == ZIO_TYPE_WRITE) {
45d1cae3 1535 vdev_raidz_generate_parity(rm);
34dc7c2f
BB
1536
1537 for (c = 0; c < rm->rm_cols; c++) {
1538 rc = &rm->rm_col[c];
1539 cvd = vd->vdev_child[rc->rc_devidx];
1540 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1541 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1542 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1543 vdev_raidz_child_done, rc));
1544 }
1545
45d1cae3
BB
1546 /*
1547 * Generate optional I/Os for any skipped sectors to improve
1548 * aggregation contiguity.
1549 */
428870ff 1550 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
45d1cae3
BB
1551 ASSERT(c <= rm->rm_scols);
1552 if (c == rm->rm_scols)
1553 c = 0;
1554 rc = &rm->rm_col[c];
1555 cvd = vd->vdev_child[rc->rc_devidx];
1556 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1557 rc->rc_offset + rc->rc_size, NULL,
1558 1 << tvd->vdev_ashift,
1559 zio->io_type, zio->io_priority,
1560 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1561 }
1562
b128c09f 1563 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1564 }
1565
1566 ASSERT(zio->io_type == ZIO_TYPE_READ);
1567
1568 /*
1569 * Iterate over the columns in reverse order so that we hit the parity
45d1cae3 1570 * last -- any errors along the way will force us to read the parity.
34dc7c2f
BB
1571 */
1572 for (c = rm->rm_cols - 1; c >= 0; c--) {
1573 rc = &rm->rm_col[c];
1574 cvd = vd->vdev_child[rc->rc_devidx];
1575 if (!vdev_readable(cvd)) {
1576 if (c >= rm->rm_firstdatacol)
1577 rm->rm_missingdata++;
1578 else
1579 rm->rm_missingparity++;
1580 rc->rc_error = ENXIO;
1581 rc->rc_tried = 1; /* don't even try */
1582 rc->rc_skipped = 1;
1583 continue;
1584 }
428870ff 1585 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
34dc7c2f
BB
1586 if (c >= rm->rm_firstdatacol)
1587 rm->rm_missingdata++;
1588 else
1589 rm->rm_missingparity++;
1590 rc->rc_error = ESTALE;
1591 rc->rc_skipped = 1;
1592 continue;
1593 }
1594 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
9babb374 1595 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
1596 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1597 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1598 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1599 vdev_raidz_child_done, rc));
1600 }
1601 }
1602
b128c09f 1603 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1604}
1605
428870ff 1606
34dc7c2f
BB
1607/*
1608 * Report a checksum error for a child of a RAID-Z device.
1609 */
1610static void
428870ff 1611raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
34dc7c2f
BB
1612{
1613 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
34dc7c2f
BB
1614
1615 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
428870ff
BB
1616 zio_bad_cksum_t zbc;
1617 raidz_map_t *rm = zio->io_vsd;
1618
34dc7c2f
BB
1619 mutex_enter(&vd->vdev_stat_lock);
1620 vd->vdev_stat.vs_checksum_errors++;
1621 mutex_exit(&vd->vdev_stat_lock);
428870ff
BB
1622
1623 zbc.zbc_has_cksum = 0;
1624 zbc.zbc_injected = rm->rm_ecksuminjected;
1625
1626 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1627 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1628 &zbc);
34dc7c2f 1629 }
428870ff
BB
1630}
1631
1632/*
1633 * We keep track of whether or not there were any injected errors, so that
1634 * any ereports we generate can note it.
1635 */
1636static int
1637raidz_checksum_verify(zio_t *zio)
1638{
1639 zio_bad_cksum_t zbc;
1640 raidz_map_t *rm = zio->io_vsd;
1641
1642 int ret = zio_checksum_error(zio, &zbc);
1643 if (ret != 0 && zbc.zbc_injected != 0)
1644 rm->rm_ecksuminjected = 1;
34dc7c2f 1645
428870ff 1646 return (ret);
34dc7c2f
BB
1647}
1648
1649/*
1650 * Generate the parity from the data columns. If we tried and were able to
1651 * read the parity without error, verify that the generated parity matches the
1652 * data we read. If it doesn't, we fire off a checksum error. Return the
1653 * number such failures.
1654 */
1655static int
1656raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1657{
1658 void *orig[VDEV_RAIDZ_MAXPARITY];
1659 int c, ret = 0;
1660 raidz_col_t *rc;
1661
1662 for (c = 0; c < rm->rm_firstdatacol; c++) {
1663 rc = &rm->rm_col[c];
1664 if (!rc->rc_tried || rc->rc_error != 0)
1665 continue;
1666 orig[c] = zio_buf_alloc(rc->rc_size);
1667 bcopy(rc->rc_data, orig[c], rc->rc_size);
1668 }
1669
45d1cae3 1670 vdev_raidz_generate_parity(rm);
34dc7c2f
BB
1671
1672 for (c = 0; c < rm->rm_firstdatacol; c++) {
1673 rc = &rm->rm_col[c];
1674 if (!rc->rc_tried || rc->rc_error != 0)
1675 continue;
1676 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
428870ff 1677 raidz_checksum_error(zio, rc, orig[c]);
34dc7c2f
BB
1678 rc->rc_error = ECKSUM;
1679 ret++;
1680 }
1681 zio_buf_free(orig[c], rc->rc_size);
1682 }
1683
1684 return (ret);
1685}
1686
45d1cae3
BB
1687/*
1688 * Keep statistics on all the ways that we used parity to correct data.
1689 */
1690static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
34dc7c2f
BB
1691
1692static int
b128c09f
BB
1693vdev_raidz_worst_error(raidz_map_t *rm)
1694{
d6320ddb 1695 int c, error = 0;
b128c09f 1696
d6320ddb 1697 for (c = 0; c < rm->rm_cols; c++)
b128c09f
BB
1698 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1699
1700 return (error);
1701}
1702
45d1cae3
BB
1703/*
1704 * Iterate over all combinations of bad data and attempt a reconstruction.
1705 * Note that the algorithm below is non-optimal because it doesn't take into
1706 * account how reconstruction is actually performed. For example, with
1707 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1708 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1709 * cases we'd only use parity information in column 0.
1710 */
1711static int
1712vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1713{
1714 raidz_map_t *rm = zio->io_vsd;
1715 raidz_col_t *rc;
1716 void *orig[VDEV_RAIDZ_MAXPARITY];
1717 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1718 int *tgts = &tstore[1];
1719 int current, next, i, c, n;
1720 int code, ret = 0;
1721
1722 ASSERT(total_errors < rm->rm_firstdatacol);
1723
1724 /*
1725 * This simplifies one edge condition.
1726 */
1727 tgts[-1] = -1;
1728
1729 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1730 /*
1731 * Initialize the targets array by finding the first n columns
1732 * that contain no error.
1733 *
1734 * If there were no data errors, we need to ensure that we're
1735 * always explicitly attempting to reconstruct at least one
1736 * data column. To do this, we simply push the highest target
1737 * up into the data columns.
1738 */
1739 for (c = 0, i = 0; i < n; i++) {
1740 if (i == n - 1 && data_errors == 0 &&
1741 c < rm->rm_firstdatacol) {
1742 c = rm->rm_firstdatacol;
1743 }
1744
1745 while (rm->rm_col[c].rc_error != 0) {
1746 c++;
1747 ASSERT3S(c, <, rm->rm_cols);
1748 }
1749
1750 tgts[i] = c++;
1751 }
1752
1753 /*
1754 * Setting tgts[n] simplifies the other edge condition.
1755 */
1756 tgts[n] = rm->rm_cols;
1757
1758 /*
1759 * These buffers were allocated in previous iterations.
1760 */
1761 for (i = 0; i < n - 1; i++) {
1762 ASSERT(orig[i] != NULL);
1763 }
1764
1765 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1766
1767 current = 0;
1768 next = tgts[current];
1769
1770 while (current != n) {
1771 tgts[current] = next;
1772 current = 0;
1773
1774 /*
1775 * Save off the original data that we're going to
1776 * attempt to reconstruct.
1777 */
1778 for (i = 0; i < n; i++) {
1779 ASSERT(orig[i] != NULL);
1780 c = tgts[i];
1781 ASSERT3S(c, >=, 0);
1782 ASSERT3S(c, <, rm->rm_cols);
1783 rc = &rm->rm_col[c];
1784 bcopy(rc->rc_data, orig[i], rc->rc_size);
1785 }
1786
1787 /*
1788 * Attempt a reconstruction and exit the outer loop on
1789 * success.
1790 */
1791 code = vdev_raidz_reconstruct(rm, tgts, n);
428870ff 1792 if (raidz_checksum_verify(zio) == 0) {
45d1cae3
BB
1793 atomic_inc_64(&raidz_corrected[code]);
1794
1795 for (i = 0; i < n; i++) {
1796 c = tgts[i];
1797 rc = &rm->rm_col[c];
1798 ASSERT(rc->rc_error == 0);
1799 if (rc->rc_tried)
428870ff
BB
1800 raidz_checksum_error(zio, rc,
1801 orig[i]);
45d1cae3
BB
1802 rc->rc_error = ECKSUM;
1803 }
1804
1805 ret = code;
1806 goto done;
1807 }
1808
1809 /*
1810 * Restore the original data.
1811 */
1812 for (i = 0; i < n; i++) {
1813 c = tgts[i];
1814 rc = &rm->rm_col[c];
1815 bcopy(orig[i], rc->rc_data, rc->rc_size);
1816 }
1817
1818 do {
1819 /*
1820 * Find the next valid column after the current
1821 * position..
1822 */
1823 for (next = tgts[current] + 1;
1824 next < rm->rm_cols &&
1825 rm->rm_col[next].rc_error != 0; next++)
1826 continue;
1827
1828 ASSERT(next <= tgts[current + 1]);
1829
1830 /*
1831 * If that spot is available, we're done here.
1832 */
1833 if (next != tgts[current + 1])
1834 break;
1835
1836 /*
1837 * Otherwise, find the next valid column after
1838 * the previous position.
1839 */
1840 for (c = tgts[current - 1] + 1;
1841 rm->rm_col[c].rc_error != 0; c++)
1842 continue;
1843
1844 tgts[current] = c;
1845 current++;
1846
1847 } while (current != n);
1848 }
1849 }
1850 n--;
1851done:
1852 for (i = 0; i < n; i++) {
1853 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1854 }
1855
1856 return (ret);
1857}
1858
b128c09f 1859static void
34dc7c2f
BB
1860vdev_raidz_io_done(zio_t *zio)
1861{
1862 vdev_t *vd = zio->io_vd;
1863 vdev_t *cvd;
1864 raidz_map_t *rm = zio->io_vsd;
45d1cae3 1865 raidz_col_t *rc;
34dc7c2f
BB
1866 int unexpected_errors = 0;
1867 int parity_errors = 0;
1868 int parity_untried = 0;
1869 int data_errors = 0;
b128c09f 1870 int total_errors = 0;
45d1cae3
BB
1871 int n, c;
1872 int tgts[VDEV_RAIDZ_MAXPARITY];
1873 int code;
34dc7c2f
BB
1874
1875 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1876
34dc7c2f
BB
1877 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1878 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1879
1880 for (c = 0; c < rm->rm_cols; c++) {
1881 rc = &rm->rm_col[c];
1882
34dc7c2f 1883 if (rc->rc_error) {
b128c09f 1884 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
34dc7c2f
BB
1885
1886 if (c < rm->rm_firstdatacol)
1887 parity_errors++;
1888 else
1889 data_errors++;
1890
1891 if (!rc->rc_skipped)
1892 unexpected_errors++;
1893
b128c09f 1894 total_errors++;
34dc7c2f
BB
1895 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1896 parity_untried++;
1897 }
1898 }
1899
1900 if (zio->io_type == ZIO_TYPE_WRITE) {
1901 /*
b128c09f
BB
1902 * XXX -- for now, treat partial writes as a success.
1903 * (If we couldn't write enough columns to reconstruct
1904 * the data, the I/O failed. Otherwise, good enough.)
1905 *
1906 * Now that we support write reallocation, it would be better
1907 * to treat partial failure as real failure unless there are
1908 * no non-degraded top-level vdevs left, and not update DTLs
1909 * if we intend to reallocate.
34dc7c2f
BB
1910 */
1911 /* XXPOLICY */
b128c09f
BB
1912 if (total_errors > rm->rm_firstdatacol)
1913 zio->io_error = vdev_raidz_worst_error(rm);
34dc7c2f 1914
b128c09f 1915 return;
34dc7c2f
BB
1916 }
1917
1918 ASSERT(zio->io_type == ZIO_TYPE_READ);
1919 /*
1920 * There are three potential phases for a read:
1921 * 1. produce valid data from the columns read
1922 * 2. read all disks and try again
1923 * 3. perform combinatorial reconstruction
1924 *
1925 * Each phase is progressively both more expensive and less likely to
1926 * occur. If we encounter more errors than we can repair or all phases
1927 * fail, we have no choice but to return an error.
1928 */
1929
1930 /*
1931 * If the number of errors we saw was correctable -- less than or equal
1932 * to the number of parity disks read -- attempt to produce data that
1933 * has a valid checksum. Naturally, this case applies in the absence of
1934 * any errors.
1935 */
b128c09f 1936 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
45d1cae3 1937 if (data_errors == 0) {
428870ff 1938 if (raidz_checksum_verify(zio) == 0) {
34dc7c2f
BB
1939 /*
1940 * If we read parity information (unnecessarily
1941 * as it happens since no reconstruction was
1942 * needed) regenerate and verify the parity.
1943 * We also regenerate parity when resilvering
1944 * so we can write it out to the failed device
1945 * later.
1946 */
1947 if (parity_errors + parity_untried <
1948 rm->rm_firstdatacol ||
1949 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1950 n = raidz_parity_verify(zio, rm);
1951 unexpected_errors += n;
1952 ASSERT(parity_errors + n <=
1953 rm->rm_firstdatacol);
1954 }
1955 goto done;
1956 }
45d1cae3 1957 } else {
34dc7c2f
BB
1958 /*
1959 * We either attempt to read all the parity columns or
1960 * none of them. If we didn't try to read parity, we
1961 * wouldn't be here in the correctable case. There must
1962 * also have been fewer parity errors than parity
1963 * columns or, again, we wouldn't be in this code path.
1964 */
1965 ASSERT(parity_untried == 0);
1966 ASSERT(parity_errors < rm->rm_firstdatacol);
1967
1968 /*
45d1cae3 1969 * Identify the data columns that reported an error.
34dc7c2f 1970 */
45d1cae3 1971 n = 0;
34dc7c2f
BB
1972 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1973 rc = &rm->rm_col[c];
45d1cae3
BB
1974 if (rc->rc_error != 0) {
1975 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1976 tgts[n++] = c;
1977 }
34dc7c2f 1978 }
34dc7c2f 1979
45d1cae3
BB
1980 ASSERT(rm->rm_firstdatacol >= n);
1981
1982 code = vdev_raidz_reconstruct(rm, tgts, n);
34dc7c2f 1983
428870ff 1984 if (raidz_checksum_verify(zio) == 0) {
45d1cae3 1985 atomic_inc_64(&raidz_corrected[code]);
34dc7c2f
BB
1986
1987 /*
45d1cae3
BB
1988 * If we read more parity disks than were used
1989 * for reconstruction, confirm that the other
1990 * parity disks produced correct data. This
1991 * routine is suboptimal in that it regenerates
1992 * the parity that we already used in addition
1993 * to the parity that we're attempting to
1994 * verify, but this should be a relatively
1995 * uncommon case, and can be optimized if it
1996 * becomes a problem. Note that we regenerate
1997 * parity when resilvering so we can write it
1998 * out to failed devices later.
34dc7c2f 1999 */
45d1cae3 2000 if (parity_errors < rm->rm_firstdatacol - n ||
34dc7c2f
BB
2001 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2002 n = raidz_parity_verify(zio, rm);
2003 unexpected_errors += n;
2004 ASSERT(parity_errors + n <=
2005 rm->rm_firstdatacol);
2006 }
2007
2008 goto done;
2009 }
34dc7c2f
BB
2010 }
2011 }
2012
2013 /*
2014 * This isn't a typical situation -- either we got a read error or
2015 * a child silently returned bad data. Read every block so we can
2016 * try again with as much data and parity as we can track down. If
2017 * we've already been through once before, all children will be marked
2018 * as tried so we'll proceed to combinatorial reconstruction.
2019 */
2020 unexpected_errors = 1;
2021 rm->rm_missingdata = 0;
2022 rm->rm_missingparity = 0;
2023
2024 for (c = 0; c < rm->rm_cols; c++) {
2025 if (rm->rm_col[c].rc_tried)
2026 continue;
2027
34dc7c2f
BB
2028 zio_vdev_io_redone(zio);
2029 do {
2030 rc = &rm->rm_col[c];
2031 if (rc->rc_tried)
2032 continue;
2033 zio_nowait(zio_vdev_child_io(zio, NULL,
2034 vd->vdev_child[rc->rc_devidx],
2035 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 2036 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
2037 vdev_raidz_child_done, rc));
2038 } while (++c < rm->rm_cols);
34dc7c2f 2039
b128c09f 2040 return;
34dc7c2f
BB
2041 }
2042
2043 /*
2044 * At this point we've attempted to reconstruct the data given the
2045 * errors we detected, and we've attempted to read all columns. There
2046 * must, therefore, be one or more additional problems -- silent errors
2047 * resulting in invalid data rather than explicit I/O errors resulting
45d1cae3
BB
2048 * in absent data. We check if there is enough additional data to
2049 * possibly reconstruct the data and then perform combinatorial
2050 * reconstruction over all possible combinations. If that fails,
2051 * we're cooked.
34dc7c2f 2052 */
428870ff 2053 if (total_errors > rm->rm_firstdatacol) {
b128c09f 2054 zio->io_error = vdev_raidz_worst_error(rm);
34dc7c2f 2055
428870ff
BB
2056 } else if (total_errors < rm->rm_firstdatacol &&
2057 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
34dc7c2f 2058 /*
45d1cae3
BB
2059 * If we didn't use all the available parity for the
2060 * combinatorial reconstruction, verify that the remaining
2061 * parity is correct.
34dc7c2f 2062 */
45d1cae3
BB
2063 if (code != (1 << rm->rm_firstdatacol) - 1)
2064 (void) raidz_parity_verify(zio, rm);
2065 } else {
34dc7c2f 2066 /*
428870ff
BB
2067 * We're here because either:
2068 *
2069 * total_errors == rm_first_datacol, or
2070 * vdev_raidz_combrec() failed
2071 *
2072 * In either case, there is enough bad data to prevent
2073 * reconstruction.
2074 *
2075 * Start checksum ereports for all children which haven't
2076 * failed, and the IO wasn't speculative.
34dc7c2f 2077 */
45d1cae3 2078 zio->io_error = ECKSUM;
34dc7c2f 2079
45d1cae3
BB
2080 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2081 for (c = 0; c < rm->rm_cols; c++) {
2082 rc = &rm->rm_col[c];
428870ff
BB
2083 if (rc->rc_error == 0) {
2084 zio_bad_cksum_t zbc;
2085 zbc.zbc_has_cksum = 0;
2086 zbc.zbc_injected =
2087 rm->rm_ecksuminjected;
2088
2089 zfs_ereport_start_checksum(
2090 zio->io_spa,
2091 vd->vdev_child[rc->rc_devidx],
2092 zio, rc->rc_offset, rc->rc_size,
2093 (void *)(uintptr_t)c, &zbc);
2094 }
34dc7c2f 2095 }
34dc7c2f
BB
2096 }
2097 }
2098
2099done:
2100 zio_checksum_verified(zio);
2101
fb5f0bc8 2102 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
34dc7c2f 2103 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
2104 /*
2105 * Use the good data we have in hand to repair damaged children.
34dc7c2f 2106 */
34dc7c2f
BB
2107 for (c = 0; c < rm->rm_cols; c++) {
2108 rc = &rm->rm_col[c];
2109 cvd = vd->vdev_child[rc->rc_devidx];
2110
2111 if (rc->rc_error == 0)
2112 continue;
2113
b128c09f 2114 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
34dc7c2f
BB
2115 rc->rc_offset, rc->rc_data, rc->rc_size,
2116 ZIO_TYPE_WRITE, zio->io_priority,
fb5f0bc8
BB
2117 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2118 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
34dc7c2f 2119 }
34dc7c2f 2120 }
34dc7c2f
BB
2121}
2122
2123static void
2124vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2125{
2126 if (faulted > vd->vdev_nparity)
2127 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2128 VDEV_AUX_NO_REPLICAS);
2129 else if (degraded + faulted != 0)
2130 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2131 else
2132 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2133}
2134
2135vdev_ops_t vdev_raidz_ops = {
2136 vdev_raidz_open,
2137 vdev_raidz_close,
34dc7c2f
BB
2138 vdev_raidz_asize,
2139 vdev_raidz_io_start,
2140 vdev_raidz_io_done,
2141 vdev_raidz_state_change,
428870ff
BB
2142 NULL,
2143 NULL,
34dc7c2f
BB
2144 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2145 B_FALSE /* not a leaf vdev */
2146};