]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_raidz.c
Add cstyle.pl utility and cstyle.1 man page
[mirror_zfs.git] / module / zfs / vdev_raidz.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
1bd201e7 24 * Copyright (c) 2012 by Delphix. All rights reserved.
34dc7c2f
BB
25 */
26
34dc7c2f
BB
27#include <sys/zfs_context.h>
28#include <sys/spa.h>
29#include <sys/vdev_impl.h>
30#include <sys/zio.h>
31#include <sys/zio_checksum.h>
32#include <sys/fs/zfs.h>
33#include <sys/fm/fs/zfs.h>
34
35/*
36 * Virtual device vector for RAID-Z.
37 *
45d1cae3
BB
38 * This vdev supports single, double, and triple parity. For single parity,
39 * we use a simple XOR of all the data columns. For double or triple parity,
40 * we use a special case of Reed-Solomon coding. This extends the
41 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
42 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
43 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
44 * former is also based. The latter is designed to provide higher performance
45 * for writes.
46 *
47 * Note that the Plank paper claimed to support arbitrary N+M, but was then
48 * amended six years later identifying a critical flaw that invalidates its
49 * claims. Nevertheless, the technique can be adapted to work for up to
50 * triple parity. For additional parity, the amendment "Note: Correction to
51 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
52 * is viable, but the additional complexity means that write performance will
53 * suffer.
54 *
55 * All of the methods above operate on a Galois field, defined over the
56 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
57 * can be expressed with a single byte. Briefly, the operations on the
58 * field are defined as follows:
34dc7c2f
BB
59 *
60 * o addition (+) is represented by a bitwise XOR
61 * o subtraction (-) is therefore identical to addition: A + B = A - B
62 * o multiplication of A by 2 is defined by the following bitwise expression:
63 * (A * 2)_7 = A_6
64 * (A * 2)_6 = A_5
65 * (A * 2)_5 = A_4
66 * (A * 2)_4 = A_3 + A_7
67 * (A * 2)_3 = A_2 + A_7
68 * (A * 2)_2 = A_1 + A_7
69 * (A * 2)_1 = A_0
70 * (A * 2)_0 = A_7
71 *
72 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
45d1cae3
BB
73 * As an aside, this multiplication is derived from the error correcting
74 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
34dc7c2f
BB
75 *
76 * Observe that any number in the field (except for 0) can be expressed as a
77 * power of 2 -- a generator for the field. We store a table of the powers of
78 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
79 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
45d1cae3
BB
80 * than field addition). The inverse of a field element A (A^-1) is therefore
81 * A ^ (255 - 1) = A^254.
34dc7c2f 82 *
45d1cae3
BB
83 * The up-to-three parity columns, P, Q, R over several data columns,
84 * D_0, ... D_n-1, can be expressed by field operations:
34dc7c2f
BB
85 *
86 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
87 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
88 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
45d1cae3
BB
89 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
90 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
34dc7c2f 91 *
45d1cae3
BB
92 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
93 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
94 * independent coefficients. (There are no additional coefficients that have
95 * this property which is why the uncorrected Plank method breaks down.)
96 *
97 * See the reconstruction code below for how P, Q and R can used individually
98 * or in concert to recover missing data columns.
34dc7c2f
BB
99 */
100
101typedef struct raidz_col {
102 uint64_t rc_devidx; /* child device index for I/O */
103 uint64_t rc_offset; /* device offset */
104 uint64_t rc_size; /* I/O size */
105 void *rc_data; /* I/O data */
428870ff 106 void *rc_gdata; /* used to store the "good" version */
34dc7c2f
BB
107 int rc_error; /* I/O error for this device */
108 uint8_t rc_tried; /* Did we attempt this I/O column? */
109 uint8_t rc_skipped; /* Did we skip this I/O column? */
110} raidz_col_t;
111
112typedef struct raidz_map {
45d1cae3
BB
113 uint64_t rm_cols; /* Regular column count */
114 uint64_t rm_scols; /* Count including skipped columns */
34dc7c2f
BB
115 uint64_t rm_bigcols; /* Number of oversized columns */
116 uint64_t rm_asize; /* Actual total I/O size */
117 uint64_t rm_missingdata; /* Count of missing data devices */
118 uint64_t rm_missingparity; /* Count of missing parity devices */
119 uint64_t rm_firstdatacol; /* First data column/parity count */
428870ff
BB
120 uint64_t rm_nskip; /* Skipped sectors for padding */
121 uint64_t rm_skipstart; /* Column index of padding start */
122 void *rm_datacopy; /* rm_asize-buffer of copied data */
123 uintptr_t rm_reports; /* # of referencing checksum reports */
124 uint8_t rm_freed; /* map no longer has referencing ZIO */
125 uint8_t rm_ecksuminjected; /* checksum error was injected */
34dc7c2f
BB
126 raidz_col_t rm_col[1]; /* Flexible array of I/O columns */
127} raidz_map_t;
128
129#define VDEV_RAIDZ_P 0
130#define VDEV_RAIDZ_Q 1
45d1cae3 131#define VDEV_RAIDZ_R 2
45d1cae3
BB
132
133#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
134#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
135
136/*
137 * We provide a mechanism to perform the field multiplication operation on a
138 * 64-bit value all at once rather than a byte at a time. This works by
139 * creating a mask from the top bit in each byte and using that to
140 * conditionally apply the XOR of 0x1d.
141 */
142#define VDEV_RAIDZ_64MUL_2(x, mask) \
143{ \
144 (mask) = (x) & 0x8080808080808080ULL; \
145 (mask) = ((mask) << 1) - ((mask) >> 7); \
146 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
c5b3a7bb 147 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
45d1cae3 148}
34dc7c2f 149
45d1cae3
BB
150#define VDEV_RAIDZ_64MUL_4(x, mask) \
151{ \
152 VDEV_RAIDZ_64MUL_2((x), mask); \
153 VDEV_RAIDZ_64MUL_2((x), mask); \
154}
34dc7c2f 155
45d1cae3
BB
156/*
157 * Force reconstruction to use the general purpose method.
158 */
159int vdev_raidz_default_to_general;
34dc7c2f
BB
160
161/*
162 * These two tables represent powers and logs of 2 in the Galois field defined
163 * above. These values were computed by repeatedly multiplying by 2 as above.
164 */
165static const uint8_t vdev_raidz_pow2[256] = {
166 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
167 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
168 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
169 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
170 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
171 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
172 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
173 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
174 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
175 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
176 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
177 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
178 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
179 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
180 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
181 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
182 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
183 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
184 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
185 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
186 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
187 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
188 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
189 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
190 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
191 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
192 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
193 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
194 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
195 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
196 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
197 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
198};
199static const uint8_t vdev_raidz_log2[256] = {
200 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
201 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
202 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
203 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
204 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
205 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
206 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
207 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
208 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
209 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
210 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
211 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
212 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
213 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
214 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
215 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
216 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
217 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
218 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
219 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
220 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
221 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
222 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
223 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
224 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
225 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
226 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
227 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
228 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
229 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
230 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
231 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
232};
233
428870ff
BB
234static void vdev_raidz_generate_parity(raidz_map_t *rm);
235
34dc7c2f
BB
236/*
237 * Multiply a given number by 2 raised to the given power.
238 */
239static uint8_t
240vdev_raidz_exp2(uint_t a, int exp)
241{
242 if (a == 0)
243 return (0);
244
245 ASSERT(exp >= 0);
246 ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
247
248 exp += vdev_raidz_log2[a];
249 if (exp > 255)
250 exp -= 255;
251
252 return (vdev_raidz_pow2[exp]);
253}
254
b128c09f 255static void
428870ff 256vdev_raidz_map_free(raidz_map_t *rm)
b128c09f 257{
b128c09f 258 int c;
428870ff 259 size_t size;
b128c09f 260
428870ff 261 for (c = 0; c < rm->rm_firstdatacol; c++) {
b128c09f
BB
262 zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
263
428870ff
BB
264 if (rm->rm_col[c].rc_gdata != NULL)
265 zio_buf_free(rm->rm_col[c].rc_gdata,
266 rm->rm_col[c].rc_size);
267 }
268
269 size = 0;
270 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
271 size += rm->rm_col[c].rc_size;
272
273 if (rm->rm_datacopy != NULL)
274 zio_buf_free(rm->rm_datacopy, size);
275
45d1cae3 276 kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
b128c09f
BB
277}
278
428870ff
BB
279static void
280vdev_raidz_map_free_vsd(zio_t *zio)
281{
282 raidz_map_t *rm = zio->io_vsd;
283
c99c9001 284 ASSERT0(rm->rm_freed);
428870ff
BB
285 rm->rm_freed = 1;
286
287 if (rm->rm_reports == 0)
288 vdev_raidz_map_free(rm);
289}
290
291/*ARGSUSED*/
292static void
293vdev_raidz_cksum_free(void *arg, size_t ignored)
294{
295 raidz_map_t *rm = arg;
296
297 ASSERT3U(rm->rm_reports, >, 0);
298
299 if (--rm->rm_reports == 0 && rm->rm_freed != 0)
300 vdev_raidz_map_free(rm);
301}
302
303static void
304vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
305{
306 raidz_map_t *rm = zcr->zcr_cbdata;
307 size_t c = zcr->zcr_cbinfo;
308 size_t x;
309
310 const char *good = NULL;
311 const char *bad = rm->rm_col[c].rc_data;
312
313 if (good_data == NULL) {
314 zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
315 return;
316 }
317
318 if (c < rm->rm_firstdatacol) {
319 /*
320 * The first time through, calculate the parity blocks for
321 * the good data (this relies on the fact that the good
322 * data never changes for a given logical ZIO)
323 */
324 if (rm->rm_col[0].rc_gdata == NULL) {
325 char *bad_parity[VDEV_RAIDZ_MAXPARITY];
326 char *buf;
327
328 /*
329 * Set up the rm_col[]s to generate the parity for
330 * good_data, first saving the parity bufs and
331 * replacing them with buffers to hold the result.
332 */
333 for (x = 0; x < rm->rm_firstdatacol; x++) {
334 bad_parity[x] = rm->rm_col[x].rc_data;
335 rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
336 zio_buf_alloc(rm->rm_col[x].rc_size);
337 }
338
339 /* fill in the data columns from good_data */
340 buf = (char *)good_data;
341 for (; x < rm->rm_cols; x++) {
342 rm->rm_col[x].rc_data = buf;
343 buf += rm->rm_col[x].rc_size;
344 }
345
346 /*
347 * Construct the parity from the good data.
348 */
349 vdev_raidz_generate_parity(rm);
350
351 /* restore everything back to its original state */
352 for (x = 0; x < rm->rm_firstdatacol; x++)
353 rm->rm_col[x].rc_data = bad_parity[x];
354
355 buf = rm->rm_datacopy;
356 for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
357 rm->rm_col[x].rc_data = buf;
358 buf += rm->rm_col[x].rc_size;
359 }
360 }
361
362 ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
363 good = rm->rm_col[c].rc_gdata;
364 } else {
365 /* adjust good_data to point at the start of our column */
366 good = good_data;
367
368 for (x = rm->rm_firstdatacol; x < c; x++)
369 good += rm->rm_col[x].rc_size;
370 }
371
372 /* we drop the ereport if it ends up that the data was good */
373 zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
374}
375
376/*
377 * Invoked indirectly by zfs_ereport_start_checksum(), called
378 * below when our read operation fails completely. The main point
379 * is to keep a copy of everything we read from disk, so that at
380 * vdev_raidz_cksum_finish() time we can compare it with the good data.
381 */
382static void
383vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
384{
385 size_t c = (size_t)(uintptr_t)arg;
386 caddr_t buf;
387
388 raidz_map_t *rm = zio->io_vsd;
389 size_t size;
390
391 /* set up the report and bump the refcount */
392 zcr->zcr_cbdata = rm;
393 zcr->zcr_cbinfo = c;
394 zcr->zcr_finish = vdev_raidz_cksum_finish;
395 zcr->zcr_free = vdev_raidz_cksum_free;
396
397 rm->rm_reports++;
398 ASSERT3U(rm->rm_reports, >, 0);
399
400 if (rm->rm_datacopy != NULL)
401 return;
402
403 /*
404 * It's the first time we're called for this raidz_map_t, so we need
405 * to copy the data aside; there's no guarantee that our zio's buffer
406 * won't be re-used for something else.
407 *
408 * Our parity data is already in separate buffers, so there's no need
409 * to copy them.
410 */
411
412 size = 0;
413 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
414 size += rm->rm_col[c].rc_size;
415
416 buf = rm->rm_datacopy = zio_buf_alloc(size);
417
418 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
419 raidz_col_t *col = &rm->rm_col[c];
420
421 bcopy(col->rc_data, buf, col->rc_size);
422 col->rc_data = buf;
423
424 buf += col->rc_size;
425 }
426 ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
427}
428
429static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
430 vdev_raidz_map_free_vsd,
431 vdev_raidz_cksum_report
432};
433
34dc7c2f
BB
434static raidz_map_t *
435vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
436 uint64_t nparity)
437{
438 raidz_map_t *rm;
439 uint64_t b = zio->io_offset >> unit_shift;
440 uint64_t s = zio->io_size >> unit_shift;
441 uint64_t f = b % dcols;
442 uint64_t o = (b / dcols) << unit_shift;
45d1cae3 443 uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
34dc7c2f
BB
444
445 q = s / (dcols - nparity);
446 r = s - q * (dcols - nparity);
447 bc = (r == 0 ? 0 : r + nparity);
45d1cae3
BB
448 tot = s + nparity * (q + (r == 0 ? 0 : 1));
449
450 if (q == 0) {
451 acols = bc;
452 scols = MIN(dcols, roundup(bc, nparity + 1));
453 } else {
454 acols = dcols;
455 scols = dcols;
456 }
34dc7c2f 457
45d1cae3 458 ASSERT3U(acols, <=, scols);
34dc7c2f 459
b8d06fca 460 rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_PUSHPAGE);
34dc7c2f
BB
461
462 rm->rm_cols = acols;
45d1cae3 463 rm->rm_scols = scols;
34dc7c2f 464 rm->rm_bigcols = bc;
428870ff 465 rm->rm_skipstart = bc;
34dc7c2f
BB
466 rm->rm_missingdata = 0;
467 rm->rm_missingparity = 0;
468 rm->rm_firstdatacol = nparity;
428870ff
BB
469 rm->rm_datacopy = NULL;
470 rm->rm_reports = 0;
471 rm->rm_freed = 0;
472 rm->rm_ecksuminjected = 0;
34dc7c2f 473
45d1cae3
BB
474 asize = 0;
475
476 for (c = 0; c < scols; c++) {
34dc7c2f
BB
477 col = f + c;
478 coff = o;
479 if (col >= dcols) {
480 col -= dcols;
481 coff += 1ULL << unit_shift;
482 }
483 rm->rm_col[c].rc_devidx = col;
484 rm->rm_col[c].rc_offset = coff;
34dc7c2f 485 rm->rm_col[c].rc_data = NULL;
428870ff 486 rm->rm_col[c].rc_gdata = NULL;
34dc7c2f
BB
487 rm->rm_col[c].rc_error = 0;
488 rm->rm_col[c].rc_tried = 0;
489 rm->rm_col[c].rc_skipped = 0;
45d1cae3
BB
490
491 if (c >= acols)
492 rm->rm_col[c].rc_size = 0;
493 else if (c < bc)
494 rm->rm_col[c].rc_size = (q + 1) << unit_shift;
495 else
496 rm->rm_col[c].rc_size = q << unit_shift;
497
498 asize += rm->rm_col[c].rc_size;
34dc7c2f
BB
499 }
500
45d1cae3
BB
501 ASSERT3U(asize, ==, tot << unit_shift);
502 rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
428870ff
BB
503 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
504 ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
505 ASSERT3U(rm->rm_nskip, <=, nparity);
34dc7c2f
BB
506
507 for (c = 0; c < rm->rm_firstdatacol; c++)
508 rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
509
510 rm->rm_col[c].rc_data = zio->io_data;
511
512 for (c = c + 1; c < acols; c++)
513 rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data +
514 rm->rm_col[c - 1].rc_size;
515
516 /*
517 * If all data stored spans all columns, there's a danger that parity
518 * will always be on the same device and, since parity isn't read
519 * during normal operation, that that device's I/O bandwidth won't be
520 * used effectively. We therefore switch the parity every 1MB.
521 *
522 * ... at least that was, ostensibly, the theory. As a practical
523 * matter unless we juggle the parity between all devices evenly, we
524 * won't see any benefit. Further, occasional writes that aren't a
525 * multiple of the LCM of the number of children and the minimum
526 * stripe width are sufficient to avoid pessimal behavior.
527 * Unfortunately, this decision created an implicit on-disk format
528 * requirement that we need to support for all eternity, but only
529 * for single-parity RAID-Z.
428870ff
BB
530 *
531 * If we intend to skip a sector in the zeroth column for padding
532 * we must make sure to note this swap. We will never intend to
533 * skip the first column since at least one data and one parity
534 * column must appear in each row.
34dc7c2f
BB
535 */
536 ASSERT(rm->rm_cols >= 2);
537 ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
538
539 if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
540 devidx = rm->rm_col[0].rc_devidx;
541 o = rm->rm_col[0].rc_offset;
542 rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx;
543 rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
544 rm->rm_col[1].rc_devidx = devidx;
545 rm->rm_col[1].rc_offset = o;
428870ff
BB
546
547 if (rm->rm_skipstart == 0)
548 rm->rm_skipstart = 1;
34dc7c2f
BB
549 }
550
551 zio->io_vsd = rm;
428870ff 552 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
34dc7c2f
BB
553 return (rm);
554}
555
34dc7c2f
BB
556static void
557vdev_raidz_generate_parity_p(raidz_map_t *rm)
558{
559 uint64_t *p, *src, pcount, ccount, i;
560 int c;
561
562 pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
563
564 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
565 src = rm->rm_col[c].rc_data;
566 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
567 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
568
569 if (c == rm->rm_firstdatacol) {
570 ASSERT(ccount == pcount);
45d1cae3 571 for (i = 0; i < ccount; i++, src++, p++) {
34dc7c2f
BB
572 *p = *src;
573 }
574 } else {
575 ASSERT(ccount <= pcount);
45d1cae3 576 for (i = 0; i < ccount; i++, src++, p++) {
34dc7c2f
BB
577 *p ^= *src;
578 }
579 }
580 }
581}
582
583static void
584vdev_raidz_generate_parity_pq(raidz_map_t *rm)
585{
45d1cae3 586 uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
34dc7c2f
BB
587 int c;
588
45d1cae3 589 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
34dc7c2f
BB
590 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
591 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
592
593 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
594 src = rm->rm_col[c].rc_data;
595 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
596 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
45d1cae3
BB
597
598 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
34dc7c2f
BB
599
600 if (c == rm->rm_firstdatacol) {
45d1cae3
BB
601 ASSERT(ccnt == pcnt || ccnt == 0);
602 for (i = 0; i < ccnt; i++, src++, p++, q++) {
34dc7c2f 603 *p = *src;
45d1cae3 604 *q = *src;
34dc7c2f 605 }
45d1cae3 606 for (; i < pcnt; i++, src++, p++, q++) {
34dc7c2f 607 *p = 0;
45d1cae3 608 *q = 0;
34dc7c2f
BB
609 }
610 } else {
45d1cae3 611 ASSERT(ccnt <= pcnt);
34dc7c2f
BB
612
613 /*
45d1cae3
BB
614 * Apply the algorithm described above by multiplying
615 * the previous result and adding in the new value.
34dc7c2f 616 */
45d1cae3
BB
617 for (i = 0; i < ccnt; i++, src++, p++, q++) {
618 *p ^= *src;
619
620 VDEV_RAIDZ_64MUL_2(*q, mask);
34dc7c2f 621 *q ^= *src;
45d1cae3
BB
622 }
623
624 /*
625 * Treat short columns as though they are full of 0s.
626 * Note that there's therefore nothing needed for P.
627 */
628 for (; i < pcnt; i++, q++) {
629 VDEV_RAIDZ_64MUL_2(*q, mask);
630 }
631 }
632 }
633}
634
635static void
636vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
637{
638 uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
639 int c;
640
641 pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
642 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
643 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
644 ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
645 rm->rm_col[VDEV_RAIDZ_R].rc_size);
646
647 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
648 src = rm->rm_col[c].rc_data;
649 p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
650 q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
651 r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
652
653 ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
654
655 if (c == rm->rm_firstdatacol) {
656 ASSERT(ccnt == pcnt || ccnt == 0);
657 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
658 *p = *src;
659 *q = *src;
660 *r = *src;
661 }
662 for (; i < pcnt; i++, src++, p++, q++, r++) {
663 *p = 0;
664 *q = 0;
665 *r = 0;
666 }
667 } else {
668 ASSERT(ccnt <= pcnt);
669
670 /*
671 * Apply the algorithm described above by multiplying
672 * the previous result and adding in the new value.
673 */
674 for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
34dc7c2f 675 *p ^= *src;
45d1cae3
BB
676
677 VDEV_RAIDZ_64MUL_2(*q, mask);
678 *q ^= *src;
679
680 VDEV_RAIDZ_64MUL_4(*r, mask);
681 *r ^= *src;
34dc7c2f
BB
682 }
683
684 /*
685 * Treat short columns as though they are full of 0s.
45d1cae3 686 * Note that there's therefore nothing needed for P.
34dc7c2f 687 */
45d1cae3
BB
688 for (; i < pcnt; i++, q++, r++) {
689 VDEV_RAIDZ_64MUL_2(*q, mask);
690 VDEV_RAIDZ_64MUL_4(*r, mask);
34dc7c2f
BB
691 }
692 }
693 }
694}
695
45d1cae3
BB
696/*
697 * Generate RAID parity in the first virtual columns according to the number of
698 * parity columns available.
699 */
34dc7c2f 700static void
45d1cae3
BB
701vdev_raidz_generate_parity(raidz_map_t *rm)
702{
703 switch (rm->rm_firstdatacol) {
704 case 1:
705 vdev_raidz_generate_parity_p(rm);
706 break;
707 case 2:
708 vdev_raidz_generate_parity_pq(rm);
709 break;
710 case 3:
711 vdev_raidz_generate_parity_pqr(rm);
712 break;
713 default:
714 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
715 }
716}
717
718static int
719vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
720{
721 uint64_t *dst, *src, xcount, ccount, count, i;
45d1cae3 722 int x = tgts[0];
34dc7c2f
BB
723 int c;
724
45d1cae3
BB
725 ASSERT(ntgts == 1);
726 ASSERT(x >= rm->rm_firstdatacol);
727 ASSERT(x < rm->rm_cols);
728
34dc7c2f
BB
729 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
730 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
731 ASSERT(xcount > 0);
732
733 src = rm->rm_col[VDEV_RAIDZ_P].rc_data;
734 dst = rm->rm_col[x].rc_data;
735 for (i = 0; i < xcount; i++, dst++, src++) {
736 *dst = *src;
737 }
738
739 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
740 src = rm->rm_col[c].rc_data;
741 dst = rm->rm_col[x].rc_data;
742
743 if (c == x)
744 continue;
745
746 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
747 count = MIN(ccount, xcount);
748
749 for (i = 0; i < count; i++, dst++, src++) {
750 *dst ^= *src;
751 }
752 }
45d1cae3
BB
753
754 return (1 << VDEV_RAIDZ_P);
34dc7c2f
BB
755}
756
45d1cae3
BB
757static int
758vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
759{
760 uint64_t *dst, *src, xcount, ccount, count, mask, i;
761 uint8_t *b;
45d1cae3 762 int x = tgts[0];
34dc7c2f
BB
763 int c, j, exp;
764
45d1cae3
BB
765 ASSERT(ntgts == 1);
766
34dc7c2f
BB
767 xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
768 ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
769
770 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
771 src = rm->rm_col[c].rc_data;
772 dst = rm->rm_col[x].rc_data;
773
774 if (c == x)
775 ccount = 0;
776 else
777 ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
778
779 count = MIN(ccount, xcount);
780
781 if (c == rm->rm_firstdatacol) {
782 for (i = 0; i < count; i++, dst++, src++) {
783 *dst = *src;
784 }
785 for (; i < xcount; i++, dst++) {
786 *dst = 0;
787 }
788
789 } else {
34dc7c2f 790 for (i = 0; i < count; i++, dst++, src++) {
45d1cae3 791 VDEV_RAIDZ_64MUL_2(*dst, mask);
34dc7c2f
BB
792 *dst ^= *src;
793 }
794
795 for (; i < xcount; i++, dst++) {
45d1cae3 796 VDEV_RAIDZ_64MUL_2(*dst, mask);
34dc7c2f
BB
797 }
798 }
799 }
800
801 src = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
802 dst = rm->rm_col[x].rc_data;
803 exp = 255 - (rm->rm_cols - 1 - x);
804
805 for (i = 0; i < xcount; i++, dst++, src++) {
806 *dst ^= *src;
807 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
808 *b = vdev_raidz_exp2(*b, exp);
809 }
810 }
45d1cae3
BB
811
812 return (1 << VDEV_RAIDZ_Q);
34dc7c2f
BB
813}
814
45d1cae3
BB
815static int
816vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
34dc7c2f
BB
817{
818 uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
819 void *pdata, *qdata;
820 uint64_t xsize, ysize, i;
45d1cae3
BB
821 int x = tgts[0];
822 int y = tgts[1];
34dc7c2f 823
45d1cae3 824 ASSERT(ntgts == 2);
34dc7c2f
BB
825 ASSERT(x < y);
826 ASSERT(x >= rm->rm_firstdatacol);
827 ASSERT(y < rm->rm_cols);
828
829 ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size);
830
831 /*
832 * Move the parity data aside -- we're going to compute parity as
833 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
834 * reuse the parity generation mechanism without trashing the actual
835 * parity so we make those columns appear to be full of zeros by
836 * setting their lengths to zero.
837 */
838 pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data;
839 qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
840 xsize = rm->rm_col[x].rc_size;
841 ysize = rm->rm_col[y].rc_size;
842
843 rm->rm_col[VDEV_RAIDZ_P].rc_data =
844 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size);
845 rm->rm_col[VDEV_RAIDZ_Q].rc_data =
846 zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size);
847 rm->rm_col[x].rc_size = 0;
848 rm->rm_col[y].rc_size = 0;
849
850 vdev_raidz_generate_parity_pq(rm);
851
852 rm->rm_col[x].rc_size = xsize;
853 rm->rm_col[y].rc_size = ysize;
854
855 p = pdata;
856 q = qdata;
857 pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data;
858 qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
859 xd = rm->rm_col[x].rc_data;
860 yd = rm->rm_col[y].rc_data;
861
862 /*
863 * We now have:
864 * Pxy = P + D_x + D_y
865 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
866 *
867 * We can then solve for D_x:
868 * D_x = A * (P + Pxy) + B * (Q + Qxy)
869 * where
870 * A = 2^(x - y) * (2^(x - y) + 1)^-1
871 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
872 *
873 * With D_x in hand, we can easily solve for D_y:
874 * D_y = P + Pxy + D_x
875 */
876
877 a = vdev_raidz_pow2[255 + x - y];
878 b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)];
879 tmp = 255 - vdev_raidz_log2[a ^ 1];
880
881 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
882 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
883
884 for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) {
885 *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^
886 vdev_raidz_exp2(*q ^ *qxy, bexp);
887
888 if (i < ysize)
889 *yd = *p ^ *pxy ^ *xd;
890 }
891
892 zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data,
893 rm->rm_col[VDEV_RAIDZ_P].rc_size);
894 zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data,
895 rm->rm_col[VDEV_RAIDZ_Q].rc_size);
896
897 /*
898 * Restore the saved parity data.
899 */
900 rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
901 rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
45d1cae3
BB
902
903 return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
904}
905
906/* BEGIN CSTYLED */
907/*
908 * In the general case of reconstruction, we must solve the system of linear
909 * equations defined by the coeffecients used to generate parity as well as
910 * the contents of the data and parity disks. This can be expressed with
911 * vectors for the original data (D) and the actual data (d) and parity (p)
912 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
913 *
914 * __ __ __ __
915 * | | __ __ | p_0 |
916 * | V | | D_0 | | p_m-1 |
917 * | | x | : | = | d_0 |
918 * | I | | D_n-1 | | : |
919 * | | ~~ ~~ | d_n-1 |
920 * ~~ ~~ ~~ ~~
921 *
922 * I is simply a square identity matrix of size n, and V is a vandermonde
923 * matrix defined by the coeffecients we chose for the various parity columns
924 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
925 * computation as well as linear separability.
926 *
927 * __ __ __ __
928 * | 1 .. 1 1 1 | | p_0 |
929 * | 2^n-1 .. 4 2 1 | __ __ | : |
930 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
931 * | 1 .. 0 0 0 | | D_1 | | d_0 |
932 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
933 * | : : : : | | : | | d_2 |
934 * | 0 .. 1 0 0 | | D_n-1 | | : |
935 * | 0 .. 0 1 0 | ~~ ~~ | : |
936 * | 0 .. 0 0 1 | | d_n-1 |
937 * ~~ ~~ ~~ ~~
938 *
939 * Note that I, V, d, and p are known. To compute D, we must invert the
940 * matrix and use the known data and parity values to reconstruct the unknown
941 * data values. We begin by removing the rows in V|I and d|p that correspond
942 * to failed or missing columns; we then make V|I square (n x n) and d|p
943 * sized n by removing rows corresponding to unused parity from the bottom up
944 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
945 * using Gauss-Jordan elimination. In the example below we use m=3 parity
946 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
947 * __ __
948 * | 1 1 1 1 1 1 1 1 |
949 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
950 * | 19 205 116 29 64 16 4 1 | / /
951 * | 1 0 0 0 0 0 0 0 | / /
952 * | 0 1 0 0 0 0 0 0 | <--' /
953 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
954 * | 0 0 0 1 0 0 0 0 |
955 * | 0 0 0 0 1 0 0 0 |
956 * | 0 0 0 0 0 1 0 0 |
957 * | 0 0 0 0 0 0 1 0 |
958 * | 0 0 0 0 0 0 0 1 |
959 * ~~ ~~
960 * __ __
961 * | 1 1 1 1 1 1 1 1 |
962 * | 128 64 32 16 8 4 2 1 |
963 * | 19 205 116 29 64 16 4 1 |
964 * | 1 0 0 0 0 0 0 0 |
965 * | 0 1 0 0 0 0 0 0 |
966 * (V|I)' = | 0 0 1 0 0 0 0 0 |
967 * | 0 0 0 1 0 0 0 0 |
968 * | 0 0 0 0 1 0 0 0 |
969 * | 0 0 0 0 0 1 0 0 |
970 * | 0 0 0 0 0 0 1 0 |
971 * | 0 0 0 0 0 0 0 1 |
972 * ~~ ~~
973 *
974 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
975 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
976 * matrix is not singular.
977 * __ __
978 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
979 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
980 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
981 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
982 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
983 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
984 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
985 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
986 * ~~ ~~
987 * __ __
988 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
989 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
990 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
991 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
992 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
993 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
994 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
995 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
996 * ~~ ~~
997 * __ __
998 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
999 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1000 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1001 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1002 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1003 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1004 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1005 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1006 * ~~ ~~
1007 * __ __
1008 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1009 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1010 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1011 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1012 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1013 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1014 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1015 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1016 * ~~ ~~
1017 * __ __
1018 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1019 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1020 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1021 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1022 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1023 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1024 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1025 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1026 * ~~ ~~
1027 * __ __
1028 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1029 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1030 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1031 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1032 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1033 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1034 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1035 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1036 * ~~ ~~
1037 * __ __
1038 * | 0 0 1 0 0 0 0 0 |
1039 * | 167 100 5 41 159 169 217 208 |
1040 * | 166 100 4 40 158 168 216 209 |
1041 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1042 * | 0 0 0 0 1 0 0 0 |
1043 * | 0 0 0 0 0 1 0 0 |
1044 * | 0 0 0 0 0 0 1 0 |
1045 * | 0 0 0 0 0 0 0 1 |
1046 * ~~ ~~
1047 *
1048 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1049 * of the missing data.
1050 *
1051 * As is apparent from the example above, the only non-trivial rows in the
1052 * inverse matrix correspond to the data disks that we're trying to
1053 * reconstruct. Indeed, those are the only rows we need as the others would
1054 * only be useful for reconstructing data known or assumed to be valid. For
1055 * that reason, we only build the coefficients in the rows that correspond to
1056 * targeted columns.
1057 */
1058/* END CSTYLED */
1059
1060static void
1061vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
1062 uint8_t **rows)
1063{
1064 int i, j;
1065 int pow;
1066
1067 ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
1068
1069 /*
1070 * Fill in the missing rows of interest.
1071 */
1072 for (i = 0; i < nmap; i++) {
1073 ASSERT3S(0, <=, map[i]);
1074 ASSERT3S(map[i], <=, 2);
1075
1076 pow = map[i] * n;
1077 if (pow > 255)
1078 pow -= 255;
1079 ASSERT(pow <= 255);
1080
1081 for (j = 0; j < n; j++) {
1082 pow -= map[i];
1083 if (pow < 0)
1084 pow += 255;
1085 rows[i][j] = vdev_raidz_pow2[pow];
1086 }
1087 }
1088}
1089
1090static void
1091vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
1092 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1093{
1094 int i, j, ii, jj;
1095 uint8_t log;
1096
1097 /*
1098 * Assert that the first nmissing entries from the array of used
1099 * columns correspond to parity columns and that subsequent entries
1100 * correspond to data columns.
1101 */
1102 for (i = 0; i < nmissing; i++) {
1103 ASSERT3S(used[i], <, rm->rm_firstdatacol);
1104 }
1105 for (; i < n; i++) {
1106 ASSERT3S(used[i], >=, rm->rm_firstdatacol);
1107 }
1108
1109 /*
1110 * First initialize the storage where we'll compute the inverse rows.
1111 */
1112 for (i = 0; i < nmissing; i++) {
1113 for (j = 0; j < n; j++) {
1114 invrows[i][j] = (i == j) ? 1 : 0;
1115 }
1116 }
1117
1118 /*
1119 * Subtract all trivial rows from the rows of consequence.
1120 */
1121 for (i = 0; i < nmissing; i++) {
1122 for (j = nmissing; j < n; j++) {
1123 ASSERT3U(used[j], >=, rm->rm_firstdatacol);
1124 jj = used[j] - rm->rm_firstdatacol;
1125 ASSERT3S(jj, <, n);
1126 invrows[i][j] = rows[i][jj];
1127 rows[i][jj] = 0;
1128 }
1129 }
1130
1131 /*
1132 * For each of the rows of interest, we must normalize it and subtract
1133 * a multiple of it from the other rows.
1134 */
1135 for (i = 0; i < nmissing; i++) {
1136 for (j = 0; j < missing[i]; j++) {
c99c9001 1137 ASSERT0(rows[i][j]);
45d1cae3
BB
1138 }
1139 ASSERT3U(rows[i][missing[i]], !=, 0);
1140
1141 /*
1142 * Compute the inverse of the first element and multiply each
1143 * element in the row by that value.
1144 */
1145 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1146
1147 for (j = 0; j < n; j++) {
1148 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1149 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1150 }
1151
1152 for (ii = 0; ii < nmissing; ii++) {
1153 if (i == ii)
1154 continue;
1155
1156 ASSERT3U(rows[ii][missing[i]], !=, 0);
1157
1158 log = vdev_raidz_log2[rows[ii][missing[i]]];
1159
1160 for (j = 0; j < n; j++) {
1161 rows[ii][j] ^=
1162 vdev_raidz_exp2(rows[i][j], log);
1163 invrows[ii][j] ^=
1164 vdev_raidz_exp2(invrows[i][j], log);
1165 }
1166 }
1167 }
1168
1169 /*
1170 * Verify that the data that is left in the rows are properly part of
1171 * an identity matrix.
1172 */
1173 for (i = 0; i < nmissing; i++) {
1174 for (j = 0; j < n; j++) {
1175 if (j == missing[i]) {
1176 ASSERT3U(rows[i][j], ==, 1);
1177 } else {
c99c9001 1178 ASSERT0(rows[i][j]);
45d1cae3
BB
1179 }
1180 }
1181 }
1182}
1183
1184static void
1185vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
1186 int *missing, uint8_t **invrows, const uint8_t *used)
1187{
1188 int i, j, x, cc, c;
1189 uint8_t *src;
1190 uint64_t ccount;
1191 uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
1192 uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
d4ed6673 1193 uint8_t log = 0, val;
45d1cae3
BB
1194 int ll;
1195 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1196 uint8_t *p, *pp;
1197 size_t psize;
1198
1199 psize = sizeof (invlog[0][0]) * n * nmissing;
b8d06fca 1200 p = kmem_alloc(psize, KM_PUSHPAGE);
45d1cae3
BB
1201
1202 for (pp = p, i = 0; i < nmissing; i++) {
1203 invlog[i] = pp;
1204 pp += n;
1205 }
1206
1207 for (i = 0; i < nmissing; i++) {
1208 for (j = 0; j < n; j++) {
1209 ASSERT3U(invrows[i][j], !=, 0);
1210 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1211 }
1212 }
1213
1214 for (i = 0; i < n; i++) {
1215 c = used[i];
1216 ASSERT3U(c, <, rm->rm_cols);
1217
1218 src = rm->rm_col[c].rc_data;
1219 ccount = rm->rm_col[c].rc_size;
1220 for (j = 0; j < nmissing; j++) {
1221 cc = missing[j] + rm->rm_firstdatacol;
1222 ASSERT3U(cc, >=, rm->rm_firstdatacol);
1223 ASSERT3U(cc, <, rm->rm_cols);
1224 ASSERT3U(cc, !=, c);
1225
1226 dst[j] = rm->rm_col[cc].rc_data;
1227 dcount[j] = rm->rm_col[cc].rc_size;
1228 }
1229
1230 ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
1231
1232 for (x = 0; x < ccount; x++, src++) {
1233 if (*src != 0)
1234 log = vdev_raidz_log2[*src];
1235
1236 for (cc = 0; cc < nmissing; cc++) {
1237 if (x >= dcount[cc])
1238 continue;
1239
1240 if (*src == 0) {
1241 val = 0;
1242 } else {
1243 if ((ll = log + invlog[cc][i]) >= 255)
1244 ll -= 255;
1245 val = vdev_raidz_pow2[ll];
1246 }
1247
1248 if (i == 0)
1249 dst[cc][x] = val;
1250 else
1251 dst[cc][x] ^= val;
1252 }
1253 }
1254 }
1255
1256 kmem_free(p, psize);
1257}
1258
1259static int
1260vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
1261{
1262 int n, i, c, t, tt;
1263 int nmissing_rows;
1264 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1265 int parity_map[VDEV_RAIDZ_MAXPARITY];
1266
1267 uint8_t *p, *pp;
1268 size_t psize;
1269
1270 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1271 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1272 uint8_t *used;
1273
1274 int code = 0;
1275
1276
1277 n = rm->rm_cols - rm->rm_firstdatacol;
1278
1279 /*
1280 * Figure out which data columns are missing.
1281 */
1282 nmissing_rows = 0;
1283 for (t = 0; t < ntgts; t++) {
1284 if (tgts[t] >= rm->rm_firstdatacol) {
1285 missing_rows[nmissing_rows++] =
1286 tgts[t] - rm->rm_firstdatacol;
1287 }
1288 }
1289
1290 /*
1291 * Figure out which parity columns to use to help generate the missing
1292 * data columns.
1293 */
1294 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1295 ASSERT(tt < ntgts);
1296 ASSERT(c < rm->rm_firstdatacol);
1297
1298 /*
1299 * Skip any targeted parity columns.
1300 */
1301 if (c == tgts[tt]) {
1302 tt++;
1303 continue;
1304 }
1305
1306 code |= 1 << c;
1307
1308 parity_map[i] = c;
1309 i++;
1310 }
1311
1312 ASSERT(code != 0);
1313 ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
1314
1315 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1316 nmissing_rows * n + sizeof (used[0]) * n;
b8d06fca 1317 p = kmem_alloc(psize, KM_PUSHPAGE);
45d1cae3
BB
1318
1319 for (pp = p, i = 0; i < nmissing_rows; i++) {
1320 rows[i] = pp;
1321 pp += n;
1322 invrows[i] = pp;
1323 pp += n;
1324 }
1325 used = pp;
1326
1327 for (i = 0; i < nmissing_rows; i++) {
1328 used[i] = parity_map[i];
1329 }
1330
1331 for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1332 if (tt < nmissing_rows &&
1333 c == missing_rows[tt] + rm->rm_firstdatacol) {
1334 tt++;
1335 continue;
1336 }
1337
1338 ASSERT3S(i, <, n);
1339 used[i] = c;
1340 i++;
1341 }
1342
1343 /*
1344 * Initialize the interesting rows of the matrix.
1345 */
1346 vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
1347
1348 /*
1349 * Invert the matrix.
1350 */
1351 vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
1352 invrows, used);
1353
1354 /*
1355 * Reconstruct the missing data using the generated matrix.
1356 */
1357 vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
1358 invrows, used);
1359
1360 kmem_free(p, psize);
1361
1362 return (code);
34dc7c2f
BB
1363}
1364
45d1cae3
BB
1365static int
1366vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
1367{
1368 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
1369 int ntgts;
1370 int i, c;
1371 int code;
1372 int nbadparity, nbaddata;
1373 int parity_valid[VDEV_RAIDZ_MAXPARITY];
1374
1375 /*
1376 * The tgts list must already be sorted.
1377 */
1378 for (i = 1; i < nt; i++) {
1379 ASSERT(t[i] > t[i - 1]);
1380 }
1381
1382 nbadparity = rm->rm_firstdatacol;
1383 nbaddata = rm->rm_cols - nbadparity;
1384 ntgts = 0;
1385 for (i = 0, c = 0; c < rm->rm_cols; c++) {
1386 if (c < rm->rm_firstdatacol)
1387 parity_valid[c] = B_FALSE;
1388
1389 if (i < nt && c == t[i]) {
1390 tgts[ntgts++] = c;
1391 i++;
1392 } else if (rm->rm_col[c].rc_error != 0) {
1393 tgts[ntgts++] = c;
1394 } else if (c >= rm->rm_firstdatacol) {
1395 nbaddata--;
1396 } else {
1397 parity_valid[c] = B_TRUE;
1398 nbadparity--;
1399 }
1400 }
1401
1402 ASSERT(ntgts >= nt);
1403 ASSERT(nbaddata >= 0);
1404 ASSERT(nbaddata + nbadparity == ntgts);
1405
1406 dt = &tgts[nbadparity];
1407
1408 /*
1409 * See if we can use any of our optimized reconstruction routines.
1410 */
1411 if (!vdev_raidz_default_to_general) {
1412 switch (nbaddata) {
1413 case 1:
1414 if (parity_valid[VDEV_RAIDZ_P])
1415 return (vdev_raidz_reconstruct_p(rm, dt, 1));
1416
1417 ASSERT(rm->rm_firstdatacol > 1);
1418
1419 if (parity_valid[VDEV_RAIDZ_Q])
1420 return (vdev_raidz_reconstruct_q(rm, dt, 1));
1421
1422 ASSERT(rm->rm_firstdatacol > 2);
1423 break;
1424
1425 case 2:
1426 ASSERT(rm->rm_firstdatacol > 1);
1427
1428 if (parity_valid[VDEV_RAIDZ_P] &&
1429 parity_valid[VDEV_RAIDZ_Q])
1430 return (vdev_raidz_reconstruct_pq(rm, dt, 2));
1431
1432 ASSERT(rm->rm_firstdatacol > 2);
1433
1434 break;
1435 }
1436 }
1437
1438 code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
1439 ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
1440 ASSERT(code > 0);
1441 return (code);
1442}
34dc7c2f
BB
1443
1444static int
1bd201e7
CS
1445vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
1446 uint64_t *ashift)
34dc7c2f
BB
1447{
1448 vdev_t *cvd;
1449 uint64_t nparity = vd->vdev_nparity;
45d1cae3 1450 int c;
34dc7c2f
BB
1451 int lasterror = 0;
1452 int numerrors = 0;
1453
1454 ASSERT(nparity > 0);
1455
1456 if (nparity > VDEV_RAIDZ_MAXPARITY ||
1457 vd->vdev_children < nparity + 1) {
1458 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
1459 return (EINVAL);
1460 }
1461
45d1cae3
BB
1462 vdev_open_children(vd);
1463
34dc7c2f
BB
1464 for (c = 0; c < vd->vdev_children; c++) {
1465 cvd = vd->vdev_child[c];
1466
45d1cae3
BB
1467 if (cvd->vdev_open_error != 0) {
1468 lasterror = cvd->vdev_open_error;
34dc7c2f
BB
1469 numerrors++;
1470 continue;
1471 }
1472
1473 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1bd201e7 1474 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
34dc7c2f
BB
1475 *ashift = MAX(*ashift, cvd->vdev_ashift);
1476 }
1477
1478 *asize *= vd->vdev_children;
1bd201e7 1479 *max_asize *= vd->vdev_children;
34dc7c2f
BB
1480
1481 if (numerrors > nparity) {
1482 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
1483 return (lasterror);
1484 }
1485
1486 return (0);
1487}
1488
1489static void
1490vdev_raidz_close(vdev_t *vd)
1491{
1492 int c;
1493
1494 for (c = 0; c < vd->vdev_children; c++)
1495 vdev_close(vd->vdev_child[c]);
1496}
1497
1498static uint64_t
1499vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1500{
1501 uint64_t asize;
1502 uint64_t ashift = vd->vdev_top->vdev_ashift;
1503 uint64_t cols = vd->vdev_children;
1504 uint64_t nparity = vd->vdev_nparity;
1505
1506 asize = ((psize - 1) >> ashift) + 1;
1507 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1508 asize = roundup(asize, nparity + 1) << ashift;
1509
1510 return (asize);
1511}
1512
1513static void
1514vdev_raidz_child_done(zio_t *zio)
1515{
1516 raidz_col_t *rc = zio->io_private;
1517
1518 rc->rc_error = zio->io_error;
1519 rc->rc_tried = 1;
1520 rc->rc_skipped = 0;
1521}
1522
34dc7c2f
BB
1523static int
1524vdev_raidz_io_start(zio_t *zio)
1525{
1526 vdev_t *vd = zio->io_vd;
1527 vdev_t *tvd = vd->vdev_top;
1528 vdev_t *cvd;
34dc7c2f
BB
1529 raidz_map_t *rm;
1530 raidz_col_t *rc;
45d1cae3 1531 int c, i;
34dc7c2f
BB
1532
1533 rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
1534 vd->vdev_nparity);
1535
1536 ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1537
1538 if (zio->io_type == ZIO_TYPE_WRITE) {
45d1cae3 1539 vdev_raidz_generate_parity(rm);
34dc7c2f
BB
1540
1541 for (c = 0; c < rm->rm_cols; c++) {
1542 rc = &rm->rm_col[c];
1543 cvd = vd->vdev_child[rc->rc_devidx];
1544 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1545 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1546 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1547 vdev_raidz_child_done, rc));
1548 }
1549
45d1cae3
BB
1550 /*
1551 * Generate optional I/Os for any skipped sectors to improve
1552 * aggregation contiguity.
1553 */
428870ff 1554 for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
45d1cae3
BB
1555 ASSERT(c <= rm->rm_scols);
1556 if (c == rm->rm_scols)
1557 c = 0;
1558 rc = &rm->rm_col[c];
1559 cvd = vd->vdev_child[rc->rc_devidx];
1560 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1561 rc->rc_offset + rc->rc_size, NULL,
1562 1 << tvd->vdev_ashift,
1563 zio->io_type, zio->io_priority,
1564 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
1565 }
1566
b128c09f 1567 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1568 }
1569
1570 ASSERT(zio->io_type == ZIO_TYPE_READ);
1571
1572 /*
1573 * Iterate over the columns in reverse order so that we hit the parity
45d1cae3 1574 * last -- any errors along the way will force us to read the parity.
34dc7c2f
BB
1575 */
1576 for (c = rm->rm_cols - 1; c >= 0; c--) {
1577 rc = &rm->rm_col[c];
1578 cvd = vd->vdev_child[rc->rc_devidx];
1579 if (!vdev_readable(cvd)) {
1580 if (c >= rm->rm_firstdatacol)
1581 rm->rm_missingdata++;
1582 else
1583 rm->rm_missingparity++;
1584 rc->rc_error = ENXIO;
1585 rc->rc_tried = 1; /* don't even try */
1586 rc->rc_skipped = 1;
1587 continue;
1588 }
428870ff 1589 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
34dc7c2f
BB
1590 if (c >= rm->rm_firstdatacol)
1591 rm->rm_missingdata++;
1592 else
1593 rm->rm_missingparity++;
1594 rc->rc_error = ESTALE;
1595 rc->rc_skipped = 1;
1596 continue;
1597 }
1598 if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
9babb374 1599 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
1600 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1601 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 1602 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
1603 vdev_raidz_child_done, rc));
1604 }
1605 }
1606
b128c09f 1607 return (ZIO_PIPELINE_CONTINUE);
34dc7c2f
BB
1608}
1609
428870ff 1610
34dc7c2f
BB
1611/*
1612 * Report a checksum error for a child of a RAID-Z device.
1613 */
1614static void
428870ff 1615raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
34dc7c2f
BB
1616{
1617 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
34dc7c2f
BB
1618
1619 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
428870ff
BB
1620 zio_bad_cksum_t zbc;
1621 raidz_map_t *rm = zio->io_vsd;
1622
34dc7c2f
BB
1623 mutex_enter(&vd->vdev_stat_lock);
1624 vd->vdev_stat.vs_checksum_errors++;
1625 mutex_exit(&vd->vdev_stat_lock);
428870ff
BB
1626
1627 zbc.zbc_has_cksum = 0;
1628 zbc.zbc_injected = rm->rm_ecksuminjected;
1629
1630 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
1631 rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
1632 &zbc);
34dc7c2f 1633 }
428870ff
BB
1634}
1635
1636/*
1637 * We keep track of whether or not there were any injected errors, so that
1638 * any ereports we generate can note it.
1639 */
1640static int
1641raidz_checksum_verify(zio_t *zio)
1642{
1643 zio_bad_cksum_t zbc;
1644 raidz_map_t *rm = zio->io_vsd;
d4ed6673 1645 int ret;
428870ff 1646
d4ed6673
BB
1647 bzero(&zbc, sizeof (zio_bad_cksum_t));
1648
1649 ret = zio_checksum_error(zio, &zbc);
428870ff
BB
1650 if (ret != 0 && zbc.zbc_injected != 0)
1651 rm->rm_ecksuminjected = 1;
34dc7c2f 1652
428870ff 1653 return (ret);
34dc7c2f
BB
1654}
1655
1656/*
1657 * Generate the parity from the data columns. If we tried and were able to
1658 * read the parity without error, verify that the generated parity matches the
1659 * data we read. If it doesn't, we fire off a checksum error. Return the
1660 * number such failures.
1661 */
1662static int
1663raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
1664{
1665 void *orig[VDEV_RAIDZ_MAXPARITY];
1666 int c, ret = 0;
1667 raidz_col_t *rc;
1668
1669 for (c = 0; c < rm->rm_firstdatacol; c++) {
1670 rc = &rm->rm_col[c];
1671 if (!rc->rc_tried || rc->rc_error != 0)
1672 continue;
1673 orig[c] = zio_buf_alloc(rc->rc_size);
1674 bcopy(rc->rc_data, orig[c], rc->rc_size);
1675 }
1676
45d1cae3 1677 vdev_raidz_generate_parity(rm);
34dc7c2f
BB
1678
1679 for (c = 0; c < rm->rm_firstdatacol; c++) {
1680 rc = &rm->rm_col[c];
1681 if (!rc->rc_tried || rc->rc_error != 0)
1682 continue;
1683 if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
428870ff 1684 raidz_checksum_error(zio, rc, orig[c]);
34dc7c2f
BB
1685 rc->rc_error = ECKSUM;
1686 ret++;
1687 }
1688 zio_buf_free(orig[c], rc->rc_size);
1689 }
1690
1691 return (ret);
1692}
1693
45d1cae3
BB
1694/*
1695 * Keep statistics on all the ways that we used parity to correct data.
1696 */
1697static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
34dc7c2f
BB
1698
1699static int
b128c09f
BB
1700vdev_raidz_worst_error(raidz_map_t *rm)
1701{
d6320ddb 1702 int c, error = 0;
b128c09f 1703
d6320ddb 1704 for (c = 0; c < rm->rm_cols; c++)
b128c09f
BB
1705 error = zio_worst_error(error, rm->rm_col[c].rc_error);
1706
1707 return (error);
1708}
1709
45d1cae3
BB
1710/*
1711 * Iterate over all combinations of bad data and attempt a reconstruction.
1712 * Note that the algorithm below is non-optimal because it doesn't take into
1713 * account how reconstruction is actually performed. For example, with
1714 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
1715 * is targeted as invalid as if columns 1 and 4 are targeted since in both
1716 * cases we'd only use parity information in column 0.
1717 */
1718static int
1719vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
1720{
1721 raidz_map_t *rm = zio->io_vsd;
1722 raidz_col_t *rc;
1723 void *orig[VDEV_RAIDZ_MAXPARITY];
1724 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
1725 int *tgts = &tstore[1];
5631c038 1726 int curr, next, i, c, n;
45d1cae3
BB
1727 int code, ret = 0;
1728
1729 ASSERT(total_errors < rm->rm_firstdatacol);
1730
1731 /*
1732 * This simplifies one edge condition.
1733 */
1734 tgts[-1] = -1;
1735
1736 for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
1737 /*
1738 * Initialize the targets array by finding the first n columns
1739 * that contain no error.
1740 *
1741 * If there were no data errors, we need to ensure that we're
1742 * always explicitly attempting to reconstruct at least one
1743 * data column. To do this, we simply push the highest target
1744 * up into the data columns.
1745 */
1746 for (c = 0, i = 0; i < n; i++) {
1747 if (i == n - 1 && data_errors == 0 &&
1748 c < rm->rm_firstdatacol) {
1749 c = rm->rm_firstdatacol;
1750 }
1751
1752 while (rm->rm_col[c].rc_error != 0) {
1753 c++;
1754 ASSERT3S(c, <, rm->rm_cols);
1755 }
1756
1757 tgts[i] = c++;
1758 }
1759
1760 /*
1761 * Setting tgts[n] simplifies the other edge condition.
1762 */
1763 tgts[n] = rm->rm_cols;
1764
1765 /*
1766 * These buffers were allocated in previous iterations.
1767 */
1768 for (i = 0; i < n - 1; i++) {
1769 ASSERT(orig[i] != NULL);
1770 }
1771
1772 orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
1773
5631c038
BB
1774 curr = 0;
1775 next = tgts[curr];
45d1cae3 1776
5631c038
BB
1777 while (curr != n) {
1778 tgts[curr] = next;
1779 curr = 0;
45d1cae3
BB
1780
1781 /*
1782 * Save off the original data that we're going to
1783 * attempt to reconstruct.
1784 */
1785 for (i = 0; i < n; i++) {
1786 ASSERT(orig[i] != NULL);
1787 c = tgts[i];
1788 ASSERT3S(c, >=, 0);
1789 ASSERT3S(c, <, rm->rm_cols);
1790 rc = &rm->rm_col[c];
1791 bcopy(rc->rc_data, orig[i], rc->rc_size);
1792 }
1793
1794 /*
1795 * Attempt a reconstruction and exit the outer loop on
1796 * success.
1797 */
1798 code = vdev_raidz_reconstruct(rm, tgts, n);
428870ff 1799 if (raidz_checksum_verify(zio) == 0) {
45d1cae3
BB
1800 atomic_inc_64(&raidz_corrected[code]);
1801
1802 for (i = 0; i < n; i++) {
1803 c = tgts[i];
1804 rc = &rm->rm_col[c];
1805 ASSERT(rc->rc_error == 0);
1806 if (rc->rc_tried)
428870ff
BB
1807 raidz_checksum_error(zio, rc,
1808 orig[i]);
45d1cae3
BB
1809 rc->rc_error = ECKSUM;
1810 }
1811
1812 ret = code;
1813 goto done;
1814 }
1815
1816 /*
1817 * Restore the original data.
1818 */
1819 for (i = 0; i < n; i++) {
1820 c = tgts[i];
1821 rc = &rm->rm_col[c];
1822 bcopy(orig[i], rc->rc_data, rc->rc_size);
1823 }
1824
1825 do {
1826 /*
5631c038 1827 * Find the next valid column after the curr
45d1cae3
BB
1828 * position..
1829 */
5631c038 1830 for (next = tgts[curr] + 1;
45d1cae3
BB
1831 next < rm->rm_cols &&
1832 rm->rm_col[next].rc_error != 0; next++)
1833 continue;
1834
5631c038 1835 ASSERT(next <= tgts[curr + 1]);
45d1cae3
BB
1836
1837 /*
1838 * If that spot is available, we're done here.
1839 */
5631c038 1840 if (next != tgts[curr + 1])
45d1cae3
BB
1841 break;
1842
1843 /*
1844 * Otherwise, find the next valid column after
1845 * the previous position.
1846 */
5631c038 1847 for (c = tgts[curr - 1] + 1;
45d1cae3
BB
1848 rm->rm_col[c].rc_error != 0; c++)
1849 continue;
1850
5631c038
BB
1851 tgts[curr] = c;
1852 curr++;
45d1cae3 1853
5631c038 1854 } while (curr != n);
45d1cae3
BB
1855 }
1856 }
1857 n--;
1858done:
1859 for (i = 0; i < n; i++) {
1860 zio_buf_free(orig[i], rm->rm_col[0].rc_size);
1861 }
1862
1863 return (ret);
1864}
1865
b128c09f 1866static void
34dc7c2f
BB
1867vdev_raidz_io_done(zio_t *zio)
1868{
1869 vdev_t *vd = zio->io_vd;
1870 vdev_t *cvd;
1871 raidz_map_t *rm = zio->io_vsd;
d4ed6673 1872 raidz_col_t *rc = NULL;
34dc7c2f
BB
1873 int unexpected_errors = 0;
1874 int parity_errors = 0;
1875 int parity_untried = 0;
1876 int data_errors = 0;
b128c09f 1877 int total_errors = 0;
45d1cae3
BB
1878 int n, c;
1879 int tgts[VDEV_RAIDZ_MAXPARITY];
1880 int code;
34dc7c2f
BB
1881
1882 ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */
1883
34dc7c2f
BB
1884 ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
1885 ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
1886
1887 for (c = 0; c < rm->rm_cols; c++) {
1888 rc = &rm->rm_col[c];
1889
34dc7c2f 1890 if (rc->rc_error) {
b128c09f 1891 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
34dc7c2f
BB
1892
1893 if (c < rm->rm_firstdatacol)
1894 parity_errors++;
1895 else
1896 data_errors++;
1897
1898 if (!rc->rc_skipped)
1899 unexpected_errors++;
1900
b128c09f 1901 total_errors++;
34dc7c2f
BB
1902 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
1903 parity_untried++;
1904 }
1905 }
1906
1907 if (zio->io_type == ZIO_TYPE_WRITE) {
1908 /*
b128c09f
BB
1909 * XXX -- for now, treat partial writes as a success.
1910 * (If we couldn't write enough columns to reconstruct
1911 * the data, the I/O failed. Otherwise, good enough.)
1912 *
1913 * Now that we support write reallocation, it would be better
1914 * to treat partial failure as real failure unless there are
1915 * no non-degraded top-level vdevs left, and not update DTLs
1916 * if we intend to reallocate.
34dc7c2f
BB
1917 */
1918 /* XXPOLICY */
b128c09f
BB
1919 if (total_errors > rm->rm_firstdatacol)
1920 zio->io_error = vdev_raidz_worst_error(rm);
34dc7c2f 1921
b128c09f 1922 return;
34dc7c2f
BB
1923 }
1924
1925 ASSERT(zio->io_type == ZIO_TYPE_READ);
1926 /*
1927 * There are three potential phases for a read:
1928 * 1. produce valid data from the columns read
1929 * 2. read all disks and try again
1930 * 3. perform combinatorial reconstruction
1931 *
1932 * Each phase is progressively both more expensive and less likely to
1933 * occur. If we encounter more errors than we can repair or all phases
1934 * fail, we have no choice but to return an error.
1935 */
1936
1937 /*
1938 * If the number of errors we saw was correctable -- less than or equal
1939 * to the number of parity disks read -- attempt to produce data that
1940 * has a valid checksum. Naturally, this case applies in the absence of
1941 * any errors.
1942 */
b128c09f 1943 if (total_errors <= rm->rm_firstdatacol - parity_untried) {
45d1cae3 1944 if (data_errors == 0) {
428870ff 1945 if (raidz_checksum_verify(zio) == 0) {
34dc7c2f
BB
1946 /*
1947 * If we read parity information (unnecessarily
1948 * as it happens since no reconstruction was
1949 * needed) regenerate and verify the parity.
1950 * We also regenerate parity when resilvering
1951 * so we can write it out to the failed device
1952 * later.
1953 */
1954 if (parity_errors + parity_untried <
1955 rm->rm_firstdatacol ||
1956 (zio->io_flags & ZIO_FLAG_RESILVER)) {
1957 n = raidz_parity_verify(zio, rm);
1958 unexpected_errors += n;
1959 ASSERT(parity_errors + n <=
1960 rm->rm_firstdatacol);
1961 }
1962 goto done;
1963 }
45d1cae3 1964 } else {
34dc7c2f
BB
1965 /*
1966 * We either attempt to read all the parity columns or
1967 * none of them. If we didn't try to read parity, we
1968 * wouldn't be here in the correctable case. There must
1969 * also have been fewer parity errors than parity
1970 * columns or, again, we wouldn't be in this code path.
1971 */
1972 ASSERT(parity_untried == 0);
1973 ASSERT(parity_errors < rm->rm_firstdatacol);
1974
1975 /*
45d1cae3 1976 * Identify the data columns that reported an error.
34dc7c2f 1977 */
45d1cae3 1978 n = 0;
34dc7c2f
BB
1979 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
1980 rc = &rm->rm_col[c];
45d1cae3
BB
1981 if (rc->rc_error != 0) {
1982 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
1983 tgts[n++] = c;
1984 }
34dc7c2f 1985 }
34dc7c2f 1986
45d1cae3
BB
1987 ASSERT(rm->rm_firstdatacol >= n);
1988
1989 code = vdev_raidz_reconstruct(rm, tgts, n);
34dc7c2f 1990
428870ff 1991 if (raidz_checksum_verify(zio) == 0) {
45d1cae3 1992 atomic_inc_64(&raidz_corrected[code]);
34dc7c2f
BB
1993
1994 /*
45d1cae3
BB
1995 * If we read more parity disks than were used
1996 * for reconstruction, confirm that the other
1997 * parity disks produced correct data. This
1998 * routine is suboptimal in that it regenerates
1999 * the parity that we already used in addition
2000 * to the parity that we're attempting to
2001 * verify, but this should be a relatively
2002 * uncommon case, and can be optimized if it
2003 * becomes a problem. Note that we regenerate
2004 * parity when resilvering so we can write it
2005 * out to failed devices later.
34dc7c2f 2006 */
45d1cae3 2007 if (parity_errors < rm->rm_firstdatacol - n ||
34dc7c2f
BB
2008 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2009 n = raidz_parity_verify(zio, rm);
2010 unexpected_errors += n;
2011 ASSERT(parity_errors + n <=
2012 rm->rm_firstdatacol);
2013 }
2014
2015 goto done;
2016 }
34dc7c2f
BB
2017 }
2018 }
2019
2020 /*
2021 * This isn't a typical situation -- either we got a read error or
2022 * a child silently returned bad data. Read every block so we can
2023 * try again with as much data and parity as we can track down. If
2024 * we've already been through once before, all children will be marked
2025 * as tried so we'll proceed to combinatorial reconstruction.
2026 */
2027 unexpected_errors = 1;
2028 rm->rm_missingdata = 0;
2029 rm->rm_missingparity = 0;
2030
2031 for (c = 0; c < rm->rm_cols; c++) {
2032 if (rm->rm_col[c].rc_tried)
2033 continue;
2034
34dc7c2f
BB
2035 zio_vdev_io_redone(zio);
2036 do {
2037 rc = &rm->rm_col[c];
2038 if (rc->rc_tried)
2039 continue;
2040 zio_nowait(zio_vdev_child_io(zio, NULL,
2041 vd->vdev_child[rc->rc_devidx],
2042 rc->rc_offset, rc->rc_data, rc->rc_size,
b128c09f 2043 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
2044 vdev_raidz_child_done, rc));
2045 } while (++c < rm->rm_cols);
34dc7c2f 2046
b128c09f 2047 return;
34dc7c2f
BB
2048 }
2049
2050 /*
2051 * At this point we've attempted to reconstruct the data given the
2052 * errors we detected, and we've attempted to read all columns. There
2053 * must, therefore, be one or more additional problems -- silent errors
2054 * resulting in invalid data rather than explicit I/O errors resulting
45d1cae3
BB
2055 * in absent data. We check if there is enough additional data to
2056 * possibly reconstruct the data and then perform combinatorial
2057 * reconstruction over all possible combinations. If that fails,
2058 * we're cooked.
34dc7c2f 2059 */
428870ff 2060 if (total_errors > rm->rm_firstdatacol) {
b128c09f 2061 zio->io_error = vdev_raidz_worst_error(rm);
34dc7c2f 2062
428870ff
BB
2063 } else if (total_errors < rm->rm_firstdatacol &&
2064 (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
34dc7c2f 2065 /*
45d1cae3
BB
2066 * If we didn't use all the available parity for the
2067 * combinatorial reconstruction, verify that the remaining
2068 * parity is correct.
34dc7c2f 2069 */
45d1cae3
BB
2070 if (code != (1 << rm->rm_firstdatacol) - 1)
2071 (void) raidz_parity_verify(zio, rm);
2072 } else {
34dc7c2f 2073 /*
428870ff
BB
2074 * We're here because either:
2075 *
2076 * total_errors == rm_first_datacol, or
2077 * vdev_raidz_combrec() failed
2078 *
2079 * In either case, there is enough bad data to prevent
2080 * reconstruction.
2081 *
2082 * Start checksum ereports for all children which haven't
2083 * failed, and the IO wasn't speculative.
34dc7c2f 2084 */
45d1cae3 2085 zio->io_error = ECKSUM;
34dc7c2f 2086
45d1cae3
BB
2087 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2088 for (c = 0; c < rm->rm_cols; c++) {
2089 rc = &rm->rm_col[c];
428870ff
BB
2090 if (rc->rc_error == 0) {
2091 zio_bad_cksum_t zbc;
2092 zbc.zbc_has_cksum = 0;
2093 zbc.zbc_injected =
2094 rm->rm_ecksuminjected;
2095
2096 zfs_ereport_start_checksum(
2097 zio->io_spa,
2098 vd->vdev_child[rc->rc_devidx],
2099 zio, rc->rc_offset, rc->rc_size,
2100 (void *)(uintptr_t)c, &zbc);
2101 }
34dc7c2f 2102 }
34dc7c2f
BB
2103 }
2104 }
2105
2106done:
2107 zio_checksum_verified(zio);
2108
fb5f0bc8 2109 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
34dc7c2f 2110 (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
34dc7c2f
BB
2111 /*
2112 * Use the good data we have in hand to repair damaged children.
34dc7c2f 2113 */
34dc7c2f
BB
2114 for (c = 0; c < rm->rm_cols; c++) {
2115 rc = &rm->rm_col[c];
2116 cvd = vd->vdev_child[rc->rc_devidx];
2117
2118 if (rc->rc_error == 0)
2119 continue;
2120
b128c09f 2121 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
34dc7c2f
BB
2122 rc->rc_offset, rc->rc_data, rc->rc_size,
2123 ZIO_TYPE_WRITE, zio->io_priority,
fb5f0bc8
BB
2124 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2125 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
34dc7c2f 2126 }
34dc7c2f 2127 }
34dc7c2f
BB
2128}
2129
2130static void
2131vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2132{
2133 if (faulted > vd->vdev_nparity)
2134 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2135 VDEV_AUX_NO_REPLICAS);
2136 else if (degraded + faulted != 0)
2137 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2138 else
2139 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2140}
2141
2142vdev_ops_t vdev_raidz_ops = {
2143 vdev_raidz_open,
2144 vdev_raidz_close,
34dc7c2f
BB
2145 vdev_raidz_asize,
2146 vdev_raidz_io_start,
2147 vdev_raidz_io_done,
2148 vdev_raidz_state_change,
428870ff
BB
2149 NULL,
2150 NULL,
34dc7c2f
BB
2151 VDEV_TYPE_RAIDZ, /* name of this vdev type */
2152 B_FALSE /* not a leaf vdev */
2153};