]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright 2007 Sun Microsystems, Inc. All rights reserved. | |
24 | * Use is subject to license terms. | |
25 | */ | |
26 | ||
27 | #pragma ident "@(#)vdev_raidz.c 1.10 07/11/27 SMI" | |
28 | ||
29 | #include <sys/zfs_context.h> | |
30 | #include <sys/spa.h> | |
31 | #include <sys/vdev_impl.h> | |
32 | #include <sys/zio.h> | |
33 | #include <sys/zio_checksum.h> | |
34 | #include <sys/fs/zfs.h> | |
35 | #include <sys/fm/fs/zfs.h> | |
36 | ||
37 | /* | |
38 | * Virtual device vector for RAID-Z. | |
39 | * | |
40 | * This vdev supports both single and double parity. For single parity, we | |
41 | * use a simple XOR of all the data columns. For double parity, we use both | |
42 | * the simple XOR as well as a technique described in "The mathematics of | |
43 | * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), | |
44 | * over the integers expressable in a single byte. Briefly, the operations on | |
45 | * the field are defined as follows: | |
46 | * | |
47 | * o addition (+) is represented by a bitwise XOR | |
48 | * o subtraction (-) is therefore identical to addition: A + B = A - B | |
49 | * o multiplication of A by 2 is defined by the following bitwise expression: | |
50 | * (A * 2)_7 = A_6 | |
51 | * (A * 2)_6 = A_5 | |
52 | * (A * 2)_5 = A_4 | |
53 | * (A * 2)_4 = A_3 + A_7 | |
54 | * (A * 2)_3 = A_2 + A_7 | |
55 | * (A * 2)_2 = A_1 + A_7 | |
56 | * (A * 2)_1 = A_0 | |
57 | * (A * 2)_0 = A_7 | |
58 | * | |
59 | * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). | |
60 | * | |
61 | * Observe that any number in the field (except for 0) can be expressed as a | |
62 | * power of 2 -- a generator for the field. We store a table of the powers of | |
63 | * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can | |
64 | * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather | |
65 | * than field addition). The inverse of a field element A (A^-1) is A^254. | |
66 | * | |
67 | * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, | |
68 | * can be expressed by field operations: | |
69 | * | |
70 | * P = D_0 + D_1 + ... + D_n-2 + D_n-1 | |
71 | * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 | |
72 | * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 | |
73 | * | |
74 | * See the reconstruction code below for how P and Q can used individually or | |
75 | * in concert to recover missing data columns. | |
76 | */ | |
77 | ||
78 | typedef struct raidz_col { | |
79 | uint64_t rc_devidx; /* child device index for I/O */ | |
80 | uint64_t rc_offset; /* device offset */ | |
81 | uint64_t rc_size; /* I/O size */ | |
82 | void *rc_data; /* I/O data */ | |
83 | int rc_error; /* I/O error for this device */ | |
84 | uint8_t rc_tried; /* Did we attempt this I/O column? */ | |
85 | uint8_t rc_skipped; /* Did we skip this I/O column? */ | |
86 | } raidz_col_t; | |
87 | ||
88 | typedef struct raidz_map { | |
89 | uint64_t rm_cols; /* Column count */ | |
90 | uint64_t rm_bigcols; /* Number of oversized columns */ | |
91 | uint64_t rm_asize; /* Actual total I/O size */ | |
92 | uint64_t rm_missingdata; /* Count of missing data devices */ | |
93 | uint64_t rm_missingparity; /* Count of missing parity devices */ | |
94 | uint64_t rm_firstdatacol; /* First data column/parity count */ | |
95 | raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ | |
96 | } raidz_map_t; | |
97 | ||
98 | #define VDEV_RAIDZ_P 0 | |
99 | #define VDEV_RAIDZ_Q 1 | |
100 | ||
101 | #define VDEV_RAIDZ_MAXPARITY 2 | |
102 | ||
103 | #define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) | |
104 | ||
105 | /* | |
106 | * These two tables represent powers and logs of 2 in the Galois field defined | |
107 | * above. These values were computed by repeatedly multiplying by 2 as above. | |
108 | */ | |
109 | static const uint8_t vdev_raidz_pow2[256] = { | |
110 | 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, | |
111 | 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, | |
112 | 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, | |
113 | 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, | |
114 | 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, | |
115 | 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, | |
116 | 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, | |
117 | 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, | |
118 | 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, | |
119 | 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, | |
120 | 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, | |
121 | 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, | |
122 | 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, | |
123 | 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, | |
124 | 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, | |
125 | 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, | |
126 | 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, | |
127 | 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, | |
128 | 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, | |
129 | 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, | |
130 | 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, | |
131 | 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, | |
132 | 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, | |
133 | 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, | |
134 | 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, | |
135 | 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, | |
136 | 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, | |
137 | 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, | |
138 | 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, | |
139 | 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, | |
140 | 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, | |
141 | 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 | |
142 | }; | |
143 | static const uint8_t vdev_raidz_log2[256] = { | |
144 | 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, | |
145 | 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, | |
146 | 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, | |
147 | 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, | |
148 | 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, | |
149 | 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, | |
150 | 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, | |
151 | 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, | |
152 | 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, | |
153 | 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, | |
154 | 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, | |
155 | 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, | |
156 | 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, | |
157 | 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, | |
158 | 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, | |
159 | 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, | |
160 | 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, | |
161 | 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, | |
162 | 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, | |
163 | 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, | |
164 | 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, | |
165 | 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, | |
166 | 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, | |
167 | 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, | |
168 | 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, | |
169 | 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, | |
170 | 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, | |
171 | 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, | |
172 | 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, | |
173 | 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, | |
174 | 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, | |
175 | 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, | |
176 | }; | |
177 | ||
178 | /* | |
179 | * Multiply a given number by 2 raised to the given power. | |
180 | */ | |
181 | static uint8_t | |
182 | vdev_raidz_exp2(uint_t a, int exp) | |
183 | { | |
184 | if (a == 0) | |
185 | return (0); | |
186 | ||
187 | ASSERT(exp >= 0); | |
188 | ASSERT(vdev_raidz_log2[a] > 0 || a == 1); | |
189 | ||
190 | exp += vdev_raidz_log2[a]; | |
191 | if (exp > 255) | |
192 | exp -= 255; | |
193 | ||
194 | return (vdev_raidz_pow2[exp]); | |
195 | } | |
196 | ||
197 | static raidz_map_t * | |
198 | vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, | |
199 | uint64_t nparity) | |
200 | { | |
201 | raidz_map_t *rm; | |
202 | uint64_t b = zio->io_offset >> unit_shift; | |
203 | uint64_t s = zio->io_size >> unit_shift; | |
204 | uint64_t f = b % dcols; | |
205 | uint64_t o = (b / dcols) << unit_shift; | |
206 | uint64_t q, r, c, bc, col, acols, coff, devidx; | |
207 | ||
208 | q = s / (dcols - nparity); | |
209 | r = s - q * (dcols - nparity); | |
210 | bc = (r == 0 ? 0 : r + nparity); | |
211 | ||
212 | acols = (q == 0 ? bc : dcols); | |
213 | ||
214 | rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); | |
215 | ||
216 | rm->rm_cols = acols; | |
217 | rm->rm_bigcols = bc; | |
218 | rm->rm_asize = 0; | |
219 | rm->rm_missingdata = 0; | |
220 | rm->rm_missingparity = 0; | |
221 | rm->rm_firstdatacol = nparity; | |
222 | ||
223 | for (c = 0; c < acols; c++) { | |
224 | col = f + c; | |
225 | coff = o; | |
226 | if (col >= dcols) { | |
227 | col -= dcols; | |
228 | coff += 1ULL << unit_shift; | |
229 | } | |
230 | rm->rm_col[c].rc_devidx = col; | |
231 | rm->rm_col[c].rc_offset = coff; | |
232 | rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; | |
233 | rm->rm_col[c].rc_data = NULL; | |
234 | rm->rm_col[c].rc_error = 0; | |
235 | rm->rm_col[c].rc_tried = 0; | |
236 | rm->rm_col[c].rc_skipped = 0; | |
237 | rm->rm_asize += rm->rm_col[c].rc_size; | |
238 | } | |
239 | ||
240 | rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); | |
241 | ||
242 | for (c = 0; c < rm->rm_firstdatacol; c++) | |
243 | rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); | |
244 | ||
245 | rm->rm_col[c].rc_data = zio->io_data; | |
246 | ||
247 | for (c = c + 1; c < acols; c++) | |
248 | rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + | |
249 | rm->rm_col[c - 1].rc_size; | |
250 | ||
251 | /* | |
252 | * If all data stored spans all columns, there's a danger that parity | |
253 | * will always be on the same device and, since parity isn't read | |
254 | * during normal operation, that that device's I/O bandwidth won't be | |
255 | * used effectively. We therefore switch the parity every 1MB. | |
256 | * | |
257 | * ... at least that was, ostensibly, the theory. As a practical | |
258 | * matter unless we juggle the parity between all devices evenly, we | |
259 | * won't see any benefit. Further, occasional writes that aren't a | |
260 | * multiple of the LCM of the number of children and the minimum | |
261 | * stripe width are sufficient to avoid pessimal behavior. | |
262 | * Unfortunately, this decision created an implicit on-disk format | |
263 | * requirement that we need to support for all eternity, but only | |
264 | * for single-parity RAID-Z. | |
265 | */ | |
266 | ASSERT(rm->rm_cols >= 2); | |
267 | ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); | |
268 | ||
269 | if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { | |
270 | devidx = rm->rm_col[0].rc_devidx; | |
271 | o = rm->rm_col[0].rc_offset; | |
272 | rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; | |
273 | rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; | |
274 | rm->rm_col[1].rc_devidx = devidx; | |
275 | rm->rm_col[1].rc_offset = o; | |
276 | } | |
277 | ||
278 | zio->io_vsd = rm; | |
279 | return (rm); | |
280 | } | |
281 | ||
282 | static void | |
283 | vdev_raidz_map_free(zio_t *zio) | |
284 | { | |
285 | raidz_map_t *rm = zio->io_vsd; | |
286 | int c; | |
287 | ||
288 | for (c = 0; c < rm->rm_firstdatacol; c++) | |
289 | zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); | |
290 | ||
291 | kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); | |
292 | zio->io_vsd = NULL; | |
293 | } | |
294 | ||
295 | static void | |
296 | vdev_raidz_generate_parity_p(raidz_map_t *rm) | |
297 | { | |
298 | uint64_t *p, *src, pcount, ccount, i; | |
299 | int c; | |
300 | ||
301 | pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); | |
302 | ||
303 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
304 | src = rm->rm_col[c].rc_data; | |
305 | p = rm->rm_col[VDEV_RAIDZ_P].rc_data; | |
306 | ccount = rm->rm_col[c].rc_size / sizeof (src[0]); | |
307 | ||
308 | if (c == rm->rm_firstdatacol) { | |
309 | ASSERT(ccount == pcount); | |
310 | for (i = 0; i < ccount; i++, p++, src++) { | |
311 | *p = *src; | |
312 | } | |
313 | } else { | |
314 | ASSERT(ccount <= pcount); | |
315 | for (i = 0; i < ccount; i++, p++, src++) { | |
316 | *p ^= *src; | |
317 | } | |
318 | } | |
319 | } | |
320 | } | |
321 | ||
322 | static void | |
323 | vdev_raidz_generate_parity_pq(raidz_map_t *rm) | |
324 | { | |
325 | uint64_t *q, *p, *src, pcount, ccount, mask, i; | |
326 | int c; | |
327 | ||
328 | pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); | |
329 | ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == | |
330 | rm->rm_col[VDEV_RAIDZ_Q].rc_size); | |
331 | ||
332 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
333 | src = rm->rm_col[c].rc_data; | |
334 | p = rm->rm_col[VDEV_RAIDZ_P].rc_data; | |
335 | q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; | |
336 | ccount = rm->rm_col[c].rc_size / sizeof (src[0]); | |
337 | ||
338 | if (c == rm->rm_firstdatacol) { | |
339 | ASSERT(ccount == pcount || ccount == 0); | |
340 | for (i = 0; i < ccount; i++, p++, q++, src++) { | |
341 | *q = *src; | |
342 | *p = *src; | |
343 | } | |
344 | for (; i < pcount; i++, p++, q++, src++) { | |
345 | *q = 0; | |
346 | *p = 0; | |
347 | } | |
348 | } else { | |
349 | ASSERT(ccount <= pcount); | |
350 | ||
351 | /* | |
352 | * Rather than multiplying each byte individually (as | |
353 | * described above), we are able to handle 8 at once | |
354 | * by generating a mask based on the high bit in each | |
355 | * byte and using that to conditionally XOR in 0x1d. | |
356 | */ | |
357 | for (i = 0; i < ccount; i++, p++, q++, src++) { | |
358 | mask = *q & 0x8080808080808080ULL; | |
359 | mask = (mask << 1) - (mask >> 7); | |
360 | *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ | |
361 | (mask & 0x1d1d1d1d1d1d1d1dULL); | |
362 | *q ^= *src; | |
363 | *p ^= *src; | |
364 | } | |
365 | ||
366 | /* | |
367 | * Treat short columns as though they are full of 0s. | |
368 | */ | |
369 | for (; i < pcount; i++, q++) { | |
370 | mask = *q & 0x8080808080808080ULL; | |
371 | mask = (mask << 1) - (mask >> 7); | |
372 | *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ | |
373 | (mask & 0x1d1d1d1d1d1d1d1dULL); | |
374 | } | |
375 | } | |
376 | } | |
377 | } | |
378 | ||
379 | static void | |
380 | vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) | |
381 | { | |
382 | uint64_t *dst, *src, xcount, ccount, count, i; | |
383 | int c; | |
384 | ||
385 | xcount = rm->rm_col[x].rc_size / sizeof (src[0]); | |
386 | ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); | |
387 | ASSERT(xcount > 0); | |
388 | ||
389 | src = rm->rm_col[VDEV_RAIDZ_P].rc_data; | |
390 | dst = rm->rm_col[x].rc_data; | |
391 | for (i = 0; i < xcount; i++, dst++, src++) { | |
392 | *dst = *src; | |
393 | } | |
394 | ||
395 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
396 | src = rm->rm_col[c].rc_data; | |
397 | dst = rm->rm_col[x].rc_data; | |
398 | ||
399 | if (c == x) | |
400 | continue; | |
401 | ||
402 | ccount = rm->rm_col[c].rc_size / sizeof (src[0]); | |
403 | count = MIN(ccount, xcount); | |
404 | ||
405 | for (i = 0; i < count; i++, dst++, src++) { | |
406 | *dst ^= *src; | |
407 | } | |
408 | } | |
409 | } | |
410 | ||
411 | static void | |
412 | vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) | |
413 | { | |
414 | uint64_t *dst, *src, xcount, ccount, count, mask, i; | |
415 | uint8_t *b; | |
416 | int c, j, exp; | |
417 | ||
418 | xcount = rm->rm_col[x].rc_size / sizeof (src[0]); | |
419 | ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); | |
420 | ||
421 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
422 | src = rm->rm_col[c].rc_data; | |
423 | dst = rm->rm_col[x].rc_data; | |
424 | ||
425 | if (c == x) | |
426 | ccount = 0; | |
427 | else | |
428 | ccount = rm->rm_col[c].rc_size / sizeof (src[0]); | |
429 | ||
430 | count = MIN(ccount, xcount); | |
431 | ||
432 | if (c == rm->rm_firstdatacol) { | |
433 | for (i = 0; i < count; i++, dst++, src++) { | |
434 | *dst = *src; | |
435 | } | |
436 | for (; i < xcount; i++, dst++) { | |
437 | *dst = 0; | |
438 | } | |
439 | ||
440 | } else { | |
441 | /* | |
442 | * For an explanation of this, see the comment in | |
443 | * vdev_raidz_generate_parity_pq() above. | |
444 | */ | |
445 | for (i = 0; i < count; i++, dst++, src++) { | |
446 | mask = *dst & 0x8080808080808080ULL; | |
447 | mask = (mask << 1) - (mask >> 7); | |
448 | *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ | |
449 | (mask & 0x1d1d1d1d1d1d1d1dULL); | |
450 | *dst ^= *src; | |
451 | } | |
452 | ||
453 | for (; i < xcount; i++, dst++) { | |
454 | mask = *dst & 0x8080808080808080ULL; | |
455 | mask = (mask << 1) - (mask >> 7); | |
456 | *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ | |
457 | (mask & 0x1d1d1d1d1d1d1d1dULL); | |
458 | } | |
459 | } | |
460 | } | |
461 | ||
462 | src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; | |
463 | dst = rm->rm_col[x].rc_data; | |
464 | exp = 255 - (rm->rm_cols - 1 - x); | |
465 | ||
466 | for (i = 0; i < xcount; i++, dst++, src++) { | |
467 | *dst ^= *src; | |
468 | for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { | |
469 | *b = vdev_raidz_exp2(*b, exp); | |
470 | } | |
471 | } | |
472 | } | |
473 | ||
474 | static void | |
475 | vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) | |
476 | { | |
477 | uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; | |
478 | void *pdata, *qdata; | |
479 | uint64_t xsize, ysize, i; | |
480 | ||
481 | ASSERT(x < y); | |
482 | ASSERT(x >= rm->rm_firstdatacol); | |
483 | ASSERT(y < rm->rm_cols); | |
484 | ||
485 | ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); | |
486 | ||
487 | /* | |
488 | * Move the parity data aside -- we're going to compute parity as | |
489 | * though columns x and y were full of zeros -- Pxy and Qxy. We want to | |
490 | * reuse the parity generation mechanism without trashing the actual | |
491 | * parity so we make those columns appear to be full of zeros by | |
492 | * setting their lengths to zero. | |
493 | */ | |
494 | pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; | |
495 | qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; | |
496 | xsize = rm->rm_col[x].rc_size; | |
497 | ysize = rm->rm_col[y].rc_size; | |
498 | ||
499 | rm->rm_col[VDEV_RAIDZ_P].rc_data = | |
500 | zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); | |
501 | rm->rm_col[VDEV_RAIDZ_Q].rc_data = | |
502 | zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); | |
503 | rm->rm_col[x].rc_size = 0; | |
504 | rm->rm_col[y].rc_size = 0; | |
505 | ||
506 | vdev_raidz_generate_parity_pq(rm); | |
507 | ||
508 | rm->rm_col[x].rc_size = xsize; | |
509 | rm->rm_col[y].rc_size = ysize; | |
510 | ||
511 | p = pdata; | |
512 | q = qdata; | |
513 | pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; | |
514 | qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; | |
515 | xd = rm->rm_col[x].rc_data; | |
516 | yd = rm->rm_col[y].rc_data; | |
517 | ||
518 | /* | |
519 | * We now have: | |
520 | * Pxy = P + D_x + D_y | |
521 | * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y | |
522 | * | |
523 | * We can then solve for D_x: | |
524 | * D_x = A * (P + Pxy) + B * (Q + Qxy) | |
525 | * where | |
526 | * A = 2^(x - y) * (2^(x - y) + 1)^-1 | |
527 | * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 | |
528 | * | |
529 | * With D_x in hand, we can easily solve for D_y: | |
530 | * D_y = P + Pxy + D_x | |
531 | */ | |
532 | ||
533 | a = vdev_raidz_pow2[255 + x - y]; | |
534 | b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; | |
535 | tmp = 255 - vdev_raidz_log2[a ^ 1]; | |
536 | ||
537 | aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; | |
538 | bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; | |
539 | ||
540 | for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { | |
541 | *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ | |
542 | vdev_raidz_exp2(*q ^ *qxy, bexp); | |
543 | ||
544 | if (i < ysize) | |
545 | *yd = *p ^ *pxy ^ *xd; | |
546 | } | |
547 | ||
548 | zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, | |
549 | rm->rm_col[VDEV_RAIDZ_P].rc_size); | |
550 | zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, | |
551 | rm->rm_col[VDEV_RAIDZ_Q].rc_size); | |
552 | ||
553 | /* | |
554 | * Restore the saved parity data. | |
555 | */ | |
556 | rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; | |
557 | rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; | |
558 | } | |
559 | ||
560 | ||
561 | static int | |
562 | vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) | |
563 | { | |
564 | vdev_t *cvd; | |
565 | uint64_t nparity = vd->vdev_nparity; | |
566 | int c, error; | |
567 | int lasterror = 0; | |
568 | int numerrors = 0; | |
569 | ||
570 | ASSERT(nparity > 0); | |
571 | ||
572 | if (nparity > VDEV_RAIDZ_MAXPARITY || | |
573 | vd->vdev_children < nparity + 1) { | |
574 | vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
575 | return (EINVAL); | |
576 | } | |
577 | ||
578 | for (c = 0; c < vd->vdev_children; c++) { | |
579 | cvd = vd->vdev_child[c]; | |
580 | ||
581 | if ((error = vdev_open(cvd)) != 0) { | |
582 | lasterror = error; | |
583 | numerrors++; | |
584 | continue; | |
585 | } | |
586 | ||
587 | *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; | |
588 | *ashift = MAX(*ashift, cvd->vdev_ashift); | |
589 | } | |
590 | ||
591 | *asize *= vd->vdev_children; | |
592 | ||
593 | if (numerrors > nparity) { | |
594 | vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; | |
595 | return (lasterror); | |
596 | } | |
597 | ||
598 | return (0); | |
599 | } | |
600 | ||
601 | static void | |
602 | vdev_raidz_close(vdev_t *vd) | |
603 | { | |
604 | int c; | |
605 | ||
606 | for (c = 0; c < vd->vdev_children; c++) | |
607 | vdev_close(vd->vdev_child[c]); | |
608 | } | |
609 | ||
610 | static uint64_t | |
611 | vdev_raidz_asize(vdev_t *vd, uint64_t psize) | |
612 | { | |
613 | uint64_t asize; | |
614 | uint64_t ashift = vd->vdev_top->vdev_ashift; | |
615 | uint64_t cols = vd->vdev_children; | |
616 | uint64_t nparity = vd->vdev_nparity; | |
617 | ||
618 | asize = ((psize - 1) >> ashift) + 1; | |
619 | asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); | |
620 | asize = roundup(asize, nparity + 1) << ashift; | |
621 | ||
622 | return (asize); | |
623 | } | |
624 | ||
625 | static void | |
626 | vdev_raidz_child_done(zio_t *zio) | |
627 | { | |
628 | raidz_col_t *rc = zio->io_private; | |
629 | ||
630 | rc->rc_error = zio->io_error; | |
631 | rc->rc_tried = 1; | |
632 | rc->rc_skipped = 0; | |
633 | } | |
634 | ||
635 | static void | |
636 | vdev_raidz_repair_done(zio_t *zio) | |
637 | { | |
638 | ASSERT(zio->io_private == zio->io_parent); | |
639 | vdev_raidz_map_free(zio->io_private); | |
640 | } | |
641 | ||
642 | static int | |
643 | vdev_raidz_io_start(zio_t *zio) | |
644 | { | |
645 | vdev_t *vd = zio->io_vd; | |
646 | vdev_t *tvd = vd->vdev_top; | |
647 | vdev_t *cvd; | |
648 | blkptr_t *bp = zio->io_bp; | |
649 | raidz_map_t *rm; | |
650 | raidz_col_t *rc; | |
651 | int c; | |
652 | ||
653 | rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, | |
654 | vd->vdev_nparity); | |
655 | ||
656 | ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); | |
657 | ||
658 | if (zio->io_type == ZIO_TYPE_WRITE) { | |
659 | /* | |
660 | * Generate RAID parity in the first virtual columns. | |
661 | */ | |
662 | if (rm->rm_firstdatacol == 1) | |
663 | vdev_raidz_generate_parity_p(rm); | |
664 | else | |
665 | vdev_raidz_generate_parity_pq(rm); | |
666 | ||
667 | for (c = 0; c < rm->rm_cols; c++) { | |
668 | rc = &rm->rm_col[c]; | |
669 | cvd = vd->vdev_child[rc->rc_devidx]; | |
670 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd, | |
671 | rc->rc_offset, rc->rc_data, rc->rc_size, | |
672 | zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, | |
673 | vdev_raidz_child_done, rc)); | |
674 | } | |
675 | ||
676 | return (zio_wait_for_children_done(zio)); | |
677 | } | |
678 | ||
679 | ASSERT(zio->io_type == ZIO_TYPE_READ); | |
680 | ||
681 | /* | |
682 | * Iterate over the columns in reverse order so that we hit the parity | |
683 | * last -- any errors along the way will force us to read the parity | |
684 | * data. | |
685 | */ | |
686 | for (c = rm->rm_cols - 1; c >= 0; c--) { | |
687 | rc = &rm->rm_col[c]; | |
688 | cvd = vd->vdev_child[rc->rc_devidx]; | |
689 | if (!vdev_readable(cvd)) { | |
690 | if (c >= rm->rm_firstdatacol) | |
691 | rm->rm_missingdata++; | |
692 | else | |
693 | rm->rm_missingparity++; | |
694 | rc->rc_error = ENXIO; | |
695 | rc->rc_tried = 1; /* don't even try */ | |
696 | rc->rc_skipped = 1; | |
697 | continue; | |
698 | } | |
699 | if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { | |
700 | if (c >= rm->rm_firstdatacol) | |
701 | rm->rm_missingdata++; | |
702 | else | |
703 | rm->rm_missingparity++; | |
704 | rc->rc_error = ESTALE; | |
705 | rc->rc_skipped = 1; | |
706 | continue; | |
707 | } | |
708 | if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || | |
709 | (zio->io_flags & ZIO_FLAG_SCRUB)) { | |
710 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd, | |
711 | rc->rc_offset, rc->rc_data, rc->rc_size, | |
712 | zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, | |
713 | vdev_raidz_child_done, rc)); | |
714 | } | |
715 | } | |
716 | ||
717 | return (zio_wait_for_children_done(zio)); | |
718 | } | |
719 | ||
720 | /* | |
721 | * Report a checksum error for a child of a RAID-Z device. | |
722 | */ | |
723 | static void | |
724 | raidz_checksum_error(zio_t *zio, raidz_col_t *rc) | |
725 | { | |
726 | vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; | |
727 | dprintf_bp(zio->io_bp, "imputed checksum error on %s: ", | |
728 | vdev_description(vd)); | |
729 | ||
730 | if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { | |
731 | mutex_enter(&vd->vdev_stat_lock); | |
732 | vd->vdev_stat.vs_checksum_errors++; | |
733 | mutex_exit(&vd->vdev_stat_lock); | |
734 | } | |
735 | ||
736 | if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) | |
737 | zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, | |
738 | zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); | |
739 | } | |
740 | ||
741 | /* | |
742 | * Generate the parity from the data columns. If we tried and were able to | |
743 | * read the parity without error, verify that the generated parity matches the | |
744 | * data we read. If it doesn't, we fire off a checksum error. Return the | |
745 | * number such failures. | |
746 | */ | |
747 | static int | |
748 | raidz_parity_verify(zio_t *zio, raidz_map_t *rm) | |
749 | { | |
750 | void *orig[VDEV_RAIDZ_MAXPARITY]; | |
751 | int c, ret = 0; | |
752 | raidz_col_t *rc; | |
753 | ||
754 | for (c = 0; c < rm->rm_firstdatacol; c++) { | |
755 | rc = &rm->rm_col[c]; | |
756 | if (!rc->rc_tried || rc->rc_error != 0) | |
757 | continue; | |
758 | orig[c] = zio_buf_alloc(rc->rc_size); | |
759 | bcopy(rc->rc_data, orig[c], rc->rc_size); | |
760 | } | |
761 | ||
762 | if (rm->rm_firstdatacol == 1) | |
763 | vdev_raidz_generate_parity_p(rm); | |
764 | else | |
765 | vdev_raidz_generate_parity_pq(rm); | |
766 | ||
767 | for (c = 0; c < rm->rm_firstdatacol; c++) { | |
768 | rc = &rm->rm_col[c]; | |
769 | if (!rc->rc_tried || rc->rc_error != 0) | |
770 | continue; | |
771 | if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { | |
772 | raidz_checksum_error(zio, rc); | |
773 | rc->rc_error = ECKSUM; | |
774 | ret++; | |
775 | } | |
776 | zio_buf_free(orig[c], rc->rc_size); | |
777 | } | |
778 | ||
779 | return (ret); | |
780 | } | |
781 | ||
782 | static uint64_t raidz_corrected_p; | |
783 | static uint64_t raidz_corrected_q; | |
784 | static uint64_t raidz_corrected_pq; | |
785 | ||
786 | static int | |
787 | vdev_raidz_io_done(zio_t *zio) | |
788 | { | |
789 | vdev_t *vd = zio->io_vd; | |
790 | vdev_t *cvd; | |
791 | raidz_map_t *rm = zio->io_vsd; | |
792 | raidz_col_t *rc, *rc1; | |
793 | int unexpected_errors = 0; | |
794 | int parity_errors = 0; | |
795 | int parity_untried = 0; | |
796 | int data_errors = 0; | |
797 | int n, c, c1; | |
798 | ||
799 | ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ | |
800 | ||
801 | zio->io_error = 0; | |
802 | zio->io_numerrors = 0; | |
803 | ||
804 | ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); | |
805 | ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); | |
806 | ||
807 | for (c = 0; c < rm->rm_cols; c++) { | |
808 | rc = &rm->rm_col[c]; | |
809 | ||
810 | /* | |
811 | * We preserve any EIOs because those may be worth retrying; | |
812 | * whereas ECKSUM and ENXIO are more likely to be persistent. | |
813 | */ | |
814 | if (rc->rc_error) { | |
815 | if (zio->io_error != EIO) | |
816 | zio->io_error = rc->rc_error; | |
817 | ||
818 | if (c < rm->rm_firstdatacol) | |
819 | parity_errors++; | |
820 | else | |
821 | data_errors++; | |
822 | ||
823 | if (!rc->rc_skipped) | |
824 | unexpected_errors++; | |
825 | ||
826 | zio->io_numerrors++; | |
827 | } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { | |
828 | parity_untried++; | |
829 | } | |
830 | } | |
831 | ||
832 | if (zio->io_type == ZIO_TYPE_WRITE) { | |
833 | /* | |
834 | * If this is not a failfast write, and we were able to | |
835 | * write enough columns to reconstruct the data, good enough. | |
836 | */ | |
837 | /* XXPOLICY */ | |
838 | if (zio->io_numerrors <= rm->rm_firstdatacol && | |
839 | !(zio->io_flags & ZIO_FLAG_FAILFAST)) | |
840 | zio->io_error = 0; | |
841 | ||
842 | vdev_raidz_map_free(zio); | |
843 | ||
844 | return (ZIO_PIPELINE_CONTINUE); | |
845 | } | |
846 | ||
847 | ASSERT(zio->io_type == ZIO_TYPE_READ); | |
848 | /* | |
849 | * There are three potential phases for a read: | |
850 | * 1. produce valid data from the columns read | |
851 | * 2. read all disks and try again | |
852 | * 3. perform combinatorial reconstruction | |
853 | * | |
854 | * Each phase is progressively both more expensive and less likely to | |
855 | * occur. If we encounter more errors than we can repair or all phases | |
856 | * fail, we have no choice but to return an error. | |
857 | */ | |
858 | ||
859 | /* | |
860 | * If the number of errors we saw was correctable -- less than or equal | |
861 | * to the number of parity disks read -- attempt to produce data that | |
862 | * has a valid checksum. Naturally, this case applies in the absence of | |
863 | * any errors. | |
864 | */ | |
865 | if (zio->io_numerrors <= rm->rm_firstdatacol - parity_untried) { | |
866 | switch (data_errors) { | |
867 | case 0: | |
868 | if (zio_checksum_error(zio) == 0) { | |
869 | zio->io_error = 0; | |
870 | ||
871 | /* | |
872 | * If we read parity information (unnecessarily | |
873 | * as it happens since no reconstruction was | |
874 | * needed) regenerate and verify the parity. | |
875 | * We also regenerate parity when resilvering | |
876 | * so we can write it out to the failed device | |
877 | * later. | |
878 | */ | |
879 | if (parity_errors + parity_untried < | |
880 | rm->rm_firstdatacol || | |
881 | (zio->io_flags & ZIO_FLAG_RESILVER)) { | |
882 | n = raidz_parity_verify(zio, rm); | |
883 | unexpected_errors += n; | |
884 | ASSERT(parity_errors + n <= | |
885 | rm->rm_firstdatacol); | |
886 | } | |
887 | goto done; | |
888 | } | |
889 | break; | |
890 | ||
891 | case 1: | |
892 | /* | |
893 | * We either attempt to read all the parity columns or | |
894 | * none of them. If we didn't try to read parity, we | |
895 | * wouldn't be here in the correctable case. There must | |
896 | * also have been fewer parity errors than parity | |
897 | * columns or, again, we wouldn't be in this code path. | |
898 | */ | |
899 | ASSERT(parity_untried == 0); | |
900 | ASSERT(parity_errors < rm->rm_firstdatacol); | |
901 | ||
902 | /* | |
903 | * Find the column that reported the error. | |
904 | */ | |
905 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
906 | rc = &rm->rm_col[c]; | |
907 | if (rc->rc_error != 0) | |
908 | break; | |
909 | } | |
910 | ASSERT(c != rm->rm_cols); | |
911 | ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || | |
912 | rc->rc_error == ESTALE); | |
913 | ||
914 | if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { | |
915 | vdev_raidz_reconstruct_p(rm, c); | |
916 | } else { | |
917 | ASSERT(rm->rm_firstdatacol > 1); | |
918 | vdev_raidz_reconstruct_q(rm, c); | |
919 | } | |
920 | ||
921 | if (zio_checksum_error(zio) == 0) { | |
922 | zio->io_error = 0; | |
923 | if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) | |
924 | atomic_inc_64(&raidz_corrected_p); | |
925 | else | |
926 | atomic_inc_64(&raidz_corrected_q); | |
927 | ||
928 | /* | |
929 | * If there's more than one parity disk that | |
930 | * was successfully read, confirm that the | |
931 | * other parity disk produced the correct data. | |
932 | * This routine is suboptimal in that it | |
933 | * regenerates both the parity we wish to test | |
934 | * as well as the parity we just used to | |
935 | * perform the reconstruction, but this should | |
936 | * be a relatively uncommon case, and can be | |
937 | * optimized if it becomes a problem. | |
938 | * We also regenerate parity when resilvering | |
939 | * so we can write it out to the failed device | |
940 | * later. | |
941 | */ | |
942 | if (parity_errors < rm->rm_firstdatacol - 1 || | |
943 | (zio->io_flags & ZIO_FLAG_RESILVER)) { | |
944 | n = raidz_parity_verify(zio, rm); | |
945 | unexpected_errors += n; | |
946 | ASSERT(parity_errors + n <= | |
947 | rm->rm_firstdatacol); | |
948 | } | |
949 | ||
950 | goto done; | |
951 | } | |
952 | break; | |
953 | ||
954 | case 2: | |
955 | /* | |
956 | * Two data column errors require double parity. | |
957 | */ | |
958 | ASSERT(rm->rm_firstdatacol == 2); | |
959 | ||
960 | /* | |
961 | * Find the two columns that reported errors. | |
962 | */ | |
963 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
964 | rc = &rm->rm_col[c]; | |
965 | if (rc->rc_error != 0) | |
966 | break; | |
967 | } | |
968 | ASSERT(c != rm->rm_cols); | |
969 | ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || | |
970 | rc->rc_error == ESTALE); | |
971 | ||
972 | for (c1 = c++; c < rm->rm_cols; c++) { | |
973 | rc = &rm->rm_col[c]; | |
974 | if (rc->rc_error != 0) | |
975 | break; | |
976 | } | |
977 | ASSERT(c != rm->rm_cols); | |
978 | ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || | |
979 | rc->rc_error == ESTALE); | |
980 | ||
981 | vdev_raidz_reconstruct_pq(rm, c1, c); | |
982 | ||
983 | if (zio_checksum_error(zio) == 0) { | |
984 | zio->io_error = 0; | |
985 | atomic_inc_64(&raidz_corrected_pq); | |
986 | ||
987 | goto done; | |
988 | } | |
989 | break; | |
990 | ||
991 | default: | |
992 | ASSERT(rm->rm_firstdatacol <= 2); | |
993 | ASSERT(0); | |
994 | } | |
995 | } | |
996 | ||
997 | /* | |
998 | * This isn't a typical situation -- either we got a read error or | |
999 | * a child silently returned bad data. Read every block so we can | |
1000 | * try again with as much data and parity as we can track down. If | |
1001 | * we've already been through once before, all children will be marked | |
1002 | * as tried so we'll proceed to combinatorial reconstruction. | |
1003 | */ | |
1004 | unexpected_errors = 1; | |
1005 | rm->rm_missingdata = 0; | |
1006 | rm->rm_missingparity = 0; | |
1007 | ||
1008 | for (c = 0; c < rm->rm_cols; c++) { | |
1009 | if (rm->rm_col[c].rc_tried) | |
1010 | continue; | |
1011 | ||
1012 | zio->io_error = 0; | |
1013 | zio_vdev_io_redone(zio); | |
1014 | do { | |
1015 | rc = &rm->rm_col[c]; | |
1016 | if (rc->rc_tried) | |
1017 | continue; | |
1018 | zio_nowait(zio_vdev_child_io(zio, NULL, | |
1019 | vd->vdev_child[rc->rc_devidx], | |
1020 | rc->rc_offset, rc->rc_data, rc->rc_size, | |
1021 | zio->io_type, zio->io_priority, ZIO_FLAG_CANFAIL, | |
1022 | vdev_raidz_child_done, rc)); | |
1023 | } while (++c < rm->rm_cols); | |
1024 | dprintf("rereading\n"); | |
1025 | ||
1026 | return (zio_wait_for_children_done(zio)); | |
1027 | } | |
1028 | ||
1029 | /* | |
1030 | * At this point we've attempted to reconstruct the data given the | |
1031 | * errors we detected, and we've attempted to read all columns. There | |
1032 | * must, therefore, be one or more additional problems -- silent errors | |
1033 | * resulting in invalid data rather than explicit I/O errors resulting | |
1034 | * in absent data. Before we attempt combinatorial reconstruction make | |
1035 | * sure we have a chance of coming up with the right answer. | |
1036 | */ | |
1037 | if (zio->io_numerrors >= rm->rm_firstdatacol) { | |
1038 | ASSERT(zio->io_error != 0); | |
1039 | goto done; | |
1040 | } | |
1041 | ||
1042 | if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { | |
1043 | /* | |
1044 | * Attempt to reconstruct the data from parity P. | |
1045 | */ | |
1046 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
1047 | void *orig; | |
1048 | rc = &rm->rm_col[c]; | |
1049 | ||
1050 | orig = zio_buf_alloc(rc->rc_size); | |
1051 | bcopy(rc->rc_data, orig, rc->rc_size); | |
1052 | vdev_raidz_reconstruct_p(rm, c); | |
1053 | ||
1054 | if (zio_checksum_error(zio) == 0) { | |
1055 | zio_buf_free(orig, rc->rc_size); | |
1056 | zio->io_error = 0; | |
1057 | atomic_inc_64(&raidz_corrected_p); | |
1058 | ||
1059 | /* | |
1060 | * If this child didn't know that it returned | |
1061 | * bad data, inform it. | |
1062 | */ | |
1063 | if (rc->rc_tried && rc->rc_error == 0) | |
1064 | raidz_checksum_error(zio, rc); | |
1065 | rc->rc_error = ECKSUM; | |
1066 | goto done; | |
1067 | } | |
1068 | ||
1069 | bcopy(orig, rc->rc_data, rc->rc_size); | |
1070 | zio_buf_free(orig, rc->rc_size); | |
1071 | } | |
1072 | } | |
1073 | ||
1074 | if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { | |
1075 | /* | |
1076 | * Attempt to reconstruct the data from parity Q. | |
1077 | */ | |
1078 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
1079 | void *orig; | |
1080 | rc = &rm->rm_col[c]; | |
1081 | ||
1082 | orig = zio_buf_alloc(rc->rc_size); | |
1083 | bcopy(rc->rc_data, orig, rc->rc_size); | |
1084 | vdev_raidz_reconstruct_q(rm, c); | |
1085 | ||
1086 | if (zio_checksum_error(zio) == 0) { | |
1087 | zio_buf_free(orig, rc->rc_size); | |
1088 | zio->io_error = 0; | |
1089 | atomic_inc_64(&raidz_corrected_q); | |
1090 | ||
1091 | /* | |
1092 | * If this child didn't know that it returned | |
1093 | * bad data, inform it. | |
1094 | */ | |
1095 | if (rc->rc_tried && rc->rc_error == 0) | |
1096 | raidz_checksum_error(zio, rc); | |
1097 | rc->rc_error = ECKSUM; | |
1098 | goto done; | |
1099 | } | |
1100 | ||
1101 | bcopy(orig, rc->rc_data, rc->rc_size); | |
1102 | zio_buf_free(orig, rc->rc_size); | |
1103 | } | |
1104 | } | |
1105 | ||
1106 | if (rm->rm_firstdatacol > 1 && | |
1107 | rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && | |
1108 | rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { | |
1109 | /* | |
1110 | * Attempt to reconstruct the data from both P and Q. | |
1111 | */ | |
1112 | for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { | |
1113 | void *orig, *orig1; | |
1114 | rc = &rm->rm_col[c]; | |
1115 | ||
1116 | orig = zio_buf_alloc(rc->rc_size); | |
1117 | bcopy(rc->rc_data, orig, rc->rc_size); | |
1118 | ||
1119 | for (c1 = c + 1; c1 < rm->rm_cols; c1++) { | |
1120 | rc1 = &rm->rm_col[c1]; | |
1121 | ||
1122 | orig1 = zio_buf_alloc(rc1->rc_size); | |
1123 | bcopy(rc1->rc_data, orig1, rc1->rc_size); | |
1124 | ||
1125 | vdev_raidz_reconstruct_pq(rm, c, c1); | |
1126 | ||
1127 | if (zio_checksum_error(zio) == 0) { | |
1128 | zio_buf_free(orig, rc->rc_size); | |
1129 | zio_buf_free(orig1, rc1->rc_size); | |
1130 | zio->io_error = 0; | |
1131 | atomic_inc_64(&raidz_corrected_pq); | |
1132 | ||
1133 | /* | |
1134 | * If these children didn't know they | |
1135 | * returned bad data, inform them. | |
1136 | */ | |
1137 | if (rc->rc_tried && rc->rc_error == 0) | |
1138 | raidz_checksum_error(zio, rc); | |
1139 | if (rc1->rc_tried && rc1->rc_error == 0) | |
1140 | raidz_checksum_error(zio, rc1); | |
1141 | ||
1142 | rc->rc_error = ECKSUM; | |
1143 | rc1->rc_error = ECKSUM; | |
1144 | ||
1145 | goto done; | |
1146 | } | |
1147 | ||
1148 | bcopy(orig1, rc1->rc_data, rc1->rc_size); | |
1149 | zio_buf_free(orig1, rc1->rc_size); | |
1150 | } | |
1151 | ||
1152 | bcopy(orig, rc->rc_data, rc->rc_size); | |
1153 | zio_buf_free(orig, rc->rc_size); | |
1154 | } | |
1155 | } | |
1156 | ||
1157 | /* | |
1158 | * All combinations failed to checksum. Generate checksum ereports for | |
1159 | * all children. | |
1160 | */ | |
1161 | zio->io_error = ECKSUM; | |
1162 | if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { | |
1163 | for (c = 0; c < rm->rm_cols; c++) { | |
1164 | rc = &rm->rm_col[c]; | |
1165 | zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, | |
1166 | zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, | |
1167 | rc->rc_offset, rc->rc_size); | |
1168 | } | |
1169 | } | |
1170 | ||
1171 | done: | |
1172 | zio_checksum_verified(zio); | |
1173 | ||
1174 | if (zio->io_error == 0 && (spa_mode & FWRITE) && | |
1175 | (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { | |
1176 | zio_t *rio; | |
1177 | ||
1178 | /* | |
1179 | * Use the good data we have in hand to repair damaged children. | |
1180 | * | |
1181 | * We issue all repair I/Os as children of 'rio' to arrange | |
1182 | * that vdev_raidz_map_free(zio) will be invoked after all | |
1183 | * repairs complete, but before we advance to the next stage. | |
1184 | */ | |
1185 | rio = zio_null(zio, zio->io_spa, | |
1186 | vdev_raidz_repair_done, zio, ZIO_FLAG_CANFAIL); | |
1187 | ||
1188 | for (c = 0; c < rm->rm_cols; c++) { | |
1189 | rc = &rm->rm_col[c]; | |
1190 | cvd = vd->vdev_child[rc->rc_devidx]; | |
1191 | ||
1192 | if (rc->rc_error == 0) | |
1193 | continue; | |
1194 | ||
1195 | dprintf("%s resilvered %s @ 0x%llx error %d\n", | |
1196 | vdev_description(vd), | |
1197 | vdev_description(cvd), | |
1198 | zio->io_offset, rc->rc_error); | |
1199 | ||
1200 | zio_nowait(zio_vdev_child_io(rio, NULL, cvd, | |
1201 | rc->rc_offset, rc->rc_data, rc->rc_size, | |
1202 | ZIO_TYPE_WRITE, zio->io_priority, | |
1203 | ZIO_FLAG_IO_REPAIR | ZIO_FLAG_DONT_PROPAGATE | | |
1204 | ZIO_FLAG_CANFAIL, NULL, NULL)); | |
1205 | } | |
1206 | ||
1207 | zio_nowait(rio); | |
1208 | ||
1209 | return (zio_wait_for_children_done(zio)); | |
1210 | } | |
1211 | ||
1212 | vdev_raidz_map_free(zio); | |
1213 | ||
1214 | return (ZIO_PIPELINE_CONTINUE); | |
1215 | } | |
1216 | ||
1217 | static void | |
1218 | vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) | |
1219 | { | |
1220 | if (faulted > vd->vdev_nparity) | |
1221 | vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, | |
1222 | VDEV_AUX_NO_REPLICAS); | |
1223 | else if (degraded + faulted != 0) | |
1224 | vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); | |
1225 | else | |
1226 | vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); | |
1227 | } | |
1228 | ||
1229 | vdev_ops_t vdev_raidz_ops = { | |
1230 | vdev_raidz_open, | |
1231 | vdev_raidz_close, | |
1232 | NULL, | |
1233 | vdev_raidz_asize, | |
1234 | vdev_raidz_io_start, | |
1235 | vdev_raidz_io_done, | |
1236 | vdev_raidz_state_change, | |
1237 | VDEV_TYPE_RAIDZ, /* name of this vdev type */ | |
1238 | B_FALSE /* not a leaf vdev */ | |
1239 | }; |