]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
b128c09f | 23 | * Copyright 2008 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f BB |
24 | * Use is subject to license terms. |
25 | */ | |
26 | ||
34dc7c2f BB |
27 | #include <sys/zfs_context.h> |
28 | #include <sys/spa.h> | |
29 | #include <sys/vdev_impl.h> | |
30 | #include <sys/zio.h> | |
31 | #include <sys/zio_checksum.h> | |
32 | #include <sys/fs/zfs.h> | |
33 | #include <sys/fm/fs/zfs.h> | |
34 | ||
35 | /* | |
36 | * Virtual device vector for RAID-Z. | |
37 | * | |
38 | * This vdev supports both single and double parity. For single parity, we | |
39 | * use a simple XOR of all the data columns. For double parity, we use both | |
40 | * the simple XOR as well as a technique described in "The mathematics of | |
41 | * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), | |
42 | * over the integers expressable in a single byte. Briefly, the operations on | |
43 | * the field are defined as follows: | |
44 | * | |
45 | * o addition (+) is represented by a bitwise XOR | |
46 | * o subtraction (-) is therefore identical to addition: A + B = A - B | |
47 | * o multiplication of A by 2 is defined by the following bitwise expression: | |
48 | * (A * 2)_7 = A_6 | |
49 | * (A * 2)_6 = A_5 | |
50 | * (A * 2)_5 = A_4 | |
51 | * (A * 2)_4 = A_3 + A_7 | |
52 | * (A * 2)_3 = A_2 + A_7 | |
53 | * (A * 2)_2 = A_1 + A_7 | |
54 | * (A * 2)_1 = A_0 | |
55 | * (A * 2)_0 = A_7 | |
56 | * | |
57 | * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). | |
58 | * | |
59 | * Observe that any number in the field (except for 0) can be expressed as a | |
60 | * power of 2 -- a generator for the field. We store a table of the powers of | |
61 | * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can | |
62 | * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather | |
63 | * than field addition). The inverse of a field element A (A^-1) is A^254. | |
64 | * | |
65 | * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, | |
66 | * can be expressed by field operations: | |
67 | * | |
68 | * P = D_0 + D_1 + ... + D_n-2 + D_n-1 | |
69 | * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 | |
70 | * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 | |
71 | * | |
72 | * See the reconstruction code below for how P and Q can used individually or | |
73 | * in concert to recover missing data columns. | |
74 | */ | |
75 | ||
76 | typedef struct raidz_col { | |
77 | uint64_t rc_devidx; /* child device index for I/O */ | |
78 | uint64_t rc_offset; /* device offset */ | |
79 | uint64_t rc_size; /* I/O size */ | |
80 | void *rc_data; /* I/O data */ | |
81 | int rc_error; /* I/O error for this device */ | |
82 | uint8_t rc_tried; /* Did we attempt this I/O column? */ | |
83 | uint8_t rc_skipped; /* Did we skip this I/O column? */ | |
84 | } raidz_col_t; | |
85 | ||
86 | typedef struct raidz_map { | |
87 | uint64_t rm_cols; /* Column count */ | |
88 | uint64_t rm_bigcols; /* Number of oversized columns */ | |
89 | uint64_t rm_asize; /* Actual total I/O size */ | |
90 | uint64_t rm_missingdata; /* Count of missing data devices */ | |
91 | uint64_t rm_missingparity; /* Count of missing parity devices */ | |
92 | uint64_t rm_firstdatacol; /* First data column/parity count */ | |
93 | raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ | |
94 | } raidz_map_t; | |
95 | ||
96 | #define VDEV_RAIDZ_P 0 | |
97 | #define VDEV_RAIDZ_Q 1 | |
98 | ||
99 | #define VDEV_RAIDZ_MAXPARITY 2 | |
100 | ||
101 | #define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) | |
102 | ||
103 | /* | |
104 | * These two tables represent powers and logs of 2 in the Galois field defined | |
105 | * above. These values were computed by repeatedly multiplying by 2 as above. | |
106 | */ | |
107 | static const uint8_t vdev_raidz_pow2[256] = { | |
108 | 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, | |
109 | 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, | |
110 | 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, | |
111 | 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, | |
112 | 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, | |
113 | 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, | |
114 | 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, | |
115 | 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, | |
116 | 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, | |
117 | 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, | |
118 | 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, | |
119 | 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, | |
120 | 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, | |
121 | 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, | |
122 | 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, | |
123 | 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, | |
124 | 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, | |
125 | 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, | |
126 | 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, | |
127 | 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, | |
128 | 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, | |
129 | 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, | |
130 | 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, | |
131 | 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, | |
132 | 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, | |
133 | 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, | |
134 | 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, | |
135 | 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, | |
136 | 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, | |
137 | 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, | |
138 | 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, | |
139 | 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 | |
140 | }; | |
141 | static const uint8_t vdev_raidz_log2[256] = { | |
142 | 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, | |
143 | 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, | |
144 | 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, | |
145 | 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, | |
146 | 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, | |
147 | 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, | |
148 | 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, | |
149 | 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, | |
150 | 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, | |
151 | 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, | |
152 | 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, | |
153 | 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, | |
154 | 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, | |
155 | 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, | |
156 | 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, | |
157 | 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, | |
158 | 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, | |
159 | 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, | |
160 | 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, | |
161 | 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, | |
162 | 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, | |
163 | 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, | |
164 | 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, | |
165 | 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, | |
166 | 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, | |
167 | 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, | |
168 | 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, | |
169 | 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, | |
170 | 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, | |
171 | 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, | |
172 | 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, | |
173 | 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, | |
174 | }; | |
175 | ||
176 | /* | |
177 | * Multiply a given number by 2 raised to the given power. | |
178 | */ | |
179 | static uint8_t | |
180 | vdev_raidz_exp2(uint_t a, int exp) | |
181 | { | |
182 | if (a == 0) | |
183 | return (0); | |
184 | ||
185 | ASSERT(exp >= 0); | |
186 | ASSERT(vdev_raidz_log2[a] > 0 || a == 1); | |
187 | ||
188 | exp += vdev_raidz_log2[a]; | |
189 | if (exp > 255) | |
190 | exp -= 255; | |
191 | ||
192 | return (vdev_raidz_pow2[exp]); | |
193 | } | |
194 | ||
b128c09f BB |
195 | static void |
196 | vdev_raidz_map_free(zio_t *zio) | |
197 | { | |
198 | raidz_map_t *rm = zio->io_vsd; | |
199 | int c; | |
200 | ||
201 | for (c = 0; c < rm->rm_firstdatacol; c++) | |
202 | zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); | |
203 | ||
204 | kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); | |
205 | } | |
206 | ||
34dc7c2f BB |
207 | static raidz_map_t * |
208 | vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, | |
209 | uint64_t nparity) | |
210 | { | |
211 | raidz_map_t *rm; | |
212 | uint64_t b = zio->io_offset >> unit_shift; | |
213 | uint64_t s = zio->io_size >> unit_shift; | |
214 | uint64_t f = b % dcols; | |
215 | uint64_t o = (b / dcols) << unit_shift; | |
216 | uint64_t q, r, c, bc, col, acols, coff, devidx; | |
217 | ||
218 | q = s / (dcols - nparity); | |
219 | r = s - q * (dcols - nparity); | |
220 | bc = (r == 0 ? 0 : r + nparity); | |
221 | ||
222 | acols = (q == 0 ? bc : dcols); | |
223 | ||
224 | rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); | |
225 | ||
226 | rm->rm_cols = acols; | |
227 | rm->rm_bigcols = bc; | |
228 | rm->rm_asize = 0; | |
229 | rm->rm_missingdata = 0; | |
230 | rm->rm_missingparity = 0; | |
231 | rm->rm_firstdatacol = nparity; | |
232 | ||
233 | for (c = 0; c < acols; c++) { | |
234 | col = f + c; | |
235 | coff = o; | |
236 | if (col >= dcols) { | |
237 | col -= dcols; | |
238 | coff += 1ULL << unit_shift; | |
239 | } | |
240 | rm->rm_col[c].rc_devidx = col; | |
241 | rm->rm_col[c].rc_offset = coff; | |
242 | rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; | |
243 | rm->rm_col[c].rc_data = NULL; | |
244 | rm->rm_col[c].rc_error = 0; | |
245 | rm->rm_col[c].rc_tried = 0; | |
246 | rm->rm_col[c].rc_skipped = 0; | |
247 | rm->rm_asize += rm->rm_col[c].rc_size; | |
248 | } | |
249 | ||
250 | rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); | |
251 | ||
252 | for (c = 0; c < rm->rm_firstdatacol; c++) | |
253 | rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); | |
254 | ||
255 | rm->rm_col[c].rc_data = zio->io_data; | |
256 | ||
257 | for (c = c + 1; c < acols; c++) | |
258 | rm->rm_col[c].rc_data = (char *)rm->rm_col[c - 1].rc_data + | |
259 | rm->rm_col[c - 1].rc_size; | |
260 | ||
261 | /* | |
262 | * If all data stored spans all columns, there's a danger that parity | |
263 | * will always be on the same device and, since parity isn't read | |
264 | * during normal operation, that that device's I/O bandwidth won't be | |
265 | * used effectively. We therefore switch the parity every 1MB. | |
266 | * | |
267 | * ... at least that was, ostensibly, the theory. As a practical | |
268 | * matter unless we juggle the parity between all devices evenly, we | |
269 | * won't see any benefit. Further, occasional writes that aren't a | |
270 | * multiple of the LCM of the number of children and the minimum | |
271 | * stripe width are sufficient to avoid pessimal behavior. | |
272 | * Unfortunately, this decision created an implicit on-disk format | |
273 | * requirement that we need to support for all eternity, but only | |
274 | * for single-parity RAID-Z. | |
275 | */ | |
276 | ASSERT(rm->rm_cols >= 2); | |
277 | ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); | |
278 | ||
279 | if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { | |
280 | devidx = rm->rm_col[0].rc_devidx; | |
281 | o = rm->rm_col[0].rc_offset; | |
282 | rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; | |
283 | rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; | |
284 | rm->rm_col[1].rc_devidx = devidx; | |
285 | rm->rm_col[1].rc_offset = o; | |
286 | } | |
287 | ||
288 | zio->io_vsd = rm; | |
b128c09f | 289 | zio->io_vsd_free = vdev_raidz_map_free; |
34dc7c2f BB |
290 | return (rm); |
291 | } | |
292 | ||
34dc7c2f BB |
293 | static void |
294 | vdev_raidz_generate_parity_p(raidz_map_t *rm) | |
295 | { | |
296 | uint64_t *p, *src, pcount, ccount, i; | |
297 | int c; | |
298 | ||
299 | pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); | |
300 | ||
301 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
302 | src = rm->rm_col[c].rc_data; | |
303 | p = rm->rm_col[VDEV_RAIDZ_P].rc_data; | |
304 | ccount = rm->rm_col[c].rc_size / sizeof (src[0]); | |
305 | ||
306 | if (c == rm->rm_firstdatacol) { | |
307 | ASSERT(ccount == pcount); | |
308 | for (i = 0; i < ccount; i++, p++, src++) { | |
309 | *p = *src; | |
310 | } | |
311 | } else { | |
312 | ASSERT(ccount <= pcount); | |
313 | for (i = 0; i < ccount; i++, p++, src++) { | |
314 | *p ^= *src; | |
315 | } | |
316 | } | |
317 | } | |
318 | } | |
319 | ||
320 | static void | |
321 | vdev_raidz_generate_parity_pq(raidz_map_t *rm) | |
322 | { | |
323 | uint64_t *q, *p, *src, pcount, ccount, mask, i; | |
324 | int c; | |
325 | ||
326 | pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); | |
327 | ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == | |
328 | rm->rm_col[VDEV_RAIDZ_Q].rc_size); | |
329 | ||
330 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
331 | src = rm->rm_col[c].rc_data; | |
332 | p = rm->rm_col[VDEV_RAIDZ_P].rc_data; | |
333 | q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; | |
334 | ccount = rm->rm_col[c].rc_size / sizeof (src[0]); | |
335 | ||
336 | if (c == rm->rm_firstdatacol) { | |
337 | ASSERT(ccount == pcount || ccount == 0); | |
338 | for (i = 0; i < ccount; i++, p++, q++, src++) { | |
339 | *q = *src; | |
340 | *p = *src; | |
341 | } | |
342 | for (; i < pcount; i++, p++, q++, src++) { | |
343 | *q = 0; | |
344 | *p = 0; | |
345 | } | |
346 | } else { | |
347 | ASSERT(ccount <= pcount); | |
348 | ||
349 | /* | |
350 | * Rather than multiplying each byte individually (as | |
351 | * described above), we are able to handle 8 at once | |
352 | * by generating a mask based on the high bit in each | |
353 | * byte and using that to conditionally XOR in 0x1d. | |
354 | */ | |
355 | for (i = 0; i < ccount; i++, p++, q++, src++) { | |
356 | mask = *q & 0x8080808080808080ULL; | |
357 | mask = (mask << 1) - (mask >> 7); | |
358 | *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ | |
359 | (mask & 0x1d1d1d1d1d1d1d1dULL); | |
360 | *q ^= *src; | |
361 | *p ^= *src; | |
362 | } | |
363 | ||
364 | /* | |
365 | * Treat short columns as though they are full of 0s. | |
366 | */ | |
367 | for (; i < pcount; i++, q++) { | |
368 | mask = *q & 0x8080808080808080ULL; | |
369 | mask = (mask << 1) - (mask >> 7); | |
370 | *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ | |
371 | (mask & 0x1d1d1d1d1d1d1d1dULL); | |
372 | } | |
373 | } | |
374 | } | |
375 | } | |
376 | ||
377 | static void | |
378 | vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) | |
379 | { | |
380 | uint64_t *dst, *src, xcount, ccount, count, i; | |
381 | int c; | |
382 | ||
383 | xcount = rm->rm_col[x].rc_size / sizeof (src[0]); | |
384 | ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); | |
385 | ASSERT(xcount > 0); | |
386 | ||
387 | src = rm->rm_col[VDEV_RAIDZ_P].rc_data; | |
388 | dst = rm->rm_col[x].rc_data; | |
389 | for (i = 0; i < xcount; i++, dst++, src++) { | |
390 | *dst = *src; | |
391 | } | |
392 | ||
393 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
394 | src = rm->rm_col[c].rc_data; | |
395 | dst = rm->rm_col[x].rc_data; | |
396 | ||
397 | if (c == x) | |
398 | continue; | |
399 | ||
400 | ccount = rm->rm_col[c].rc_size / sizeof (src[0]); | |
401 | count = MIN(ccount, xcount); | |
402 | ||
403 | for (i = 0; i < count; i++, dst++, src++) { | |
404 | *dst ^= *src; | |
405 | } | |
406 | } | |
407 | } | |
408 | ||
409 | static void | |
410 | vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) | |
411 | { | |
412 | uint64_t *dst, *src, xcount, ccount, count, mask, i; | |
413 | uint8_t *b; | |
414 | int c, j, exp; | |
415 | ||
416 | xcount = rm->rm_col[x].rc_size / sizeof (src[0]); | |
417 | ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); | |
418 | ||
419 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
420 | src = rm->rm_col[c].rc_data; | |
421 | dst = rm->rm_col[x].rc_data; | |
422 | ||
423 | if (c == x) | |
424 | ccount = 0; | |
425 | else | |
426 | ccount = rm->rm_col[c].rc_size / sizeof (src[0]); | |
427 | ||
428 | count = MIN(ccount, xcount); | |
429 | ||
430 | if (c == rm->rm_firstdatacol) { | |
431 | for (i = 0; i < count; i++, dst++, src++) { | |
432 | *dst = *src; | |
433 | } | |
434 | for (; i < xcount; i++, dst++) { | |
435 | *dst = 0; | |
436 | } | |
437 | ||
438 | } else { | |
439 | /* | |
440 | * For an explanation of this, see the comment in | |
441 | * vdev_raidz_generate_parity_pq() above. | |
442 | */ | |
443 | for (i = 0; i < count; i++, dst++, src++) { | |
444 | mask = *dst & 0x8080808080808080ULL; | |
445 | mask = (mask << 1) - (mask >> 7); | |
446 | *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ | |
447 | (mask & 0x1d1d1d1d1d1d1d1dULL); | |
448 | *dst ^= *src; | |
449 | } | |
450 | ||
451 | for (; i < xcount; i++, dst++) { | |
452 | mask = *dst & 0x8080808080808080ULL; | |
453 | mask = (mask << 1) - (mask >> 7); | |
454 | *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ | |
455 | (mask & 0x1d1d1d1d1d1d1d1dULL); | |
456 | } | |
457 | } | |
458 | } | |
459 | ||
460 | src = rm->rm_col[VDEV_RAIDZ_Q].rc_data; | |
461 | dst = rm->rm_col[x].rc_data; | |
462 | exp = 255 - (rm->rm_cols - 1 - x); | |
463 | ||
464 | for (i = 0; i < xcount; i++, dst++, src++) { | |
465 | *dst ^= *src; | |
466 | for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { | |
467 | *b = vdev_raidz_exp2(*b, exp); | |
468 | } | |
469 | } | |
470 | } | |
471 | ||
472 | static void | |
473 | vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) | |
474 | { | |
475 | uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; | |
476 | void *pdata, *qdata; | |
477 | uint64_t xsize, ysize, i; | |
478 | ||
479 | ASSERT(x < y); | |
480 | ASSERT(x >= rm->rm_firstdatacol); | |
481 | ASSERT(y < rm->rm_cols); | |
482 | ||
483 | ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); | |
484 | ||
485 | /* | |
486 | * Move the parity data aside -- we're going to compute parity as | |
487 | * though columns x and y were full of zeros -- Pxy and Qxy. We want to | |
488 | * reuse the parity generation mechanism without trashing the actual | |
489 | * parity so we make those columns appear to be full of zeros by | |
490 | * setting their lengths to zero. | |
491 | */ | |
492 | pdata = rm->rm_col[VDEV_RAIDZ_P].rc_data; | |
493 | qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_data; | |
494 | xsize = rm->rm_col[x].rc_size; | |
495 | ysize = rm->rm_col[y].rc_size; | |
496 | ||
497 | rm->rm_col[VDEV_RAIDZ_P].rc_data = | |
498 | zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_P].rc_size); | |
499 | rm->rm_col[VDEV_RAIDZ_Q].rc_data = | |
500 | zio_buf_alloc(rm->rm_col[VDEV_RAIDZ_Q].rc_size); | |
501 | rm->rm_col[x].rc_size = 0; | |
502 | rm->rm_col[y].rc_size = 0; | |
503 | ||
504 | vdev_raidz_generate_parity_pq(rm); | |
505 | ||
506 | rm->rm_col[x].rc_size = xsize; | |
507 | rm->rm_col[y].rc_size = ysize; | |
508 | ||
509 | p = pdata; | |
510 | q = qdata; | |
511 | pxy = rm->rm_col[VDEV_RAIDZ_P].rc_data; | |
512 | qxy = rm->rm_col[VDEV_RAIDZ_Q].rc_data; | |
513 | xd = rm->rm_col[x].rc_data; | |
514 | yd = rm->rm_col[y].rc_data; | |
515 | ||
516 | /* | |
517 | * We now have: | |
518 | * Pxy = P + D_x + D_y | |
519 | * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y | |
520 | * | |
521 | * We can then solve for D_x: | |
522 | * D_x = A * (P + Pxy) + B * (Q + Qxy) | |
523 | * where | |
524 | * A = 2^(x - y) * (2^(x - y) + 1)^-1 | |
525 | * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 | |
526 | * | |
527 | * With D_x in hand, we can easily solve for D_y: | |
528 | * D_y = P + Pxy + D_x | |
529 | */ | |
530 | ||
531 | a = vdev_raidz_pow2[255 + x - y]; | |
532 | b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; | |
533 | tmp = 255 - vdev_raidz_log2[a ^ 1]; | |
534 | ||
535 | aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; | |
536 | bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; | |
537 | ||
538 | for (i = 0; i < xsize; i++, p++, q++, pxy++, qxy++, xd++, yd++) { | |
539 | *xd = vdev_raidz_exp2(*p ^ *pxy, aexp) ^ | |
540 | vdev_raidz_exp2(*q ^ *qxy, bexp); | |
541 | ||
542 | if (i < ysize) | |
543 | *yd = *p ^ *pxy ^ *xd; | |
544 | } | |
545 | ||
546 | zio_buf_free(rm->rm_col[VDEV_RAIDZ_P].rc_data, | |
547 | rm->rm_col[VDEV_RAIDZ_P].rc_size); | |
548 | zio_buf_free(rm->rm_col[VDEV_RAIDZ_Q].rc_data, | |
549 | rm->rm_col[VDEV_RAIDZ_Q].rc_size); | |
550 | ||
551 | /* | |
552 | * Restore the saved parity data. | |
553 | */ | |
554 | rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; | |
555 | rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; | |
556 | } | |
557 | ||
558 | ||
559 | static int | |
560 | vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) | |
561 | { | |
562 | vdev_t *cvd; | |
563 | uint64_t nparity = vd->vdev_nparity; | |
564 | int c, error; | |
565 | int lasterror = 0; | |
566 | int numerrors = 0; | |
567 | ||
568 | ASSERT(nparity > 0); | |
569 | ||
570 | if (nparity > VDEV_RAIDZ_MAXPARITY || | |
571 | vd->vdev_children < nparity + 1) { | |
572 | vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
573 | return (EINVAL); | |
574 | } | |
575 | ||
576 | for (c = 0; c < vd->vdev_children; c++) { | |
577 | cvd = vd->vdev_child[c]; | |
578 | ||
579 | if ((error = vdev_open(cvd)) != 0) { | |
580 | lasterror = error; | |
581 | numerrors++; | |
582 | continue; | |
583 | } | |
584 | ||
585 | *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; | |
586 | *ashift = MAX(*ashift, cvd->vdev_ashift); | |
587 | } | |
588 | ||
589 | *asize *= vd->vdev_children; | |
590 | ||
591 | if (numerrors > nparity) { | |
592 | vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; | |
593 | return (lasterror); | |
594 | } | |
595 | ||
596 | return (0); | |
597 | } | |
598 | ||
599 | static void | |
600 | vdev_raidz_close(vdev_t *vd) | |
601 | { | |
602 | int c; | |
603 | ||
604 | for (c = 0; c < vd->vdev_children; c++) | |
605 | vdev_close(vd->vdev_child[c]); | |
606 | } | |
607 | ||
608 | static uint64_t | |
609 | vdev_raidz_asize(vdev_t *vd, uint64_t psize) | |
610 | { | |
611 | uint64_t asize; | |
612 | uint64_t ashift = vd->vdev_top->vdev_ashift; | |
613 | uint64_t cols = vd->vdev_children; | |
614 | uint64_t nparity = vd->vdev_nparity; | |
615 | ||
616 | asize = ((psize - 1) >> ashift) + 1; | |
617 | asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); | |
618 | asize = roundup(asize, nparity + 1) << ashift; | |
619 | ||
620 | return (asize); | |
621 | } | |
622 | ||
623 | static void | |
624 | vdev_raidz_child_done(zio_t *zio) | |
625 | { | |
626 | raidz_col_t *rc = zio->io_private; | |
627 | ||
628 | rc->rc_error = zio->io_error; | |
629 | rc->rc_tried = 1; | |
630 | rc->rc_skipped = 0; | |
631 | } | |
632 | ||
34dc7c2f BB |
633 | static int |
634 | vdev_raidz_io_start(zio_t *zio) | |
635 | { | |
636 | vdev_t *vd = zio->io_vd; | |
637 | vdev_t *tvd = vd->vdev_top; | |
638 | vdev_t *cvd; | |
639 | blkptr_t *bp = zio->io_bp; | |
640 | raidz_map_t *rm; | |
641 | raidz_col_t *rc; | |
642 | int c; | |
643 | ||
644 | rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, | |
645 | vd->vdev_nparity); | |
646 | ||
647 | ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); | |
648 | ||
649 | if (zio->io_type == ZIO_TYPE_WRITE) { | |
650 | /* | |
651 | * Generate RAID parity in the first virtual columns. | |
652 | */ | |
653 | if (rm->rm_firstdatacol == 1) | |
654 | vdev_raidz_generate_parity_p(rm); | |
655 | else | |
656 | vdev_raidz_generate_parity_pq(rm); | |
657 | ||
658 | for (c = 0; c < rm->rm_cols; c++) { | |
659 | rc = &rm->rm_col[c]; | |
660 | cvd = vd->vdev_child[rc->rc_devidx]; | |
661 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd, | |
662 | rc->rc_offset, rc->rc_data, rc->rc_size, | |
b128c09f | 663 | zio->io_type, zio->io_priority, 0, |
34dc7c2f BB |
664 | vdev_raidz_child_done, rc)); |
665 | } | |
666 | ||
b128c09f | 667 | return (ZIO_PIPELINE_CONTINUE); |
34dc7c2f BB |
668 | } |
669 | ||
670 | ASSERT(zio->io_type == ZIO_TYPE_READ); | |
671 | ||
672 | /* | |
673 | * Iterate over the columns in reverse order so that we hit the parity | |
674 | * last -- any errors along the way will force us to read the parity | |
675 | * data. | |
676 | */ | |
677 | for (c = rm->rm_cols - 1; c >= 0; c--) { | |
678 | rc = &rm->rm_col[c]; | |
679 | cvd = vd->vdev_child[rc->rc_devidx]; | |
680 | if (!vdev_readable(cvd)) { | |
681 | if (c >= rm->rm_firstdatacol) | |
682 | rm->rm_missingdata++; | |
683 | else | |
684 | rm->rm_missingparity++; | |
685 | rc->rc_error = ENXIO; | |
686 | rc->rc_tried = 1; /* don't even try */ | |
687 | rc->rc_skipped = 1; | |
688 | continue; | |
689 | } | |
fb5f0bc8 | 690 | if (vdev_dtl_contains(cvd, DTL_MISSING, bp->blk_birth, 1)) { |
34dc7c2f BB |
691 | if (c >= rm->rm_firstdatacol) |
692 | rm->rm_missingdata++; | |
693 | else | |
694 | rm->rm_missingparity++; | |
695 | rc->rc_error = ESTALE; | |
696 | rc->rc_skipped = 1; | |
697 | continue; | |
698 | } | |
699 | if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || | |
700 | (zio->io_flags & ZIO_FLAG_SCRUB)) { | |
701 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd, | |
702 | rc->rc_offset, rc->rc_data, rc->rc_size, | |
b128c09f | 703 | zio->io_type, zio->io_priority, 0, |
34dc7c2f BB |
704 | vdev_raidz_child_done, rc)); |
705 | } | |
706 | } | |
707 | ||
b128c09f | 708 | return (ZIO_PIPELINE_CONTINUE); |
34dc7c2f BB |
709 | } |
710 | ||
711 | /* | |
712 | * Report a checksum error for a child of a RAID-Z device. | |
713 | */ | |
714 | static void | |
715 | raidz_checksum_error(zio_t *zio, raidz_col_t *rc) | |
716 | { | |
717 | vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; | |
34dc7c2f BB |
718 | |
719 | if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { | |
720 | mutex_enter(&vd->vdev_stat_lock); | |
721 | vd->vdev_stat.vs_checksum_errors++; | |
722 | mutex_exit(&vd->vdev_stat_lock); | |
723 | } | |
724 | ||
725 | if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) | |
726 | zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, | |
727 | zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); | |
728 | } | |
729 | ||
730 | /* | |
731 | * Generate the parity from the data columns. If we tried and were able to | |
732 | * read the parity without error, verify that the generated parity matches the | |
733 | * data we read. If it doesn't, we fire off a checksum error. Return the | |
734 | * number such failures. | |
735 | */ | |
736 | static int | |
737 | raidz_parity_verify(zio_t *zio, raidz_map_t *rm) | |
738 | { | |
739 | void *orig[VDEV_RAIDZ_MAXPARITY]; | |
740 | int c, ret = 0; | |
741 | raidz_col_t *rc; | |
742 | ||
743 | for (c = 0; c < rm->rm_firstdatacol; c++) { | |
744 | rc = &rm->rm_col[c]; | |
745 | if (!rc->rc_tried || rc->rc_error != 0) | |
746 | continue; | |
747 | orig[c] = zio_buf_alloc(rc->rc_size); | |
748 | bcopy(rc->rc_data, orig[c], rc->rc_size); | |
749 | } | |
750 | ||
751 | if (rm->rm_firstdatacol == 1) | |
752 | vdev_raidz_generate_parity_p(rm); | |
753 | else | |
754 | vdev_raidz_generate_parity_pq(rm); | |
755 | ||
756 | for (c = 0; c < rm->rm_firstdatacol; c++) { | |
757 | rc = &rm->rm_col[c]; | |
758 | if (!rc->rc_tried || rc->rc_error != 0) | |
759 | continue; | |
760 | if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { | |
761 | raidz_checksum_error(zio, rc); | |
762 | rc->rc_error = ECKSUM; | |
763 | ret++; | |
764 | } | |
765 | zio_buf_free(orig[c], rc->rc_size); | |
766 | } | |
767 | ||
768 | return (ret); | |
769 | } | |
770 | ||
771 | static uint64_t raidz_corrected_p; | |
772 | static uint64_t raidz_corrected_q; | |
773 | static uint64_t raidz_corrected_pq; | |
774 | ||
775 | static int | |
b128c09f BB |
776 | vdev_raidz_worst_error(raidz_map_t *rm) |
777 | { | |
778 | int error = 0; | |
779 | ||
780 | for (int c = 0; c < rm->rm_cols; c++) | |
781 | error = zio_worst_error(error, rm->rm_col[c].rc_error); | |
782 | ||
783 | return (error); | |
784 | } | |
785 | ||
786 | static void | |
34dc7c2f BB |
787 | vdev_raidz_io_done(zio_t *zio) |
788 | { | |
789 | vdev_t *vd = zio->io_vd; | |
790 | vdev_t *cvd; | |
791 | raidz_map_t *rm = zio->io_vsd; | |
792 | raidz_col_t *rc, *rc1; | |
793 | int unexpected_errors = 0; | |
794 | int parity_errors = 0; | |
795 | int parity_untried = 0; | |
796 | int data_errors = 0; | |
b128c09f | 797 | int total_errors = 0; |
34dc7c2f BB |
798 | int n, c, c1; |
799 | ||
800 | ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ | |
801 | ||
34dc7c2f BB |
802 | ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); |
803 | ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); | |
804 | ||
805 | for (c = 0; c < rm->rm_cols; c++) { | |
806 | rc = &rm->rm_col[c]; | |
807 | ||
34dc7c2f | 808 | if (rc->rc_error) { |
b128c09f | 809 | ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ |
34dc7c2f BB |
810 | |
811 | if (c < rm->rm_firstdatacol) | |
812 | parity_errors++; | |
813 | else | |
814 | data_errors++; | |
815 | ||
816 | if (!rc->rc_skipped) | |
817 | unexpected_errors++; | |
818 | ||
b128c09f | 819 | total_errors++; |
34dc7c2f BB |
820 | } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { |
821 | parity_untried++; | |
822 | } | |
823 | } | |
824 | ||
825 | if (zio->io_type == ZIO_TYPE_WRITE) { | |
826 | /* | |
b128c09f BB |
827 | * XXX -- for now, treat partial writes as a success. |
828 | * (If we couldn't write enough columns to reconstruct | |
829 | * the data, the I/O failed. Otherwise, good enough.) | |
830 | * | |
831 | * Now that we support write reallocation, it would be better | |
832 | * to treat partial failure as real failure unless there are | |
833 | * no non-degraded top-level vdevs left, and not update DTLs | |
834 | * if we intend to reallocate. | |
34dc7c2f BB |
835 | */ |
836 | /* XXPOLICY */ | |
b128c09f BB |
837 | if (total_errors > rm->rm_firstdatacol) |
838 | zio->io_error = vdev_raidz_worst_error(rm); | |
34dc7c2f | 839 | |
b128c09f | 840 | return; |
34dc7c2f BB |
841 | } |
842 | ||
843 | ASSERT(zio->io_type == ZIO_TYPE_READ); | |
844 | /* | |
845 | * There are three potential phases for a read: | |
846 | * 1. produce valid data from the columns read | |
847 | * 2. read all disks and try again | |
848 | * 3. perform combinatorial reconstruction | |
849 | * | |
850 | * Each phase is progressively both more expensive and less likely to | |
851 | * occur. If we encounter more errors than we can repair or all phases | |
852 | * fail, we have no choice but to return an error. | |
853 | */ | |
854 | ||
855 | /* | |
856 | * If the number of errors we saw was correctable -- less than or equal | |
857 | * to the number of parity disks read -- attempt to produce data that | |
858 | * has a valid checksum. Naturally, this case applies in the absence of | |
859 | * any errors. | |
860 | */ | |
b128c09f | 861 | if (total_errors <= rm->rm_firstdatacol - parity_untried) { |
34dc7c2f BB |
862 | switch (data_errors) { |
863 | case 0: | |
864 | if (zio_checksum_error(zio) == 0) { | |
34dc7c2f BB |
865 | /* |
866 | * If we read parity information (unnecessarily | |
867 | * as it happens since no reconstruction was | |
868 | * needed) regenerate and verify the parity. | |
869 | * We also regenerate parity when resilvering | |
870 | * so we can write it out to the failed device | |
871 | * later. | |
872 | */ | |
873 | if (parity_errors + parity_untried < | |
874 | rm->rm_firstdatacol || | |
875 | (zio->io_flags & ZIO_FLAG_RESILVER)) { | |
876 | n = raidz_parity_verify(zio, rm); | |
877 | unexpected_errors += n; | |
878 | ASSERT(parity_errors + n <= | |
879 | rm->rm_firstdatacol); | |
880 | } | |
881 | goto done; | |
882 | } | |
883 | break; | |
884 | ||
885 | case 1: | |
886 | /* | |
887 | * We either attempt to read all the parity columns or | |
888 | * none of them. If we didn't try to read parity, we | |
889 | * wouldn't be here in the correctable case. There must | |
890 | * also have been fewer parity errors than parity | |
891 | * columns or, again, we wouldn't be in this code path. | |
892 | */ | |
893 | ASSERT(parity_untried == 0); | |
894 | ASSERT(parity_errors < rm->rm_firstdatacol); | |
895 | ||
896 | /* | |
897 | * Find the column that reported the error. | |
898 | */ | |
899 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
900 | rc = &rm->rm_col[c]; | |
901 | if (rc->rc_error != 0) | |
902 | break; | |
903 | } | |
904 | ASSERT(c != rm->rm_cols); | |
905 | ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || | |
906 | rc->rc_error == ESTALE); | |
907 | ||
908 | if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { | |
909 | vdev_raidz_reconstruct_p(rm, c); | |
910 | } else { | |
911 | ASSERT(rm->rm_firstdatacol > 1); | |
912 | vdev_raidz_reconstruct_q(rm, c); | |
913 | } | |
914 | ||
915 | if (zio_checksum_error(zio) == 0) { | |
34dc7c2f BB |
916 | if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) |
917 | atomic_inc_64(&raidz_corrected_p); | |
918 | else | |
919 | atomic_inc_64(&raidz_corrected_q); | |
920 | ||
921 | /* | |
922 | * If there's more than one parity disk that | |
923 | * was successfully read, confirm that the | |
924 | * other parity disk produced the correct data. | |
925 | * This routine is suboptimal in that it | |
926 | * regenerates both the parity we wish to test | |
927 | * as well as the parity we just used to | |
928 | * perform the reconstruction, but this should | |
929 | * be a relatively uncommon case, and can be | |
930 | * optimized if it becomes a problem. | |
931 | * We also regenerate parity when resilvering | |
932 | * so we can write it out to the failed device | |
933 | * later. | |
934 | */ | |
935 | if (parity_errors < rm->rm_firstdatacol - 1 || | |
936 | (zio->io_flags & ZIO_FLAG_RESILVER)) { | |
937 | n = raidz_parity_verify(zio, rm); | |
938 | unexpected_errors += n; | |
939 | ASSERT(parity_errors + n <= | |
940 | rm->rm_firstdatacol); | |
941 | } | |
942 | ||
943 | goto done; | |
944 | } | |
945 | break; | |
946 | ||
947 | case 2: | |
948 | /* | |
949 | * Two data column errors require double parity. | |
950 | */ | |
951 | ASSERT(rm->rm_firstdatacol == 2); | |
952 | ||
953 | /* | |
954 | * Find the two columns that reported errors. | |
955 | */ | |
956 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
957 | rc = &rm->rm_col[c]; | |
958 | if (rc->rc_error != 0) | |
959 | break; | |
960 | } | |
961 | ASSERT(c != rm->rm_cols); | |
962 | ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || | |
963 | rc->rc_error == ESTALE); | |
964 | ||
965 | for (c1 = c++; c < rm->rm_cols; c++) { | |
966 | rc = &rm->rm_col[c]; | |
967 | if (rc->rc_error != 0) | |
968 | break; | |
969 | } | |
970 | ASSERT(c != rm->rm_cols); | |
971 | ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || | |
972 | rc->rc_error == ESTALE); | |
973 | ||
974 | vdev_raidz_reconstruct_pq(rm, c1, c); | |
975 | ||
976 | if (zio_checksum_error(zio) == 0) { | |
34dc7c2f | 977 | atomic_inc_64(&raidz_corrected_pq); |
34dc7c2f BB |
978 | goto done; |
979 | } | |
980 | break; | |
981 | ||
982 | default: | |
983 | ASSERT(rm->rm_firstdatacol <= 2); | |
984 | ASSERT(0); | |
985 | } | |
986 | } | |
987 | ||
988 | /* | |
989 | * This isn't a typical situation -- either we got a read error or | |
990 | * a child silently returned bad data. Read every block so we can | |
991 | * try again with as much data and parity as we can track down. If | |
992 | * we've already been through once before, all children will be marked | |
993 | * as tried so we'll proceed to combinatorial reconstruction. | |
994 | */ | |
995 | unexpected_errors = 1; | |
996 | rm->rm_missingdata = 0; | |
997 | rm->rm_missingparity = 0; | |
998 | ||
999 | for (c = 0; c < rm->rm_cols; c++) { | |
1000 | if (rm->rm_col[c].rc_tried) | |
1001 | continue; | |
1002 | ||
34dc7c2f BB |
1003 | zio_vdev_io_redone(zio); |
1004 | do { | |
1005 | rc = &rm->rm_col[c]; | |
1006 | if (rc->rc_tried) | |
1007 | continue; | |
1008 | zio_nowait(zio_vdev_child_io(zio, NULL, | |
1009 | vd->vdev_child[rc->rc_devidx], | |
1010 | rc->rc_offset, rc->rc_data, rc->rc_size, | |
b128c09f | 1011 | zio->io_type, zio->io_priority, 0, |
34dc7c2f BB |
1012 | vdev_raidz_child_done, rc)); |
1013 | } while (++c < rm->rm_cols); | |
34dc7c2f | 1014 | |
b128c09f | 1015 | return; |
34dc7c2f BB |
1016 | } |
1017 | ||
1018 | /* | |
1019 | * At this point we've attempted to reconstruct the data given the | |
1020 | * errors we detected, and we've attempted to read all columns. There | |
1021 | * must, therefore, be one or more additional problems -- silent errors | |
1022 | * resulting in invalid data rather than explicit I/O errors resulting | |
1023 | * in absent data. Before we attempt combinatorial reconstruction make | |
1024 | * sure we have a chance of coming up with the right answer. | |
1025 | */ | |
b128c09f BB |
1026 | if (total_errors >= rm->rm_firstdatacol) { |
1027 | zio->io_error = vdev_raidz_worst_error(rm); | |
1028 | /* | |
1029 | * If there were exactly as many device errors as parity | |
1030 | * columns, yet we couldn't reconstruct the data, then at | |
1031 | * least one device must have returned bad data silently. | |
1032 | */ | |
1033 | if (total_errors == rm->rm_firstdatacol) | |
1034 | zio->io_error = zio_worst_error(zio->io_error, ECKSUM); | |
34dc7c2f BB |
1035 | goto done; |
1036 | } | |
1037 | ||
1038 | if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { | |
1039 | /* | |
1040 | * Attempt to reconstruct the data from parity P. | |
1041 | */ | |
1042 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
1043 | void *orig; | |
1044 | rc = &rm->rm_col[c]; | |
1045 | ||
1046 | orig = zio_buf_alloc(rc->rc_size); | |
1047 | bcopy(rc->rc_data, orig, rc->rc_size); | |
1048 | vdev_raidz_reconstruct_p(rm, c); | |
1049 | ||
1050 | if (zio_checksum_error(zio) == 0) { | |
1051 | zio_buf_free(orig, rc->rc_size); | |
34dc7c2f BB |
1052 | atomic_inc_64(&raidz_corrected_p); |
1053 | ||
1054 | /* | |
1055 | * If this child didn't know that it returned | |
1056 | * bad data, inform it. | |
1057 | */ | |
1058 | if (rc->rc_tried && rc->rc_error == 0) | |
1059 | raidz_checksum_error(zio, rc); | |
1060 | rc->rc_error = ECKSUM; | |
1061 | goto done; | |
1062 | } | |
1063 | ||
1064 | bcopy(orig, rc->rc_data, rc->rc_size); | |
1065 | zio_buf_free(orig, rc->rc_size); | |
1066 | } | |
1067 | } | |
1068 | ||
1069 | if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { | |
1070 | /* | |
1071 | * Attempt to reconstruct the data from parity Q. | |
1072 | */ | |
1073 | for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { | |
1074 | void *orig; | |
1075 | rc = &rm->rm_col[c]; | |
1076 | ||
1077 | orig = zio_buf_alloc(rc->rc_size); | |
1078 | bcopy(rc->rc_data, orig, rc->rc_size); | |
1079 | vdev_raidz_reconstruct_q(rm, c); | |
1080 | ||
1081 | if (zio_checksum_error(zio) == 0) { | |
1082 | zio_buf_free(orig, rc->rc_size); | |
34dc7c2f BB |
1083 | atomic_inc_64(&raidz_corrected_q); |
1084 | ||
1085 | /* | |
1086 | * If this child didn't know that it returned | |
1087 | * bad data, inform it. | |
1088 | */ | |
1089 | if (rc->rc_tried && rc->rc_error == 0) | |
1090 | raidz_checksum_error(zio, rc); | |
1091 | rc->rc_error = ECKSUM; | |
1092 | goto done; | |
1093 | } | |
1094 | ||
1095 | bcopy(orig, rc->rc_data, rc->rc_size); | |
1096 | zio_buf_free(orig, rc->rc_size); | |
1097 | } | |
1098 | } | |
1099 | ||
1100 | if (rm->rm_firstdatacol > 1 && | |
1101 | rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && | |
1102 | rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { | |
1103 | /* | |
1104 | * Attempt to reconstruct the data from both P and Q. | |
1105 | */ | |
1106 | for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { | |
1107 | void *orig, *orig1; | |
1108 | rc = &rm->rm_col[c]; | |
1109 | ||
1110 | orig = zio_buf_alloc(rc->rc_size); | |
1111 | bcopy(rc->rc_data, orig, rc->rc_size); | |
1112 | ||
1113 | for (c1 = c + 1; c1 < rm->rm_cols; c1++) { | |
1114 | rc1 = &rm->rm_col[c1]; | |
1115 | ||
1116 | orig1 = zio_buf_alloc(rc1->rc_size); | |
1117 | bcopy(rc1->rc_data, orig1, rc1->rc_size); | |
1118 | ||
1119 | vdev_raidz_reconstruct_pq(rm, c, c1); | |
1120 | ||
1121 | if (zio_checksum_error(zio) == 0) { | |
1122 | zio_buf_free(orig, rc->rc_size); | |
1123 | zio_buf_free(orig1, rc1->rc_size); | |
34dc7c2f BB |
1124 | atomic_inc_64(&raidz_corrected_pq); |
1125 | ||
1126 | /* | |
1127 | * If these children didn't know they | |
1128 | * returned bad data, inform them. | |
1129 | */ | |
1130 | if (rc->rc_tried && rc->rc_error == 0) | |
1131 | raidz_checksum_error(zio, rc); | |
1132 | if (rc1->rc_tried && rc1->rc_error == 0) | |
1133 | raidz_checksum_error(zio, rc1); | |
1134 | ||
1135 | rc->rc_error = ECKSUM; | |
1136 | rc1->rc_error = ECKSUM; | |
1137 | ||
1138 | goto done; | |
1139 | } | |
1140 | ||
1141 | bcopy(orig1, rc1->rc_data, rc1->rc_size); | |
1142 | zio_buf_free(orig1, rc1->rc_size); | |
1143 | } | |
1144 | ||
1145 | bcopy(orig, rc->rc_data, rc->rc_size); | |
1146 | zio_buf_free(orig, rc->rc_size); | |
1147 | } | |
1148 | } | |
1149 | ||
1150 | /* | |
1151 | * All combinations failed to checksum. Generate checksum ereports for | |
1152 | * all children. | |
1153 | */ | |
1154 | zio->io_error = ECKSUM; | |
b128c09f | 1155 | |
34dc7c2f BB |
1156 | if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { |
1157 | for (c = 0; c < rm->rm_cols; c++) { | |
1158 | rc = &rm->rm_col[c]; | |
1159 | zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, | |
1160 | zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, | |
1161 | rc->rc_offset, rc->rc_size); | |
1162 | } | |
1163 | } | |
1164 | ||
1165 | done: | |
1166 | zio_checksum_verified(zio); | |
1167 | ||
fb5f0bc8 | 1168 | if (zio->io_error == 0 && spa_writeable(zio->io_spa) && |
34dc7c2f | 1169 | (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { |
34dc7c2f BB |
1170 | /* |
1171 | * Use the good data we have in hand to repair damaged children. | |
34dc7c2f | 1172 | */ |
34dc7c2f BB |
1173 | for (c = 0; c < rm->rm_cols; c++) { |
1174 | rc = &rm->rm_col[c]; | |
1175 | cvd = vd->vdev_child[rc->rc_devidx]; | |
1176 | ||
1177 | if (rc->rc_error == 0) | |
1178 | continue; | |
1179 | ||
b128c09f | 1180 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd, |
34dc7c2f BB |
1181 | rc->rc_offset, rc->rc_data, rc->rc_size, |
1182 | ZIO_TYPE_WRITE, zio->io_priority, | |
fb5f0bc8 BB |
1183 | ZIO_FLAG_IO_REPAIR | (unexpected_errors ? |
1184 | ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); | |
34dc7c2f | 1185 | } |
34dc7c2f | 1186 | } |
34dc7c2f BB |
1187 | } |
1188 | ||
1189 | static void | |
1190 | vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) | |
1191 | { | |
1192 | if (faulted > vd->vdev_nparity) | |
1193 | vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, | |
1194 | VDEV_AUX_NO_REPLICAS); | |
1195 | else if (degraded + faulted != 0) | |
1196 | vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); | |
1197 | else | |
1198 | vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); | |
1199 | } | |
1200 | ||
1201 | vdev_ops_t vdev_raidz_ops = { | |
1202 | vdev_raidz_open, | |
1203 | vdev_raidz_close, | |
34dc7c2f BB |
1204 | vdev_raidz_asize, |
1205 | vdev_raidz_io_start, | |
1206 | vdev_raidz_io_done, | |
1207 | vdev_raidz_state_change, | |
1208 | VDEV_TYPE_RAIDZ, /* name of this vdev type */ | |
1209 | B_FALSE /* not a leaf vdev */ | |
1210 | }; |