]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
34dc7c2f BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
428870ff | 23 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. |
4f072827 | 24 | * Copyright (c) 2012, 2020 by Delphix. All rights reserved. |
ab9f4b0b | 25 | * Copyright (c) 2016 Gvozden Nešković. All rights reserved. |
34dc7c2f BB |
26 | */ |
27 | ||
34dc7c2f BB |
28 | #include <sys/zfs_context.h> |
29 | #include <sys/spa.h> | |
5caeef02 DB |
30 | #include <sys/spa_impl.h> |
31 | #include <sys/zap.h> | |
34dc7c2f | 32 | #include <sys/vdev_impl.h> |
5caeef02 | 33 | #include <sys/metaslab_impl.h> |
34dc7c2f BB |
34 | #include <sys/zio.h> |
35 | #include <sys/zio_checksum.h> | |
5caeef02 | 36 | #include <sys/dmu_tx.h> |
a6255b7f | 37 | #include <sys/abd.h> |
5caeef02 | 38 | #include <sys/zfs_rlock.h> |
34dc7c2f BB |
39 | #include <sys/fs/zfs.h> |
40 | #include <sys/fm/fs/zfs.h> | |
ab9f4b0b GN |
41 | #include <sys/vdev_raidz.h> |
42 | #include <sys/vdev_raidz_impl.h> | |
b2255edc | 43 | #include <sys/vdev_draid.h> |
5caeef02 DB |
44 | #include <sys/uberblock_impl.h> |
45 | #include <sys/dsl_scan.h> | |
34dc7c2f | 46 | |
619f0976 | 47 | #ifdef ZFS_DEBUG |
1b939560 | 48 | #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */ |
619f0976 GW |
49 | #endif |
50 | ||
34dc7c2f BB |
51 | /* |
52 | * Virtual device vector for RAID-Z. | |
53 | * | |
45d1cae3 BB |
54 | * This vdev supports single, double, and triple parity. For single parity, |
55 | * we use a simple XOR of all the data columns. For double or triple parity, | |
56 | * we use a special case of Reed-Solomon coding. This extends the | |
57 | * technique described in "The mathematics of RAID-6" by H. Peter Anvin by | |
58 | * drawing on the system described in "A Tutorial on Reed-Solomon Coding for | |
59 | * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the | |
60 | * former is also based. The latter is designed to provide higher performance | |
61 | * for writes. | |
62 | * | |
63 | * Note that the Plank paper claimed to support arbitrary N+M, but was then | |
64 | * amended six years later identifying a critical flaw that invalidates its | |
65 | * claims. Nevertheless, the technique can be adapted to work for up to | |
66 | * triple parity. For additional parity, the amendment "Note: Correction to | |
67 | * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding | |
68 | * is viable, but the additional complexity means that write performance will | |
69 | * suffer. | |
70 | * | |
71 | * All of the methods above operate on a Galois field, defined over the | |
72 | * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements | |
73 | * can be expressed with a single byte. Briefly, the operations on the | |
74 | * field are defined as follows: | |
34dc7c2f BB |
75 | * |
76 | * o addition (+) is represented by a bitwise XOR | |
77 | * o subtraction (-) is therefore identical to addition: A + B = A - B | |
78 | * o multiplication of A by 2 is defined by the following bitwise expression: | |
d3cc8b15 | 79 | * |
34dc7c2f BB |
80 | * (A * 2)_7 = A_6 |
81 | * (A * 2)_6 = A_5 | |
82 | * (A * 2)_5 = A_4 | |
83 | * (A * 2)_4 = A_3 + A_7 | |
84 | * (A * 2)_3 = A_2 + A_7 | |
85 | * (A * 2)_2 = A_1 + A_7 | |
86 | * (A * 2)_1 = A_0 | |
87 | * (A * 2)_0 = A_7 | |
88 | * | |
89 | * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). | |
45d1cae3 BB |
90 | * As an aside, this multiplication is derived from the error correcting |
91 | * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. | |
34dc7c2f BB |
92 | * |
93 | * Observe that any number in the field (except for 0) can be expressed as a | |
94 | * power of 2 -- a generator for the field. We store a table of the powers of | |
95 | * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can | |
96 | * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather | |
45d1cae3 BB |
97 | * than field addition). The inverse of a field element A (A^-1) is therefore |
98 | * A ^ (255 - 1) = A^254. | |
34dc7c2f | 99 | * |
45d1cae3 BB |
100 | * The up-to-three parity columns, P, Q, R over several data columns, |
101 | * D_0, ... D_n-1, can be expressed by field operations: | |
34dc7c2f BB |
102 | * |
103 | * P = D_0 + D_1 + ... + D_n-2 + D_n-1 | |
104 | * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 | |
105 | * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 | |
45d1cae3 BB |
106 | * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 |
107 | * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 | |
34dc7c2f | 108 | * |
e1cfd73f | 109 | * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial |
45d1cae3 BB |
110 | * XOR operation, and 2 and 4 can be computed quickly and generate linearly- |
111 | * independent coefficients. (There are no additional coefficients that have | |
112 | * this property which is why the uncorrected Plank method breaks down.) | |
113 | * | |
114 | * See the reconstruction code below for how P, Q and R can used individually | |
115 | * or in concert to recover missing data columns. | |
34dc7c2f BB |
116 | */ |
117 | ||
34dc7c2f BB |
118 | #define VDEV_RAIDZ_P 0 |
119 | #define VDEV_RAIDZ_Q 1 | |
45d1cae3 | 120 | #define VDEV_RAIDZ_R 2 |
45d1cae3 BB |
121 | |
122 | #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) | |
123 | #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) | |
124 | ||
125 | /* | |
126 | * We provide a mechanism to perform the field multiplication operation on a | |
127 | * 64-bit value all at once rather than a byte at a time. This works by | |
128 | * creating a mask from the top bit in each byte and using that to | |
129 | * conditionally apply the XOR of 0x1d. | |
130 | */ | |
131 | #define VDEV_RAIDZ_64MUL_2(x, mask) \ | |
132 | { \ | |
133 | (mask) = (x) & 0x8080808080808080ULL; \ | |
134 | (mask) = ((mask) << 1) - ((mask) >> 7); \ | |
135 | (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ | |
c5b3a7bb | 136 | ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ |
45d1cae3 | 137 | } |
34dc7c2f | 138 | |
45d1cae3 BB |
139 | #define VDEV_RAIDZ_64MUL_4(x, mask) \ |
140 | { \ | |
141 | VDEV_RAIDZ_64MUL_2((x), mask); \ | |
142 | VDEV_RAIDZ_64MUL_2((x), mask); \ | |
143 | } | |
34dc7c2f | 144 | |
5caeef02 DB |
145 | |
146 | /* | |
147 | * Big Theory Statement for how a RAIDZ VDEV is expanded | |
148 | * | |
149 | * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion | |
150 | * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs | |
151 | * that have been previously expanded can be expanded again. | |
152 | * | |
153 | * The RAIDZ VDEV must be healthy (must be able to write to all the drives in | |
154 | * the VDEV) when an expansion starts. And the expansion will pause if any | |
155 | * disk in the VDEV fails, and resume once the VDEV is healthy again. All other | |
156 | * operations on the pool can continue while an expansion is in progress (e.g. | |
157 | * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim, | |
158 | * and zpool initialize which can't be run during an expansion. Following a | |
159 | * reboot or export/import, the expansion resumes where it left off. | |
160 | * | |
161 | * == Reflowing the Data == | |
162 | * | |
163 | * The expansion involves reflowing (copying) the data from the current set | |
164 | * of disks to spread it across the new set which now has one more disk. This | |
165 | * reflow operation is similar to reflowing text when the column width of a | |
166 | * text editor window is expanded. The text doesn’t change but the location of | |
167 | * the text changes to accommodate the new width. An example reflow result for | |
168 | * a 4-wide RAIDZ1 to a 5-wide is shown below. | |
169 | * | |
170 | * Reflow End State | |
171 | * Each letter indicates a parity group (logical stripe) | |
172 | * | |
173 | * Before expansion After Expansion | |
174 | * D1 D2 D3 D4 D1 D2 D3 D4 D5 | |
175 | * +------+------+------+------+ +------+------+------+------+------+ | |
176 | * | | | | | | | | | | | | |
177 | * | A | A | A | A | | A | A | A | A | B | | |
178 | * | 1| 2| 3| 4| | 1| 2| 3| 4| 5| | |
179 | * +------+------+------+------+ +------+------+------+------+------+ | |
180 | * | | | | | | | | | | | | |
181 | * | B | B | C | C | | B | C | C | C | C | | |
182 | * | 5| 6| 7| 8| | 6| 7| 8| 9| 10| | |
183 | * +------+------+------+------+ +------+------+------+------+------+ | |
184 | * | | | | | | | | | | | | |
185 | * | C | C | D | D | | D | D | E | E | E | | |
186 | * | 9| 10| 11| 12| | 11| 12| 13| 14| 15| | |
187 | * +------+------+------+------+ +------+------+------+------+------+ | |
188 | * | | | | | | | | | | | | |
189 | * | E | E | E | E | --> | E | F | F | G | G | | |
190 | * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20| | |
191 | * +------+------+------+------+ +------+------+------+------+------+ | |
192 | * | | | | | | | | | | | | |
193 | * | F | F | G | G | | G | G | H | H | H | | |
194 | * | 17| 18| 19| 20| | 21| 22| 23| 24| 25| | |
195 | * +------+------+------+------+ +------+------+------+------+------+ | |
196 | * | | | | | | | | | | | | |
197 | * | G | G | H | H | | H | I | I | J | J | | |
198 | * | 21| 22| 23| 24| | 26| 27| 28| 29| 30| | |
199 | * +------+------+------+------+ +------+------+------+------+------+ | |
200 | * | | | | | | | | | | | | |
201 | * | H | H | I | I | | J | J | | | K | | |
202 | * | 25| 26| 27| 28| | 31| 32| 33| 34| 35| | |
203 | * +------+------+------+------+ +------+------+------+------+------+ | |
204 | * | |
205 | * This reflow approach has several advantages. There is no need to read or | |
206 | * modify the block pointers or recompute any block checksums. The reflow | |
207 | * doesn’t need to know where the parity sectors reside. We can read and write | |
208 | * data sequentially and the copy can occur in a background thread in open | |
209 | * context. The design also allows for fast discovery of what data to copy. | |
210 | * | |
211 | * The VDEV metaslabs are processed, one at a time, to copy the block data to | |
212 | * have it flow across all the disks. The metaslab is disabled for allocations | |
213 | * during the copy. As an optimization, we only copy the allocated data which | |
214 | * can be determined by looking at the metaslab range tree. During the copy we | |
215 | * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still | |
216 | * need to be able to survive losing parity count disks). This means we | |
217 | * cannot overwrite data during the reflow that would be needed if a disk is | |
218 | * lost. | |
219 | * | |
220 | * After the reflow completes, all newly-written blocks will have the new | |
221 | * layout, i.e., they will have the parity to data ratio implied by the new | |
222 | * number of disks in the RAIDZ group. Even though the reflow copies all of | |
223 | * the allocated space (data and parity), it is only rearranged, not changed. | |
224 | * | |
225 | * This act of reflowing the data has a few implications about blocks | |
226 | * that were written before the reflow completes: | |
227 | * | |
228 | * - Old blocks will still use the same amount of space (i.e., they will have | |
229 | * the parity to data ratio implied by the old number of disks in the RAIDZ | |
230 | * group). | |
231 | * - Reading old blocks will be slightly slower than before the reflow, for | |
232 | * two reasons. First, we will have to read from all disks in the RAIDZ | |
233 | * VDEV, rather than being able to skip the children that contain only | |
234 | * parity of this block (because the data of a single block is now spread | |
235 | * out across all the disks). Second, in most cases there will be an extra | |
236 | * bcopy, needed to rearrange the data back to its original layout in memory. | |
237 | * | |
238 | * == Scratch Area == | |
239 | * | |
240 | * As we copy the block data, we can only progress to the point that writes | |
241 | * will not overlap with blocks whose progress has not yet been recorded on | |
242 | * disk. Since partially-copied rows are always read from the old location, | |
243 | * we need to stop one row before the sector-wise overlap, to prevent any | |
244 | * row-wise overlap. For example, in the diagram above, when we reflow sector | |
245 | * B6 it will overwite the original location for B5. | |
246 | * | |
247 | * To get around this, a scratch space is used so that we can start copying | |
248 | * without risking data loss by overlapping the row. As an added benefit, it | |
249 | * improves performance at the beginning of the reflow, but that small perf | |
250 | * boost wouldn't be worth the complexity on its own. | |
251 | * | |
252 | * Ideally we want to copy at least 2 * (new_width)^2 so that we have a | |
253 | * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max | |
254 | * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice | |
255 | * the widths will likely be single digits so we can get a substantial chuck | |
256 | * size using only a few MB of scratch per disk. | |
257 | * | |
258 | * The scratch area is persisted to disk which holds a large amount of reflowed | |
259 | * state. We can always read the partially written stripes when a disk fails or | |
260 | * the copy is interrupted (crash) during the initial copying phase and also | |
261 | * get past a small chunk size restriction. At a minimum, the scratch space | |
262 | * must be large enough to get us to the point that one row does not overlap | |
263 | * itself when moved (i.e new_width^2). But going larger is even better. We | |
264 | * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels | |
265 | * as our scratch space to handle overwriting the initial part of the VDEV. | |
266 | * | |
267 | * 0 256K 512K 4M | |
268 | * +------+------+-----------------------+----------------------------- | |
269 | * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ... | |
270 | * | L0 | L1 | Reserved | (Metaslabs) | |
271 | * +------+------+-----------------------+------------------------------- | |
272 | * Scratch Area | |
273 | * | |
274 | * == Reflow Progress Updates == | |
275 | * After the initial scratch-based reflow, the expansion process works | |
276 | * similarly to device removal. We create a new open context thread which | |
277 | * reflows the data, and periodically kicks off sync tasks to update logical | |
278 | * state. In this case, state is the committed progress (offset of next data | |
279 | * to copy). We need to persist the completed offset on disk, so that if we | |
280 | * crash we know which format each VDEV offset is in. | |
281 | * | |
282 | * == Time Dependent Geometry == | |
283 | * | |
284 | * In non-expanded RAIDZ, blocks are read from disk in a column by column | |
285 | * fashion. For a multi-row block, the second sector is in the first column | |
286 | * not in the second column. This allows us to issue full reads for each | |
287 | * column directly into the request buffer. The block data is thus laid out | |
288 | * sequentially in a column-by-column fashion. | |
289 | * | |
290 | * For example, in the before expansion diagram above, one logical block might | |
291 | * be sectors G19-H26. The parity is in G19,H23; and the data is in | |
292 | * G20,H24,G21,H25,G22,H26. | |
293 | * | |
294 | * After a block is reflowed, the sectors that were all in the original column | |
295 | * data can now reside in different columns. When reading from an expanded | |
296 | * VDEV, we need to know the logical stripe width for each block so we can | |
297 | * reconstitute the block’s data after the reads are completed. Likewise, | |
298 | * when we perform the combinatorial reconstruction we need to know the | |
299 | * original width so we can retry combinations from the past layouts. | |
300 | * | |
301 | * Time dependent geometry is what we call having blocks with different layouts | |
302 | * (stripe widths) in the same VDEV. This time-dependent geometry uses the | |
303 | * block’s birth time (+ the time expansion ended) to establish the correct | |
304 | * width for a given block. After an expansion completes, we record the time | |
305 | * for blocks written with a particular width (geometry). | |
306 | * | |
307 | * == On Disk Format Changes == | |
308 | * | |
309 | * New pool feature flag, 'raidz_expansion' whose reference count is the number | |
310 | * of RAIDZ VDEVs that have been expanded. | |
311 | * | |
312 | * The blocks on expanded RAIDZ VDEV can have different logical stripe widths. | |
313 | * | |
314 | * Since the uberblock can point to arbitrary blocks, which might be on the | |
315 | * expanding RAIDZ, and might or might not have been expanded. We need to know | |
316 | * which way a block is laid out before reading it. This info is the next | |
317 | * offset that needs to be reflowed and we persist that in the uberblock, in | |
318 | * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label. | |
319 | * After the expansion is complete, we then use the raidz_expand_txgs array | |
320 | * (see below) to determine how to read a block and the ub_raidz_reflow_info | |
321 | * field no longer required. | |
322 | * | |
323 | * The uberblock's ub_raidz_reflow_info field also holds the scratch space | |
324 | * state (i.e., active or not) which is also required before reading a block | |
325 | * during the initial phase of reflowing the data. | |
326 | * | |
327 | * The top-level RAIDZ VDEV has two new entries in the nvlist: | |
328 | * | |
329 | * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here | |
330 | * and used after the expansion is complete to | |
331 | * determine how to read a raidz block | |
332 | * 'raidz_expanding' boolean: present during reflow and removed after completion | |
333 | * used during a spa import to resume an unfinished | |
334 | * expansion | |
335 | * | |
336 | * And finally the VDEVs top zap adds the following informational entries: | |
337 | * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE | |
338 | * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME | |
339 | * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME | |
340 | * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED | |
341 | */ | |
342 | ||
343 | /* | |
344 | * For testing only: pause the raidz expansion after reflowing this amount. | |
345 | * (accessed by ZTS and ztest) | |
346 | */ | |
347 | #ifdef _KERNEL | |
348 | static | |
349 | #endif /* _KERNEL */ | |
350 | unsigned long raidz_expand_max_reflow_bytes = 0; | |
351 | ||
352 | /* | |
353 | * For testing only: pause the raidz expansion at a certain point. | |
354 | */ | |
355 | uint_t raidz_expand_pause_point = 0; | |
356 | ||
357 | /* | |
358 | * Maximum amount of copy io's outstanding at once. | |
359 | */ | |
360 | static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE; | |
361 | ||
362 | /* | |
363 | * Apply raidz map abds aggregation if the number of rows in the map is equal | |
364 | * or greater than the value below. | |
365 | */ | |
366 | static unsigned long raidz_io_aggregate_rows = 4; | |
367 | ||
368 | /* | |
369 | * Automatically start a pool scrub when a RAIDZ expansion completes in | |
370 | * order to verify the checksums of all blocks which have been copied | |
371 | * during the expansion. Automatic scrubbing is enabled by default and | |
372 | * is strongly recommended. | |
373 | */ | |
374 | static int zfs_scrub_after_expand = 1; | |
375 | ||
b2255edc BB |
376 | static void |
377 | vdev_raidz_row_free(raidz_row_t *rr) | |
b128c09f | 378 | { |
e2af2acc MA |
379 | for (int c = 0; c < rr->rr_cols; c++) { |
380 | raidz_col_t *rc = &rr->rr_col[c]; | |
b128c09f | 381 | |
e2af2acc MA |
382 | if (rc->rc_size != 0) |
383 | abd_free(rc->rc_abd); | |
e2af2acc | 384 | if (rc->rc_orig_data != NULL) |
330c6c05 | 385 | abd_free(rc->rc_orig_data); |
b2255edc BB |
386 | } |
387 | ||
b2255edc BB |
388 | if (rr->rr_abd_empty != NULL) |
389 | abd_free(rr->rr_abd_empty); | |
390 | ||
391 | kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols])); | |
392 | } | |
428870ff | 393 | |
b2255edc BB |
394 | void |
395 | vdev_raidz_map_free(raidz_map_t *rm) | |
396 | { | |
397 | for (int i = 0; i < rm->rm_nrows; i++) | |
398 | vdev_raidz_row_free(rm->rm_row[i]); | |
428870ff | 399 | |
5caeef02 DB |
400 | if (rm->rm_nphys_cols) { |
401 | for (int i = 0; i < rm->rm_nphys_cols; i++) { | |
402 | if (rm->rm_phys_col[i].rc_abd != NULL) | |
403 | abd_free(rm->rm_phys_col[i].rc_abd); | |
404 | } | |
405 | ||
406 | kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) * | |
407 | rm->rm_nphys_cols); | |
408 | } | |
409 | ||
410 | ASSERT3P(rm->rm_lr, ==, NULL); | |
b2255edc | 411 | kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows])); |
b128c09f BB |
412 | } |
413 | ||
428870ff BB |
414 | static void |
415 | vdev_raidz_map_free_vsd(zio_t *zio) | |
416 | { | |
417 | raidz_map_t *rm = zio->io_vsd; | |
418 | ||
330c6c05 | 419 | vdev_raidz_map_free(rm); |
428870ff BB |
420 | } |
421 | ||
5caeef02 DB |
422 | static int |
423 | vdev_raidz_reflow_compare(const void *x1, const void *x2) | |
424 | { | |
425 | const reflow_node_t *l = x1; | |
426 | const reflow_node_t *r = x2; | |
427 | ||
428 | return (TREE_CMP(l->re_txg, r->re_txg)); | |
429 | } | |
430 | ||
330c6c05 | 431 | const zio_vsd_ops_t vdev_raidz_vsd_ops = { |
56d8d8ac | 432 | .vsd_free = vdev_raidz_map_free_vsd, |
428870ff BB |
433 | }; |
434 | ||
5caeef02 DB |
435 | raidz_row_t * |
436 | vdev_raidz_row_alloc(int cols) | |
437 | { | |
438 | raidz_row_t *rr = | |
439 | kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP); | |
440 | ||
441 | rr->rr_cols = cols; | |
442 | rr->rr_scols = cols; | |
443 | ||
444 | for (int c = 0; c < cols; c++) { | |
445 | raidz_col_t *rc = &rr->rr_col[c]; | |
446 | rc->rc_shadow_devidx = INT_MAX; | |
447 | rc->rc_shadow_offset = UINT64_MAX; | |
448 | rc->rc_allow_repair = 1; | |
449 | } | |
450 | return (rr); | |
451 | } | |
452 | ||
345196be BA |
453 | static void |
454 | vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift) | |
455 | { | |
456 | int c; | |
457 | int nwrapped = 0; | |
458 | uint64_t off = 0; | |
459 | raidz_row_t *rr = rm->rm_row[0]; | |
460 | ||
461 | ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); | |
462 | ASSERT3U(rm->rm_nrows, ==, 1); | |
463 | ||
464 | /* | |
465 | * Pad any parity columns with additional space to account for skip | |
466 | * sectors. | |
467 | */ | |
468 | if (rm->rm_skipstart < rr->rr_firstdatacol) { | |
469 | ASSERT0(rm->rm_skipstart); | |
470 | nwrapped = rm->rm_nskip; | |
471 | } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) { | |
472 | nwrapped = | |
473 | (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols; | |
474 | } | |
475 | ||
476 | /* | |
477 | * Optional single skip sectors (rc_size == 0) will be handled in | |
478 | * vdev_raidz_io_start_write(). | |
479 | */ | |
480 | int skipped = rr->rr_scols - rr->rr_cols; | |
481 | ||
482 | /* Allocate buffers for the parity columns */ | |
483 | for (c = 0; c < rr->rr_firstdatacol; c++) { | |
484 | raidz_col_t *rc = &rr->rr_col[c]; | |
485 | ||
486 | /* | |
487 | * Parity columns will pad out a linear ABD to account for | |
488 | * the skip sector. A linear ABD is used here because | |
489 | * parity calculations use the ABD buffer directly to calculate | |
490 | * parity. This avoids doing a memcpy back to the ABD after the | |
491 | * parity has been calculated. By issuing the parity column | |
492 | * with the skip sector we can reduce contention on the child | |
493 | * VDEV queue locks (vq_lock). | |
494 | */ | |
495 | if (c < nwrapped) { | |
496 | rc->rc_abd = abd_alloc_linear( | |
497 | rc->rc_size + (1ULL << ashift), B_FALSE); | |
498 | abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift); | |
499 | skipped++; | |
500 | } else { | |
501 | rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); | |
502 | } | |
503 | } | |
504 | ||
505 | for (off = 0; c < rr->rr_cols; c++) { | |
506 | raidz_col_t *rc = &rr->rr_col[c]; | |
507 | abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct, | |
508 | zio->io_abd, off, rc->rc_size); | |
509 | ||
510 | /* | |
511 | * Generate I/O for skip sectors to improve aggregation | |
512 | * continuity. We will use gang ABD's to reduce contention | |
513 | * on the child VDEV queue locks (vq_lock) by issuing | |
514 | * a single I/O that contains the data and skip sector. | |
515 | * | |
516 | * It is important to make sure that rc_size is not updated | |
517 | * even though we are adding a skip sector to the ABD. When | |
518 | * calculating the parity in vdev_raidz_generate_parity_row() | |
519 | * the rc_size is used to iterate through the ABD's. We can | |
520 | * not have zero'd out skip sectors used for calculating | |
521 | * parity for raidz, because those same sectors are not used | |
522 | * during reconstruction. | |
523 | */ | |
524 | if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) { | |
525 | rc->rc_abd = abd_alloc_gang(); | |
526 | abd_gang_add(rc->rc_abd, abd, B_TRUE); | |
527 | abd_gang_add(rc->rc_abd, | |
528 | abd_get_zeros(1ULL << ashift), B_TRUE); | |
529 | skipped++; | |
530 | } else { | |
531 | rc->rc_abd = abd; | |
532 | } | |
533 | off += rc->rc_size; | |
534 | } | |
535 | ||
536 | ASSERT3U(off, ==, zio->io_size); | |
537 | ASSERT3S(skipped, ==, rm->rm_nskip); | |
538 | } | |
539 | ||
540 | static void | |
541 | vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm) | |
542 | { | |
543 | int c; | |
544 | raidz_row_t *rr = rm->rm_row[0]; | |
545 | ||
546 | ASSERT3U(rm->rm_nrows, ==, 1); | |
547 | ||
548 | /* Allocate buffers for the parity columns */ | |
549 | for (c = 0; c < rr->rr_firstdatacol; c++) | |
550 | rr->rr_col[c].rc_abd = | |
551 | abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE); | |
552 | ||
553 | for (uint64_t off = 0; c < rr->rr_cols; c++) { | |
554 | raidz_col_t *rc = &rr->rr_col[c]; | |
555 | rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct, | |
556 | zio->io_abd, off, rc->rc_size); | |
557 | off += rc->rc_size; | |
558 | } | |
559 | } | |
560 | ||
e49f1e20 WA |
561 | /* |
562 | * Divides the IO evenly across all child vdevs; usually, dcols is | |
563 | * the number of children in the target vdev. | |
a1687880 BB |
564 | * |
565 | * Avoid inlining the function to keep vdev_raidz_io_start(), which | |
566 | * is this functions only caller, as small as possible on the stack. | |
e49f1e20 | 567 | */ |
ab9f4b0b | 568 | noinline raidz_map_t * |
3d6da72d | 569 | vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, |
34dc7c2f BB |
570 | uint64_t nparity) |
571 | { | |
b2255edc | 572 | raidz_row_t *rr; |
e49f1e20 | 573 | /* The starting RAIDZ (parent) vdev sector of the block. */ |
3d6da72d | 574 | uint64_t b = zio->io_offset >> ashift; |
e49f1e20 | 575 | /* The zio's size in units of the vdev's minimum sector size. */ |
3d6da72d | 576 | uint64_t s = zio->io_size >> ashift; |
e49f1e20 | 577 | /* The first column for this stripe. */ |
34dc7c2f | 578 | uint64_t f = b % dcols; |
e49f1e20 | 579 | /* The starting byte offset on each child vdev. */ |
3d6da72d | 580 | uint64_t o = (b / dcols) << ashift; |
5caeef02 | 581 | uint64_t acols, scols; |
34dc7c2f | 582 | |
b2255edc BB |
583 | raidz_map_t *rm = |
584 | kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP); | |
585 | rm->rm_nrows = 1; | |
586 | ||
e49f1e20 WA |
587 | /* |
588 | * "Quotient": The number of data sectors for this stripe on all but | |
589 | * the "big column" child vdevs that also contain "remainder" data. | |
590 | */ | |
5caeef02 | 591 | uint64_t q = s / (dcols - nparity); |
e49f1e20 WA |
592 | |
593 | /* | |
594 | * "Remainder": The number of partial stripe data sectors in this I/O. | |
595 | * This will add a sector to some, but not all, child vdevs. | |
596 | */ | |
5caeef02 | 597 | uint64_t r = s - q * (dcols - nparity); |
e49f1e20 WA |
598 | |
599 | /* The number of "big columns" - those which contain remainder data. */ | |
5caeef02 | 600 | uint64_t bc = (r == 0 ? 0 : r + nparity); |
e49f1e20 WA |
601 | |
602 | /* | |
603 | * The total number of data and parity sectors associated with | |
604 | * this I/O. | |
605 | */ | |
5caeef02 | 606 | uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); |
45d1cae3 | 607 | |
b2255edc BB |
608 | /* |
609 | * acols: The columns that will be accessed. | |
610 | * scols: The columns that will be accessed or skipped. | |
611 | */ | |
45d1cae3 | 612 | if (q == 0) { |
e49f1e20 | 613 | /* Our I/O request doesn't span all child vdevs. */ |
45d1cae3 BB |
614 | acols = bc; |
615 | scols = MIN(dcols, roundup(bc, nparity + 1)); | |
616 | } else { | |
617 | acols = dcols; | |
618 | scols = dcols; | |
619 | } | |
34dc7c2f | 620 | |
45d1cae3 | 621 | ASSERT3U(acols, <=, scols); |
5caeef02 | 622 | rr = vdev_raidz_row_alloc(scols); |
b2255edc | 623 | rm->rm_row[0] = rr; |
b2255edc | 624 | rr->rr_cols = acols; |
b2255edc | 625 | rr->rr_bigcols = bc; |
b2255edc | 626 | rr->rr_firstdatacol = nparity; |
b2255edc BB |
627 | #ifdef ZFS_DEBUG |
628 | rr->rr_offset = zio->io_offset; | |
629 | rr->rr_size = zio->io_size; | |
630 | #endif | |
34dc7c2f | 631 | |
5caeef02 | 632 | uint64_t asize = 0; |
45d1cae3 | 633 | |
5caeef02 | 634 | for (uint64_t c = 0; c < scols; c++) { |
b2255edc | 635 | raidz_col_t *rc = &rr->rr_col[c]; |
5caeef02 DB |
636 | uint64_t col = f + c; |
637 | uint64_t coff = o; | |
34dc7c2f BB |
638 | if (col >= dcols) { |
639 | col -= dcols; | |
3d6da72d | 640 | coff += 1ULL << ashift; |
34dc7c2f | 641 | } |
b2255edc BB |
642 | rc->rc_devidx = col; |
643 | rc->rc_offset = coff; | |
45d1cae3 BB |
644 | |
645 | if (c >= acols) | |
b2255edc | 646 | rc->rc_size = 0; |
45d1cae3 | 647 | else if (c < bc) |
b2255edc | 648 | rc->rc_size = (q + 1) << ashift; |
45d1cae3 | 649 | else |
b2255edc | 650 | rc->rc_size = q << ashift; |
45d1cae3 | 651 | |
b2255edc | 652 | asize += rc->rc_size; |
34dc7c2f BB |
653 | } |
654 | ||
3d6da72d | 655 | ASSERT3U(asize, ==, tot << ashift); |
428870ff | 656 | rm->rm_nskip = roundup(tot, nparity + 1) - tot; |
b2255edc | 657 | rm->rm_skipstart = bc; |
34dc7c2f | 658 | |
34dc7c2f BB |
659 | /* |
660 | * If all data stored spans all columns, there's a danger that parity | |
661 | * will always be on the same device and, since parity isn't read | |
e1cfd73f | 662 | * during normal operation, that device's I/O bandwidth won't be |
34dc7c2f BB |
663 | * used effectively. We therefore switch the parity every 1MB. |
664 | * | |
665 | * ... at least that was, ostensibly, the theory. As a practical | |
666 | * matter unless we juggle the parity between all devices evenly, we | |
667 | * won't see any benefit. Further, occasional writes that aren't a | |
668 | * multiple of the LCM of the number of children and the minimum | |
669 | * stripe width are sufficient to avoid pessimal behavior. | |
670 | * Unfortunately, this decision created an implicit on-disk format | |
671 | * requirement that we need to support for all eternity, but only | |
672 | * for single-parity RAID-Z. | |
428870ff BB |
673 | * |
674 | * If we intend to skip a sector in the zeroth column for padding | |
675 | * we must make sure to note this swap. We will never intend to | |
676 | * skip the first column since at least one data and one parity | |
677 | * column must appear in each row. | |
34dc7c2f | 678 | */ |
b2255edc BB |
679 | ASSERT(rr->rr_cols >= 2); |
680 | ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); | |
34dc7c2f | 681 | |
b2255edc | 682 | if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { |
5caeef02 | 683 | uint64_t devidx = rr->rr_col[0].rc_devidx; |
b2255edc BB |
684 | o = rr->rr_col[0].rc_offset; |
685 | rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; | |
686 | rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; | |
687 | rr->rr_col[1].rc_devidx = devidx; | |
688 | rr->rr_col[1].rc_offset = o; | |
428870ff BB |
689 | if (rm->rm_skipstart == 0) |
690 | rm->rm_skipstart = 1; | |
34dc7c2f BB |
691 | } |
692 | ||
345196be BA |
693 | if (zio->io_type == ZIO_TYPE_WRITE) { |
694 | vdev_raidz_map_alloc_write(zio, rm, ashift); | |
695 | } else { | |
696 | vdev_raidz_map_alloc_read(zio, rm); | |
697 | } | |
5caeef02 DB |
698 | /* init RAIDZ parity ops */ |
699 | rm->rm_ops = vdev_raidz_math_get_ops(); | |
700 | ||
701 | return (rm); | |
702 | } | |
703 | ||
704 | /* | |
705 | * Everything before reflow_offset_synced should have been moved to the new | |
706 | * location (read and write completed). However, this may not yet be reflected | |
707 | * in the on-disk format (e.g. raidz_reflow_sync() has been called but the | |
708 | * uberblock has not yet been written). If reflow is not in progress, | |
709 | * reflow_offset_synced should be UINT64_MAX. For each row, if the row is | |
710 | * entirely before reflow_offset_synced, it will come from the new location. | |
711 | * Otherwise this row will come from the old location. Therefore, rows that | |
712 | * straddle the reflow_offset_synced will come from the old location. | |
713 | * | |
714 | * For writes, reflow_offset_next is the next offset to copy. If a sector has | |
715 | * been copied, but not yet reflected in the on-disk progress | |
716 | * (reflow_offset_synced), it will also be written to the new (already copied) | |
717 | * offset. | |
718 | */ | |
719 | noinline raidz_map_t * | |
720 | vdev_raidz_map_alloc_expanded(zio_t *zio, | |
721 | uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols, | |
722 | uint64_t nparity, uint64_t reflow_offset_synced, | |
723 | uint64_t reflow_offset_next, boolean_t use_scratch) | |
724 | { | |
725 | abd_t *abd = zio->io_abd; | |
726 | uint64_t offset = zio->io_offset; | |
727 | uint64_t size = zio->io_size; | |
728 | ||
729 | /* The zio's size in units of the vdev's minimum sector size. */ | |
730 | uint64_t s = size >> ashift; | |
731 | ||
732 | /* | |
733 | * "Quotient": The number of data sectors for this stripe on all but | |
734 | * the "big column" child vdevs that also contain "remainder" data. | |
735 | * AKA "full rows" | |
736 | */ | |
737 | uint64_t q = s / (logical_cols - nparity); | |
738 | ||
739 | /* | |
740 | * "Remainder": The number of partial stripe data sectors in this I/O. | |
741 | * This will add a sector to some, but not all, child vdevs. | |
742 | */ | |
743 | uint64_t r = s - q * (logical_cols - nparity); | |
744 | ||
745 | /* The number of "big columns" - those which contain remainder data. */ | |
746 | uint64_t bc = (r == 0 ? 0 : r + nparity); | |
747 | ||
748 | /* | |
749 | * The total number of data and parity sectors associated with | |
750 | * this I/O. | |
751 | */ | |
752 | uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1)); | |
753 | ||
754 | /* How many rows contain data (not skip) */ | |
755 | uint64_t rows = howmany(tot, logical_cols); | |
756 | int cols = MIN(tot, logical_cols); | |
757 | ||
758 | raidz_map_t *rm = | |
759 | kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]), | |
760 | KM_SLEEP); | |
761 | rm->rm_nrows = rows; | |
762 | rm->rm_nskip = roundup(tot, nparity + 1) - tot; | |
763 | rm->rm_skipstart = bc; | |
764 | uint64_t asize = 0; | |
765 | ||
766 | for (uint64_t row = 0; row < rows; row++) { | |
767 | boolean_t row_use_scratch = B_FALSE; | |
768 | raidz_row_t *rr = vdev_raidz_row_alloc(cols); | |
769 | rm->rm_row[row] = rr; | |
770 | ||
771 | /* The starting RAIDZ (parent) vdev sector of the row. */ | |
772 | uint64_t b = (offset >> ashift) + row * logical_cols; | |
773 | ||
774 | /* | |
775 | * If we are in the middle of a reflow, and the copying has | |
776 | * not yet completed for any part of this row, then use the | |
777 | * old location of this row. Note that reflow_offset_synced | |
778 | * reflects the i/o that's been completed, because it's | |
779 | * updated by a synctask, after zio_wait(spa_txg_zio[]). | |
780 | * This is sufficient for our check, even if that progress | |
781 | * has not yet been recorded to disk (reflected in | |
782 | * spa_ubsync). Also note that we consider the last row to | |
783 | * be "full width" (`cols`-wide rather than `bc`-wide) for | |
784 | * this calculation. This causes a tiny bit of unnecessary | |
785 | * double-writes but is safe and simpler to calculate. | |
786 | */ | |
787 | int row_phys_cols = physical_cols; | |
788 | if (b + cols > reflow_offset_synced >> ashift) | |
789 | row_phys_cols--; | |
790 | else if (use_scratch) | |
791 | row_use_scratch = B_TRUE; | |
792 | ||
793 | /* starting child of this row */ | |
794 | uint64_t child_id = b % row_phys_cols; | |
795 | /* The starting byte offset on each child vdev. */ | |
796 | uint64_t child_offset = (b / row_phys_cols) << ashift; | |
797 | ||
798 | /* | |
799 | * Note, rr_cols is the entire width of the block, even | |
800 | * if this row is shorter. This is needed because parity | |
801 | * generation (for Q and R) needs to know the entire width, | |
802 | * because it treats the short row as though it was | |
803 | * full-width (and the "phantom" sectors were zero-filled). | |
804 | * | |
805 | * Another approach to this would be to set cols shorter | |
806 | * (to just the number of columns that we might do i/o to) | |
807 | * and have another mechanism to tell the parity generation | |
808 | * about the "entire width". Reconstruction (at least | |
809 | * vdev_raidz_reconstruct_general()) would also need to | |
810 | * know about the "entire width". | |
811 | */ | |
812 | rr->rr_firstdatacol = nparity; | |
813 | #ifdef ZFS_DEBUG | |
814 | /* | |
815 | * note: rr_size is PSIZE, not ASIZE | |
816 | */ | |
817 | rr->rr_offset = b << ashift; | |
818 | rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift; | |
819 | #endif | |
820 | ||
821 | for (int c = 0; c < rr->rr_cols; c++, child_id++) { | |
822 | if (child_id >= row_phys_cols) { | |
823 | child_id -= row_phys_cols; | |
824 | child_offset += 1ULL << ashift; | |
825 | } | |
826 | raidz_col_t *rc = &rr->rr_col[c]; | |
827 | rc->rc_devidx = child_id; | |
828 | rc->rc_offset = child_offset; | |
829 | ||
830 | /* | |
831 | * Get this from the scratch space if appropriate. | |
832 | * This only happens if we crashed in the middle of | |
833 | * raidz_reflow_scratch_sync() (while it's running, | |
834 | * the rangelock prevents us from doing concurrent | |
835 | * io), and even then only during zpool import or | |
836 | * when the pool is imported readonly. | |
837 | */ | |
838 | if (row_use_scratch) | |
839 | rc->rc_offset -= VDEV_BOOT_SIZE; | |
840 | ||
841 | uint64_t dc = c - rr->rr_firstdatacol; | |
842 | if (c < rr->rr_firstdatacol) { | |
843 | rc->rc_size = 1ULL << ashift; | |
844 | ||
845 | /* | |
846 | * Parity sectors' rc_abd's are set below | |
847 | * after determining if this is an aggregation. | |
848 | */ | |
849 | } else if (row == rows - 1 && bc != 0 && c >= bc) { | |
850 | /* | |
851 | * Past the end of the block (even including | |
852 | * skip sectors). This sector is part of the | |
853 | * map so that we have full rows for p/q parity | |
854 | * generation. | |
855 | */ | |
856 | rc->rc_size = 0; | |
857 | rc->rc_abd = NULL; | |
858 | } else { | |
859 | /* "data column" (col excluding parity) */ | |
860 | uint64_t off; | |
861 | ||
862 | if (c < bc || r == 0) { | |
863 | off = dc * rows + row; | |
864 | } else { | |
865 | off = r * rows + | |
866 | (dc - r) * (rows - 1) + row; | |
867 | } | |
868 | rc->rc_size = 1ULL << ashift; | |
869 | rc->rc_abd = abd_get_offset_struct( | |
870 | &rc->rc_abdstruct, abd, off << ashift, | |
871 | rc->rc_size); | |
872 | } | |
873 | ||
874 | if (rc->rc_size == 0) | |
875 | continue; | |
876 | ||
877 | /* | |
878 | * If any part of this row is in both old and new | |
879 | * locations, the primary location is the old | |
880 | * location. If this sector was already copied to the | |
881 | * new location, we need to also write to the new, | |
882 | * "shadow" location. | |
883 | * | |
884 | * Note, `row_phys_cols != physical_cols` indicates | |
885 | * that the primary location is the old location. | |
886 | * `b+c < reflow_offset_next` indicates that the copy | |
887 | * to the new location has been initiated. We know | |
888 | * that the copy has completed because we have the | |
889 | * rangelock, which is held exclusively while the | |
890 | * copy is in progress. | |
891 | */ | |
892 | if (row_use_scratch || | |
893 | (row_phys_cols != physical_cols && | |
894 | b + c < reflow_offset_next >> ashift)) { | |
895 | rc->rc_shadow_devidx = (b + c) % physical_cols; | |
896 | rc->rc_shadow_offset = | |
897 | ((b + c) / physical_cols) << ashift; | |
898 | if (row_use_scratch) | |
899 | rc->rc_shadow_offset -= VDEV_BOOT_SIZE; | |
900 | } | |
901 | ||
902 | asize += rc->rc_size; | |
903 | } | |
904 | ||
905 | /* | |
906 | * See comment in vdev_raidz_map_alloc() | |
907 | */ | |
908 | if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 && | |
909 | (offset & (1ULL << 20))) { | |
910 | ASSERT(rr->rr_cols >= 2); | |
911 | ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size); | |
912 | ||
913 | int devidx0 = rr->rr_col[0].rc_devidx; | |
914 | uint64_t offset0 = rr->rr_col[0].rc_offset; | |
915 | int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx; | |
916 | uint64_t shadow_offset0 = | |
917 | rr->rr_col[0].rc_shadow_offset; | |
918 | ||
919 | rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx; | |
920 | rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset; | |
921 | rr->rr_col[0].rc_shadow_devidx = | |
922 | rr->rr_col[1].rc_shadow_devidx; | |
923 | rr->rr_col[0].rc_shadow_offset = | |
924 | rr->rr_col[1].rc_shadow_offset; | |
925 | ||
926 | rr->rr_col[1].rc_devidx = devidx0; | |
927 | rr->rr_col[1].rc_offset = offset0; | |
928 | rr->rr_col[1].rc_shadow_devidx = shadow_devidx0; | |
929 | rr->rr_col[1].rc_shadow_offset = shadow_offset0; | |
930 | } | |
931 | } | |
932 | ASSERT3U(asize, ==, tot << ashift); | |
933 | ||
934 | /* | |
935 | * Determine if the block is contiguous, in which case we can use | |
936 | * an aggregation. | |
937 | */ | |
938 | if (rows >= raidz_io_aggregate_rows) { | |
939 | rm->rm_nphys_cols = physical_cols; | |
940 | rm->rm_phys_col = | |
941 | kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols, | |
942 | KM_SLEEP); | |
943 | ||
944 | /* | |
945 | * Determine the aggregate io's offset and size, and check | |
946 | * that the io is contiguous. | |
947 | */ | |
948 | for (int i = 0; | |
949 | i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) { | |
950 | raidz_row_t *rr = rm->rm_row[i]; | |
951 | for (int c = 0; c < rr->rr_cols; c++) { | |
952 | raidz_col_t *rc = &rr->rr_col[c]; | |
953 | raidz_col_t *prc = | |
954 | &rm->rm_phys_col[rc->rc_devidx]; | |
955 | ||
956 | if (rc->rc_size == 0) | |
957 | continue; | |
958 | ||
959 | if (prc->rc_size == 0) { | |
960 | ASSERT0(prc->rc_offset); | |
961 | prc->rc_offset = rc->rc_offset; | |
962 | } else if (prc->rc_offset + prc->rc_size != | |
963 | rc->rc_offset) { | |
964 | /* | |
965 | * This block is not contiguous and | |
966 | * therefore can't be aggregated. | |
967 | * This is expected to be rare, so | |
968 | * the cost of allocating and then | |
969 | * freeing rm_phys_col is not | |
970 | * significant. | |
971 | */ | |
972 | kmem_free(rm->rm_phys_col, | |
973 | sizeof (raidz_col_t) * | |
974 | rm->rm_nphys_cols); | |
975 | rm->rm_phys_col = NULL; | |
976 | rm->rm_nphys_cols = 0; | |
977 | break; | |
978 | } | |
979 | prc->rc_size += rc->rc_size; | |
980 | } | |
981 | } | |
982 | } | |
983 | if (rm->rm_phys_col != NULL) { | |
984 | /* | |
985 | * Allocate aggregate ABD's. | |
986 | */ | |
987 | for (int i = 0; i < rm->rm_nphys_cols; i++) { | |
988 | raidz_col_t *prc = &rm->rm_phys_col[i]; | |
989 | ||
990 | prc->rc_devidx = i; | |
345196be | 991 | |
5caeef02 DB |
992 | if (prc->rc_size == 0) |
993 | continue; | |
994 | ||
995 | prc->rc_abd = | |
996 | abd_alloc_linear(rm->rm_phys_col[i].rc_size, | |
997 | B_FALSE); | |
998 | } | |
999 | ||
1000 | /* | |
1001 | * Point the parity abd's into the aggregate abd's. | |
1002 | */ | |
1003 | for (int i = 0; i < rm->rm_nrows; i++) { | |
1004 | raidz_row_t *rr = rm->rm_row[i]; | |
1005 | for (int c = 0; c < rr->rr_firstdatacol; c++) { | |
1006 | raidz_col_t *rc = &rr->rr_col[c]; | |
1007 | raidz_col_t *prc = | |
1008 | &rm->rm_phys_col[rc->rc_devidx]; | |
1009 | rc->rc_abd = | |
1010 | abd_get_offset_struct(&rc->rc_abdstruct, | |
1011 | prc->rc_abd, | |
1012 | rc->rc_offset - prc->rc_offset, | |
1013 | rc->rc_size); | |
1014 | } | |
1015 | } | |
1016 | } else { | |
1017 | /* | |
1018 | * Allocate new abd's for the parity sectors. | |
1019 | */ | |
1020 | for (int i = 0; i < rm->rm_nrows; i++) { | |
1021 | raidz_row_t *rr = rm->rm_row[i]; | |
1022 | for (int c = 0; c < rr->rr_firstdatacol; c++) { | |
1023 | raidz_col_t *rc = &rr->rr_col[c]; | |
1024 | rc->rc_abd = | |
1025 | abd_alloc_linear(rc->rc_size, | |
1026 | B_TRUE); | |
1027 | } | |
1028 | } | |
1029 | } | |
c9187d86 GN |
1030 | /* init RAIDZ parity ops */ |
1031 | rm->rm_ops = vdev_raidz_math_get_ops(); | |
ab9f4b0b | 1032 | |
34dc7c2f BB |
1033 | return (rm); |
1034 | } | |
1035 | ||
a6255b7f DQ |
1036 | struct pqr_struct { |
1037 | uint64_t *p; | |
1038 | uint64_t *q; | |
1039 | uint64_t *r; | |
1040 | }; | |
1041 | ||
1042 | static int | |
1043 | vdev_raidz_p_func(void *buf, size_t size, void *private) | |
1044 | { | |
1045 | struct pqr_struct *pqr = private; | |
1046 | const uint64_t *src = buf; | |
5caeef02 | 1047 | int cnt = size / sizeof (src[0]); |
a6255b7f DQ |
1048 | |
1049 | ASSERT(pqr->p && !pqr->q && !pqr->r); | |
1050 | ||
5caeef02 | 1051 | for (int i = 0; i < cnt; i++, src++, pqr->p++) |
a6255b7f DQ |
1052 | *pqr->p ^= *src; |
1053 | ||
1054 | return (0); | |
1055 | } | |
1056 | ||
1057 | static int | |
1058 | vdev_raidz_pq_func(void *buf, size_t size, void *private) | |
1059 | { | |
1060 | struct pqr_struct *pqr = private; | |
1061 | const uint64_t *src = buf; | |
1062 | uint64_t mask; | |
5caeef02 | 1063 | int cnt = size / sizeof (src[0]); |
a6255b7f DQ |
1064 | |
1065 | ASSERT(pqr->p && pqr->q && !pqr->r); | |
1066 | ||
5caeef02 | 1067 | for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { |
a6255b7f DQ |
1068 | *pqr->p ^= *src; |
1069 | VDEV_RAIDZ_64MUL_2(*pqr->q, mask); | |
1070 | *pqr->q ^= *src; | |
1071 | } | |
1072 | ||
1073 | return (0); | |
1074 | } | |
1075 | ||
1076 | static int | |
1077 | vdev_raidz_pqr_func(void *buf, size_t size, void *private) | |
1078 | { | |
1079 | struct pqr_struct *pqr = private; | |
1080 | const uint64_t *src = buf; | |
1081 | uint64_t mask; | |
5caeef02 | 1082 | int cnt = size / sizeof (src[0]); |
a6255b7f DQ |
1083 | |
1084 | ASSERT(pqr->p && pqr->q && pqr->r); | |
1085 | ||
5caeef02 | 1086 | for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { |
a6255b7f DQ |
1087 | *pqr->p ^= *src; |
1088 | VDEV_RAIDZ_64MUL_2(*pqr->q, mask); | |
1089 | *pqr->q ^= *src; | |
1090 | VDEV_RAIDZ_64MUL_4(*pqr->r, mask); | |
1091 | *pqr->r ^= *src; | |
1092 | } | |
1093 | ||
1094 | return (0); | |
1095 | } | |
1096 | ||
34dc7c2f | 1097 | static void |
b2255edc | 1098 | vdev_raidz_generate_parity_p(raidz_row_t *rr) |
34dc7c2f | 1099 | { |
b2255edc | 1100 | uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); |
34dc7c2f | 1101 | |
b2255edc BB |
1102 | for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { |
1103 | abd_t *src = rr->rr_col[c].rc_abd; | |
34dc7c2f | 1104 | |
b2255edc BB |
1105 | if (c == rr->rr_firstdatacol) { |
1106 | abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); | |
34dc7c2f | 1107 | } else { |
a6255b7f | 1108 | struct pqr_struct pqr = { p, NULL, NULL }; |
b2255edc | 1109 | (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, |
a6255b7f | 1110 | vdev_raidz_p_func, &pqr); |
34dc7c2f BB |
1111 | } |
1112 | } | |
1113 | } | |
1114 | ||
1115 | static void | |
b2255edc | 1116 | vdev_raidz_generate_parity_pq(raidz_row_t *rr) |
34dc7c2f | 1117 | { |
b2255edc BB |
1118 | uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); |
1119 | uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); | |
1120 | uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); | |
1121 | ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == | |
1122 | rr->rr_col[VDEV_RAIDZ_Q].rc_size); | |
34dc7c2f | 1123 | |
b2255edc BB |
1124 | for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { |
1125 | abd_t *src = rr->rr_col[c].rc_abd; | |
45d1cae3 | 1126 | |
b2255edc | 1127 | uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); |
34dc7c2f | 1128 | |
b2255edc | 1129 | if (c == rr->rr_firstdatacol) { |
f7e76821 | 1130 | ASSERT(ccnt == pcnt || ccnt == 0); |
b2255edc BB |
1131 | abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); |
1132 | (void) memcpy(q, p, rr->rr_col[c].rc_size); | |
45d1cae3 | 1133 | |
b2255edc | 1134 | for (uint64_t i = ccnt; i < pcnt; i++) { |
a6255b7f DQ |
1135 | p[i] = 0; |
1136 | q[i] = 0; | |
45d1cae3 | 1137 | } |
a6255b7f | 1138 | } else { |
f7e76821 IH |
1139 | struct pqr_struct pqr = { p, q, NULL }; |
1140 | ||
1141 | ASSERT(ccnt <= pcnt); | |
b2255edc | 1142 | (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, |
f7e76821 | 1143 | vdev_raidz_pq_func, &pqr); |
45d1cae3 BB |
1144 | |
1145 | /* | |
1146 | * Treat short columns as though they are full of 0s. | |
1147 | * Note that there's therefore nothing needed for P. | |
1148 | */ | |
b2255edc BB |
1149 | uint64_t mask; |
1150 | for (uint64_t i = ccnt; i < pcnt; i++) { | |
a6255b7f | 1151 | VDEV_RAIDZ_64MUL_2(q[i], mask); |
45d1cae3 BB |
1152 | } |
1153 | } | |
1154 | } | |
1155 | } | |
1156 | ||
1157 | static void | |
b2255edc | 1158 | vdev_raidz_generate_parity_pqr(raidz_row_t *rr) |
45d1cae3 | 1159 | { |
b2255edc BB |
1160 | uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); |
1161 | uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); | |
1162 | uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd); | |
1163 | uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); | |
1164 | ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == | |
1165 | rr->rr_col[VDEV_RAIDZ_Q].rc_size); | |
1166 | ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size == | |
1167 | rr->rr_col[VDEV_RAIDZ_R].rc_size); | |
45d1cae3 | 1168 | |
b2255edc BB |
1169 | for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { |
1170 | abd_t *src = rr->rr_col[c].rc_abd; | |
45d1cae3 | 1171 | |
b2255edc | 1172 | uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]); |
45d1cae3 | 1173 | |
b2255edc | 1174 | if (c == rr->rr_firstdatacol) { |
f7e76821 | 1175 | ASSERT(ccnt == pcnt || ccnt == 0); |
b2255edc BB |
1176 | abd_copy_to_buf(p, src, rr->rr_col[c].rc_size); |
1177 | (void) memcpy(q, p, rr->rr_col[c].rc_size); | |
1178 | (void) memcpy(r, p, rr->rr_col[c].rc_size); | |
45d1cae3 | 1179 | |
b2255edc | 1180 | for (uint64_t i = ccnt; i < pcnt; i++) { |
a6255b7f DQ |
1181 | p[i] = 0; |
1182 | q[i] = 0; | |
1183 | r[i] = 0; | |
34dc7c2f | 1184 | } |
a6255b7f | 1185 | } else { |
f7e76821 IH |
1186 | struct pqr_struct pqr = { p, q, r }; |
1187 | ||
1188 | ASSERT(ccnt <= pcnt); | |
b2255edc | 1189 | (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size, |
f7e76821 IH |
1190 | vdev_raidz_pqr_func, &pqr); |
1191 | ||
34dc7c2f BB |
1192 | /* |
1193 | * Treat short columns as though they are full of 0s. | |
45d1cae3 | 1194 | * Note that there's therefore nothing needed for P. |
34dc7c2f | 1195 | */ |
b2255edc BB |
1196 | uint64_t mask; |
1197 | for (uint64_t i = ccnt; i < pcnt; i++) { | |
a6255b7f DQ |
1198 | VDEV_RAIDZ_64MUL_2(q[i], mask); |
1199 | VDEV_RAIDZ_64MUL_4(r[i], mask); | |
34dc7c2f BB |
1200 | } |
1201 | } | |
1202 | } | |
1203 | } | |
1204 | ||
45d1cae3 BB |
1205 | /* |
1206 | * Generate RAID parity in the first virtual columns according to the number of | |
1207 | * parity columns available. | |
1208 | */ | |
ab9f4b0b | 1209 | void |
b2255edc | 1210 | vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr) |
45d1cae3 | 1211 | { |
5caeef02 DB |
1212 | if (rr->rr_cols == 0) { |
1213 | /* | |
1214 | * We are handling this block one row at a time (because | |
1215 | * this block has a different logical vs physical width, | |
1216 | * due to RAIDZ expansion), and this is a pad-only row, | |
1217 | * which has no parity. | |
1218 | */ | |
1219 | return; | |
1220 | } | |
b2255edc | 1221 | |
c9187d86 | 1222 | /* Generate using the new math implementation */ |
b2255edc | 1223 | if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL) |
ab9f4b0b | 1224 | return; |
ab9f4b0b | 1225 | |
b2255edc | 1226 | switch (rr->rr_firstdatacol) { |
45d1cae3 | 1227 | case 1: |
b2255edc | 1228 | vdev_raidz_generate_parity_p(rr); |
45d1cae3 BB |
1229 | break; |
1230 | case 2: | |
b2255edc | 1231 | vdev_raidz_generate_parity_pq(rr); |
45d1cae3 BB |
1232 | break; |
1233 | case 3: | |
b2255edc | 1234 | vdev_raidz_generate_parity_pqr(rr); |
45d1cae3 BB |
1235 | break; |
1236 | default: | |
1237 | cmn_err(CE_PANIC, "invalid RAID-Z configuration"); | |
1238 | } | |
1239 | } | |
1240 | ||
b2255edc BB |
1241 | void |
1242 | vdev_raidz_generate_parity(raidz_map_t *rm) | |
1243 | { | |
1244 | for (int i = 0; i < rm->rm_nrows; i++) { | |
1245 | raidz_row_t *rr = rm->rm_row[i]; | |
1246 | vdev_raidz_generate_parity_row(rm, rr); | |
1247 | } | |
1248 | } | |
1249 | ||
a6255b7f DQ |
1250 | static int |
1251 | vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) | |
1252 | { | |
14e4e3cb | 1253 | (void) private; |
a6255b7f DQ |
1254 | uint64_t *dst = dbuf; |
1255 | uint64_t *src = sbuf; | |
1256 | int cnt = size / sizeof (src[0]); | |
a6255b7f | 1257 | |
1c27024e | 1258 | for (int i = 0; i < cnt; i++) { |
a6255b7f DQ |
1259 | dst[i] ^= src[i]; |
1260 | } | |
1261 | ||
1262 | return (0); | |
1263 | } | |
1264 | ||
a6255b7f DQ |
1265 | static int |
1266 | vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, | |
1267 | void *private) | |
1268 | { | |
14e4e3cb | 1269 | (void) private; |
a6255b7f DQ |
1270 | uint64_t *dst = dbuf; |
1271 | uint64_t *src = sbuf; | |
1272 | uint64_t mask; | |
1273 | int cnt = size / sizeof (dst[0]); | |
a6255b7f | 1274 | |
1c27024e | 1275 | for (int i = 0; i < cnt; i++, dst++, src++) { |
a6255b7f DQ |
1276 | VDEV_RAIDZ_64MUL_2(*dst, mask); |
1277 | *dst ^= *src; | |
1278 | } | |
1279 | ||
1280 | return (0); | |
1281 | } | |
1282 | ||
a6255b7f DQ |
1283 | static int |
1284 | vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) | |
1285 | { | |
14e4e3cb | 1286 | (void) private; |
a6255b7f DQ |
1287 | uint64_t *dst = buf; |
1288 | uint64_t mask; | |
1289 | int cnt = size / sizeof (dst[0]); | |
a6255b7f | 1290 | |
1c27024e | 1291 | for (int i = 0; i < cnt; i++, dst++) { |
a6255b7f DQ |
1292 | /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ |
1293 | VDEV_RAIDZ_64MUL_2(*dst, mask); | |
1294 | } | |
1295 | ||
1296 | return (0); | |
1297 | } | |
1298 | ||
1299 | struct reconst_q_struct { | |
1300 | uint64_t *q; | |
1301 | int exp; | |
1302 | }; | |
1303 | ||
1304 | static int | |
1305 | vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) | |
1306 | { | |
1307 | struct reconst_q_struct *rq = private; | |
1308 | uint64_t *dst = buf; | |
1309 | int cnt = size / sizeof (dst[0]); | |
a6255b7f | 1310 | |
1c27024e | 1311 | for (int i = 0; i < cnt; i++, dst++, rq->q++) { |
a6255b7f DQ |
1312 | int j; |
1313 | uint8_t *b; | |
1314 | ||
1315 | *dst ^= *rq->q; | |
1316 | for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { | |
1317 | *b = vdev_raidz_exp2(*b, rq->exp); | |
1318 | } | |
1319 | } | |
1320 | ||
1321 | return (0); | |
1322 | } | |
1323 | ||
1324 | struct reconst_pq_struct { | |
1325 | uint8_t *p; | |
1326 | uint8_t *q; | |
1327 | uint8_t *pxy; | |
1328 | uint8_t *qxy; | |
1329 | int aexp; | |
1330 | int bexp; | |
1331 | }; | |
1332 | ||
1333 | static int | |
1334 | vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) | |
1335 | { | |
1336 | struct reconst_pq_struct *rpq = private; | |
1337 | uint8_t *xd = xbuf; | |
1338 | uint8_t *yd = ybuf; | |
a6255b7f | 1339 | |
1c27024e | 1340 | for (int i = 0; i < size; |
a6255b7f DQ |
1341 | i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { |
1342 | *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ | |
1343 | vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); | |
1344 | *yd = *rpq->p ^ *rpq->pxy ^ *xd; | |
1345 | } | |
1346 | ||
1347 | return (0); | |
1348 | } | |
1349 | ||
1350 | static int | |
1351 | vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) | |
1352 | { | |
1353 | struct reconst_pq_struct *rpq = private; | |
1354 | uint8_t *xd = xbuf; | |
a6255b7f | 1355 | |
1c27024e | 1356 | for (int i = 0; i < size; |
a6255b7f DQ |
1357 | i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { |
1358 | /* same operation as vdev_raidz_reconst_pq_func() on xd */ | |
1359 | *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ | |
1360 | vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); | |
1361 | } | |
1362 | ||
1363 | return (0); | |
1364 | } | |
1365 | ||
46df6e98 | 1366 | static void |
b2255edc | 1367 | vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts) |
34dc7c2f | 1368 | { |
45d1cae3 | 1369 | int x = tgts[0]; |
a6255b7f | 1370 | abd_t *dst, *src; |
34dc7c2f | 1371 | |
5caeef02 DB |
1372 | if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) |
1373 | zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x); | |
1374 | ||
b2255edc BB |
1375 | ASSERT3U(ntgts, ==, 1); |
1376 | ASSERT3U(x, >=, rr->rr_firstdatacol); | |
1377 | ASSERT3U(x, <, rr->rr_cols); | |
45d1cae3 | 1378 | |
b2255edc | 1379 | ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size); |
34dc7c2f | 1380 | |
b2255edc BB |
1381 | src = rr->rr_col[VDEV_RAIDZ_P].rc_abd; |
1382 | dst = rr->rr_col[x].rc_abd; | |
a6255b7f | 1383 | |
b2255edc | 1384 | abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size); |
34dc7c2f | 1385 | |
b2255edc BB |
1386 | for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { |
1387 | uint64_t size = MIN(rr->rr_col[x].rc_size, | |
1388 | rr->rr_col[c].rc_size); | |
a6255b7f | 1389 | |
b2255edc | 1390 | src = rr->rr_col[c].rc_abd; |
34dc7c2f BB |
1391 | |
1392 | if (c == x) | |
1393 | continue; | |
1394 | ||
a6255b7f DQ |
1395 | (void) abd_iterate_func2(dst, src, 0, 0, size, |
1396 | vdev_raidz_reconst_p_func, NULL); | |
34dc7c2f BB |
1397 | } |
1398 | } | |
1399 | ||
46df6e98 | 1400 | static void |
b2255edc | 1401 | vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts) |
34dc7c2f | 1402 | { |
45d1cae3 | 1403 | int x = tgts[0]; |
a6255b7f DQ |
1404 | int c, exp; |
1405 | abd_t *dst, *src; | |
34dc7c2f | 1406 | |
5caeef02 DB |
1407 | if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) |
1408 | zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x); | |
1409 | ||
45d1cae3 BB |
1410 | ASSERT(ntgts == 1); |
1411 | ||
b2255edc | 1412 | ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size); |
34dc7c2f | 1413 | |
b2255edc BB |
1414 | for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { |
1415 | uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size, | |
1416 | rr->rr_col[c].rc_size); | |
34dc7c2f | 1417 | |
b2255edc BB |
1418 | src = rr->rr_col[c].rc_abd; |
1419 | dst = rr->rr_col[x].rc_abd; | |
34dc7c2f | 1420 | |
b2255edc | 1421 | if (c == rr->rr_firstdatacol) { |
a6255b7f | 1422 | abd_copy(dst, src, size); |
b2255edc | 1423 | if (rr->rr_col[x].rc_size > size) { |
a6255b7f | 1424 | abd_zero_off(dst, size, |
b2255edc BB |
1425 | rr->rr_col[x].rc_size - size); |
1426 | } | |
34dc7c2f | 1427 | } else { |
b2255edc | 1428 | ASSERT3U(size, <=, rr->rr_col[x].rc_size); |
a6255b7f DQ |
1429 | (void) abd_iterate_func2(dst, src, 0, 0, size, |
1430 | vdev_raidz_reconst_q_pre_func, NULL); | |
1431 | (void) abd_iterate_func(dst, | |
b2255edc | 1432 | size, rr->rr_col[x].rc_size - size, |
a6255b7f | 1433 | vdev_raidz_reconst_q_pre_tail_func, NULL); |
34dc7c2f BB |
1434 | } |
1435 | } | |
1436 | ||
b2255edc BB |
1437 | src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; |
1438 | dst = rr->rr_col[x].rc_abd; | |
1439 | exp = 255 - (rr->rr_cols - 1 - x); | |
34dc7c2f | 1440 | |
1c27024e | 1441 | struct reconst_q_struct rq = { abd_to_buf(src), exp }; |
b2255edc | 1442 | (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size, |
a6255b7f | 1443 | vdev_raidz_reconst_q_post_func, &rq); |
34dc7c2f BB |
1444 | } |
1445 | ||
46df6e98 | 1446 | static void |
b2255edc | 1447 | vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts) |
34dc7c2f | 1448 | { |
a6255b7f DQ |
1449 | uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; |
1450 | abd_t *pdata, *qdata; | |
1451 | uint64_t xsize, ysize; | |
45d1cae3 BB |
1452 | int x = tgts[0]; |
1453 | int y = tgts[1]; | |
a6255b7f | 1454 | abd_t *xd, *yd; |
34dc7c2f | 1455 | |
5caeef02 DB |
1456 | if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) |
1457 | zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y); | |
1458 | ||
45d1cae3 | 1459 | ASSERT(ntgts == 2); |
34dc7c2f | 1460 | ASSERT(x < y); |
b2255edc BB |
1461 | ASSERT(x >= rr->rr_firstdatacol); |
1462 | ASSERT(y < rr->rr_cols); | |
34dc7c2f | 1463 | |
b2255edc | 1464 | ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size); |
34dc7c2f BB |
1465 | |
1466 | /* | |
1467 | * Move the parity data aside -- we're going to compute parity as | |
1468 | * though columns x and y were full of zeros -- Pxy and Qxy. We want to | |
1469 | * reuse the parity generation mechanism without trashing the actual | |
1470 | * parity so we make those columns appear to be full of zeros by | |
1471 | * setting their lengths to zero. | |
1472 | */ | |
b2255edc BB |
1473 | pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd; |
1474 | qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd; | |
1475 | xsize = rr->rr_col[x].rc_size; | |
1476 | ysize = rr->rr_col[y].rc_size; | |
34dc7c2f | 1477 | |
b2255edc BB |
1478 | rr->rr_col[VDEV_RAIDZ_P].rc_abd = |
1479 | abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE); | |
1480 | rr->rr_col[VDEV_RAIDZ_Q].rc_abd = | |
1481 | abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); | |
1482 | rr->rr_col[x].rc_size = 0; | |
1483 | rr->rr_col[y].rc_size = 0; | |
34dc7c2f | 1484 | |
b2255edc | 1485 | vdev_raidz_generate_parity_pq(rr); |
34dc7c2f | 1486 | |
b2255edc BB |
1487 | rr->rr_col[x].rc_size = xsize; |
1488 | rr->rr_col[y].rc_size = ysize; | |
34dc7c2f | 1489 | |
a6255b7f DQ |
1490 | p = abd_to_buf(pdata); |
1491 | q = abd_to_buf(qdata); | |
b2255edc BB |
1492 | pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd); |
1493 | qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); | |
1494 | xd = rr->rr_col[x].rc_abd; | |
1495 | yd = rr->rr_col[y].rc_abd; | |
34dc7c2f BB |
1496 | |
1497 | /* | |
1498 | * We now have: | |
1499 | * Pxy = P + D_x + D_y | |
1500 | * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y | |
1501 | * | |
1502 | * We can then solve for D_x: | |
1503 | * D_x = A * (P + Pxy) + B * (Q + Qxy) | |
1504 | * where | |
1505 | * A = 2^(x - y) * (2^(x - y) + 1)^-1 | |
1506 | * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 | |
1507 | * | |
1508 | * With D_x in hand, we can easily solve for D_y: | |
1509 | * D_y = P + Pxy + D_x | |
1510 | */ | |
1511 | ||
1512 | a = vdev_raidz_pow2[255 + x - y]; | |
b2255edc | 1513 | b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)]; |
34dc7c2f BB |
1514 | tmp = 255 - vdev_raidz_log2[a ^ 1]; |
1515 | ||
1516 | aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; | |
1517 | bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; | |
1518 | ||
a6255b7f | 1519 | ASSERT3U(xsize, >=, ysize); |
1c27024e | 1520 | struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; |
34dc7c2f | 1521 | |
a6255b7f DQ |
1522 | (void) abd_iterate_func2(xd, yd, 0, 0, ysize, |
1523 | vdev_raidz_reconst_pq_func, &rpq); | |
1524 | (void) abd_iterate_func(xd, ysize, xsize - ysize, | |
1525 | vdev_raidz_reconst_pq_tail_func, &rpq); | |
34dc7c2f | 1526 | |
b2255edc BB |
1527 | abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd); |
1528 | abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd); | |
34dc7c2f BB |
1529 | |
1530 | /* | |
1531 | * Restore the saved parity data. | |
1532 | */ | |
b2255edc BB |
1533 | rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata; |
1534 | rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata; | |
45d1cae3 BB |
1535 | } |
1536 | ||
45d1cae3 BB |
1537 | /* |
1538 | * In the general case of reconstruction, we must solve the system of linear | |
dd4bc569 | 1539 | * equations defined by the coefficients used to generate parity as well as |
45d1cae3 BB |
1540 | * the contents of the data and parity disks. This can be expressed with |
1541 | * vectors for the original data (D) and the actual data (d) and parity (p) | |
1542 | * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): | |
1543 | * | |
1544 | * __ __ __ __ | |
1545 | * | | __ __ | p_0 | | |
1546 | * | V | | D_0 | | p_m-1 | | |
1547 | * | | x | : | = | d_0 | | |
1548 | * | I | | D_n-1 | | : | | |
1549 | * | | ~~ ~~ | d_n-1 | | |
1550 | * ~~ ~~ ~~ ~~ | |
1551 | * | |
1552 | * I is simply a square identity matrix of size n, and V is a vandermonde | |
dd4bc569 | 1553 | * matrix defined by the coefficients we chose for the various parity columns |
45d1cae3 BB |
1554 | * (1, 2, 4). Note that these values were chosen both for simplicity, speedy |
1555 | * computation as well as linear separability. | |
1556 | * | |
1557 | * __ __ __ __ | |
1558 | * | 1 .. 1 1 1 | | p_0 | | |
1559 | * | 2^n-1 .. 4 2 1 | __ __ | : | | |
1560 | * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | | |
1561 | * | 1 .. 0 0 0 | | D_1 | | d_0 | | |
1562 | * | 0 .. 0 0 0 | x | D_2 | = | d_1 | | |
1563 | * | : : : : | | : | | d_2 | | |
1564 | * | 0 .. 1 0 0 | | D_n-1 | | : | | |
1565 | * | 0 .. 0 1 0 | ~~ ~~ | : | | |
1566 | * | 0 .. 0 0 1 | | d_n-1 | | |
1567 | * ~~ ~~ ~~ ~~ | |
1568 | * | |
1569 | * Note that I, V, d, and p are known. To compute D, we must invert the | |
1570 | * matrix and use the known data and parity values to reconstruct the unknown | |
1571 | * data values. We begin by removing the rows in V|I and d|p that correspond | |
1572 | * to failed or missing columns; we then make V|I square (n x n) and d|p | |
1573 | * sized n by removing rows corresponding to unused parity from the bottom up | |
1574 | * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' | |
1575 | * using Gauss-Jordan elimination. In the example below we use m=3 parity | |
1576 | * columns, n=8 data columns, with errors in d_1, d_2, and p_1: | |
1577 | * __ __ | |
1578 | * | 1 1 1 1 1 1 1 1 | | |
1579 | * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks | |
1580 | * | 19 205 116 29 64 16 4 1 | / / | |
1581 | * | 1 0 0 0 0 0 0 0 | / / | |
1582 | * | 0 1 0 0 0 0 0 0 | <--' / | |
1583 | * (V|I) = | 0 0 1 0 0 0 0 0 | <---' | |
1584 | * | 0 0 0 1 0 0 0 0 | | |
1585 | * | 0 0 0 0 1 0 0 0 | | |
1586 | * | 0 0 0 0 0 1 0 0 | | |
1587 | * | 0 0 0 0 0 0 1 0 | | |
1588 | * | 0 0 0 0 0 0 0 1 | | |
1589 | * ~~ ~~ | |
1590 | * __ __ | |
1591 | * | 1 1 1 1 1 1 1 1 | | |
1592 | * | 128 64 32 16 8 4 2 1 | | |
1593 | * | 19 205 116 29 64 16 4 1 | | |
1594 | * | 1 0 0 0 0 0 0 0 | | |
1595 | * | 0 1 0 0 0 0 0 0 | | |
1596 | * (V|I)' = | 0 0 1 0 0 0 0 0 | | |
1597 | * | 0 0 0 1 0 0 0 0 | | |
1598 | * | 0 0 0 0 1 0 0 0 | | |
1599 | * | 0 0 0 0 0 1 0 0 | | |
1600 | * | 0 0 0 0 0 0 1 0 | | |
1601 | * | 0 0 0 0 0 0 0 1 | | |
1602 | * ~~ ~~ | |
1603 | * | |
1604 | * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We | |
1605 | * have carefully chosen the seed values 1, 2, and 4 to ensure that this | |
1606 | * matrix is not singular. | |
1607 | * __ __ | |
1608 | * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | | |
1609 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | | |
1610 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | | |
1611 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | | |
1612 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | | |
1613 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | | |
1614 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | | |
1615 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | | |
1616 | * ~~ ~~ | |
1617 | * __ __ | |
1618 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | | |
1619 | * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | | |
1620 | * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | | |
1621 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | | |
1622 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | | |
1623 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | | |
1624 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | | |
1625 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | | |
1626 | * ~~ ~~ | |
1627 | * __ __ | |
1628 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | | |
1629 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | | |
1630 | * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | | |
1631 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | | |
1632 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | | |
1633 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | | |
1634 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | | |
1635 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | | |
1636 | * ~~ ~~ | |
1637 | * __ __ | |
1638 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | | |
1639 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | | |
1640 | * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | | |
1641 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | | |
1642 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | | |
1643 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | | |
1644 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | | |
1645 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | | |
1646 | * ~~ ~~ | |
1647 | * __ __ | |
1648 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | | |
1649 | * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | | |
1650 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | | |
1651 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | | |
1652 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | | |
1653 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | | |
1654 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | | |
1655 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | | |
1656 | * ~~ ~~ | |
1657 | * __ __ | |
1658 | * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | | |
1659 | * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | | |
1660 | * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | | |
1661 | * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | | |
1662 | * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | | |
1663 | * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | | |
1664 | * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | | |
1665 | * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | | |
1666 | * ~~ ~~ | |
1667 | * __ __ | |
1668 | * | 0 0 1 0 0 0 0 0 | | |
1669 | * | 167 100 5 41 159 169 217 208 | | |
1670 | * | 166 100 4 40 158 168 216 209 | | |
1671 | * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | | |
1672 | * | 0 0 0 0 1 0 0 0 | | |
1673 | * | 0 0 0 0 0 1 0 0 | | |
1674 | * | 0 0 0 0 0 0 1 0 | | |
1675 | * | 0 0 0 0 0 0 0 1 | | |
1676 | * ~~ ~~ | |
1677 | * | |
1678 | * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values | |
1679 | * of the missing data. | |
1680 | * | |
1681 | * As is apparent from the example above, the only non-trivial rows in the | |
1682 | * inverse matrix correspond to the data disks that we're trying to | |
1683 | * reconstruct. Indeed, those are the only rows we need as the others would | |
1684 | * only be useful for reconstructing data known or assumed to be valid. For | |
1685 | * that reason, we only build the coefficients in the rows that correspond to | |
1686 | * targeted columns. | |
1687 | */ | |
45d1cae3 BB |
1688 | |
1689 | static void | |
b2255edc | 1690 | vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map, |
45d1cae3 BB |
1691 | uint8_t **rows) |
1692 | { | |
1693 | int i, j; | |
1694 | int pow; | |
1695 | ||
b2255edc | 1696 | ASSERT(n == rr->rr_cols - rr->rr_firstdatacol); |
45d1cae3 BB |
1697 | |
1698 | /* | |
1699 | * Fill in the missing rows of interest. | |
1700 | */ | |
1701 | for (i = 0; i < nmap; i++) { | |
1702 | ASSERT3S(0, <=, map[i]); | |
1703 | ASSERT3S(map[i], <=, 2); | |
1704 | ||
1705 | pow = map[i] * n; | |
1706 | if (pow > 255) | |
1707 | pow -= 255; | |
1708 | ASSERT(pow <= 255); | |
1709 | ||
1710 | for (j = 0; j < n; j++) { | |
1711 | pow -= map[i]; | |
1712 | if (pow < 0) | |
1713 | pow += 255; | |
1714 | rows[i][j] = vdev_raidz_pow2[pow]; | |
1715 | } | |
1716 | } | |
1717 | } | |
1718 | ||
1719 | static void | |
b2255edc | 1720 | vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing, |
45d1cae3 BB |
1721 | uint8_t **rows, uint8_t **invrows, const uint8_t *used) |
1722 | { | |
1723 | int i, j, ii, jj; | |
1724 | uint8_t log; | |
1725 | ||
1726 | /* | |
1727 | * Assert that the first nmissing entries from the array of used | |
1728 | * columns correspond to parity columns and that subsequent entries | |
1729 | * correspond to data columns. | |
1730 | */ | |
1731 | for (i = 0; i < nmissing; i++) { | |
b2255edc | 1732 | ASSERT3S(used[i], <, rr->rr_firstdatacol); |
45d1cae3 BB |
1733 | } |
1734 | for (; i < n; i++) { | |
b2255edc | 1735 | ASSERT3S(used[i], >=, rr->rr_firstdatacol); |
45d1cae3 BB |
1736 | } |
1737 | ||
1738 | /* | |
1739 | * First initialize the storage where we'll compute the inverse rows. | |
1740 | */ | |
1741 | for (i = 0; i < nmissing; i++) { | |
1742 | for (j = 0; j < n; j++) { | |
1743 | invrows[i][j] = (i == j) ? 1 : 0; | |
1744 | } | |
1745 | } | |
1746 | ||
1747 | /* | |
1748 | * Subtract all trivial rows from the rows of consequence. | |
1749 | */ | |
1750 | for (i = 0; i < nmissing; i++) { | |
1751 | for (j = nmissing; j < n; j++) { | |
b2255edc BB |
1752 | ASSERT3U(used[j], >=, rr->rr_firstdatacol); |
1753 | jj = used[j] - rr->rr_firstdatacol; | |
45d1cae3 BB |
1754 | ASSERT3S(jj, <, n); |
1755 | invrows[i][j] = rows[i][jj]; | |
1756 | rows[i][jj] = 0; | |
1757 | } | |
1758 | } | |
1759 | ||
1760 | /* | |
1761 | * For each of the rows of interest, we must normalize it and subtract | |
1762 | * a multiple of it from the other rows. | |
1763 | */ | |
1764 | for (i = 0; i < nmissing; i++) { | |
1765 | for (j = 0; j < missing[i]; j++) { | |
c99c9001 | 1766 | ASSERT0(rows[i][j]); |
45d1cae3 BB |
1767 | } |
1768 | ASSERT3U(rows[i][missing[i]], !=, 0); | |
1769 | ||
1770 | /* | |
1771 | * Compute the inverse of the first element and multiply each | |
1772 | * element in the row by that value. | |
1773 | */ | |
1774 | log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; | |
1775 | ||
1776 | for (j = 0; j < n; j++) { | |
1777 | rows[i][j] = vdev_raidz_exp2(rows[i][j], log); | |
1778 | invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); | |
1779 | } | |
1780 | ||
1781 | for (ii = 0; ii < nmissing; ii++) { | |
1782 | if (i == ii) | |
1783 | continue; | |
1784 | ||
1785 | ASSERT3U(rows[ii][missing[i]], !=, 0); | |
1786 | ||
1787 | log = vdev_raidz_log2[rows[ii][missing[i]]]; | |
1788 | ||
1789 | for (j = 0; j < n; j++) { | |
1790 | rows[ii][j] ^= | |
1791 | vdev_raidz_exp2(rows[i][j], log); | |
1792 | invrows[ii][j] ^= | |
1793 | vdev_raidz_exp2(invrows[i][j], log); | |
1794 | } | |
1795 | } | |
1796 | } | |
1797 | ||
1798 | /* | |
1799 | * Verify that the data that is left in the rows are properly part of | |
1800 | * an identity matrix. | |
1801 | */ | |
1802 | for (i = 0; i < nmissing; i++) { | |
1803 | for (j = 0; j < n; j++) { | |
1804 | if (j == missing[i]) { | |
1805 | ASSERT3U(rows[i][j], ==, 1); | |
1806 | } else { | |
c99c9001 | 1807 | ASSERT0(rows[i][j]); |
45d1cae3 BB |
1808 | } |
1809 | } | |
1810 | } | |
1811 | } | |
1812 | ||
1813 | static void | |
b2255edc | 1814 | vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing, |
45d1cae3 BB |
1815 | int *missing, uint8_t **invrows, const uint8_t *used) |
1816 | { | |
1817 | int i, j, x, cc, c; | |
1818 | uint8_t *src; | |
1819 | uint64_t ccount; | |
689f093e GN |
1820 | uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; |
1821 | uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; | |
a117a6d6 GW |
1822 | uint8_t log = 0; |
1823 | uint8_t val; | |
45d1cae3 BB |
1824 | int ll; |
1825 | uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; | |
1826 | uint8_t *p, *pp; | |
1827 | size_t psize; | |
1828 | ||
1829 | psize = sizeof (invlog[0][0]) * n * nmissing; | |
79c76d5b | 1830 | p = kmem_alloc(psize, KM_SLEEP); |
45d1cae3 BB |
1831 | |
1832 | for (pp = p, i = 0; i < nmissing; i++) { | |
1833 | invlog[i] = pp; | |
1834 | pp += n; | |
1835 | } | |
1836 | ||
1837 | for (i = 0; i < nmissing; i++) { | |
1838 | for (j = 0; j < n; j++) { | |
1839 | ASSERT3U(invrows[i][j], !=, 0); | |
1840 | invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; | |
1841 | } | |
1842 | } | |
1843 | ||
1844 | for (i = 0; i < n; i++) { | |
1845 | c = used[i]; | |
b2255edc | 1846 | ASSERT3U(c, <, rr->rr_cols); |
45d1cae3 | 1847 | |
b2255edc BB |
1848 | ccount = rr->rr_col[c].rc_size; |
1849 | ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0); | |
1850 | if (ccount == 0) | |
1851 | continue; | |
1852 | src = abd_to_buf(rr->rr_col[c].rc_abd); | |
45d1cae3 | 1853 | for (j = 0; j < nmissing; j++) { |
b2255edc BB |
1854 | cc = missing[j] + rr->rr_firstdatacol; |
1855 | ASSERT3U(cc, >=, rr->rr_firstdatacol); | |
1856 | ASSERT3U(cc, <, rr->rr_cols); | |
45d1cae3 BB |
1857 | ASSERT3U(cc, !=, c); |
1858 | ||
b2255edc BB |
1859 | dcount[j] = rr->rr_col[cc].rc_size; |
1860 | if (dcount[j] != 0) | |
1861 | dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd); | |
45d1cae3 BB |
1862 | } |
1863 | ||
45d1cae3 BB |
1864 | for (x = 0; x < ccount; x++, src++) { |
1865 | if (*src != 0) | |
1866 | log = vdev_raidz_log2[*src]; | |
1867 | ||
1868 | for (cc = 0; cc < nmissing; cc++) { | |
1869 | if (x >= dcount[cc]) | |
1870 | continue; | |
1871 | ||
1872 | if (*src == 0) { | |
1873 | val = 0; | |
1874 | } else { | |
1875 | if ((ll = log + invlog[cc][i]) >= 255) | |
1876 | ll -= 255; | |
1877 | val = vdev_raidz_pow2[ll]; | |
1878 | } | |
1879 | ||
1880 | if (i == 0) | |
1881 | dst[cc][x] = val; | |
1882 | else | |
1883 | dst[cc][x] ^= val; | |
1884 | } | |
1885 | } | |
1886 | } | |
1887 | ||
1888 | kmem_free(p, psize); | |
1889 | } | |
1890 | ||
46df6e98 | 1891 | static void |
b2255edc | 1892 | vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts) |
45d1cae3 BB |
1893 | { |
1894 | int n, i, c, t, tt; | |
1895 | int nmissing_rows; | |
1896 | int missing_rows[VDEV_RAIDZ_MAXPARITY]; | |
1897 | int parity_map[VDEV_RAIDZ_MAXPARITY]; | |
45d1cae3 BB |
1898 | uint8_t *p, *pp; |
1899 | size_t psize; | |
45d1cae3 BB |
1900 | uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; |
1901 | uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; | |
1902 | uint8_t *used; | |
1903 | ||
a6255b7f DQ |
1904 | abd_t **bufs = NULL; |
1905 | ||
5caeef02 DB |
1906 | if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) |
1907 | zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts); | |
a6255b7f DQ |
1908 | /* |
1909 | * Matrix reconstruction can't use scatter ABDs yet, so we allocate | |
b2255edc | 1910 | * temporary linear ABDs if any non-linear ABDs are found. |
a6255b7f | 1911 | */ |
b2255edc | 1912 | for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) { |
5caeef02 | 1913 | ASSERT(rr->rr_col[i].rc_abd != NULL); |
b2255edc BB |
1914 | if (!abd_is_linear(rr->rr_col[i].rc_abd)) { |
1915 | bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *), | |
1916 | KM_PUSHPAGE); | |
1917 | ||
1918 | for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { | |
1919 | raidz_col_t *col = &rr->rr_col[c]; | |
1920 | ||
1921 | bufs[c] = col->rc_abd; | |
1922 | if (bufs[c] != NULL) { | |
1923 | col->rc_abd = abd_alloc_linear( | |
1924 | col->rc_size, B_TRUE); | |
1925 | abd_copy(col->rc_abd, bufs[c], | |
1926 | col->rc_size); | |
1927 | } | |
1928 | } | |
a6255b7f | 1929 | |
b2255edc | 1930 | break; |
a6255b7f DQ |
1931 | } |
1932 | } | |
45d1cae3 | 1933 | |
b2255edc | 1934 | n = rr->rr_cols - rr->rr_firstdatacol; |
45d1cae3 BB |
1935 | |
1936 | /* | |
1937 | * Figure out which data columns are missing. | |
1938 | */ | |
1939 | nmissing_rows = 0; | |
1940 | for (t = 0; t < ntgts; t++) { | |
b2255edc | 1941 | if (tgts[t] >= rr->rr_firstdatacol) { |
45d1cae3 | 1942 | missing_rows[nmissing_rows++] = |
b2255edc | 1943 | tgts[t] - rr->rr_firstdatacol; |
45d1cae3 BB |
1944 | } |
1945 | } | |
1946 | ||
1947 | /* | |
1948 | * Figure out which parity columns to use to help generate the missing | |
1949 | * data columns. | |
1950 | */ | |
1951 | for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { | |
1952 | ASSERT(tt < ntgts); | |
b2255edc | 1953 | ASSERT(c < rr->rr_firstdatacol); |
45d1cae3 BB |
1954 | |
1955 | /* | |
1956 | * Skip any targeted parity columns. | |
1957 | */ | |
1958 | if (c == tgts[tt]) { | |
1959 | tt++; | |
1960 | continue; | |
1961 | } | |
1962 | ||
45d1cae3 BB |
1963 | parity_map[i] = c; |
1964 | i++; | |
1965 | } | |
1966 | ||
45d1cae3 BB |
1967 | psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * |
1968 | nmissing_rows * n + sizeof (used[0]) * n; | |
79c76d5b | 1969 | p = kmem_alloc(psize, KM_SLEEP); |
45d1cae3 BB |
1970 | |
1971 | for (pp = p, i = 0; i < nmissing_rows; i++) { | |
1972 | rows[i] = pp; | |
1973 | pp += n; | |
1974 | invrows[i] = pp; | |
1975 | pp += n; | |
1976 | } | |
1977 | used = pp; | |
1978 | ||
1979 | for (i = 0; i < nmissing_rows; i++) { | |
1980 | used[i] = parity_map[i]; | |
1981 | } | |
1982 | ||
b2255edc | 1983 | for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { |
45d1cae3 | 1984 | if (tt < nmissing_rows && |
b2255edc | 1985 | c == missing_rows[tt] + rr->rr_firstdatacol) { |
45d1cae3 BB |
1986 | tt++; |
1987 | continue; | |
1988 | } | |
1989 | ||
1990 | ASSERT3S(i, <, n); | |
1991 | used[i] = c; | |
1992 | i++; | |
1993 | } | |
1994 | ||
1995 | /* | |
1996 | * Initialize the interesting rows of the matrix. | |
1997 | */ | |
b2255edc | 1998 | vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows); |
45d1cae3 BB |
1999 | |
2000 | /* | |
2001 | * Invert the matrix. | |
2002 | */ | |
b2255edc | 2003 | vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows, |
45d1cae3 BB |
2004 | invrows, used); |
2005 | ||
2006 | /* | |
2007 | * Reconstruct the missing data using the generated matrix. | |
2008 | */ | |
b2255edc | 2009 | vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows, |
45d1cae3 BB |
2010 | invrows, used); |
2011 | ||
2012 | kmem_free(p, psize); | |
2013 | ||
a6255b7f DQ |
2014 | /* |
2015 | * copy back from temporary linear abds and free them | |
2016 | */ | |
2017 | if (bufs) { | |
b2255edc BB |
2018 | for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { |
2019 | raidz_col_t *col = &rr->rr_col[c]; | |
a6255b7f | 2020 | |
b2255edc BB |
2021 | if (bufs[c] != NULL) { |
2022 | abd_copy(bufs[c], col->rc_abd, col->rc_size); | |
2023 | abd_free(col->rc_abd); | |
2024 | } | |
a6255b7f DQ |
2025 | col->rc_abd = bufs[c]; |
2026 | } | |
b2255edc | 2027 | kmem_free(bufs, rr->rr_cols * sizeof (abd_t *)); |
a6255b7f | 2028 | } |
34dc7c2f BB |
2029 | } |
2030 | ||
46df6e98 | 2031 | static void |
b2255edc BB |
2032 | vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr, |
2033 | const int *t, int nt) | |
45d1cae3 BB |
2034 | { |
2035 | int tgts[VDEV_RAIDZ_MAXPARITY], *dt; | |
2036 | int ntgts; | |
c9187d86 | 2037 | int i, c, ret; |
45d1cae3 BB |
2038 | int nbadparity, nbaddata; |
2039 | int parity_valid[VDEV_RAIDZ_MAXPARITY]; | |
2040 | ||
5caeef02 DB |
2041 | if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { |
2042 | zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)", | |
2043 | rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata, | |
2044 | (int)rr->rr_missingparity); | |
2045 | } | |
2046 | ||
b2255edc BB |
2047 | nbadparity = rr->rr_firstdatacol; |
2048 | nbaddata = rr->rr_cols - nbadparity; | |
45d1cae3 | 2049 | ntgts = 0; |
b2255edc | 2050 | for (i = 0, c = 0; c < rr->rr_cols; c++) { |
5caeef02 DB |
2051 | if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) { |
2052 | zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u " | |
2053 | "offset=%llx error=%u)", | |
2054 | rr, c, (int)rr->rr_col[c].rc_devidx, | |
2055 | (long long)rr->rr_col[c].rc_offset, | |
2056 | (int)rr->rr_col[c].rc_error); | |
2057 | } | |
b2255edc | 2058 | if (c < rr->rr_firstdatacol) |
45d1cae3 BB |
2059 | parity_valid[c] = B_FALSE; |
2060 | ||
2061 | if (i < nt && c == t[i]) { | |
2062 | tgts[ntgts++] = c; | |
2063 | i++; | |
b2255edc | 2064 | } else if (rr->rr_col[c].rc_error != 0) { |
45d1cae3 | 2065 | tgts[ntgts++] = c; |
b2255edc | 2066 | } else if (c >= rr->rr_firstdatacol) { |
45d1cae3 BB |
2067 | nbaddata--; |
2068 | } else { | |
2069 | parity_valid[c] = B_TRUE; | |
2070 | nbadparity--; | |
2071 | } | |
2072 | } | |
2073 | ||
2074 | ASSERT(ntgts >= nt); | |
2075 | ASSERT(nbaddata >= 0); | |
2076 | ASSERT(nbaddata + nbadparity == ntgts); | |
2077 | ||
2078 | dt = &tgts[nbadparity]; | |
2079 | ||
c9187d86 | 2080 | /* Reconstruct using the new math implementation */ |
b2255edc | 2081 | ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata); |
c9187d86 | 2082 | if (ret != RAIDZ_ORIGINAL_IMPL) |
46df6e98 | 2083 | return; |
ab9f4b0b | 2084 | |
45d1cae3 BB |
2085 | /* |
2086 | * See if we can use any of our optimized reconstruction routines. | |
2087 | */ | |
ab9f4b0b GN |
2088 | switch (nbaddata) { |
2089 | case 1: | |
46df6e98 MA |
2090 | if (parity_valid[VDEV_RAIDZ_P]) { |
2091 | vdev_raidz_reconstruct_p(rr, dt, 1); | |
2092 | return; | |
2093 | } | |
45d1cae3 | 2094 | |
b2255edc | 2095 | ASSERT(rr->rr_firstdatacol > 1); |
45d1cae3 | 2096 | |
46df6e98 MA |
2097 | if (parity_valid[VDEV_RAIDZ_Q]) { |
2098 | vdev_raidz_reconstruct_q(rr, dt, 1); | |
2099 | return; | |
2100 | } | |
45d1cae3 | 2101 | |
b2255edc | 2102 | ASSERT(rr->rr_firstdatacol > 2); |
ab9f4b0b | 2103 | break; |
45d1cae3 | 2104 | |
ab9f4b0b | 2105 | case 2: |
b2255edc | 2106 | ASSERT(rr->rr_firstdatacol > 1); |
45d1cae3 | 2107 | |
ab9f4b0b | 2108 | if (parity_valid[VDEV_RAIDZ_P] && |
46df6e98 MA |
2109 | parity_valid[VDEV_RAIDZ_Q]) { |
2110 | vdev_raidz_reconstruct_pq(rr, dt, 2); | |
2111 | return; | |
2112 | } | |
45d1cae3 | 2113 | |
b2255edc | 2114 | ASSERT(rr->rr_firstdatacol > 2); |
45d1cae3 | 2115 | |
ab9f4b0b | 2116 | break; |
45d1cae3 BB |
2117 | } |
2118 | ||
46df6e98 | 2119 | vdev_raidz_reconstruct_general(rr, tgts, ntgts); |
45d1cae3 | 2120 | } |
34dc7c2f BB |
2121 | |
2122 | static int | |
1bd201e7 | 2123 | vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, |
6fe3498c | 2124 | uint64_t *logical_ashift, uint64_t *physical_ashift) |
34dc7c2f | 2125 | { |
b2255edc BB |
2126 | vdev_raidz_t *vdrz = vd->vdev_tsd; |
2127 | uint64_t nparity = vdrz->vd_nparity; | |
45d1cae3 | 2128 | int c; |
34dc7c2f BB |
2129 | int lasterror = 0; |
2130 | int numerrors = 0; | |
2131 | ||
2132 | ASSERT(nparity > 0); | |
2133 | ||
2134 | if (nparity > VDEV_RAIDZ_MAXPARITY || | |
2135 | vd->vdev_children < nparity + 1) { | |
2136 | vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; | |
2e528b49 | 2137 | return (SET_ERROR(EINVAL)); |
34dc7c2f BB |
2138 | } |
2139 | ||
45d1cae3 BB |
2140 | vdev_open_children(vd); |
2141 | ||
34dc7c2f | 2142 | for (c = 0; c < vd->vdev_children; c++) { |
b2255edc | 2143 | vdev_t *cvd = vd->vdev_child[c]; |
34dc7c2f | 2144 | |
45d1cae3 BB |
2145 | if (cvd->vdev_open_error != 0) { |
2146 | lasterror = cvd->vdev_open_error; | |
34dc7c2f BB |
2147 | numerrors++; |
2148 | continue; | |
2149 | } | |
2150 | ||
2151 | *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; | |
1bd201e7 | 2152 | *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; |
6fe3498c | 2153 | *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); |
37f6845c AM |
2154 | } |
2155 | for (c = 0; c < vd->vdev_children; c++) { | |
2156 | vdev_t *cvd = vd->vdev_child[c]; | |
2157 | ||
2158 | if (cvd->vdev_open_error != 0) | |
2159 | continue; | |
2160 | *physical_ashift = vdev_best_ashift(*logical_ashift, | |
2161 | *physical_ashift, cvd->vdev_physical_ashift); | |
34dc7c2f BB |
2162 | } |
2163 | ||
5caeef02 DB |
2164 | if (vd->vdev_rz_expanding) { |
2165 | *asize *= vd->vdev_children - 1; | |
2166 | *max_asize *= vd->vdev_children - 1; | |
2167 | ||
2168 | vd->vdev_min_asize = *asize; | |
2169 | } else { | |
2170 | *asize *= vd->vdev_children; | |
2171 | *max_asize *= vd->vdev_children; | |
2172 | } | |
34dc7c2f BB |
2173 | |
2174 | if (numerrors > nparity) { | |
2175 | vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; | |
2176 | return (lasterror); | |
2177 | } | |
2178 | ||
2179 | return (0); | |
2180 | } | |
2181 | ||
2182 | static void | |
2183 | vdev_raidz_close(vdev_t *vd) | |
2184 | { | |
b2255edc BB |
2185 | for (int c = 0; c < vd->vdev_children; c++) { |
2186 | if (vd->vdev_child[c] != NULL) | |
2187 | vdev_close(vd->vdev_child[c]); | |
2188 | } | |
34dc7c2f BB |
2189 | } |
2190 | ||
5caeef02 DB |
2191 | /* |
2192 | * Return the logical width to use, given the txg in which the allocation | |
2193 | * happened. Note that BP_PHYSICAL_BIRTH() is usually the txg in which the | |
2194 | * BP was allocated. Remapped BP's (that were relocated due to device | |
2195 | * removal, see remap_blkptr_cb()), will have a more recent | |
2196 | * BP_PHYSICAL_BIRTH() which reflects when the BP was relocated, but we can | |
2197 | * ignore these because they can't be on RAIDZ (device removal doesn't | |
2198 | * support RAIDZ). | |
2199 | */ | |
2200 | static uint64_t | |
2201 | vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg) | |
2202 | { | |
2203 | reflow_node_t lookup = { | |
2204 | .re_txg = txg, | |
2205 | }; | |
2206 | avl_index_t where; | |
2207 | ||
2208 | uint64_t width; | |
2209 | mutex_enter(&vdrz->vd_expand_lock); | |
2210 | reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where); | |
2211 | if (re != NULL) { | |
2212 | width = re->re_logical_width; | |
2213 | } else { | |
2214 | re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE); | |
2215 | if (re != NULL) | |
2216 | width = re->re_logical_width; | |
2217 | else | |
2218 | width = vdrz->vd_original_width; | |
2219 | } | |
2220 | mutex_exit(&vdrz->vd_expand_lock); | |
2221 | return (width); | |
2222 | } | |
2223 | ||
2224 | /* | |
2225 | * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated | |
2226 | * more space due to the lower data-to-parity ratio. In this case it's | |
2227 | * important to pass in the correct txg. Note that vdev_gang_header_asize() | |
2228 | * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE, | |
2229 | * regardless of txg. This is assured because for a single data sector, we | |
2230 | * allocate P+1 sectors regardless of width ("cols", which is at least P+1). | |
2231 | */ | |
34dc7c2f | 2232 | static uint64_t |
5caeef02 | 2233 | vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg) |
34dc7c2f | 2234 | { |
b2255edc | 2235 | vdev_raidz_t *vdrz = vd->vdev_tsd; |
34dc7c2f BB |
2236 | uint64_t asize; |
2237 | uint64_t ashift = vd->vdev_top->vdev_ashift; | |
5caeef02 | 2238 | uint64_t cols = vdrz->vd_original_width; |
b2255edc | 2239 | uint64_t nparity = vdrz->vd_nparity; |
34dc7c2f | 2240 | |
5caeef02 DB |
2241 | cols = vdev_raidz_get_logical_width(vdrz, txg); |
2242 | ||
34dc7c2f BB |
2243 | asize = ((psize - 1) >> ashift) + 1; |
2244 | asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); | |
2245 | asize = roundup(asize, nparity + 1) << ashift; | |
2246 | ||
5caeef02 DB |
2247 | #ifdef ZFS_DEBUG |
2248 | uint64_t asize_new = ((psize - 1) >> ashift) + 1; | |
2249 | uint64_t ncols_new = vdrz->vd_physical_width; | |
2250 | asize_new += nparity * ((asize_new + ncols_new - nparity - 1) / | |
2251 | (ncols_new - nparity)); | |
2252 | asize_new = roundup(asize_new, nparity + 1) << ashift; | |
2253 | VERIFY3U(asize_new, <=, asize); | |
2254 | #endif | |
2255 | ||
34dc7c2f BB |
2256 | return (asize); |
2257 | } | |
2258 | ||
b2255edc BB |
2259 | /* |
2260 | * The allocatable space for a raidz vdev is N * sizeof(smallest child) | |
2261 | * so each child must provide at least 1/Nth of its asize. | |
2262 | */ | |
2263 | static uint64_t | |
2264 | vdev_raidz_min_asize(vdev_t *vd) | |
2265 | { | |
2266 | return ((vd->vdev_min_asize + vd->vdev_children - 1) / | |
2267 | vd->vdev_children); | |
2268 | } | |
2269 | ||
2270 | void | |
34dc7c2f BB |
2271 | vdev_raidz_child_done(zio_t *zio) |
2272 | { | |
2273 | raidz_col_t *rc = zio->io_private; | |
2274 | ||
345196be | 2275 | ASSERT3P(rc->rc_abd, !=, NULL); |
34dc7c2f BB |
2276 | rc->rc_error = zio->io_error; |
2277 | rc->rc_tried = 1; | |
2278 | rc->rc_skipped = 0; | |
2279 | } | |
2280 | ||
619f0976 | 2281 | static void |
5caeef02 | 2282 | vdev_raidz_shadow_child_done(zio_t *zio) |
619f0976 | 2283 | { |
5caeef02 DB |
2284 | raidz_col_t *rc = zio->io_private; |
2285 | ||
2286 | rc->rc_shadow_error = zio->io_error; | |
2287 | } | |
619f0976 | 2288 | |
5caeef02 DB |
2289 | static void |
2290 | vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col) | |
2291 | { | |
2292 | (void) rm; | |
2293 | #ifdef ZFS_DEBUG | |
b2255edc BB |
2294 | range_seg64_t logical_rs, physical_rs, remain_rs; |
2295 | logical_rs.rs_start = rr->rr_offset; | |
619f0976 | 2296 | logical_rs.rs_end = logical_rs.rs_start + |
5caeef02 DB |
2297 | vdev_raidz_asize(zio->io_vd, rr->rr_size, |
2298 | BP_PHYSICAL_BIRTH(zio->io_bp)); | |
619f0976 | 2299 | |
b2255edc | 2300 | raidz_col_t *rc = &rr->rr_col[col]; |
5caeef02 | 2301 | vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; |
619f0976 | 2302 | |
b2255edc BB |
2303 | vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs); |
2304 | ASSERT(vdev_xlate_is_empty(&remain_rs)); | |
5caeef02 DB |
2305 | if (vdev_xlate_is_empty(&physical_rs)) { |
2306 | /* | |
2307 | * If we are in the middle of expansion, the | |
2308 | * physical->logical mapping is changing so vdev_xlate() | |
2309 | * can't give us a reliable answer. | |
2310 | */ | |
2311 | return; | |
2312 | } | |
619f0976 GW |
2313 | ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); |
2314 | ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); | |
2315 | /* | |
2316 | * It would be nice to assert that rs_end is equal | |
2317 | * to rc_offset + rc_size but there might be an | |
2318 | * optional I/O at the end that is not accounted in | |
2319 | * rc_size. | |
2320 | */ | |
2321 | if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { | |
2322 | ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + | |
5caeef02 | 2323 | rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift)); |
619f0976 GW |
2324 | } else { |
2325 | ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); | |
2326 | } | |
2327 | #endif | |
2328 | } | |
2329 | ||
98b25418 | 2330 | static void |
5caeef02 | 2331 | vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr) |
34dc7c2f BB |
2332 | { |
2333 | vdev_t *vd = zio->io_vd; | |
b2255edc | 2334 | raidz_map_t *rm = zio->io_vsd; |
34dc7c2f | 2335 | |
b2255edc | 2336 | vdev_raidz_generate_parity_row(rm, rr); |
34dc7c2f | 2337 | |
345196be | 2338 | for (int c = 0; c < rr->rr_scols; c++) { |
b2255edc | 2339 | raidz_col_t *rc = &rr->rr_col[c]; |
345196be | 2340 | vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; |
619f0976 | 2341 | |
b2255edc | 2342 | /* Verify physical to logical translation */ |
5caeef02 | 2343 | vdev_raidz_io_verify(zio, rm, rr, c); |
34dc7c2f | 2344 | |
5caeef02 DB |
2345 | if (rc->rc_size == 0) |
2346 | continue; | |
2347 | ||
2348 | ASSERT3U(rc->rc_offset + rc->rc_size, <, | |
2349 | cvd->vdev_psize - VDEV_LABEL_END_SIZE); | |
2350 | ||
2351 | ASSERT3P(rc->rc_abd, !=, NULL); | |
2352 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd, | |
2353 | rc->rc_offset, rc->rc_abd, | |
2354 | abd_get_size(rc->rc_abd), zio->io_type, | |
2355 | zio->io_priority, 0, vdev_raidz_child_done, rc)); | |
2356 | ||
2357 | if (rc->rc_shadow_devidx != INT_MAX) { | |
2358 | vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx]; | |
2359 | ||
2360 | ASSERT3U( | |
2361 | rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <, | |
2362 | cvd2->vdev_psize - VDEV_LABEL_END_SIZE); | |
2363 | ||
2364 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd2, | |
2365 | rc->rc_shadow_offset, rc->rc_abd, | |
2366 | abd_get_size(rc->rc_abd), | |
2367 | zio->io_type, zio->io_priority, 0, | |
2368 | vdev_raidz_shadow_child_done, rc)); | |
345196be | 2369 | } |
34dc7c2f | 2370 | } |
b2255edc | 2371 | } |
34dc7c2f | 2372 | |
5caeef02 DB |
2373 | /* |
2374 | * Generate optional I/Os for skip sectors to improve aggregation contiguity. | |
2375 | * This only works for vdev_raidz_map_alloc() (not _expanded()). | |
2376 | */ | |
b2255edc | 2377 | static void |
5caeef02 | 2378 | raidz_start_skip_writes(zio_t *zio) |
b2255edc BB |
2379 | { |
2380 | vdev_t *vd = zio->io_vd; | |
5caeef02 DB |
2381 | uint64_t ashift = vd->vdev_top->vdev_ashift; |
2382 | raidz_map_t *rm = zio->io_vsd; | |
2383 | ASSERT3U(rm->rm_nrows, ==, 1); | |
2384 | raidz_row_t *rr = rm->rm_row[0]; | |
2385 | for (int c = 0; c < rr->rr_scols; c++) { | |
2386 | raidz_col_t *rc = &rr->rr_col[c]; | |
2387 | vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; | |
2388 | if (rc->rc_size != 0) | |
2389 | continue; | |
2390 | ASSERT3P(rc->rc_abd, ==, NULL); | |
2391 | ||
2392 | ASSERT3U(rc->rc_offset, <, | |
2393 | cvd->vdev_psize - VDEV_LABEL_END_SIZE); | |
2394 | ||
2395 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, | |
2396 | NULL, 1ULL << ashift, zio->io_type, zio->io_priority, | |
2397 | ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); | |
2398 | } | |
2399 | } | |
2400 | ||
2401 | static void | |
2402 | vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity) | |
2403 | { | |
2404 | vdev_t *vd = zio->io_vd; | |
2405 | ||
2406 | /* | |
34dc7c2f | 2407 | * Iterate over the columns in reverse order so that we hit the parity |
45d1cae3 | 2408 | * last -- any errors along the way will force us to read the parity. |
34dc7c2f | 2409 | */ |
b2255edc BB |
2410 | for (int c = rr->rr_cols - 1; c >= 0; c--) { |
2411 | raidz_col_t *rc = &rr->rr_col[c]; | |
2412 | if (rc->rc_size == 0) | |
2413 | continue; | |
2414 | vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; | |
34dc7c2f | 2415 | if (!vdev_readable(cvd)) { |
b2255edc BB |
2416 | if (c >= rr->rr_firstdatacol) |
2417 | rr->rr_missingdata++; | |
34dc7c2f | 2418 | else |
b2255edc | 2419 | rr->rr_missingparity++; |
2e528b49 | 2420 | rc->rc_error = SET_ERROR(ENXIO); |
34dc7c2f BB |
2421 | rc->rc_tried = 1; /* don't even try */ |
2422 | rc->rc_skipped = 1; | |
2423 | continue; | |
2424 | } | |
428870ff | 2425 | if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { |
b2255edc BB |
2426 | if (c >= rr->rr_firstdatacol) |
2427 | rr->rr_missingdata++; | |
34dc7c2f | 2428 | else |
b2255edc | 2429 | rr->rr_missingparity++; |
2e528b49 | 2430 | rc->rc_error = SET_ERROR(ESTALE); |
34dc7c2f BB |
2431 | rc->rc_skipped = 1; |
2432 | continue; | |
2433 | } | |
5caeef02 DB |
2434 | if (forceparity || |
2435 | c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 || | |
9babb374 | 2436 | (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { |
34dc7c2f | 2437 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd, |
a6255b7f | 2438 | rc->rc_offset, rc->rc_abd, rc->rc_size, |
b128c09f | 2439 | zio->io_type, zio->io_priority, 0, |
34dc7c2f BB |
2440 | vdev_raidz_child_done, rc)); |
2441 | } | |
2442 | } | |
b2255edc BB |
2443 | } |
2444 | ||
5caeef02 DB |
2445 | static void |
2446 | vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm) | |
2447 | { | |
2448 | vdev_t *vd = zio->io_vd; | |
2449 | ||
2450 | for (int i = 0; i < rm->rm_nphys_cols; i++) { | |
2451 | raidz_col_t *prc = &rm->rm_phys_col[i]; | |
2452 | if (prc->rc_size == 0) | |
2453 | continue; | |
2454 | ||
2455 | ASSERT3U(prc->rc_devidx, ==, i); | |
2456 | vdev_t *cvd = vd->vdev_child[i]; | |
2457 | if (!vdev_readable(cvd)) { | |
2458 | prc->rc_error = SET_ERROR(ENXIO); | |
2459 | prc->rc_tried = 1; /* don't even try */ | |
2460 | prc->rc_skipped = 1; | |
2461 | continue; | |
2462 | } | |
2463 | if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { | |
2464 | prc->rc_error = SET_ERROR(ESTALE); | |
2465 | prc->rc_skipped = 1; | |
2466 | continue; | |
2467 | } | |
2468 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd, | |
2469 | prc->rc_offset, prc->rc_abd, prc->rc_size, | |
2470 | zio->io_type, zio->io_priority, 0, | |
2471 | vdev_raidz_child_done, prc)); | |
2472 | } | |
2473 | } | |
2474 | ||
2475 | static void | |
2476 | vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm) | |
2477 | { | |
2478 | /* | |
2479 | * If there are multiple rows, we will be hitting | |
2480 | * all disks, so go ahead and read the parity so | |
2481 | * that we are reading in decent size chunks. | |
2482 | */ | |
2483 | boolean_t forceparity = rm->rm_nrows > 1; | |
2484 | ||
2485 | if (rm->rm_phys_col) { | |
2486 | vdev_raidz_io_start_read_phys_cols(zio, rm); | |
2487 | } else { | |
2488 | for (int i = 0; i < rm->rm_nrows; i++) { | |
2489 | raidz_row_t *rr = rm->rm_row[i]; | |
2490 | vdev_raidz_io_start_read_row(zio, rr, forceparity); | |
2491 | } | |
2492 | } | |
2493 | } | |
2494 | ||
b2255edc BB |
2495 | /* |
2496 | * Start an IO operation on a RAIDZ VDev | |
2497 | * | |
2498 | * Outline: | |
2499 | * - For write operations: | |
2500 | * 1. Generate the parity data | |
2501 | * 2. Create child zio write operations to each column's vdev, for both | |
2502 | * data and parity. | |
2503 | * 3. If the column skips any sectors for padding, create optional dummy | |
2504 | * write zio children for those areas to improve aggregation continuity. | |
2505 | * - For read operations: | |
2506 | * 1. Create child zio read operations to each data column's vdev to read | |
2507 | * the range of data required for zio. | |
2508 | * 2. If this is a scrub or resilver operation, or if any of the data | |
2509 | * vdevs have had errors, then create zio read operations to the parity | |
2510 | * columns' VDevs as well. | |
2511 | */ | |
2512 | static void | |
2513 | vdev_raidz_io_start(zio_t *zio) | |
2514 | { | |
2515 | vdev_t *vd = zio->io_vd; | |
2516 | vdev_t *tvd = vd->vdev_top; | |
2517 | vdev_raidz_t *vdrz = vd->vdev_tsd; | |
5caeef02 DB |
2518 | raidz_map_t *rm; |
2519 | ||
2520 | uint64_t logical_width = vdev_raidz_get_logical_width(vdrz, | |
2521 | BP_PHYSICAL_BIRTH(zio->io_bp)); | |
2522 | if (logical_width != vdrz->vd_physical_width) { | |
2523 | zfs_locked_range_t *lr = NULL; | |
2524 | uint64_t synced_offset = UINT64_MAX; | |
2525 | uint64_t next_offset = UINT64_MAX; | |
2526 | boolean_t use_scratch = B_FALSE; | |
2527 | /* | |
2528 | * Note: when the expansion is completing, we set | |
2529 | * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync()) | |
2530 | * in a later txg than when we last update spa_ubsync's state | |
2531 | * (see the end of spa_raidz_expand_thread()). Therefore we | |
2532 | * may see vre_state!=SCANNING before | |
2533 | * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected | |
2534 | * on disk, but the copying progress has been synced to disk | |
2535 | * (and reflected in spa_ubsync). In this case it's fine to | |
2536 | * treat the expansion as completed, since if we crash there's | |
2537 | * no additional copying to do. | |
2538 | */ | |
2539 | if (vdrz->vn_vre.vre_state == DSS_SCANNING) { | |
2540 | ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==, | |
2541 | &vdrz->vn_vre); | |
2542 | lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock, | |
2543 | zio->io_offset, zio->io_size, RL_READER); | |
2544 | use_scratch = | |
2545 | (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) == | |
2546 | RRSS_SCRATCH_VALID); | |
2547 | synced_offset = | |
2548 | RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync); | |
2549 | next_offset = vdrz->vn_vre.vre_offset; | |
2550 | /* | |
2551 | * If we haven't resumed expanding since importing the | |
2552 | * pool, vre_offset won't have been set yet. In | |
2553 | * this case the next offset to be copied is the same | |
2554 | * as what was synced. | |
2555 | */ | |
2556 | if (next_offset == UINT64_MAX) { | |
2557 | next_offset = synced_offset; | |
2558 | } | |
2559 | } | |
2560 | if (use_scratch) { | |
2561 | zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced=" | |
2562 | "%lld next_offset=%lld use_scratch=%u", | |
2563 | zio, | |
2564 | zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ", | |
2565 | (long long)zio->io_offset, | |
2566 | (long long)synced_offset, | |
2567 | (long long)next_offset, | |
2568 | use_scratch); | |
2569 | } | |
2570 | ||
2571 | rm = vdev_raidz_map_alloc_expanded(zio, | |
2572 | tvd->vdev_ashift, vdrz->vd_physical_width, | |
2573 | logical_width, vdrz->vd_nparity, | |
2574 | synced_offset, next_offset, use_scratch); | |
2575 | rm->rm_lr = lr; | |
2576 | } else { | |
2577 | rm = vdev_raidz_map_alloc(zio, | |
2578 | tvd->vdev_ashift, logical_width, vdrz->vd_nparity); | |
2579 | } | |
2580 | rm->rm_original_width = vdrz->vd_original_width; | |
b2255edc | 2581 | |
330c6c05 MA |
2582 | zio->io_vsd = rm; |
2583 | zio->io_vsd_ops = &vdev_raidz_vsd_ops; | |
b2255edc | 2584 | if (zio->io_type == ZIO_TYPE_WRITE) { |
5caeef02 DB |
2585 | for (int i = 0; i < rm->rm_nrows; i++) { |
2586 | vdev_raidz_io_start_write(zio, rm->rm_row[i]); | |
2587 | } | |
2588 | ||
2589 | if (logical_width == vdrz->vd_physical_width) { | |
2590 | raidz_start_skip_writes(zio); | |
2591 | } | |
b2255edc BB |
2592 | } else { |
2593 | ASSERT(zio->io_type == ZIO_TYPE_READ); | |
5caeef02 | 2594 | vdev_raidz_io_start_read(zio, rm); |
b2255edc | 2595 | } |
34dc7c2f | 2596 | |
98b25418 | 2597 | zio_execute(zio); |
34dc7c2f BB |
2598 | } |
2599 | ||
2600 | /* | |
2601 | * Report a checksum error for a child of a RAID-Z device. | |
2602 | */ | |
3c80e074 BB |
2603 | void |
2604 | vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) | |
34dc7c2f BB |
2605 | { |
2606 | vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; | |
34dc7c2f | 2607 | |
b2255edc BB |
2608 | if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) && |
2609 | zio->io_priority != ZIO_PRIORITY_REBUILD) { | |
428870ff BB |
2610 | zio_bad_cksum_t zbc; |
2611 | raidz_map_t *rm = zio->io_vsd; | |
2612 | ||
428870ff BB |
2613 | zbc.zbc_has_cksum = 0; |
2614 | zbc.zbc_injected = rm->rm_ecksuminjected; | |
2615 | ||
03e02e5b DB |
2616 | mutex_enter(&vd->vdev_stat_lock); |
2617 | vd->vdev_stat.vs_checksum_errors++; | |
2618 | mutex_exit(&vd->vdev_stat_lock); | |
7a75f74c RW |
2619 | (void) zfs_ereport_post_checksum(zio->io_spa, vd, |
2620 | &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, | |
2621 | rc->rc_abd, bad_data, &zbc); | |
34dc7c2f | 2622 | } |
428870ff BB |
2623 | } |
2624 | ||
2625 | /* | |
2626 | * We keep track of whether or not there were any injected errors, so that | |
2627 | * any ereports we generate can note it. | |
2628 | */ | |
2629 | static int | |
2630 | raidz_checksum_verify(zio_t *zio) | |
2631 | { | |
6fd87e1d | 2632 | zio_bad_cksum_t zbc = {0}; |
428870ff BB |
2633 | raidz_map_t *rm = zio->io_vsd; |
2634 | ||
1c27024e | 2635 | int ret = zio_checksum_error(zio, &zbc); |
428870ff BB |
2636 | if (ret != 0 && zbc.zbc_injected != 0) |
2637 | rm->rm_ecksuminjected = 1; | |
34dc7c2f | 2638 | |
428870ff | 2639 | return (ret); |
34dc7c2f BB |
2640 | } |
2641 | ||
2642 | /* | |
2643 | * Generate the parity from the data columns. If we tried and were able to | |
2644 | * read the parity without error, verify that the generated parity matches the | |
2645 | * data we read. If it doesn't, we fire off a checksum error. Return the | |
b2255edc | 2646 | * number of such failures. |
34dc7c2f BB |
2647 | */ |
2648 | static int | |
b2255edc | 2649 | raidz_parity_verify(zio_t *zio, raidz_row_t *rr) |
34dc7c2f | 2650 | { |
84c07ada | 2651 | abd_t *orig[VDEV_RAIDZ_MAXPARITY]; |
34dc7c2f | 2652 | int c, ret = 0; |
b2255edc | 2653 | raidz_map_t *rm = zio->io_vsd; |
34dc7c2f BB |
2654 | raidz_col_t *rc; |
2655 | ||
3c67d83a TH |
2656 | blkptr_t *bp = zio->io_bp; |
2657 | enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : | |
2658 | (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); | |
2659 | ||
2660 | if (checksum == ZIO_CHECKSUM_NOPARITY) | |
2661 | return (ret); | |
2662 | ||
b2255edc BB |
2663 | for (c = 0; c < rr->rr_firstdatacol; c++) { |
2664 | rc = &rr->rr_col[c]; | |
34dc7c2f BB |
2665 | if (!rc->rc_tried || rc->rc_error != 0) |
2666 | continue; | |
84c07ada | 2667 | |
74230a5b AM |
2668 | orig[c] = rc->rc_abd; |
2669 | ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size); | |
2670 | rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE); | |
34dc7c2f BB |
2671 | } |
2672 | ||
3c80e074 BB |
2673 | /* |
2674 | * Verify any empty sectors are zero filled to ensure the parity | |
2675 | * is calculated correctly even if these non-data sectors are damaged. | |
2676 | */ | |
2677 | if (rr->rr_nempty && rr->rr_abd_empty != NULL) | |
2678 | ret += vdev_draid_map_verify_empty(zio, rr); | |
2679 | ||
b2255edc BB |
2680 | /* |
2681 | * Regenerates parity even for !tried||rc_error!=0 columns. This | |
2682 | * isn't harmful but it does have the side effect of fixing stuff | |
2683 | * we didn't realize was necessary (i.e. even if we return 0). | |
2684 | */ | |
2685 | vdev_raidz_generate_parity_row(rm, rr); | |
2686 | ||
2687 | for (c = 0; c < rr->rr_firstdatacol; c++) { | |
2688 | rc = &rr->rr_col[c]; | |
34dc7c2f | 2689 | |
34dc7c2f BB |
2690 | if (!rc->rc_tried || rc->rc_error != 0) |
2691 | continue; | |
b2255edc | 2692 | |
84c07ada | 2693 | if (abd_cmp(orig[c], rc->rc_abd) != 0) { |
5caeef02 DB |
2694 | zfs_dbgmsg("found error on col=%u devidx=%u off %llx", |
2695 | c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset); | |
3c80e074 | 2696 | vdev_raidz_checksum_error(zio, rc, orig[c]); |
2e528b49 | 2697 | rc->rc_error = SET_ERROR(ECKSUM); |
34dc7c2f BB |
2698 | ret++; |
2699 | } | |
84c07ada | 2700 | abd_free(orig[c]); |
34dc7c2f BB |
2701 | } |
2702 | ||
2703 | return (ret); | |
2704 | } | |
2705 | ||
34dc7c2f | 2706 | static int |
b2255edc | 2707 | vdev_raidz_worst_error(raidz_row_t *rr) |
b128c09f | 2708 | { |
1c27024e | 2709 | int error = 0; |
b128c09f | 2710 | |
5caeef02 | 2711 | for (int c = 0; c < rr->rr_cols; c++) { |
b2255edc | 2712 | error = zio_worst_error(error, rr->rr_col[c].rc_error); |
5caeef02 DB |
2713 | error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error); |
2714 | } | |
b128c09f BB |
2715 | |
2716 | return (error); | |
2717 | } | |
2718 | ||
b2255edc BB |
2719 | static void |
2720 | vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr) | |
45d1cae3 | 2721 | { |
b2255edc BB |
2722 | int unexpected_errors = 0; |
2723 | int parity_errors = 0; | |
2724 | int parity_untried = 0; | |
2725 | int data_errors = 0; | |
45d1cae3 | 2726 | |
b2255edc BB |
2727 | ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); |
2728 | ||
2729 | for (int c = 0; c < rr->rr_cols; c++) { | |
2730 | raidz_col_t *rc = &rr->rr_col[c]; | |
2731 | ||
2732 | if (rc->rc_error) { | |
2733 | if (c < rr->rr_firstdatacol) | |
2734 | parity_errors++; | |
2735 | else | |
2736 | data_errors++; | |
2737 | ||
2738 | if (!rc->rc_skipped) | |
2739 | unexpected_errors++; | |
2740 | } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { | |
2741 | parity_untried++; | |
2742 | } | |
ad8b9f94 BB |
2743 | |
2744 | if (rc->rc_force_repair) | |
2745 | unexpected_errors++; | |
b2255edc | 2746 | } |
45d1cae3 BB |
2747 | |
2748 | /* | |
b2255edc BB |
2749 | * If we read more parity disks than were used for |
2750 | * reconstruction, confirm that the other parity disks produced | |
2751 | * correct data. | |
2752 | * | |
2753 | * Note that we also regenerate parity when resilvering so we | |
2754 | * can write it out to failed devices later. | |
45d1cae3 | 2755 | */ |
b2255edc BB |
2756 | if (parity_errors + parity_untried < |
2757 | rr->rr_firstdatacol - data_errors || | |
2758 | (zio->io_flags & ZIO_FLAG_RESILVER)) { | |
2759 | int n = raidz_parity_verify(zio, rr); | |
2760 | unexpected_errors += n; | |
b2255edc | 2761 | } |
45d1cae3 | 2762 | |
b2255edc BB |
2763 | if (zio->io_error == 0 && spa_writeable(zio->io_spa) && |
2764 | (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) { | |
45d1cae3 | 2765 | /* |
b2255edc | 2766 | * Use the good data we have in hand to repair damaged children. |
45d1cae3 | 2767 | */ |
b2255edc BB |
2768 | for (int c = 0; c < rr->rr_cols; c++) { |
2769 | raidz_col_t *rc = &rr->rr_col[c]; | |
2770 | vdev_t *vd = zio->io_vd; | |
2771 | vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; | |
2772 | ||
8fb577ae BB |
2773 | if (!rc->rc_allow_repair) { |
2774 | continue; | |
2775 | } else if (!rc->rc_force_repair && | |
2776 | (rc->rc_error == 0 || rc->rc_size == 0)) { | |
b2255edc | 2777 | continue; |
45d1cae3 BB |
2778 | } |
2779 | ||
5caeef02 DB |
2780 | zfs_dbgmsg("zio=%px repairing c=%u devidx=%u " |
2781 | "offset=%llx", | |
2782 | zio, c, rc->rc_devidx, (long long)rc->rc_offset); | |
2783 | ||
b2255edc BB |
2784 | zio_nowait(zio_vdev_child_io(zio, NULL, cvd, |
2785 | rc->rc_offset, rc->rc_abd, rc->rc_size, | |
2786 | ZIO_TYPE_WRITE, | |
2787 | zio->io_priority == ZIO_PRIORITY_REBUILD ? | |
2788 | ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, | |
2789 | ZIO_FLAG_IO_REPAIR | (unexpected_errors ? | |
2790 | ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); | |
2791 | } | |
2792 | } | |
5caeef02 DB |
2793 | |
2794 | /* | |
2795 | * Scrub or resilver i/o's: overwrite any shadow locations with the | |
2796 | * good data. This ensures that if we've already copied this sector, | |
2797 | * it will be corrected if it was damaged. This writes more than is | |
2798 | * necessary, but since expansion is paused during scrub/resilver, at | |
2799 | * most a single row will have a shadow location. | |
2800 | */ | |
2801 | if (zio->io_error == 0 && spa_writeable(zio->io_spa) && | |
2802 | (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) { | |
2803 | for (int c = 0; c < rr->rr_cols; c++) { | |
2804 | raidz_col_t *rc = &rr->rr_col[c]; | |
2805 | vdev_t *vd = zio->io_vd; | |
2806 | ||
2807 | if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0) | |
2808 | continue; | |
2809 | vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx]; | |
2810 | ||
2811 | /* | |
2812 | * Note: We don't want to update the repair stats | |
2813 | * because that would incorrectly indicate that there | |
2814 | * was bad data to repair, which we aren't sure about. | |
2815 | * By clearing the SCAN_THREAD flag, we prevent this | |
2816 | * from happening, despite having the REPAIR flag set. | |
2817 | * We need to set SELF_HEAL so that this i/o can't be | |
2818 | * bypassed by zio_vdev_io_start(). | |
2819 | */ | |
2820 | zio_t *cio = zio_vdev_child_io(zio, NULL, cvd, | |
2821 | rc->rc_shadow_offset, rc->rc_abd, rc->rc_size, | |
2822 | ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, | |
2823 | ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, | |
2824 | NULL, NULL); | |
2825 | cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD; | |
2826 | zio_nowait(cio); | |
2827 | } | |
2828 | } | |
b2255edc BB |
2829 | } |
2830 | ||
2831 | static void | |
2832 | raidz_restore_orig_data(raidz_map_t *rm) | |
2833 | { | |
2834 | for (int i = 0; i < rm->rm_nrows; i++) { | |
2835 | raidz_row_t *rr = rm->rm_row[i]; | |
2836 | for (int c = 0; c < rr->rr_cols; c++) { | |
2837 | raidz_col_t *rc = &rr->rr_col[c]; | |
2838 | if (rc->rc_need_orig_restore) { | |
330c6c05 | 2839 | abd_copy(rc->rc_abd, |
b2255edc BB |
2840 | rc->rc_orig_data, rc->rc_size); |
2841 | rc->rc_need_orig_restore = B_FALSE; | |
45d1cae3 | 2842 | } |
b2255edc BB |
2843 | } |
2844 | } | |
2845 | } | |
2846 | ||
5caeef02 DB |
2847 | /* |
2848 | * During raidz_reconstruct() for expanded VDEV, we need special consideration | |
2849 | * failure simulations. See note in raidz_reconstruct() on simulating failure | |
2850 | * of a pre-expansion device. | |
2851 | * | |
2852 | * Treating logical child i as failed, return TRUE if the given column should | |
2853 | * be treated as failed. The idea of logical children allows us to imagine | |
2854 | * that a disk silently failed before a RAIDZ expansion (reads from this disk | |
2855 | * succeed but return the wrong data). Since the expansion doesn't verify | |
2856 | * checksums, the incorrect data will be moved to new locations spread among | |
2857 | * the children (going diagonally across them). | |
2858 | * | |
2859 | * Higher "logical child failures" (values of `i`) indicate these | |
2860 | * "pre-expansion failures". The first physical_width values imagine that a | |
2861 | * current child failed; the next physical_width-1 values imagine that a | |
2862 | * child failed before the most recent expansion; the next physical_width-2 | |
2863 | * values imagine a child failed in the expansion before that, etc. | |
2864 | */ | |
2865 | static boolean_t | |
2866 | raidz_simulate_failure(int physical_width, int original_width, int ashift, | |
2867 | int i, raidz_col_t *rc) | |
2868 | { | |
2869 | uint64_t sector_id = | |
2870 | physical_width * (rc->rc_offset >> ashift) + | |
2871 | rc->rc_devidx; | |
2872 | ||
2873 | for (int w = physical_width; w >= original_width; w--) { | |
2874 | if (i < w) { | |
2875 | return (sector_id % w == i); | |
2876 | } else { | |
2877 | i -= w; | |
2878 | } | |
2879 | } | |
2880 | ASSERT(!"invalid logical child id"); | |
2881 | return (B_FALSE); | |
2882 | } | |
2883 | ||
b2255edc BB |
2884 | /* |
2885 | * returns EINVAL if reconstruction of the block will not be possible | |
2886 | * returns ECKSUM if this specific reconstruction failed | |
2887 | * returns 0 on successful reconstruction | |
2888 | */ | |
2889 | static int | |
2890 | raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity) | |
2891 | { | |
2892 | raidz_map_t *rm = zio->io_vsd; | |
5caeef02 DB |
2893 | int physical_width = zio->io_vd->vdev_children; |
2894 | int original_width = (rm->rm_original_width != 0) ? | |
2895 | rm->rm_original_width : physical_width; | |
2896 | int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT; | |
2897 | ||
2898 | if (dbgmsg) { | |
2899 | zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u " | |
2900 | "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts); | |
2901 | } | |
45d1cae3 | 2902 | |
b2255edc BB |
2903 | /* Reconstruct each row */ |
2904 | for (int r = 0; r < rm->rm_nrows; r++) { | |
2905 | raidz_row_t *rr = rm->rm_row[r]; | |
2906 | int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */ | |
2907 | int t = 0; | |
2908 | int dead = 0; | |
2909 | int dead_data = 0; | |
2910 | ||
5caeef02 DB |
2911 | if (dbgmsg) |
2912 | zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r); | |
2913 | ||
b2255edc BB |
2914 | for (int c = 0; c < rr->rr_cols; c++) { |
2915 | raidz_col_t *rc = &rr->rr_col[c]; | |
2916 | ASSERT0(rc->rc_need_orig_restore); | |
2917 | if (rc->rc_error != 0) { | |
2918 | dead++; | |
2919 | if (c >= nparity) | |
2920 | dead_data++; | |
2921 | continue; | |
2922 | } | |
2923 | if (rc->rc_size == 0) | |
2924 | continue; | |
2925 | for (int lt = 0; lt < ntgts; lt++) { | |
5caeef02 DB |
2926 | if (raidz_simulate_failure(physical_width, |
2927 | original_width, | |
2928 | zio->io_vd->vdev_top->vdev_ashift, | |
2929 | ltgts[lt], rc)) { | |
b2255edc BB |
2930 | if (rc->rc_orig_data == NULL) { |
2931 | rc->rc_orig_data = | |
330c6c05 MA |
2932 | abd_alloc_linear( |
2933 | rc->rc_size, B_TRUE); | |
2934 | abd_copy(rc->rc_orig_data, | |
b2255edc BB |
2935 | rc->rc_abd, rc->rc_size); |
2936 | } | |
2937 | rc->rc_need_orig_restore = B_TRUE; | |
2938 | ||
2939 | dead++; | |
2940 | if (c >= nparity) | |
2941 | dead_data++; | |
5caeef02 DB |
2942 | /* |
2943 | * Note: simulating failure of a | |
2944 | * pre-expansion device can hit more | |
2945 | * than one column, in which case we | |
2946 | * might try to simulate more failures | |
2947 | * than can be reconstructed, which is | |
2948 | * also more than the size of my_tgts. | |
2949 | * This check prevents accessing past | |
2950 | * the end of my_tgts. The "dead > | |
2951 | * nparity" check below will fail this | |
2952 | * reconstruction attempt. | |
2953 | */ | |
2954 | if (t < VDEV_RAIDZ_MAXPARITY) { | |
2955 | my_tgts[t++] = c; | |
2956 | if (dbgmsg) { | |
2957 | zfs_dbgmsg("simulating " | |
2958 | "failure of col %u " | |
2959 | "devidx %u", c, | |
2960 | (int)rc->rc_devidx); | |
2961 | } | |
2962 | } | |
b2255edc BB |
2963 | break; |
2964 | } | |
2965 | } | |
2966 | } | |
2967 | if (dead > nparity) { | |
2968 | /* reconstruction not possible */ | |
5caeef02 DB |
2969 | if (dbgmsg) { |
2970 | zfs_dbgmsg("reconstruction not possible; " | |
2971 | "too many failures"); | |
2972 | } | |
b2255edc BB |
2973 | raidz_restore_orig_data(rm); |
2974 | return (EINVAL); | |
45d1cae3 | 2975 | } |
b2255edc | 2976 | if (dead_data > 0) |
46df6e98 | 2977 | vdev_raidz_reconstruct_row(rm, rr, my_tgts, t); |
b2255edc | 2978 | } |
45d1cae3 | 2979 | |
b2255edc BB |
2980 | /* Check for success */ |
2981 | if (raidz_checksum_verify(zio) == 0) { | |
2982 | ||
2983 | /* Reconstruction succeeded - report errors */ | |
2984 | for (int i = 0; i < rm->rm_nrows; i++) { | |
2985 | raidz_row_t *rr = rm->rm_row[i]; | |
2986 | ||
2987 | for (int c = 0; c < rr->rr_cols; c++) { | |
2988 | raidz_col_t *rc = &rr->rr_col[c]; | |
2989 | if (rc->rc_need_orig_restore) { | |
2990 | /* | |
2991 | * Note: if this is a parity column, | |
2992 | * we don't really know if it's wrong. | |
2993 | * We need to let | |
2994 | * vdev_raidz_io_done_verified() check | |
2995 | * it, and if we set rc_error, it will | |
2996 | * think that it is a "known" error | |
2997 | * that doesn't need to be checked | |
2998 | * or corrected. | |
2999 | */ | |
3000 | if (rc->rc_error == 0 && | |
3001 | c >= rr->rr_firstdatacol) { | |
3c80e074 | 3002 | vdev_raidz_checksum_error(zio, |
330c6c05 | 3003 | rc, rc->rc_orig_data); |
b2255edc BB |
3004 | rc->rc_error = |
3005 | SET_ERROR(ECKSUM); | |
3006 | } | |
3007 | rc->rc_need_orig_restore = B_FALSE; | |
3008 | } | |
3009 | } | |
45d1cae3 | 3010 | |
b2255edc | 3011 | vdev_raidz_io_done_verified(zio, rr); |
45d1cae3 BB |
3012 | } |
3013 | ||
b2255edc | 3014 | zio_checksum_verified(zio); |
45d1cae3 | 3015 | |
5caeef02 DB |
3016 | if (dbgmsg) { |
3017 | zfs_dbgmsg("reconstruction successful " | |
3018 | "(checksum verified)"); | |
3019 | } | |
b2255edc BB |
3020 | return (0); |
3021 | } | |
45d1cae3 | 3022 | |
b2255edc BB |
3023 | /* Reconstruction failed - restore original data */ |
3024 | raidz_restore_orig_data(rm); | |
5caeef02 DB |
3025 | if (dbgmsg) { |
3026 | zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum " | |
3027 | "failed", zio); | |
3028 | } | |
b2255edc BB |
3029 | return (ECKSUM); |
3030 | } | |
45d1cae3 | 3031 | |
b2255edc BB |
3032 | /* |
3033 | * Iterate over all combinations of N bad vdevs and attempt a reconstruction. | |
3034 | * Note that the algorithm below is non-optimal because it doesn't take into | |
3035 | * account how reconstruction is actually performed. For example, with | |
3036 | * triple-parity RAID-Z the reconstruction procedure is the same if column 4 | |
3037 | * is targeted as invalid as if columns 1 and 4 are targeted since in both | |
3038 | * cases we'd only use parity information in column 0. | |
3039 | * | |
3040 | * The order that we find the various possible combinations of failed | |
3041 | * disks is dictated by these rules: | |
3042 | * - Examine each "slot" (the "i" in tgts[i]) | |
5caeef02 | 3043 | * - Try to increment this slot (tgts[i] += 1) |
b2255edc BB |
3044 | * - if we can't increment because it runs into the next slot, |
3045 | * reset our slot to the minimum, and examine the next slot | |
3046 | * | |
3047 | * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose | |
3048 | * 3 columns to reconstruct), we will generate the following sequence: | |
3049 | * | |
3050 | * STATE ACTION | |
3051 | * 0 1 2 special case: skip since these are all parity | |
3052 | * 0 1 3 first slot: reset to 0; middle slot: increment to 2 | |
3053 | * 0 2 3 first slot: increment to 1 | |
3054 | * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4 | |
3055 | * 0 1 4 first: reset to 0; middle: increment to 2 | |
3056 | * 0 2 4 first: increment to 1 | |
3057 | * 1 2 4 first: reset to 0; middle: increment to 3 | |
3058 | * 0 3 4 first: increment to 1 | |
3059 | * 1 3 4 first: increment to 2 | |
3060 | * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5 | |
3061 | * 0 1 5 first: reset to 0; middle: increment to 2 | |
3062 | * 0 2 5 first: increment to 1 | |
3063 | * 1 2 5 first: reset to 0; middle: increment to 3 | |
3064 | * 0 3 5 first: increment to 1 | |
3065 | * 1 3 5 first: increment to 2 | |
3066 | * 2 3 5 first: reset to 0; middle: increment to 4 | |
3067 | * 0 4 5 first: increment to 1 | |
3068 | * 1 4 5 first: increment to 2 | |
3069 | * 2 4 5 first: increment to 3 | |
3070 | * 3 4 5 done | |
3071 | * | |
bf169e9f | 3072 | * This strategy works for dRAID but is less efficient when there are a large |
b2255edc | 3073 | * number of child vdevs and therefore permutations to check. Furthermore, |
5caeef02 | 3074 | * since the raidz_map_t rows likely do not overlap, reconstruction would be |
b2255edc BB |
3075 | * possible as long as there are no more than nparity data errors per row. |
3076 | * These additional permutations are not currently checked but could be as | |
3077 | * a future improvement. | |
5caeef02 DB |
3078 | * |
3079 | * Returns 0 on success, ECKSUM on failure. | |
b2255edc BB |
3080 | */ |
3081 | static int | |
3082 | vdev_raidz_combrec(zio_t *zio) | |
3083 | { | |
3084 | int nparity = vdev_get_nparity(zio->io_vd); | |
3085 | raidz_map_t *rm = zio->io_vsd; | |
5caeef02 DB |
3086 | int physical_width = zio->io_vd->vdev_children; |
3087 | int original_width = (rm->rm_original_width != 0) ? | |
3088 | rm->rm_original_width : physical_width; | |
45d1cae3 | 3089 | |
b2255edc BB |
3090 | for (int i = 0; i < rm->rm_nrows; i++) { |
3091 | raidz_row_t *rr = rm->rm_row[i]; | |
3092 | int total_errors = 0; | |
45d1cae3 | 3093 | |
b2255edc BB |
3094 | for (int c = 0; c < rr->rr_cols; c++) { |
3095 | if (rr->rr_col[c].rc_error) | |
3096 | total_errors++; | |
3097 | } | |
45d1cae3 | 3098 | |
b2255edc BB |
3099 | if (total_errors > nparity) |
3100 | return (vdev_raidz_worst_error(rr)); | |
3101 | } | |
45d1cae3 | 3102 | |
b2255edc BB |
3103 | for (int num_failures = 1; num_failures <= nparity; num_failures++) { |
3104 | int tstore[VDEV_RAIDZ_MAXPARITY + 2]; | |
3105 | int *ltgts = &tstore[1]; /* value is logical child ID */ | |
3106 | ||
5caeef02 DB |
3107 | |
3108 | /* | |
3109 | * Determine number of logical children, n. See comment | |
3110 | * above raidz_simulate_failure(). | |
3111 | */ | |
3112 | int n = 0; | |
3113 | for (int w = physical_width; | |
3114 | w >= original_width; w--) { | |
3115 | n += w; | |
3116 | } | |
b2255edc BB |
3117 | |
3118 | ASSERT3U(num_failures, <=, nparity); | |
3119 | ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY); | |
3120 | ||
3121 | /* Handle corner cases in combrec logic */ | |
3122 | ltgts[-1] = -1; | |
3123 | for (int i = 0; i < num_failures; i++) { | |
3124 | ltgts[i] = i; | |
3125 | } | |
3126 | ltgts[num_failures] = n; | |
3127 | ||
3128 | for (;;) { | |
3129 | int err = raidz_reconstruct(zio, ltgts, num_failures, | |
3130 | nparity); | |
3131 | if (err == EINVAL) { | |
45d1cae3 | 3132 | /* |
b2255edc BB |
3133 | * Reconstruction not possible with this # |
3134 | * failures; try more failures. | |
45d1cae3 | 3135 | */ |
b2255edc BB |
3136 | break; |
3137 | } else if (err == 0) | |
3138 | return (0); | |
3139 | ||
3140 | /* Compute next targets to try */ | |
3141 | for (int t = 0; ; t++) { | |
3142 | ASSERT3U(t, <, num_failures); | |
3143 | ltgts[t]++; | |
3144 | if (ltgts[t] == n) { | |
3145 | /* try more failures */ | |
3146 | ASSERT3U(t, ==, num_failures - 1); | |
5caeef02 DB |
3147 | if (zfs_flags & |
3148 | ZFS_DEBUG_RAIDZ_RECONSTRUCT) { | |
3149 | zfs_dbgmsg("reconstruction " | |
3150 | "failed for num_failures=" | |
3151 | "%u; tried all " | |
3152 | "combinations", | |
3153 | num_failures); | |
3154 | } | |
b2255edc BB |
3155 | break; |
3156 | } | |
45d1cae3 | 3157 | |
b2255edc BB |
3158 | ASSERT3U(ltgts[t], <, n); |
3159 | ASSERT3U(ltgts[t], <=, ltgts[t + 1]); | |
45d1cae3 BB |
3160 | |
3161 | /* | |
3162 | * If that spot is available, we're done here. | |
b2255edc | 3163 | * Try the next combination. |
45d1cae3 | 3164 | */ |
b2255edc | 3165 | if (ltgts[t] != ltgts[t + 1]) |
5caeef02 | 3166 | break; // found next combination |
45d1cae3 BB |
3167 | |
3168 | /* | |
b2255edc BB |
3169 | * Otherwise, reset this tgt to the minimum, |
3170 | * and move on to the next tgt. | |
45d1cae3 | 3171 | */ |
b2255edc BB |
3172 | ltgts[t] = ltgts[t - 1] + 1; |
3173 | ASSERT3U(ltgts[t], ==, t); | |
3174 | } | |
45d1cae3 | 3175 | |
b2255edc BB |
3176 | /* Increase the number of failures and keep trying. */ |
3177 | if (ltgts[num_failures - 1] == n) | |
3178 | break; | |
45d1cae3 BB |
3179 | } |
3180 | } | |
5caeef02 DB |
3181 | if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) |
3182 | zfs_dbgmsg("reconstruction failed for all num_failures"); | |
b2255edc BB |
3183 | return (ECKSUM); |
3184 | } | |
3185 | ||
3186 | void | |
3187 | vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) | |
3188 | { | |
3189 | for (uint64_t row = 0; row < rm->rm_nrows; row++) { | |
3190 | raidz_row_t *rr = rm->rm_row[row]; | |
3191 | vdev_raidz_reconstruct_row(rm, rr, t, nt); | |
3192 | } | |
45d1cae3 BB |
3193 | } |
3194 | ||
e49f1e20 | 3195 | /* |
b2255edc | 3196 | * Complete a write IO operation on a RAIDZ VDev |
e49f1e20 WA |
3197 | * |
3198 | * Outline: | |
e49f1e20 WA |
3199 | * 1. Check for errors on the child IOs. |
3200 | * 2. Return, setting an error code if too few child VDevs were written | |
3201 | * to reconstruct the data later. Note that partial writes are | |
3202 | * considered successful if they can be reconstructed at all. | |
e49f1e20 | 3203 | */ |
b128c09f | 3204 | static void |
b2255edc BB |
3205 | vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr) |
3206 | { | |
5caeef02 DB |
3207 | int normal_errors = 0; |
3208 | int shadow_errors = 0; | |
b2255edc BB |
3209 | |
3210 | ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); | |
3211 | ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); | |
3212 | ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); | |
3213 | ||
3214 | for (int c = 0; c < rr->rr_cols; c++) { | |
3215 | raidz_col_t *rc = &rr->rr_col[c]; | |
3216 | ||
5caeef02 | 3217 | if (rc->rc_error != 0) { |
b2255edc | 3218 | ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ |
5caeef02 DB |
3219 | normal_errors++; |
3220 | } | |
3221 | if (rc->rc_shadow_error != 0) { | |
3222 | ASSERT(rc->rc_shadow_error != ECKSUM); | |
3223 | shadow_errors++; | |
b2255edc BB |
3224 | } |
3225 | } | |
3226 | ||
3227 | /* | |
3228 | * Treat partial writes as a success. If we couldn't write enough | |
5caeef02 DB |
3229 | * columns to reconstruct the data, the I/O failed. Otherwise, good |
3230 | * enough. Note that in the case of a shadow write (during raidz | |
3231 | * expansion), depending on if we crash, either the normal (old) or | |
3232 | * shadow (new) location may become the "real" version of the block, | |
3233 | * so both locations must have sufficient redundancy. | |
b2255edc BB |
3234 | * |
3235 | * Now that we support write reallocation, it would be better | |
3236 | * to treat partial failure as real failure unless there are | |
3237 | * no non-degraded top-level vdevs left, and not update DTLs | |
3238 | * if we intend to reallocate. | |
3239 | */ | |
5caeef02 DB |
3240 | if (normal_errors > rr->rr_firstdatacol || |
3241 | shadow_errors > rr->rr_firstdatacol) { | |
b2255edc BB |
3242 | zio->io_error = zio_worst_error(zio->io_error, |
3243 | vdev_raidz_worst_error(rr)); | |
3244 | } | |
3245 | } | |
3246 | ||
46df6e98 | 3247 | static void |
b2255edc BB |
3248 | vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm, |
3249 | raidz_row_t *rr) | |
34dc7c2f | 3250 | { |
34dc7c2f BB |
3251 | int parity_errors = 0; |
3252 | int parity_untried = 0; | |
3253 | int data_errors = 0; | |
b128c09f | 3254 | int total_errors = 0; |
34dc7c2f | 3255 | |
b2255edc BB |
3256 | ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol); |
3257 | ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol); | |
34dc7c2f | 3258 | |
b2255edc BB |
3259 | for (int c = 0; c < rr->rr_cols; c++) { |
3260 | raidz_col_t *rc = &rr->rr_col[c]; | |
34dc7c2f | 3261 | |
ad8b9f94 BB |
3262 | /* |
3263 | * If scrubbing and a replacing/sparing child vdev determined | |
3264 | * that not all of its children have an identical copy of the | |
3265 | * data, then clear the error so the column is treated like | |
3266 | * any other read and force a repair to correct the damage. | |
3267 | */ | |
3268 | if (rc->rc_error == ECKSUM) { | |
3269 | ASSERT(zio->io_flags & ZIO_FLAG_SCRUB); | |
3270 | vdev_raidz_checksum_error(zio, rc, rc->rc_abd); | |
3271 | rc->rc_force_repair = 1; | |
3272 | rc->rc_error = 0; | |
3273 | } | |
34dc7c2f | 3274 | |
ad8b9f94 | 3275 | if (rc->rc_error) { |
b2255edc | 3276 | if (c < rr->rr_firstdatacol) |
34dc7c2f BB |
3277 | parity_errors++; |
3278 | else | |
3279 | data_errors++; | |
3280 | ||
b128c09f | 3281 | total_errors++; |
b2255edc | 3282 | } else if (c < rr->rr_firstdatacol && !rc->rc_tried) { |
34dc7c2f BB |
3283 | parity_untried++; |
3284 | } | |
3285 | } | |
3286 | ||
34dc7c2f | 3287 | /* |
b2255edc BB |
3288 | * If there were data errors and the number of errors we saw was |
3289 | * correctable -- less than or equal to the number of parity disks read | |
3290 | * -- reconstruct based on the missing data. | |
34dc7c2f | 3291 | */ |
b2255edc BB |
3292 | if (data_errors != 0 && |
3293 | total_errors <= rr->rr_firstdatacol - parity_untried) { | |
3294 | /* | |
3295 | * We either attempt to read all the parity columns or | |
3296 | * none of them. If we didn't try to read parity, we | |
3297 | * wouldn't be here in the correctable case. There must | |
3298 | * also have been fewer parity errors than parity | |
3299 | * columns or, again, we wouldn't be in this code path. | |
3300 | */ | |
3301 | ASSERT(parity_untried == 0); | |
3302 | ASSERT(parity_errors < rr->rr_firstdatacol); | |
34dc7c2f | 3303 | |
b2255edc BB |
3304 | /* |
3305 | * Identify the data columns that reported an error. | |
3306 | */ | |
3307 | int n = 0; | |
3308 | int tgts[VDEV_RAIDZ_MAXPARITY]; | |
3309 | for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) { | |
3310 | raidz_col_t *rc = &rr->rr_col[c]; | |
3311 | if (rc->rc_error != 0) { | |
3312 | ASSERT(n < VDEV_RAIDZ_MAXPARITY); | |
3313 | tgts[n++] = c; | |
34dc7c2f | 3314 | } |
b2255edc | 3315 | } |
34dc7c2f | 3316 | |
b2255edc | 3317 | ASSERT(rr->rr_firstdatacol >= n); |
34dc7c2f | 3318 | |
46df6e98 | 3319 | vdev_raidz_reconstruct_row(rm, rr, tgts, n); |
b2255edc | 3320 | } |
b2255edc | 3321 | } |
34dc7c2f | 3322 | |
b2255edc BB |
3323 | /* |
3324 | * Return the number of reads issued. | |
3325 | */ | |
3326 | static int | |
3327 | vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr) | |
3328 | { | |
3329 | vdev_t *vd = zio->io_vd; | |
3330 | int nread = 0; | |
34dc7c2f | 3331 | |
b2255edc BB |
3332 | rr->rr_missingdata = 0; |
3333 | rr->rr_missingparity = 0; | |
34dc7c2f BB |
3334 | |
3335 | /* | |
b2255edc BB |
3336 | * If this rows contains empty sectors which are not required |
3337 | * for a normal read then allocate an ABD for them now so they | |
3338 | * may be read, verified, and any needed repairs performed. | |
34dc7c2f | 3339 | */ |
5caeef02 | 3340 | if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL) |
b2255edc | 3341 | vdev_draid_map_alloc_empty(zio, rr); |
34dc7c2f | 3342 | |
b2255edc BB |
3343 | for (int c = 0; c < rr->rr_cols; c++) { |
3344 | raidz_col_t *rc = &rr->rr_col[c]; | |
3345 | if (rc->rc_tried || rc->rc_size == 0) | |
34dc7c2f BB |
3346 | continue; |
3347 | ||
b2255edc BB |
3348 | zio_nowait(zio_vdev_child_io(zio, NULL, |
3349 | vd->vdev_child[rc->rc_devidx], | |
3350 | rc->rc_offset, rc->rc_abd, rc->rc_size, | |
3351 | zio->io_type, zio->io_priority, 0, | |
3352 | vdev_raidz_child_done, rc)); | |
3353 | nread++; | |
34dc7c2f | 3354 | } |
b2255edc BB |
3355 | return (nread); |
3356 | } | |
34dc7c2f | 3357 | |
b2255edc BB |
3358 | /* |
3359 | * We're here because either there were too many errors to even attempt | |
3360 | * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec() | |
3361 | * failed. In either case, there is enough bad data to prevent reconstruction. | |
3362 | * Start checksum ereports for all children which haven't failed. | |
3363 | */ | |
3364 | static void | |
3365 | vdev_raidz_io_done_unrecoverable(zio_t *zio) | |
3366 | { | |
3367 | raidz_map_t *rm = zio->io_vsd; | |
34dc7c2f | 3368 | |
b2255edc BB |
3369 | for (int i = 0; i < rm->rm_nrows; i++) { |
3370 | raidz_row_t *rr = rm->rm_row[i]; | |
428870ff | 3371 | |
b2255edc BB |
3372 | for (int c = 0; c < rr->rr_cols; c++) { |
3373 | raidz_col_t *rc = &rr->rr_col[c]; | |
3374 | vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx]; | |
3375 | ||
3376 | if (rc->rc_error != 0) | |
3377 | continue; | |
3378 | ||
3379 | zio_bad_cksum_t zbc; | |
3380 | zbc.zbc_has_cksum = 0; | |
3381 | zbc.zbc_injected = rm->rm_ecksuminjected; | |
3382 | ||
03e02e5b DB |
3383 | mutex_enter(&cvd->vdev_stat_lock); |
3384 | cvd->vdev_stat.vs_checksum_errors++; | |
3385 | mutex_exit(&cvd->vdev_stat_lock); | |
7a75f74c RW |
3386 | (void) zfs_ereport_start_checksum(zio->io_spa, |
3387 | cvd, &zio->io_bookmark, zio, rc->rc_offset, | |
3388 | rc->rc_size, &zbc); | |
34dc7c2f BB |
3389 | } |
3390 | } | |
b2255edc | 3391 | } |
34dc7c2f | 3392 | |
b2255edc BB |
3393 | void |
3394 | vdev_raidz_io_done(zio_t *zio) | |
3395 | { | |
3396 | raidz_map_t *rm = zio->io_vsd; | |
34dc7c2f | 3397 | |
5caeef02 | 3398 | ASSERT(zio->io_bp != NULL); |
b2255edc BB |
3399 | if (zio->io_type == ZIO_TYPE_WRITE) { |
3400 | for (int i = 0; i < rm->rm_nrows; i++) { | |
3401 | vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]); | |
3402 | } | |
3403 | } else { | |
5caeef02 DB |
3404 | if (rm->rm_phys_col) { |
3405 | /* | |
3406 | * This is an aggregated read. Copy the data and status | |
3407 | * from the aggregate abd's to the individual rows. | |
3408 | */ | |
3409 | for (int i = 0; i < rm->rm_nrows; i++) { | |
3410 | raidz_row_t *rr = rm->rm_row[i]; | |
3411 | ||
3412 | for (int c = 0; c < rr->rr_cols; c++) { | |
3413 | raidz_col_t *rc = &rr->rr_col[c]; | |
3414 | if (rc->rc_tried || rc->rc_size == 0) | |
3415 | continue; | |
3416 | ||
3417 | raidz_col_t *prc = | |
3418 | &rm->rm_phys_col[rc->rc_devidx]; | |
3419 | rc->rc_error = prc->rc_error; | |
3420 | rc->rc_tried = prc->rc_tried; | |
3421 | rc->rc_skipped = prc->rc_skipped; | |
3422 | if (c >= rr->rr_firstdatacol) { | |
3423 | /* | |
3424 | * Note: this is slightly faster | |
3425 | * than using abd_copy_off(). | |
3426 | */ | |
3427 | char *physbuf = abd_to_buf( | |
3428 | prc->rc_abd); | |
3429 | void *physloc = physbuf + | |
3430 | rc->rc_offset - | |
3431 | prc->rc_offset; | |
3432 | ||
3433 | abd_copy_from_buf(rc->rc_abd, | |
3434 | physloc, rc->rc_size); | |
3435 | } | |
3436 | } | |
3437 | } | |
3438 | } | |
3439 | ||
b2255edc BB |
3440 | for (int i = 0; i < rm->rm_nrows; i++) { |
3441 | raidz_row_t *rr = rm->rm_row[i]; | |
46df6e98 | 3442 | vdev_raidz_io_done_reconstruct_known_missing(zio, |
b2255edc BB |
3443 | rm, rr); |
3444 | } | |
34dc7c2f | 3445 | |
b2255edc BB |
3446 | if (raidz_checksum_verify(zio) == 0) { |
3447 | for (int i = 0; i < rm->rm_nrows; i++) { | |
3448 | raidz_row_t *rr = rm->rm_row[i]; | |
3449 | vdev_raidz_io_done_verified(zio, rr); | |
3450 | } | |
3451 | zio_checksum_verified(zio); | |
3452 | } else { | |
3453 | /* | |
3454 | * A sequential resilver has no checksum which makes | |
3455 | * combinatoral reconstruction impossible. This code | |
3456 | * path is unreachable since raidz_checksum_verify() | |
3457 | * has no checksum to verify and must succeed. | |
3458 | */ | |
3459 | ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD); | |
34dc7c2f | 3460 | |
b2255edc BB |
3461 | /* |
3462 | * This isn't a typical situation -- either we got a | |
3463 | * read error or a child silently returned bad data. | |
3464 | * Read every block so we can try again with as much | |
3465 | * data and parity as we can track down. If we've | |
3466 | * already been through once before, all children will | |
3467 | * be marked as tried so we'll proceed to combinatorial | |
3468 | * reconstruction. | |
3469 | */ | |
3470 | int nread = 0; | |
3471 | for (int i = 0; i < rm->rm_nrows; i++) { | |
3472 | nread += vdev_raidz_read_all(zio, | |
3473 | rm->rm_row[i]); | |
3474 | } | |
3475 | if (nread != 0) { | |
3476 | /* | |
3477 | * Normally our stage is VDEV_IO_DONE, but if | |
3478 | * we've already called redone(), it will have | |
3479 | * changed to VDEV_IO_START, in which case we | |
3480 | * don't want to call redone() again. | |
3481 | */ | |
3482 | if (zio->io_stage != ZIO_STAGE_VDEV_IO_START) | |
3483 | zio_vdev_io_redone(zio); | |
3484 | return; | |
3485 | } | |
5caeef02 DB |
3486 | /* |
3487 | * It would be too expensive to try every possible | |
3488 | * combination of failed sectors in every row, so | |
3489 | * instead we try every combination of failed current or | |
3490 | * past physical disk. This means that if the incorrect | |
3491 | * sectors were all on Nparity disks at any point in the | |
3492 | * past, we will find the correct data. The only known | |
3493 | * case where this is less durable than a non-expanded | |
3494 | * RAIDZ, is if we have a silent failure during | |
3495 | * expansion. In that case, one block could be | |
3496 | * partially in the old format and partially in the | |
3497 | * new format, so we'd lost some sectors from the old | |
3498 | * format and some from the new format. | |
3499 | * | |
3500 | * e.g. logical_width=4 physical_width=6 | |
3501 | * the 15 (6+5+4) possible failed disks are: | |
3502 | * width=6 child=0 | |
3503 | * width=6 child=1 | |
3504 | * width=6 child=2 | |
3505 | * width=6 child=3 | |
3506 | * width=6 child=4 | |
3507 | * width=6 child=5 | |
3508 | * width=5 child=0 | |
3509 | * width=5 child=1 | |
3510 | * width=5 child=2 | |
3511 | * width=5 child=3 | |
3512 | * width=5 child=4 | |
3513 | * width=4 child=0 | |
3514 | * width=4 child=1 | |
3515 | * width=4 child=2 | |
3516 | * width=4 child=3 | |
3517 | * And we will try every combination of Nparity of these | |
3518 | * failing. | |
3519 | * | |
3520 | * As a first pass, we can generate every combo, | |
3521 | * and try reconstructing, ignoring any known | |
3522 | * failures. If any row has too many known + simulated | |
3523 | * failures, then we bail on reconstructing with this | |
3524 | * number of simulated failures. As an improvement, | |
3525 | * we could detect the number of whole known failures | |
3526 | * (i.e. we have known failures on these disks for | |
3527 | * every row; the disks never succeeded), and | |
3528 | * subtract that from the max # failures to simulate. | |
3529 | * We could go even further like the current | |
3530 | * combrec code, but that doesn't seem like it | |
3531 | * gains us very much. If we simulate a failure | |
3532 | * that is also a known failure, that's fine. | |
3533 | */ | |
b2255edc BB |
3534 | zio->io_error = vdev_raidz_combrec(zio); |
3535 | if (zio->io_error == ECKSUM && | |
3536 | !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { | |
3537 | vdev_raidz_io_done_unrecoverable(zio); | |
3538 | } | |
34dc7c2f | 3539 | } |
34dc7c2f | 3540 | } |
5caeef02 DB |
3541 | if (rm->rm_lr != NULL) { |
3542 | zfs_rangelock_exit(rm->rm_lr); | |
3543 | rm->rm_lr = NULL; | |
3544 | } | |
34dc7c2f BB |
3545 | } |
3546 | ||
3547 | static void | |
3548 | vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) | |
3549 | { | |
b2255edc BB |
3550 | vdev_raidz_t *vdrz = vd->vdev_tsd; |
3551 | if (faulted > vdrz->vd_nparity) | |
34dc7c2f BB |
3552 | vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, |
3553 | VDEV_AUX_NO_REPLICAS); | |
3554 | else if (degraded + faulted != 0) | |
3555 | vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); | |
3556 | else | |
3557 | vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); | |
3558 | } | |
3559 | ||
3d6da72d IH |
3560 | /* |
3561 | * Determine if any portion of the provided block resides on a child vdev | |
3562 | * with a dirty DTL and therefore needs to be resilvered. The function | |
e1cfd73f | 3563 | * assumes that at least one DTL is dirty which implies that full stripe |
3d6da72d IH |
3564 | * width blocks must be resilvered. |
3565 | */ | |
3566 | static boolean_t | |
b2255edc BB |
3567 | vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize, |
3568 | uint64_t phys_birth) | |
3d6da72d | 3569 | { |
b2255edc | 3570 | vdev_raidz_t *vdrz = vd->vdev_tsd; |
5caeef02 DB |
3571 | |
3572 | /* | |
3573 | * If we're in the middle of a RAIDZ expansion, this block may be in | |
3574 | * the old and/or new location. For simplicity, always resilver it. | |
3575 | */ | |
3576 | if (vdrz->vn_vre.vre_state == DSS_SCANNING) | |
3577 | return (B_TRUE); | |
3578 | ||
3d6da72d | 3579 | uint64_t dcols = vd->vdev_children; |
b2255edc | 3580 | uint64_t nparity = vdrz->vd_nparity; |
3d6da72d IH |
3581 | uint64_t ashift = vd->vdev_top->vdev_ashift; |
3582 | /* The starting RAIDZ (parent) vdev sector of the block. */ | |
b2255edc | 3583 | uint64_t b = DVA_GET_OFFSET(dva) >> ashift; |
3d6da72d IH |
3584 | /* The zio's size in units of the vdev's minimum sector size. */ |
3585 | uint64_t s = ((psize - 1) >> ashift) + 1; | |
3586 | /* The first column for this stripe. */ | |
3587 | uint64_t f = b % dcols; | |
3588 | ||
b2255edc BB |
3589 | /* Unreachable by sequential resilver. */ |
3590 | ASSERT3U(phys_birth, !=, TXG_UNKNOWN); | |
3591 | ||
3592 | if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) | |
3593 | return (B_FALSE); | |
3594 | ||
3d6da72d IH |
3595 | if (s + nparity >= dcols) |
3596 | return (B_TRUE); | |
3597 | ||
3598 | for (uint64_t c = 0; c < s + nparity; c++) { | |
3599 | uint64_t devidx = (f + c) % dcols; | |
3600 | vdev_t *cvd = vd->vdev_child[devidx]; | |
3601 | ||
3602 | /* | |
3603 | * dsl_scan_need_resilver() already checked vd with | |
3604 | * vdev_dtl_contains(). So here just check cvd with | |
3605 | * vdev_dtl_empty(), cheaper and a good approximation. | |
3606 | */ | |
3607 | if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) | |
3608 | return (B_TRUE); | |
3609 | } | |
3610 | ||
3611 | return (B_FALSE); | |
3612 | } | |
3613 | ||
619f0976 | 3614 | static void |
b2255edc BB |
3615 | vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs, |
3616 | range_seg64_t *physical_rs, range_seg64_t *remain_rs) | |
619f0976 | 3617 | { |
14e4e3cb AZ |
3618 | (void) remain_rs; |
3619 | ||
619f0976 GW |
3620 | vdev_t *raidvd = cvd->vdev_parent; |
3621 | ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); | |
3622 | ||
5caeef02 DB |
3623 | vdev_raidz_t *vdrz = raidvd->vdev_tsd; |
3624 | ||
3625 | if (vdrz->vn_vre.vre_state == DSS_SCANNING) { | |
3626 | /* | |
3627 | * We're in the middle of expansion, in which case the | |
3628 | * translation is in flux. Any answer we give may be wrong | |
3629 | * by the time we return, so it isn't safe for the caller to | |
3630 | * act on it. Therefore we say that this range isn't present | |
3631 | * on any children. The only consumers of this are "zpool | |
3632 | * initialize" and trimming, both of which are "best effort" | |
3633 | * anyway. | |
3634 | */ | |
3635 | physical_rs->rs_start = physical_rs->rs_end = 0; | |
3636 | remain_rs->rs_start = remain_rs->rs_end = 0; | |
3637 | return; | |
3638 | } | |
3639 | ||
3640 | uint64_t width = vdrz->vd_physical_width; | |
619f0976 GW |
3641 | uint64_t tgt_col = cvd->vdev_id; |
3642 | uint64_t ashift = raidvd->vdev_top->vdev_ashift; | |
3643 | ||
3644 | /* make sure the offsets are block-aligned */ | |
b2255edc BB |
3645 | ASSERT0(logical_rs->rs_start % (1 << ashift)); |
3646 | ASSERT0(logical_rs->rs_end % (1 << ashift)); | |
3647 | uint64_t b_start = logical_rs->rs_start >> ashift; | |
3648 | uint64_t b_end = logical_rs->rs_end >> ashift; | |
619f0976 GW |
3649 | |
3650 | uint64_t start_row = 0; | |
3651 | if (b_start > tgt_col) /* avoid underflow */ | |
3652 | start_row = ((b_start - tgt_col - 1) / width) + 1; | |
3653 | ||
3654 | uint64_t end_row = 0; | |
3655 | if (b_end > tgt_col) | |
3656 | end_row = ((b_end - tgt_col - 1) / width) + 1; | |
3657 | ||
b2255edc BB |
3658 | physical_rs->rs_start = start_row << ashift; |
3659 | physical_rs->rs_end = end_row << ashift; | |
619f0976 | 3660 | |
b2255edc BB |
3661 | ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start); |
3662 | ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=, | |
3663 | logical_rs->rs_end - logical_rs->rs_start); | |
3664 | } | |
3665 | ||
5caeef02 DB |
3666 | static void |
3667 | raidz_reflow_sync(void *arg, dmu_tx_t *tx) | |
b2255edc | 3668 | { |
5caeef02 DB |
3669 | spa_t *spa = arg; |
3670 | int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; | |
3671 | vdev_raidz_expand_t *vre = spa->spa_raidz_expand; | |
b2255edc | 3672 | |
5caeef02 DB |
3673 | /* |
3674 | * Ensure there are no i/os to the range that is being committed. | |
3675 | */ | |
3676 | uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock); | |
3677 | ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset); | |
b2255edc | 3678 | |
5caeef02 DB |
3679 | mutex_enter(&vre->vre_lock); |
3680 | uint64_t new_offset = | |
3681 | MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset); | |
3682 | /* | |
3683 | * We should not have committed anything that failed. | |
3684 | */ | |
3685 | VERIFY3U(vre->vre_failed_offset, >=, old_offset); | |
3686 | mutex_exit(&vre->vre_lock); | |
b2255edc | 3687 | |
5caeef02 DB |
3688 | zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, |
3689 | old_offset, new_offset - old_offset, | |
3690 | RL_WRITER); | |
b2255edc | 3691 | |
5caeef02 DB |
3692 | /* |
3693 | * Update the uberblock that will be written when this txg completes. | |
3694 | */ | |
3695 | RAIDZ_REFLOW_SET(&spa->spa_uberblock, | |
3696 | RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset); | |
3697 | vre->vre_offset_pertxg[txgoff] = 0; | |
3698 | zfs_rangelock_exit(lr); | |
3699 | ||
3700 | mutex_enter(&vre->vre_lock); | |
3701 | vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff]; | |
3702 | vre->vre_bytes_copied_pertxg[txgoff] = 0; | |
3703 | mutex_exit(&vre->vre_lock); | |
3704 | ||
3705 | vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); | |
3706 | VERIFY0(zap_update(spa->spa_meta_objset, | |
3707 | vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, | |
3708 | sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx)); | |
3709 | } | |
b2255edc | 3710 | |
5caeef02 DB |
3711 | static void |
3712 | raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx) | |
3713 | { | |
3714 | spa_t *spa = arg; | |
3715 | vdev_raidz_expand_t *vre = spa->spa_raidz_expand; | |
3716 | vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); | |
3717 | vdev_raidz_t *vdrz = raidvd->vdev_tsd; | |
b2255edc | 3718 | |
5caeef02 DB |
3719 | for (int i = 0; i < TXG_SIZE; i++) |
3720 | VERIFY0(vre->vre_offset_pertxg[i]); | |
b2255edc | 3721 | |
5caeef02 DB |
3722 | reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); |
3723 | re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES; | |
3724 | re->re_logical_width = vdrz->vd_physical_width; | |
3725 | mutex_enter(&vdrz->vd_expand_lock); | |
3726 | avl_add(&vdrz->vd_expand_txgs, re); | |
3727 | mutex_exit(&vdrz->vd_expand_lock); | |
3728 | ||
3729 | vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); | |
3730 | ||
3731 | /* | |
3732 | * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS | |
3733 | * will get written (based on vd_expand_txgs). | |
3734 | */ | |
3735 | vdev_config_dirty(vd); | |
3736 | ||
3737 | /* | |
3738 | * Before we change vre_state, the on-disk state must reflect that we | |
3739 | * have completed all copying, so that vdev_raidz_io_start() can use | |
3740 | * vre_state to determine if the reflow is in progress. See also the | |
3741 | * end of spa_raidz_expand_thread(). | |
3742 | */ | |
3743 | VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, | |
3744 | raidvd->vdev_ms_count << raidvd->vdev_ms_shift); | |
3745 | ||
3746 | vre->vre_end_time = gethrestime_sec(); | |
3747 | vre->vre_state = DSS_FINISHED; | |
3748 | ||
3749 | uint64_t state = vre->vre_state; | |
3750 | VERIFY0(zap_update(spa->spa_meta_objset, | |
3751 | vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, | |
3752 | sizeof (state), 1, &state, tx)); | |
3753 | ||
3754 | uint64_t end_time = vre->vre_end_time; | |
3755 | VERIFY0(zap_update(spa->spa_meta_objset, | |
3756 | vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, | |
3757 | sizeof (end_time), 1, &end_time, tx)); | |
3758 | ||
3759 | spa->spa_uberblock.ub_raidz_reflow_info = 0; | |
3760 | ||
3761 | spa_history_log_internal(spa, "raidz vdev expansion completed", tx, | |
3762 | "%s vdev %llu new width %llu", spa_name(spa), | |
3763 | (unsigned long long)vd->vdev_id, | |
3764 | (unsigned long long)vd->vdev_children); | |
3765 | ||
3766 | spa->spa_raidz_expand = NULL; | |
3767 | raidvd->vdev_rz_expanding = B_FALSE; | |
3768 | ||
3769 | spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); | |
3770 | spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); | |
3771 | spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); | |
3772 | ||
3773 | spa_notify_waiters(spa); | |
3774 | ||
3775 | /* | |
3776 | * While we're in syncing context take the opportunity to | |
3777 | * setup a scrub. All the data has been sucessfully copied | |
3778 | * but we have not validated any checksums. | |
3779 | */ | |
3780 | pool_scan_func_t func = POOL_SCAN_SCRUB; | |
3781 | if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0) | |
3782 | dsl_scan_setup_sync(&func, tx); | |
b2255edc BB |
3783 | } |
3784 | ||
5caeef02 DB |
3785 | /* |
3786 | * Struct for one copy zio. | |
3787 | */ | |
3788 | typedef struct raidz_reflow_arg { | |
3789 | vdev_raidz_expand_t *rra_vre; | |
3790 | zfs_locked_range_t *rra_lr; | |
3791 | uint64_t rra_txg; | |
3792 | } raidz_reflow_arg_t; | |
3793 | ||
3794 | /* | |
3795 | * The write of the new location is done. | |
3796 | */ | |
b2255edc | 3797 | static void |
5caeef02 | 3798 | raidz_reflow_write_done(zio_t *zio) |
b2255edc | 3799 | { |
5caeef02 DB |
3800 | raidz_reflow_arg_t *rra = zio->io_private; |
3801 | vdev_raidz_expand_t *vre = rra->rra_vre; | |
3802 | ||
3803 | abd_free(zio->io_abd); | |
3804 | ||
3805 | mutex_enter(&vre->vre_lock); | |
3806 | if (zio->io_error != 0) { | |
3807 | /* Force a reflow pause on errors */ | |
3808 | vre->vre_failed_offset = | |
3809 | MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); | |
3810 | } | |
3811 | ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size); | |
3812 | vre->vre_outstanding_bytes -= zio->io_size; | |
3813 | if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length < | |
3814 | vre->vre_failed_offset) { | |
3815 | vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] += | |
3816 | zio->io_size; | |
3817 | } | |
3818 | cv_signal(&vre->vre_cv); | |
3819 | mutex_exit(&vre->vre_lock); | |
3820 | ||
3821 | zfs_rangelock_exit(rra->rra_lr); | |
3822 | ||
3823 | kmem_free(rra, sizeof (*rra)); | |
3824 | spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); | |
b2255edc BB |
3825 | } |
3826 | ||
3827 | /* | |
5caeef02 DB |
3828 | * The read of the old location is done. The parent zio is the write to |
3829 | * the new location. Allow it to start. | |
b2255edc BB |
3830 | */ |
3831 | static void | |
5caeef02 | 3832 | raidz_reflow_read_done(zio_t *zio) |
b2255edc | 3833 | { |
5caeef02 DB |
3834 | raidz_reflow_arg_t *rra = zio->io_private; |
3835 | vdev_raidz_expand_t *vre = rra->rra_vre; | |
b2255edc BB |
3836 | |
3837 | /* | |
5caeef02 DB |
3838 | * If the read failed, or if it was done on a vdev that is not fully |
3839 | * healthy (e.g. a child that has a resilver in progress), we may not | |
3840 | * have the correct data. Note that it's OK if the write proceeds. | |
3841 | * It may write garbage but the location is otherwise unused and we | |
3842 | * will retry later due to vre_failed_offset. | |
b2255edc | 3843 | */ |
5caeef02 DB |
3844 | if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) { |
3845 | zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu " | |
3846 | "err=%u partial_dtl_empty=%u missing_dtl_empty=%u", | |
3847 | (long long)rra->rra_lr->lr_offset, | |
3848 | (long long)rra->rra_lr->lr_length, | |
3849 | (long long)rra->rra_txg, | |
3850 | zio->io_error, | |
3851 | vdev_dtl_empty(zio->io_vd, DTL_PARTIAL), | |
3852 | vdev_dtl_empty(zio->io_vd, DTL_MISSING)); | |
3853 | mutex_enter(&vre->vre_lock); | |
3854 | /* Force a reflow pause on errors */ | |
3855 | vre->vre_failed_offset = | |
3856 | MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); | |
3857 | mutex_exit(&vre->vre_lock); | |
3858 | } | |
b2255edc | 3859 | |
5caeef02 | 3860 | zio_nowait(zio_unique_parent(zio)); |
b2255edc BB |
3861 | } |
3862 | ||
5caeef02 DB |
3863 | static void |
3864 | raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset, | |
3865 | dmu_tx_t *tx) | |
b2255edc | 3866 | { |
5caeef02 DB |
3867 | int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; |
3868 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
b2255edc | 3869 | |
5caeef02 DB |
3870 | if (offset == 0) |
3871 | return; | |
3872 | ||
3873 | mutex_enter(&vre->vre_lock); | |
3874 | ASSERT3U(vre->vre_offset, <=, offset); | |
3875 | vre->vre_offset = offset; | |
3876 | mutex_exit(&vre->vre_lock); | |
3877 | ||
3878 | if (vre->vre_offset_pertxg[txgoff] == 0) { | |
3879 | dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync, | |
3880 | spa, tx); | |
3881 | } | |
3882 | vre->vre_offset_pertxg[txgoff] = offset; | |
619f0976 GW |
3883 | } |
3884 | ||
5caeef02 DB |
3885 | static boolean_t |
3886 | vdev_raidz_expand_child_replacing(vdev_t *raidz_vd) | |
3887 | { | |
3888 | for (int i = 0; i < raidz_vd->vdev_children; i++) { | |
3889 | /* Quick check if a child is being replaced */ | |
3890 | if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf) | |
3891 | return (B_TRUE); | |
3892 | } | |
3893 | return (B_FALSE); | |
3894 | } | |
3895 | ||
3896 | static boolean_t | |
3897 | raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt, | |
3898 | dmu_tx_t *tx) | |
3899 | { | |
3900 | spa_t *spa = vd->vdev_spa; | |
3901 | int ashift = vd->vdev_top->vdev_ashift; | |
3902 | uint64_t offset, size; | |
3903 | ||
3904 | if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize, | |
3905 | &offset, &size)) { | |
3906 | return (B_FALSE); | |
3907 | } | |
3908 | ASSERT(IS_P2ALIGNED(offset, 1 << ashift)); | |
3909 | ASSERT3U(size, >=, 1 << ashift); | |
3910 | uint64_t length = 1 << ashift; | |
3911 | int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; | |
3912 | ||
3913 | uint64_t blkid = offset >> ashift; | |
3914 | ||
3915 | int old_children = vd->vdev_children - 1; | |
3916 | ||
3917 | /* | |
3918 | * We can only progress to the point that writes will not overlap | |
3919 | * with blocks whose progress has not yet been recorded on disk. | |
3920 | * Since partially-copied rows are still read from the old location, | |
3921 | * we need to stop one row before the sector-wise overlap, to prevent | |
3922 | * row-wise overlap. | |
3923 | * | |
3924 | * Note that even if we are skipping over a large unallocated region, | |
3925 | * we can't move the on-disk progress to `offset`, because concurrent | |
3926 | * writes/allocations could still use the currently-unallocated | |
3927 | * region. | |
3928 | */ | |
3929 | uint64_t ubsync_blkid = | |
3930 | RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift; | |
3931 | uint64_t next_overwrite_blkid = ubsync_blkid + | |
3932 | ubsync_blkid / old_children - old_children; | |
3933 | VERIFY3U(next_overwrite_blkid, >, ubsync_blkid); | |
3934 | ||
3935 | if (blkid >= next_overwrite_blkid) { | |
3936 | raidz_reflow_record_progress(vre, | |
3937 | next_overwrite_blkid << ashift, tx); | |
3938 | return (B_TRUE); | |
3939 | } | |
3940 | ||
3941 | range_tree_remove(rt, offset, length); | |
3942 | ||
3943 | raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP); | |
3944 | rra->rra_vre = vre; | |
3945 | rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock, | |
3946 | offset, length, RL_WRITER); | |
3947 | rra->rra_txg = dmu_tx_get_txg(tx); | |
3948 | ||
3949 | raidz_reflow_record_progress(vre, offset + length, tx); | |
3950 | ||
3951 | mutex_enter(&vre->vre_lock); | |
3952 | vre->vre_outstanding_bytes += length; | |
3953 | mutex_exit(&vre->vre_lock); | |
3954 | ||
3955 | /* | |
3956 | * SCL_STATE will be released when the read and write are done, | |
3957 | * by raidz_reflow_write_done(). | |
3958 | */ | |
3959 | spa_config_enter(spa, SCL_STATE, spa, RW_READER); | |
3960 | ||
3961 | /* check if a replacing vdev was added, if so treat it as an error */ | |
3962 | if (vdev_raidz_expand_child_replacing(vd)) { | |
3963 | zfs_dbgmsg("replacing vdev encountered, reflow paused at " | |
3964 | "offset=%llu txg=%llu", | |
3965 | (long long)rra->rra_lr->lr_offset, | |
3966 | (long long)rra->rra_txg); | |
3967 | ||
3968 | mutex_enter(&vre->vre_lock); | |
3969 | vre->vre_failed_offset = | |
3970 | MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset); | |
3971 | cv_signal(&vre->vre_cv); | |
3972 | mutex_exit(&vre->vre_lock); | |
3973 | ||
3974 | /* drop everything we acquired */ | |
3975 | zfs_rangelock_exit(rra->rra_lr); | |
3976 | kmem_free(rra, sizeof (*rra)); | |
3977 | spa_config_exit(spa, SCL_STATE, spa); | |
3978 | return (B_TRUE); | |
3979 | } | |
3980 | ||
3981 | zio_t *pio = spa->spa_txg_zio[txgoff]; | |
3982 | abd_t *abd = abd_alloc_for_io(length, B_FALSE); | |
3983 | zio_t *write_zio = zio_vdev_child_io(pio, NULL, | |
3984 | vd->vdev_child[blkid % vd->vdev_children], | |
3985 | (blkid / vd->vdev_children) << ashift, | |
3986 | abd, length, | |
3987 | ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, | |
3988 | ZIO_FLAG_CANFAIL, | |
3989 | raidz_reflow_write_done, rra); | |
3990 | ||
3991 | zio_nowait(zio_vdev_child_io(write_zio, NULL, | |
3992 | vd->vdev_child[blkid % old_children], | |
3993 | (blkid / old_children) << ashift, | |
3994 | abd, length, | |
3995 | ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, | |
3996 | ZIO_FLAG_CANFAIL, | |
3997 | raidz_reflow_read_done, rra)); | |
3998 | ||
3999 | return (B_FALSE); | |
4000 | } | |
4001 | ||
4002 | /* | |
4003 | * For testing (ztest specific) | |
4004 | */ | |
4005 | static void | |
4006 | raidz_expand_pause(uint_t pause_point) | |
4007 | { | |
4008 | while (raidz_expand_pause_point != 0 && | |
4009 | raidz_expand_pause_point <= pause_point) | |
4010 | delay(hz); | |
4011 | } | |
4012 | ||
4013 | static void | |
4014 | raidz_scratch_child_done(zio_t *zio) | |
4015 | { | |
4016 | zio_t *pio = zio->io_private; | |
4017 | ||
4018 | mutex_enter(&pio->io_lock); | |
4019 | pio->io_error = zio_worst_error(pio->io_error, zio->io_error); | |
4020 | mutex_exit(&pio->io_lock); | |
4021 | } | |
4022 | ||
4023 | /* | |
4024 | * Reflow the beginning portion of the vdev into an intermediate scratch area | |
4025 | * in memory and on disk. This operation must be persisted on disk before we | |
4026 | * proceed to overwrite the beginning portion with the reflowed data. | |
4027 | * | |
4028 | * This multi-step task can fail to complete if disk errors are encountered | |
4029 | * and we can return here after a pause (waiting for disk to become healthy). | |
4030 | */ | |
4031 | static void | |
4032 | raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx) | |
4033 | { | |
4034 | vdev_raidz_expand_t *vre = arg; | |
4035 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
4036 | zio_t *pio; | |
4037 | int error; | |
4038 | ||
4039 | spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); | |
4040 | vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); | |
4041 | int ashift = raidvd->vdev_ashift; | |
4042 | uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift); | |
4043 | uint64_t logical_size = write_size * raidvd->vdev_children; | |
4044 | uint64_t read_size = | |
4045 | P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)), | |
4046 | 1 << ashift); | |
4047 | ||
4048 | /* | |
4049 | * The scratch space must be large enough to get us to the point | |
4050 | * that one row does not overlap itself when moved. This is checked | |
4051 | * by vdev_raidz_attach_check(). | |
4052 | */ | |
4053 | VERIFY3U(write_size, >=, raidvd->vdev_children << ashift); | |
4054 | VERIFY3U(write_size, <=, VDEV_BOOT_SIZE); | |
4055 | VERIFY3U(write_size, <=, read_size); | |
4056 | ||
4057 | zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock, | |
4058 | 0, logical_size, RL_WRITER); | |
4059 | ||
4060 | abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), | |
4061 | KM_SLEEP); | |
4062 | for (int i = 0; i < raidvd->vdev_children; i++) { | |
4063 | abds[i] = abd_alloc_linear(read_size, B_FALSE); | |
4064 | } | |
4065 | ||
4066 | raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1); | |
4067 | ||
4068 | /* | |
4069 | * If we have already written the scratch area then we must read from | |
4070 | * there, since new writes were redirected there while we were paused | |
4071 | * or the original location may have been partially overwritten with | |
4072 | * reflowed data. | |
4073 | */ | |
4074 | if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) { | |
4075 | VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size); | |
4076 | /* | |
4077 | * Read from scratch space. | |
4078 | */ | |
4079 | pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); | |
4080 | for (int i = 0; i < raidvd->vdev_children; i++) { | |
4081 | /* | |
4082 | * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE | |
4083 | * to the offset to calculate the physical offset to | |
4084 | * write to. Passing in a negative offset makes us | |
4085 | * access the scratch area. | |
4086 | */ | |
4087 | zio_nowait(zio_vdev_child_io(pio, NULL, | |
4088 | raidvd->vdev_child[i], | |
4089 | VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], | |
4090 | write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ, | |
4091 | ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); | |
4092 | } | |
4093 | error = zio_wait(pio); | |
4094 | if (error != 0) { | |
4095 | zfs_dbgmsg("reflow: error %d reading scratch location", | |
4096 | error); | |
4097 | goto io_error_exit; | |
4098 | } | |
4099 | goto overwrite; | |
4100 | } | |
4101 | ||
4102 | /* | |
4103 | * Read from original location. | |
4104 | */ | |
4105 | pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); | |
4106 | for (int i = 0; i < raidvd->vdev_children - 1; i++) { | |
4107 | ASSERT0(vdev_is_dead(raidvd->vdev_child[i])); | |
4108 | zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], | |
4109 | 0, abds[i], read_size, ZIO_TYPE_READ, | |
4110 | ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, | |
4111 | raidz_scratch_child_done, pio)); | |
4112 | } | |
4113 | error = zio_wait(pio); | |
4114 | if (error != 0) { | |
4115 | zfs_dbgmsg("reflow: error %d reading original location", error); | |
4116 | io_error_exit: | |
4117 | for (int i = 0; i < raidvd->vdev_children; i++) | |
4118 | abd_free(abds[i]); | |
4119 | kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); | |
4120 | zfs_rangelock_exit(lr); | |
4121 | spa_config_exit(spa, SCL_STATE, FTAG); | |
4122 | return; | |
4123 | } | |
4124 | ||
4125 | raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2); | |
4126 | ||
4127 | /* | |
4128 | * Reflow in memory. | |
4129 | */ | |
4130 | uint64_t logical_sectors = logical_size >> ashift; | |
4131 | for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) { | |
4132 | int oldchild = i % (raidvd->vdev_children - 1); | |
4133 | uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift; | |
4134 | ||
4135 | int newchild = i % raidvd->vdev_children; | |
4136 | uint64_t newoff = (i / raidvd->vdev_children) << ashift; | |
4137 | ||
4138 | /* a single sector should not be copying over itself */ | |
4139 | ASSERT(!(newchild == oldchild && newoff == oldoff)); | |
4140 | ||
4141 | abd_copy_off(abds[newchild], abds[oldchild], | |
4142 | newoff, oldoff, 1 << ashift); | |
4143 | } | |
4144 | ||
4145 | /* | |
4146 | * Verify that we filled in everything we intended to (write_size on | |
4147 | * each child). | |
4148 | */ | |
4149 | VERIFY0(logical_sectors % raidvd->vdev_children); | |
4150 | VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==, | |
4151 | write_size); | |
4152 | ||
4153 | /* | |
4154 | * Write to scratch location (boot area). | |
4155 | */ | |
4156 | pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); | |
4157 | for (int i = 0; i < raidvd->vdev_children; i++) { | |
4158 | /* | |
4159 | * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to | |
4160 | * the offset to calculate the physical offset to write to. | |
4161 | * Passing in a negative offset lets us access the boot area. | |
4162 | */ | |
4163 | zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], | |
4164 | VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], | |
4165 | write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, | |
4166 | ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio)); | |
4167 | } | |
4168 | error = zio_wait(pio); | |
4169 | if (error != 0) { | |
4170 | zfs_dbgmsg("reflow: error %d writing scratch location", error); | |
4171 | goto io_error_exit; | |
4172 | } | |
4173 | pio = zio_root(spa, NULL, NULL, 0); | |
4174 | zio_flush(pio, raidvd); | |
4175 | zio_wait(pio); | |
4176 | ||
4177 | zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area", | |
4178 | (long long)logical_size); | |
4179 | ||
4180 | raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3); | |
4181 | ||
4182 | /* | |
4183 | * Update uberblock to indicate that scratch space is valid. This is | |
4184 | * needed because after this point, the real location may be | |
4185 | * overwritten. If we crash, we need to get the data from the | |
4186 | * scratch space, rather than the real location. | |
4187 | * | |
4188 | * Note: ub_timestamp is bumped so that vdev_uberblock_compare() | |
4189 | * will prefer this uberblock. | |
4190 | */ | |
4191 | RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size); | |
4192 | spa->spa_ubsync.ub_timestamp++; | |
4193 | ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, | |
4194 | &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); | |
4195 | if (spa_multihost(spa)) | |
4196 | mmp_update_uberblock(spa, &spa->spa_ubsync); | |
4197 | ||
4198 | zfs_dbgmsg("reflow: uberblock updated " | |
4199 | "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)", | |
4200 | (long long)spa->spa_ubsync.ub_txg, | |
4201 | (long long)logical_size, | |
4202 | (long long)spa->spa_ubsync.ub_timestamp); | |
4203 | ||
4204 | raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID); | |
4205 | ||
4206 | /* | |
4207 | * Overwrite with reflow'ed data. | |
4208 | */ | |
4209 | overwrite: | |
4210 | pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); | |
4211 | for (int i = 0; i < raidvd->vdev_children; i++) { | |
4212 | zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], | |
4213 | 0, abds[i], write_size, ZIO_TYPE_WRITE, | |
4214 | ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, | |
4215 | raidz_scratch_child_done, pio)); | |
4216 | } | |
4217 | error = zio_wait(pio); | |
4218 | if (error != 0) { | |
4219 | /* | |
4220 | * When we exit early here and drop the range lock, new | |
4221 | * writes will go into the scratch area so we'll need to | |
4222 | * read from there when we return after pausing. | |
4223 | */ | |
4224 | zfs_dbgmsg("reflow: error %d writing real location", error); | |
4225 | /* | |
4226 | * Update the uberblock that is written when this txg completes. | |
4227 | */ | |
4228 | RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID, | |
4229 | logical_size); | |
4230 | goto io_error_exit; | |
4231 | } | |
4232 | pio = zio_root(spa, NULL, NULL, 0); | |
4233 | zio_flush(pio, raidvd); | |
4234 | zio_wait(pio); | |
4235 | ||
4236 | zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location", | |
4237 | (long long)logical_size); | |
4238 | for (int i = 0; i < raidvd->vdev_children; i++) | |
4239 | abd_free(abds[i]); | |
4240 | kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); | |
4241 | ||
4242 | raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED); | |
4243 | ||
4244 | /* | |
4245 | * Update uberblock to indicate that the initial part has been | |
4246 | * reflow'ed. This is needed because after this point (when we exit | |
4247 | * the rangelock), we allow regular writes to this region, which will | |
4248 | * be written to the new location only (because reflow_offset_next == | |
4249 | * reflow_offset_synced). If we crashed and re-copied from the | |
4250 | * scratch space, we would lose the regular writes. | |
4251 | */ | |
4252 | RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED, | |
4253 | logical_size); | |
4254 | spa->spa_ubsync.ub_timestamp++; | |
4255 | ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, | |
4256 | &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); | |
4257 | if (spa_multihost(spa)) | |
4258 | mmp_update_uberblock(spa, &spa->spa_ubsync); | |
4259 | ||
4260 | zfs_dbgmsg("reflow: uberblock updated " | |
4261 | "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", | |
4262 | (long long)spa->spa_ubsync.ub_txg, | |
4263 | (long long)logical_size, | |
4264 | (long long)spa->spa_ubsync.ub_timestamp); | |
4265 | ||
4266 | raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1); | |
4267 | ||
4268 | /* | |
4269 | * Update progress. | |
4270 | */ | |
4271 | vre->vre_offset = logical_size; | |
4272 | zfs_rangelock_exit(lr); | |
4273 | spa_config_exit(spa, SCL_STATE, FTAG); | |
4274 | ||
4275 | int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; | |
4276 | vre->vre_offset_pertxg[txgoff] = vre->vre_offset; | |
4277 | vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; | |
4278 | /* | |
4279 | * Note - raidz_reflow_sync() will update the uberblock state to | |
4280 | * RRSS_SCRATCH_INVALID_SYNCED_REFLOW | |
4281 | */ | |
4282 | raidz_reflow_sync(spa, tx); | |
4283 | ||
4284 | raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2); | |
4285 | } | |
4286 | ||
4287 | /* | |
4288 | * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work | |
4289 | * here. No other i/o can be in progress, so we don't need the vre_rangelock. | |
4290 | */ | |
4291 | void | |
4292 | vdev_raidz_reflow_copy_scratch(spa_t *spa) | |
4293 | { | |
4294 | vdev_raidz_expand_t *vre = spa->spa_raidz_expand; | |
4295 | uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock); | |
4296 | ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID); | |
4297 | ||
4298 | spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); | |
4299 | vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); | |
4300 | ASSERT0(logical_size % raidvd->vdev_children); | |
4301 | uint64_t write_size = logical_size / raidvd->vdev_children; | |
4302 | ||
4303 | zio_t *pio; | |
4304 | ||
4305 | /* | |
4306 | * Read from scratch space. | |
4307 | */ | |
4308 | abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *), | |
4309 | KM_SLEEP); | |
4310 | for (int i = 0; i < raidvd->vdev_children; i++) { | |
4311 | abds[i] = abd_alloc_linear(write_size, B_FALSE); | |
4312 | } | |
4313 | ||
4314 | pio = zio_root(spa, NULL, NULL, 0); | |
4315 | for (int i = 0; i < raidvd->vdev_children; i++) { | |
4316 | /* | |
4317 | * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to | |
4318 | * the offset to calculate the physical offset to write to. | |
4319 | * Passing in a negative offset lets us access the boot area. | |
4320 | */ | |
4321 | zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], | |
4322 | VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i], | |
4323 | write_size, ZIO_TYPE_READ, | |
4324 | ZIO_PRIORITY_ASYNC_READ, 0, | |
4325 | raidz_scratch_child_done, pio)); | |
4326 | } | |
4327 | zio_wait(pio); | |
4328 | ||
4329 | /* | |
4330 | * Overwrite real location with reflow'ed data. | |
4331 | */ | |
4332 | pio = zio_root(spa, NULL, NULL, 0); | |
4333 | for (int i = 0; i < raidvd->vdev_children; i++) { | |
4334 | zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i], | |
4335 | 0, abds[i], write_size, ZIO_TYPE_WRITE, | |
4336 | ZIO_PRIORITY_ASYNC_WRITE, 0, | |
4337 | raidz_scratch_child_done, pio)); | |
4338 | } | |
4339 | zio_wait(pio); | |
4340 | pio = zio_root(spa, NULL, NULL, 0); | |
4341 | zio_flush(pio, raidvd); | |
4342 | zio_wait(pio); | |
4343 | ||
4344 | zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) " | |
4345 | "to real location", (long long)logical_size); | |
4346 | ||
4347 | for (int i = 0; i < raidvd->vdev_children; i++) | |
4348 | abd_free(abds[i]); | |
4349 | kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *)); | |
4350 | ||
4351 | /* | |
4352 | * Update uberblock. | |
4353 | */ | |
4354 | RAIDZ_REFLOW_SET(&spa->spa_ubsync, | |
4355 | RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size); | |
4356 | spa->spa_ubsync.ub_timestamp++; | |
4357 | VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1, | |
4358 | &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER)); | |
4359 | if (spa_multihost(spa)) | |
4360 | mmp_update_uberblock(spa, &spa->spa_ubsync); | |
4361 | ||
4362 | zfs_dbgmsg("reflow recovery: uberblock updated " | |
4363 | "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)", | |
4364 | (long long)spa->spa_ubsync.ub_txg, | |
4365 | (long long)logical_size, | |
4366 | (long long)spa->spa_ubsync.ub_timestamp); | |
4367 | ||
4368 | dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, | |
4369 | spa_first_txg(spa)); | |
4370 | int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; | |
4371 | vre->vre_offset = logical_size; | |
4372 | vre->vre_offset_pertxg[txgoff] = vre->vre_offset; | |
4373 | vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied; | |
4374 | /* | |
4375 | * Note that raidz_reflow_sync() will update the uberblock once more | |
4376 | */ | |
4377 | raidz_reflow_sync(spa, tx); | |
4378 | ||
4379 | dmu_tx_commit(tx); | |
4380 | ||
4381 | spa_config_exit(spa, SCL_STATE, FTAG); | |
4382 | } | |
4383 | ||
4384 | static boolean_t | |
4385 | spa_raidz_expand_thread_check(void *arg, zthr_t *zthr) | |
4386 | { | |
4387 | (void) zthr; | |
4388 | spa_t *spa = arg; | |
4389 | ||
4390 | return (spa->spa_raidz_expand != NULL && | |
4391 | !spa->spa_raidz_expand->vre_waiting_for_resilver); | |
4392 | } | |
4393 | ||
4394 | /* | |
4395 | * RAIDZ expansion background thread | |
4396 | * | |
4397 | * Can be called multiple times if the reflow is paused | |
4398 | */ | |
4399 | static void | |
4400 | spa_raidz_expand_thread(void *arg, zthr_t *zthr) | |
4401 | { | |
4402 | spa_t *spa = arg; | |
4403 | vdev_raidz_expand_t *vre = spa->spa_raidz_expand; | |
4404 | ||
4405 | if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) | |
4406 | vre->vre_offset = 0; | |
4407 | else | |
4408 | vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync); | |
4409 | ||
4410 | /* Reflow the begining portion using the scratch area */ | |
4411 | if (vre->vre_offset == 0) { | |
4412 | VERIFY0(dsl_sync_task(spa_name(spa), | |
4413 | NULL, raidz_reflow_scratch_sync, | |
4414 | vre, 0, ZFS_SPACE_CHECK_NONE)); | |
4415 | ||
4416 | /* if we encountered errors then pause */ | |
4417 | if (vre->vre_offset == 0) { | |
4418 | mutex_enter(&vre->vre_lock); | |
4419 | vre->vre_waiting_for_resilver = B_TRUE; | |
4420 | mutex_exit(&vre->vre_lock); | |
4421 | return; | |
4422 | } | |
4423 | } | |
4424 | ||
4425 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
4426 | vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); | |
4427 | ||
4428 | uint64_t guid = raidvd->vdev_guid; | |
4429 | ||
4430 | /* Iterate over all the remaining metaslabs */ | |
4431 | for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift; | |
4432 | i < raidvd->vdev_ms_count && | |
4433 | !zthr_iscancelled(zthr) && | |
4434 | vre->vre_failed_offset == UINT64_MAX; i++) { | |
4435 | metaslab_t *msp = raidvd->vdev_ms[i]; | |
4436 | ||
4437 | metaslab_disable(msp); | |
4438 | mutex_enter(&msp->ms_lock); | |
4439 | ||
4440 | /* | |
4441 | * The metaslab may be newly created (for the expanded | |
4442 | * space), in which case its trees won't exist yet, | |
4443 | * so we need to bail out early. | |
4444 | */ | |
4445 | if (msp->ms_new) { | |
4446 | mutex_exit(&msp->ms_lock); | |
4447 | metaslab_enable(msp, B_FALSE, B_FALSE); | |
4448 | continue; | |
4449 | } | |
4450 | ||
4451 | VERIFY0(metaslab_load(msp)); | |
4452 | ||
4453 | /* | |
4454 | * We want to copy everything except the free (allocatable) | |
4455 | * space. Note that there may be a little bit more free | |
4456 | * space (e.g. in ms_defer), and it's fine to copy that too. | |
4457 | */ | |
4458 | range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64, | |
4459 | NULL, 0, 0); | |
4460 | range_tree_add(rt, msp->ms_start, msp->ms_size); | |
4461 | range_tree_walk(msp->ms_allocatable, range_tree_remove, rt); | |
4462 | mutex_exit(&msp->ms_lock); | |
4463 | ||
4464 | /* | |
4465 | * Force the last sector of each metaslab to be copied. This | |
4466 | * ensures that we advance the on-disk progress to the end of | |
4467 | * this metaslab while the metaslab is disabled. Otherwise, we | |
4468 | * could move past this metaslab without advancing the on-disk | |
4469 | * progress, and then an allocation to this metaslab would not | |
4470 | * be copied. | |
4471 | */ | |
4472 | int sectorsz = 1 << raidvd->vdev_ashift; | |
4473 | uint64_t ms_last_offset = msp->ms_start + | |
4474 | msp->ms_size - sectorsz; | |
4475 | if (!range_tree_contains(rt, ms_last_offset, sectorsz)) { | |
4476 | range_tree_add(rt, ms_last_offset, sectorsz); | |
4477 | } | |
4478 | ||
4479 | /* | |
4480 | * When we are resuming from a paused expansion (i.e. | |
4481 | * when importing a pool with a expansion in progress), | |
4482 | * discard any state that we have already processed. | |
4483 | */ | |
4484 | range_tree_clear(rt, 0, vre->vre_offset); | |
4485 | ||
4486 | while (!zthr_iscancelled(zthr) && | |
4487 | !range_tree_is_empty(rt) && | |
4488 | vre->vre_failed_offset == UINT64_MAX) { | |
4489 | ||
4490 | /* | |
4491 | * We need to periodically drop the config lock so that | |
4492 | * writers can get in. Additionally, we can't wait | |
4493 | * for a txg to sync while holding a config lock | |
4494 | * (since a waiting writer could cause a 3-way deadlock | |
4495 | * with the sync thread, which also gets a config | |
4496 | * lock for reader). So we can't hold the config lock | |
4497 | * while calling dmu_tx_assign(). | |
4498 | */ | |
4499 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
4500 | ||
4501 | /* | |
4502 | * If requested, pause the reflow when the amount | |
4503 | * specified by raidz_expand_max_reflow_bytes is reached | |
4504 | * | |
4505 | * This pause is only used during testing or debugging. | |
4506 | */ | |
4507 | while (raidz_expand_max_reflow_bytes != 0 && | |
4508 | raidz_expand_max_reflow_bytes <= | |
4509 | vre->vre_bytes_copied && !zthr_iscancelled(zthr)) { | |
4510 | delay(hz); | |
4511 | } | |
4512 | ||
4513 | mutex_enter(&vre->vre_lock); | |
4514 | while (vre->vre_outstanding_bytes > | |
4515 | raidz_expand_max_copy_bytes) { | |
4516 | cv_wait(&vre->vre_cv, &vre->vre_lock); | |
4517 | } | |
4518 | mutex_exit(&vre->vre_lock); | |
4519 | ||
4520 | dmu_tx_t *tx = | |
4521 | dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
4522 | ||
4523 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
4524 | uint64_t txg = dmu_tx_get_txg(tx); | |
4525 | ||
4526 | /* | |
4527 | * Reacquire the vdev_config lock. Theoretically, the | |
4528 | * vdev_t that we're expanding may have changed. | |
4529 | */ | |
4530 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
4531 | raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); | |
4532 | ||
4533 | boolean_t needsync = | |
4534 | raidz_reflow_impl(raidvd, vre, rt, tx); | |
4535 | ||
4536 | dmu_tx_commit(tx); | |
4537 | ||
4538 | if (needsync) { | |
4539 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
4540 | txg_wait_synced(spa->spa_dsl_pool, txg); | |
4541 | spa_config_enter(spa, SCL_CONFIG, FTAG, | |
4542 | RW_READER); | |
4543 | } | |
4544 | } | |
4545 | ||
4546 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
4547 | ||
4548 | metaslab_enable(msp, B_FALSE, B_FALSE); | |
4549 | range_tree_vacate(rt, NULL, NULL); | |
4550 | range_tree_destroy(rt); | |
4551 | ||
4552 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
4553 | raidvd = vdev_lookup_top(spa, vre->vre_vdev_id); | |
4554 | } | |
4555 | ||
4556 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
4557 | ||
4558 | /* | |
4559 | * The txg_wait_synced() here ensures that all reflow zio's have | |
4560 | * completed, and vre_failed_offset has been set if necessary. It | |
4561 | * also ensures that the progress of the last raidz_reflow_sync() is | |
4562 | * written to disk before raidz_reflow_complete_sync() changes the | |
4563 | * in-memory vre_state. vdev_raidz_io_start() uses vre_state to | |
4564 | * determine if a reflow is in progress, in which case we may need to | |
4565 | * write to both old and new locations. Therefore we can only change | |
4566 | * vre_state once this is not necessary, which is once the on-disk | |
4567 | * progress (in spa_ubsync) has been set past any possible writes (to | |
4568 | * the end of the last metaslab). | |
4569 | */ | |
4570 | txg_wait_synced(spa->spa_dsl_pool, 0); | |
4571 | ||
4572 | if (!zthr_iscancelled(zthr) && | |
4573 | vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) { | |
4574 | /* | |
4575 | * We are not being canceled or paused, so the reflow must be | |
4576 | * complete. In that case also mark it as completed on disk. | |
4577 | */ | |
4578 | ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX); | |
4579 | VERIFY0(dsl_sync_task(spa_name(spa), NULL, | |
4580 | raidz_reflow_complete_sync, spa, | |
4581 | 0, ZFS_SPACE_CHECK_NONE)); | |
4582 | (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL); | |
4583 | } else { | |
4584 | /* | |
4585 | * Wait for all copy zio's to complete and for all the | |
4586 | * raidz_reflow_sync() synctasks to be run. | |
4587 | */ | |
4588 | spa_history_log_internal(spa, "reflow pause", | |
4589 | NULL, "offset=%llu failed_offset=%lld", | |
4590 | (long long)vre->vre_offset, | |
4591 | (long long)vre->vre_failed_offset); | |
4592 | mutex_enter(&vre->vre_lock); | |
4593 | if (vre->vre_failed_offset != UINT64_MAX) { | |
4594 | /* | |
4595 | * Reset progress so that we will retry everything | |
4596 | * after the point that something failed. | |
4597 | */ | |
4598 | vre->vre_offset = vre->vre_failed_offset; | |
4599 | vre->vre_failed_offset = UINT64_MAX; | |
4600 | vre->vre_waiting_for_resilver = B_TRUE; | |
4601 | } | |
4602 | mutex_exit(&vre->vre_lock); | |
4603 | } | |
4604 | } | |
4605 | ||
4606 | void | |
4607 | spa_start_raidz_expansion_thread(spa_t *spa) | |
4608 | { | |
4609 | ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL); | |
4610 | spa->spa_raidz_expand_zthr = zthr_create("raidz_expand", | |
4611 | spa_raidz_expand_thread_check, spa_raidz_expand_thread, | |
4612 | spa, defclsyspri); | |
4613 | } | |
4614 | ||
4615 | void | |
4616 | raidz_dtl_reassessed(vdev_t *vd) | |
4617 | { | |
4618 | spa_t *spa = vd->vdev_spa; | |
4619 | if (spa->spa_raidz_expand != NULL) { | |
4620 | vdev_raidz_expand_t *vre = spa->spa_raidz_expand; | |
4621 | /* | |
4622 | * we get called often from vdev_dtl_reassess() so make | |
4623 | * sure it's our vdev and any replacing is complete | |
4624 | */ | |
4625 | if (vd->vdev_top->vdev_id == vre->vre_vdev_id && | |
4626 | !vdev_raidz_expand_child_replacing(vd->vdev_top)) { | |
4627 | mutex_enter(&vre->vre_lock); | |
4628 | if (vre->vre_waiting_for_resilver) { | |
4629 | vdev_dbgmsg(vd, "DTL reassessed, " | |
4630 | "continuing raidz expansion"); | |
4631 | vre->vre_waiting_for_resilver = B_FALSE; | |
4632 | zthr_wakeup(spa->spa_raidz_expand_zthr); | |
4633 | } | |
4634 | mutex_exit(&vre->vre_lock); | |
4635 | } | |
4636 | } | |
4637 | } | |
4638 | ||
4639 | int | |
4640 | vdev_raidz_attach_check(vdev_t *new_child) | |
4641 | { | |
4642 | vdev_t *raidvd = new_child->vdev_parent; | |
4643 | uint64_t new_children = raidvd->vdev_children; | |
4644 | ||
4645 | /* | |
4646 | * We use the "boot" space as scratch space to handle overwriting the | |
4647 | * initial part of the vdev. If it is too small, then this expansion | |
4648 | * is not allowed. This would be very unusual (e.g. ashift > 13 and | |
4649 | * >200 children). | |
4650 | */ | |
4651 | if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) { | |
4652 | return (EINVAL); | |
4653 | } | |
4654 | return (0); | |
4655 | } | |
4656 | ||
4657 | void | |
4658 | vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx) | |
4659 | { | |
4660 | vdev_t *new_child = arg; | |
4661 | spa_t *spa = new_child->vdev_spa; | |
4662 | vdev_t *raidvd = new_child->vdev_parent; | |
4663 | vdev_raidz_t *vdrz = raidvd->vdev_tsd; | |
4664 | ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops); | |
4665 | ASSERT3P(raidvd->vdev_top, ==, raidvd); | |
4666 | ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width); | |
4667 | ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1); | |
4668 | ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==, | |
4669 | new_child); | |
4670 | ||
4671 | spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx); | |
4672 | ||
4673 | vdrz->vd_physical_width++; | |
4674 | ||
4675 | VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info); | |
4676 | vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id; | |
4677 | vdrz->vn_vre.vre_offset = 0; | |
4678 | vdrz->vn_vre.vre_failed_offset = UINT64_MAX; | |
4679 | spa->spa_raidz_expand = &vdrz->vn_vre; | |
4680 | zthr_wakeup(spa->spa_raidz_expand_zthr); | |
4681 | ||
4682 | /* | |
4683 | * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get | |
4684 | * written to the config. | |
4685 | */ | |
4686 | vdev_config_dirty(raidvd); | |
4687 | ||
4688 | vdrz->vn_vre.vre_start_time = gethrestime_sec(); | |
4689 | vdrz->vn_vre.vre_end_time = 0; | |
4690 | vdrz->vn_vre.vre_state = DSS_SCANNING; | |
4691 | vdrz->vn_vre.vre_bytes_copied = 0; | |
4692 | ||
4693 | uint64_t state = vdrz->vn_vre.vre_state; | |
4694 | VERIFY0(zap_update(spa->spa_meta_objset, | |
4695 | raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, | |
4696 | sizeof (state), 1, &state, tx)); | |
4697 | ||
4698 | uint64_t start_time = vdrz->vn_vre.vre_start_time; | |
4699 | VERIFY0(zap_update(spa->spa_meta_objset, | |
4700 | raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, | |
4701 | sizeof (start_time), 1, &start_time, tx)); | |
4702 | ||
4703 | (void) zap_remove(spa->spa_meta_objset, | |
4704 | raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx); | |
4705 | (void) zap_remove(spa->spa_meta_objset, | |
4706 | raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx); | |
4707 | ||
4708 | spa_history_log_internal(spa, "raidz vdev expansion started", tx, | |
4709 | "%s vdev %llu new width %llu", spa_name(spa), | |
4710 | (unsigned long long)raidvd->vdev_id, | |
4711 | (unsigned long long)raidvd->vdev_children); | |
4712 | } | |
4713 | ||
4714 | int | |
4715 | vdev_raidz_load(vdev_t *vd) | |
4716 | { | |
4717 | vdev_raidz_t *vdrz = vd->vdev_tsd; | |
4718 | int err; | |
4719 | ||
4720 | uint64_t state = DSS_NONE; | |
4721 | uint64_t start_time = 0; | |
4722 | uint64_t end_time = 0; | |
4723 | uint64_t bytes_copied = 0; | |
4724 | ||
4725 | if (vd->vdev_top_zap != 0) { | |
4726 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
4727 | vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE, | |
4728 | sizeof (state), 1, &state); | |
4729 | if (err != 0 && err != ENOENT) | |
4730 | return (err); | |
4731 | ||
4732 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
4733 | vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME, | |
4734 | sizeof (start_time), 1, &start_time); | |
4735 | if (err != 0 && err != ENOENT) | |
4736 | return (err); | |
4737 | ||
4738 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
4739 | vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, | |
4740 | sizeof (end_time), 1, &end_time); | |
4741 | if (err != 0 && err != ENOENT) | |
4742 | return (err); | |
4743 | ||
4744 | err = zap_lookup(vd->vdev_spa->spa_meta_objset, | |
4745 | vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, | |
4746 | sizeof (bytes_copied), 1, &bytes_copied); | |
4747 | if (err != 0 && err != ENOENT) | |
4748 | return (err); | |
4749 | } | |
4750 | ||
4751 | /* | |
4752 | * If we are in the middle of expansion, vre_state should have | |
4753 | * already been set by vdev_raidz_init(). | |
4754 | */ | |
4755 | EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING); | |
4756 | vdrz->vn_vre.vre_state = (dsl_scan_state_t)state; | |
4757 | vdrz->vn_vre.vre_start_time = start_time; | |
4758 | vdrz->vn_vre.vre_end_time = end_time; | |
4759 | vdrz->vn_vre.vre_bytes_copied = bytes_copied; | |
4760 | ||
4761 | return (0); | |
4762 | } | |
4763 | ||
4764 | int | |
4765 | spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres) | |
4766 | { | |
4767 | vdev_raidz_expand_t *vre = spa->spa_raidz_expand; | |
4768 | ||
4769 | if (vre == NULL) { | |
4770 | /* no removal in progress; find most recent completed */ | |
4771 | for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { | |
4772 | vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; | |
4773 | if (vd->vdev_ops == &vdev_raidz_ops) { | |
4774 | vdev_raidz_t *vdrz = vd->vdev_tsd; | |
4775 | ||
4776 | if (vdrz->vn_vre.vre_end_time != 0 && | |
4777 | (vre == NULL || | |
4778 | vdrz->vn_vre.vre_end_time > | |
4779 | vre->vre_end_time)) { | |
4780 | vre = &vdrz->vn_vre; | |
4781 | } | |
4782 | } | |
4783 | } | |
4784 | } | |
4785 | ||
4786 | if (vre == NULL) { | |
4787 | return (SET_ERROR(ENOENT)); | |
4788 | } | |
4789 | ||
4790 | pres->pres_state = vre->vre_state; | |
4791 | pres->pres_expanding_vdev = vre->vre_vdev_id; | |
4792 | ||
4793 | vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id); | |
4794 | pres->pres_to_reflow = vd->vdev_stat.vs_alloc; | |
4795 | ||
4796 | mutex_enter(&vre->vre_lock); | |
4797 | pres->pres_reflowed = vre->vre_bytes_copied; | |
4798 | for (int i = 0; i < TXG_SIZE; i++) | |
4799 | pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i]; | |
4800 | mutex_exit(&vre->vre_lock); | |
4801 | ||
4802 | pres->pres_start_time = vre->vre_start_time; | |
4803 | pres->pres_end_time = vre->vre_end_time; | |
4804 | pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver; | |
4805 | ||
4806 | return (0); | |
4807 | } | |
4808 | ||
4809 | /* | |
4810 | * Initialize private RAIDZ specific fields from the nvlist. | |
4811 | */ | |
4812 | static int | |
4813 | vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd) | |
4814 | { | |
4815 | uint_t children; | |
4816 | nvlist_t **child; | |
4817 | int error = nvlist_lookup_nvlist_array(nv, | |
4818 | ZPOOL_CONFIG_CHILDREN, &child, &children); | |
4819 | if (error != 0) | |
4820 | return (SET_ERROR(EINVAL)); | |
4821 | ||
4822 | uint64_t nparity; | |
4823 | if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { | |
4824 | if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) | |
4825 | return (SET_ERROR(EINVAL)); | |
4826 | ||
4827 | /* | |
4828 | * Previous versions could only support 1 or 2 parity | |
4829 | * device. | |
4830 | */ | |
4831 | if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2) | |
4832 | return (SET_ERROR(EINVAL)); | |
4833 | else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3) | |
4834 | return (SET_ERROR(EINVAL)); | |
4835 | } else { | |
4836 | /* | |
4837 | * We require the parity to be specified for SPAs that | |
4838 | * support multiple parity levels. | |
4839 | */ | |
4840 | if (spa_version(spa) >= SPA_VERSION_RAIDZ2) | |
4841 | return (SET_ERROR(EINVAL)); | |
4842 | ||
4843 | /* | |
4844 | * Otherwise, we default to 1 parity device for RAID-Z. | |
4845 | */ | |
4846 | nparity = 1; | |
4847 | } | |
4848 | ||
4849 | vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP); | |
4850 | vdrz->vn_vre.vre_vdev_id = -1; | |
4851 | vdrz->vn_vre.vre_offset = UINT64_MAX; | |
4852 | vdrz->vn_vre.vre_failed_offset = UINT64_MAX; | |
4853 | mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL); | |
4854 | cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL); | |
4855 | zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL); | |
4856 | mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL); | |
4857 | avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare, | |
4858 | sizeof (reflow_node_t), offsetof(reflow_node_t, re_link)); | |
4859 | ||
4860 | vdrz->vd_physical_width = children; | |
4861 | vdrz->vd_nparity = nparity; | |
4862 | ||
4863 | /* note, the ID does not exist when creating a pool */ | |
4864 | (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, | |
4865 | &vdrz->vn_vre.vre_vdev_id); | |
4866 | ||
4867 | boolean_t reflow_in_progress = | |
4868 | nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); | |
4869 | if (reflow_in_progress) { | |
4870 | spa->spa_raidz_expand = &vdrz->vn_vre; | |
4871 | vdrz->vn_vre.vre_state = DSS_SCANNING; | |
4872 | } | |
4873 | ||
4874 | vdrz->vd_original_width = children; | |
4875 | uint64_t *txgs; | |
4876 | unsigned int txgs_size = 0; | |
4877 | error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, | |
4878 | &txgs, &txgs_size); | |
4879 | if (error == 0) { | |
4880 | for (int i = 0; i < txgs_size; i++) { | |
4881 | reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP); | |
4882 | re->re_txg = txgs[txgs_size - i - 1]; | |
4883 | re->re_logical_width = vdrz->vd_physical_width - i; | |
4884 | ||
4885 | if (reflow_in_progress) | |
4886 | re->re_logical_width--; | |
4887 | ||
4888 | avl_add(&vdrz->vd_expand_txgs, re); | |
4889 | } | |
4890 | ||
4891 | vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size; | |
4892 | } | |
4893 | if (reflow_in_progress) { | |
4894 | vdrz->vd_original_width--; | |
4895 | zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions", | |
4896 | children, txgs_size); | |
4897 | } | |
4898 | ||
4899 | *tsd = vdrz; | |
4900 | ||
4901 | return (0); | |
4902 | } | |
4903 | ||
4904 | static void | |
4905 | vdev_raidz_fini(vdev_t *vd) | |
4906 | { | |
4907 | vdev_raidz_t *vdrz = vd->vdev_tsd; | |
4908 | if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre) | |
4909 | vd->vdev_spa->spa_raidz_expand = NULL; | |
4910 | reflow_node_t *re; | |
4911 | void *cookie = NULL; | |
4912 | avl_tree_t *tree = &vdrz->vd_expand_txgs; | |
4913 | while ((re = avl_destroy_nodes(tree, &cookie)) != NULL) | |
4914 | kmem_free(re, sizeof (*re)); | |
4915 | avl_destroy(&vdrz->vd_expand_txgs); | |
4916 | mutex_destroy(&vdrz->vd_expand_lock); | |
4917 | mutex_destroy(&vdrz->vn_vre.vre_lock); | |
4918 | cv_destroy(&vdrz->vn_vre.vre_cv); | |
4919 | zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock); | |
4920 | kmem_free(vdrz, sizeof (*vdrz)); | |
4921 | } | |
4922 | ||
4923 | /* | |
4924 | * Add RAIDZ specific fields to the config nvlist. | |
4925 | */ | |
4926 | static void | |
4927 | vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv) | |
4928 | { | |
4929 | ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops); | |
4930 | vdev_raidz_t *vdrz = vd->vdev_tsd; | |
4931 | ||
4932 | /* | |
4933 | * Make sure someone hasn't managed to sneak a fancy new vdev | |
4934 | * into a crufty old storage pool. | |
4935 | */ | |
4936 | ASSERT(vdrz->vd_nparity == 1 || | |
4937 | (vdrz->vd_nparity <= 2 && | |
4938 | spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) || | |
4939 | (vdrz->vd_nparity <= 3 && | |
4940 | spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3)); | |
4941 | ||
4942 | /* | |
4943 | * Note that we'll add these even on storage pools where they | |
4944 | * aren't strictly required -- older software will just ignore | |
4945 | * it. | |
4946 | */ | |
4947 | fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity); | |
4948 | ||
4949 | if (vdrz->vn_vre.vre_state == DSS_SCANNING) { | |
4950 | fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING); | |
4951 | } | |
4952 | ||
4953 | mutex_enter(&vdrz->vd_expand_lock); | |
4954 | if (!avl_is_empty(&vdrz->vd_expand_txgs)) { | |
4955 | uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs); | |
4956 | uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count, | |
4957 | KM_SLEEP); | |
4958 | uint64_t i = 0; | |
4959 | ||
4960 | for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs); | |
4961 | re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) { | |
4962 | txgs[i++] = re->re_txg; | |
4963 | } | |
4964 | ||
4965 | fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS, | |
4966 | txgs, count); | |
4967 | ||
4968 | kmem_free(txgs, sizeof (uint64_t) * count); | |
4969 | } | |
4970 | mutex_exit(&vdrz->vd_expand_lock); | |
4971 | } | |
4972 | ||
4973 | static uint64_t | |
4974 | vdev_raidz_nparity(vdev_t *vd) | |
4975 | { | |
4976 | vdev_raidz_t *vdrz = vd->vdev_tsd; | |
4977 | return (vdrz->vd_nparity); | |
4978 | } | |
4979 | ||
4980 | static uint64_t | |
4981 | vdev_raidz_ndisks(vdev_t *vd) | |
4982 | { | |
4983 | return (vd->vdev_children); | |
4984 | } | |
4985 | ||
4986 | vdev_ops_t vdev_raidz_ops = { | |
b2255edc BB |
4987 | .vdev_op_init = vdev_raidz_init, |
4988 | .vdev_op_fini = vdev_raidz_fini, | |
a64f8276 I |
4989 | .vdev_op_open = vdev_raidz_open, |
4990 | .vdev_op_close = vdev_raidz_close, | |
4991 | .vdev_op_asize = vdev_raidz_asize, | |
b2255edc BB |
4992 | .vdev_op_min_asize = vdev_raidz_min_asize, |
4993 | .vdev_op_min_alloc = NULL, | |
a64f8276 I |
4994 | .vdev_op_io_start = vdev_raidz_io_start, |
4995 | .vdev_op_io_done = vdev_raidz_io_done, | |
4996 | .vdev_op_state_change = vdev_raidz_state_change, | |
4997 | .vdev_op_need_resilver = vdev_raidz_need_resilver, | |
4998 | .vdev_op_hold = NULL, | |
4999 | .vdev_op_rele = NULL, | |
5000 | .vdev_op_remap = NULL, | |
5001 | .vdev_op_xlate = vdev_raidz_xlate, | |
b2255edc BB |
5002 | .vdev_op_rebuild_asize = NULL, |
5003 | .vdev_op_metaslab_init = NULL, | |
5004 | .vdev_op_config_generate = vdev_raidz_config_generate, | |
5005 | .vdev_op_nparity = vdev_raidz_nparity, | |
5006 | .vdev_op_ndisks = vdev_raidz_ndisks, | |
a64f8276 I |
5007 | .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ |
5008 | .vdev_op_leaf = B_FALSE /* not a leaf vdev */ | |
34dc7c2f | 5009 | }; |
5caeef02 DB |
5010 | |
5011 | /* BEGIN CSTYLED */ | |
5012 | ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW, | |
5013 | "For testing, pause RAIDZ expansion after reflowing this many bytes"); | |
5014 | ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW, | |
5015 | "Max amount of concurrent i/o for RAIDZ expansion"); | |
5016 | ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW, | |
5017 | "For expanded RAIDZ, aggregate reads that have more rows than this"); | |
5018 | ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW, | |
5019 | "For expanded RAIDZ, automatically start a pool scrub when expansion " | |
5020 | "completes"); | |
5021 | /* END CSTYLED */ |