4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
28 #include <sys/zfs_context.h>
30 #include <sys/spa_impl.h>
32 #include <sys/vdev_impl.h>
33 #include <sys/metaslab_impl.h>
35 #include <sys/zio_checksum.h>
36 #include <sys/dmu_tx.h>
38 #include <sys/zfs_rlock.h>
39 #include <sys/fs/zfs.h>
40 #include <sys/fm/fs/zfs.h>
41 #include <sys/vdev_raidz.h>
42 #include <sys/vdev_raidz_impl.h>
43 #include <sys/vdev_draid.h>
44 #include <sys/uberblock_impl.h>
45 #include <sys/dsl_scan.h>
48 #include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
52 * Virtual device vector for RAID-Z.
54 * This vdev supports single, double, and triple parity. For single parity,
55 * we use a simple XOR of all the data columns. For double or triple parity,
56 * we use a special case of Reed-Solomon coding. This extends the
57 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
58 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
59 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
60 * former is also based. The latter is designed to provide higher performance
63 * Note that the Plank paper claimed to support arbitrary N+M, but was then
64 * amended six years later identifying a critical flaw that invalidates its
65 * claims. Nevertheless, the technique can be adapted to work for up to
66 * triple parity. For additional parity, the amendment "Note: Correction to
67 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
68 * is viable, but the additional complexity means that write performance will
71 * All of the methods above operate on a Galois field, defined over the
72 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
73 * can be expressed with a single byte. Briefly, the operations on the
74 * field are defined as follows:
76 * o addition (+) is represented by a bitwise XOR
77 * o subtraction (-) is therefore identical to addition: A + B = A - B
78 * o multiplication of A by 2 is defined by the following bitwise expression:
83 * (A * 2)_4 = A_3 + A_7
84 * (A * 2)_3 = A_2 + A_7
85 * (A * 2)_2 = A_1 + A_7
89 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
90 * As an aside, this multiplication is derived from the error correcting
91 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
93 * Observe that any number in the field (except for 0) can be expressed as a
94 * power of 2 -- a generator for the field. We store a table of the powers of
95 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
96 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
97 * than field addition). The inverse of a field element A (A^-1) is therefore
98 * A ^ (255 - 1) = A^254.
100 * The up-to-three parity columns, P, Q, R over several data columns,
101 * D_0, ... D_n-1, can be expressed by field operations:
103 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
104 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
105 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
106 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
107 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
109 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
110 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
111 * independent coefficients. (There are no additional coefficients that have
112 * this property which is why the uncorrected Plank method breaks down.)
114 * See the reconstruction code below for how P, Q and R can used individually
115 * or in concert to recover missing data columns.
118 #define VDEV_RAIDZ_P 0
119 #define VDEV_RAIDZ_Q 1
120 #define VDEV_RAIDZ_R 2
122 #define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
123 #define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
126 * We provide a mechanism to perform the field multiplication operation on a
127 * 64-bit value all at once rather than a byte at a time. This works by
128 * creating a mask from the top bit in each byte and using that to
129 * conditionally apply the XOR of 0x1d.
131 #define VDEV_RAIDZ_64MUL_2(x, mask) \
133 (mask) = (x) & 0x8080808080808080ULL; \
134 (mask) = ((mask) << 1) - ((mask) >> 7); \
135 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
136 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
139 #define VDEV_RAIDZ_64MUL_4(x, mask) \
141 VDEV_RAIDZ_64MUL_2((x), mask); \
142 VDEV_RAIDZ_64MUL_2((x), mask); \
147 * Big Theory Statement for how a RAIDZ VDEV is expanded
149 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
150 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
151 * that have been previously expanded can be expanded again.
153 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
154 * the VDEV) when an expansion starts. And the expansion will pause if any
155 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
156 * operations on the pool can continue while an expansion is in progress (e.g.
157 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
158 * and zpool initialize which can't be run during an expansion. Following a
159 * reboot or export/import, the expansion resumes where it left off.
161 * == Reflowing the Data ==
163 * The expansion involves reflowing (copying) the data from the current set
164 * of disks to spread it across the new set which now has one more disk. This
165 * reflow operation is similar to reflowing text when the column width of a
166 * text editor window is expanded. The text doesn’t change but the location of
167 * the text changes to accommodate the new width. An example reflow result for
168 * a 4-wide RAIDZ1 to a 5-wide is shown below.
171 * Each letter indicates a parity group (logical stripe)
173 * Before expansion After Expansion
174 * D1 D2 D3 D4 D1 D2 D3 D4 D5
175 * +------+------+------+------+ +------+------+------+------+------+
176 * | | | | | | | | | | |
177 * | A | A | A | A | | A | A | A | A | B |
178 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|
179 * +------+------+------+------+ +------+------+------+------+------+
180 * | | | | | | | | | | |
181 * | B | B | C | C | | B | C | C | C | C |
182 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|
183 * +------+------+------+------+ +------+------+------+------+------+
184 * | | | | | | | | | | |
185 * | C | C | D | D | | D | D | E | E | E |
186 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|
187 * +------+------+------+------+ +------+------+------+------+------+
188 * | | | | | | | | | | |
189 * | E | E | E | E | --> | E | F | F | G | G |
190 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|
191 * +------+------+------+------+ +------+------+------+------+------+
192 * | | | | | | | | | | |
193 * | F | F | G | G | | G | G | H | H | H |
194 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|
195 * +------+------+------+------+ +------+------+------+------+------+
196 * | | | | | | | | | | |
197 * | G | G | H | H | | H | I | I | J | J |
198 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|
199 * +------+------+------+------+ +------+------+------+------+------+
200 * | | | | | | | | | | |
201 * | H | H | I | I | | J | J | | | K |
202 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|
203 * +------+------+------+------+ +------+------+------+------+------+
205 * This reflow approach has several advantages. There is no need to read or
206 * modify the block pointers or recompute any block checksums. The reflow
207 * doesn’t need to know where the parity sectors reside. We can read and write
208 * data sequentially and the copy can occur in a background thread in open
209 * context. The design also allows for fast discovery of what data to copy.
211 * The VDEV metaslabs are processed, one at a time, to copy the block data to
212 * have it flow across all the disks. The metaslab is disabled for allocations
213 * during the copy. As an optimization, we only copy the allocated data which
214 * can be determined by looking at the metaslab range tree. During the copy we
215 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
216 * need to be able to survive losing parity count disks). This means we
217 * cannot overwrite data during the reflow that would be needed if a disk is
220 * After the reflow completes, all newly-written blocks will have the new
221 * layout, i.e., they will have the parity to data ratio implied by the new
222 * number of disks in the RAIDZ group. Even though the reflow copies all of
223 * the allocated space (data and parity), it is only rearranged, not changed.
225 * This act of reflowing the data has a few implications about blocks
226 * that were written before the reflow completes:
228 * - Old blocks will still use the same amount of space (i.e., they will have
229 * the parity to data ratio implied by the old number of disks in the RAIDZ
231 * - Reading old blocks will be slightly slower than before the reflow, for
232 * two reasons. First, we will have to read from all disks in the RAIDZ
233 * VDEV, rather than being able to skip the children that contain only
234 * parity of this block (because the data of a single block is now spread
235 * out across all the disks). Second, in most cases there will be an extra
236 * bcopy, needed to rearrange the data back to its original layout in memory.
240 * As we copy the block data, we can only progress to the point that writes
241 * will not overlap with blocks whose progress has not yet been recorded on
242 * disk. Since partially-copied rows are always read from the old location,
243 * we need to stop one row before the sector-wise overlap, to prevent any
244 * row-wise overlap. For example, in the diagram above, when we reflow sector
245 * B6 it will overwite the original location for B5.
247 * To get around this, a scratch space is used so that we can start copying
248 * without risking data loss by overlapping the row. As an added benefit, it
249 * improves performance at the beginning of the reflow, but that small perf
250 * boost wouldn't be worth the complexity on its own.
252 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
253 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
254 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
255 * the widths will likely be single digits so we can get a substantial chuck
256 * size using only a few MB of scratch per disk.
258 * The scratch area is persisted to disk which holds a large amount of reflowed
259 * state. We can always read the partially written stripes when a disk fails or
260 * the copy is interrupted (crash) during the initial copying phase and also
261 * get past a small chunk size restriction. At a minimum, the scratch space
262 * must be large enough to get us to the point that one row does not overlap
263 * itself when moved (i.e new_width^2). But going larger is even better. We
264 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
265 * as our scratch space to handle overwriting the initial part of the VDEV.
268 * +------+------+-----------------------+-----------------------------
269 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...
270 * | L0 | L1 | Reserved | (Metaslabs)
271 * +------+------+-----------------------+-------------------------------
274 * == Reflow Progress Updates ==
275 * After the initial scratch-based reflow, the expansion process works
276 * similarly to device removal. We create a new open context thread which
277 * reflows the data, and periodically kicks off sync tasks to update logical
278 * state. In this case, state is the committed progress (offset of next data
279 * to copy). We need to persist the completed offset on disk, so that if we
280 * crash we know which format each VDEV offset is in.
282 * == Time Dependent Geometry ==
284 * In non-expanded RAIDZ, blocks are read from disk in a column by column
285 * fashion. For a multi-row block, the second sector is in the first column
286 * not in the second column. This allows us to issue full reads for each
287 * column directly into the request buffer. The block data is thus laid out
288 * sequentially in a column-by-column fashion.
290 * For example, in the before expansion diagram above, one logical block might
291 * be sectors G19-H26. The parity is in G19,H23; and the data is in
292 * G20,H24,G21,H25,G22,H26.
294 * After a block is reflowed, the sectors that were all in the original column
295 * data can now reside in different columns. When reading from an expanded
296 * VDEV, we need to know the logical stripe width for each block so we can
297 * reconstitute the block’s data after the reads are completed. Likewise,
298 * when we perform the combinatorial reconstruction we need to know the
299 * original width so we can retry combinations from the past layouts.
301 * Time dependent geometry is what we call having blocks with different layouts
302 * (stripe widths) in the same VDEV. This time-dependent geometry uses the
303 * block’s birth time (+ the time expansion ended) to establish the correct
304 * width for a given block. After an expansion completes, we record the time
305 * for blocks written with a particular width (geometry).
307 * == On Disk Format Changes ==
309 * New pool feature flag, 'raidz_expansion' whose reference count is the number
310 * of RAIDZ VDEVs that have been expanded.
312 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
314 * Since the uberblock can point to arbitrary blocks, which might be on the
315 * expanding RAIDZ, and might or might not have been expanded. We need to know
316 * which way a block is laid out before reading it. This info is the next
317 * offset that needs to be reflowed and we persist that in the uberblock, in
318 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
319 * After the expansion is complete, we then use the raidz_expand_txgs array
320 * (see below) to determine how to read a block and the ub_raidz_reflow_info
321 * field no longer required.
323 * The uberblock's ub_raidz_reflow_info field also holds the scratch space
324 * state (i.e., active or not) which is also required before reading a block
325 * during the initial phase of reflowing the data.
327 * The top-level RAIDZ VDEV has two new entries in the nvlist:
329 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
330 * and used after the expansion is complete to
331 * determine how to read a raidz block
332 * 'raidz_expanding' boolean: present during reflow and removed after completion
333 * used during a spa import to resume an unfinished
336 * And finally the VDEVs top zap adds the following informational entries:
337 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
338 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
344 * For testing only: pause the raidz expansion after reflowing this amount.
345 * (accessed by ZTS and ztest)
350 unsigned long raidz_expand_max_reflow_bytes
= 0;
353 * For testing only: pause the raidz expansion at a certain point.
355 uint_t raidz_expand_pause_point
= 0;
358 * Maximum amount of copy io's outstanding at once.
360 static unsigned long raidz_expand_max_copy_bytes
= 10 * SPA_MAXBLOCKSIZE
;
363 * Apply raidz map abds aggregation if the number of rows in the map is equal
364 * or greater than the value below.
366 static unsigned long raidz_io_aggregate_rows
= 4;
369 * Automatically start a pool scrub when a RAIDZ expansion completes in
370 * order to verify the checksums of all blocks which have been copied
371 * during the expansion. Automatic scrubbing is enabled by default and
372 * is strongly recommended.
374 static int zfs_scrub_after_expand
= 1;
377 vdev_raidz_row_free(raidz_row_t
*rr
)
379 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
380 raidz_col_t
*rc
= &rr
->rr_col
[c
];
382 if (rc
->rc_size
!= 0)
383 abd_free(rc
->rc_abd
);
384 if (rc
->rc_orig_data
!= NULL
)
385 abd_free(rc
->rc_orig_data
);
388 if (rr
->rr_abd_empty
!= NULL
)
389 abd_free(rr
->rr_abd_empty
);
391 kmem_free(rr
, offsetof(raidz_row_t
, rr_col
[rr
->rr_scols
]));
395 vdev_raidz_map_free(raidz_map_t
*rm
)
397 for (int i
= 0; i
< rm
->rm_nrows
; i
++)
398 vdev_raidz_row_free(rm
->rm_row
[i
]);
400 if (rm
->rm_nphys_cols
) {
401 for (int i
= 0; i
< rm
->rm_nphys_cols
; i
++) {
402 if (rm
->rm_phys_col
[i
].rc_abd
!= NULL
)
403 abd_free(rm
->rm_phys_col
[i
].rc_abd
);
406 kmem_free(rm
->rm_phys_col
, sizeof (raidz_col_t
) *
410 ASSERT3P(rm
->rm_lr
, ==, NULL
);
411 kmem_free(rm
, offsetof(raidz_map_t
, rm_row
[rm
->rm_nrows
]));
415 vdev_raidz_map_free_vsd(zio_t
*zio
)
417 raidz_map_t
*rm
= zio
->io_vsd
;
419 vdev_raidz_map_free(rm
);
423 vdev_raidz_reflow_compare(const void *x1
, const void *x2
)
425 const reflow_node_t
*l
= x1
;
426 const reflow_node_t
*r
= x2
;
428 return (TREE_CMP(l
->re_txg
, r
->re_txg
));
431 const zio_vsd_ops_t vdev_raidz_vsd_ops
= {
432 .vsd_free
= vdev_raidz_map_free_vsd
,
436 vdev_raidz_row_alloc(int cols
)
439 kmem_zalloc(offsetof(raidz_row_t
, rr_col
[cols
]), KM_SLEEP
);
444 for (int c
= 0; c
< cols
; c
++) {
445 raidz_col_t
*rc
= &rr
->rr_col
[c
];
446 rc
->rc_shadow_devidx
= INT_MAX
;
447 rc
->rc_shadow_offset
= UINT64_MAX
;
448 rc
->rc_allow_repair
= 1;
454 vdev_raidz_map_alloc_write(zio_t
*zio
, raidz_map_t
*rm
, uint64_t ashift
)
459 raidz_row_t
*rr
= rm
->rm_row
[0];
461 ASSERT3U(zio
->io_type
, ==, ZIO_TYPE_WRITE
);
462 ASSERT3U(rm
->rm_nrows
, ==, 1);
465 * Pad any parity columns with additional space to account for skip
468 if (rm
->rm_skipstart
< rr
->rr_firstdatacol
) {
469 ASSERT0(rm
->rm_skipstart
);
470 nwrapped
= rm
->rm_nskip
;
471 } else if (rr
->rr_scols
< (rm
->rm_skipstart
+ rm
->rm_nskip
)) {
473 (rm
->rm_skipstart
+ rm
->rm_nskip
) % rr
->rr_scols
;
477 * Optional single skip sectors (rc_size == 0) will be handled in
478 * vdev_raidz_io_start_write().
480 int skipped
= rr
->rr_scols
- rr
->rr_cols
;
482 /* Allocate buffers for the parity columns */
483 for (c
= 0; c
< rr
->rr_firstdatacol
; c
++) {
484 raidz_col_t
*rc
= &rr
->rr_col
[c
];
487 * Parity columns will pad out a linear ABD to account for
488 * the skip sector. A linear ABD is used here because
489 * parity calculations use the ABD buffer directly to calculate
490 * parity. This avoids doing a memcpy back to the ABD after the
491 * parity has been calculated. By issuing the parity column
492 * with the skip sector we can reduce contention on the child
493 * VDEV queue locks (vq_lock).
496 rc
->rc_abd
= abd_alloc_linear(
497 rc
->rc_size
+ (1ULL << ashift
), B_FALSE
);
498 abd_zero_off(rc
->rc_abd
, rc
->rc_size
, 1ULL << ashift
);
501 rc
->rc_abd
= abd_alloc_linear(rc
->rc_size
, B_FALSE
);
505 for (off
= 0; c
< rr
->rr_cols
; c
++) {
506 raidz_col_t
*rc
= &rr
->rr_col
[c
];
507 abd_t
*abd
= abd_get_offset_struct(&rc
->rc_abdstruct
,
508 zio
->io_abd
, off
, rc
->rc_size
);
511 * Generate I/O for skip sectors to improve aggregation
512 * continuity. We will use gang ABD's to reduce contention
513 * on the child VDEV queue locks (vq_lock) by issuing
514 * a single I/O that contains the data and skip sector.
516 * It is important to make sure that rc_size is not updated
517 * even though we are adding a skip sector to the ABD. When
518 * calculating the parity in vdev_raidz_generate_parity_row()
519 * the rc_size is used to iterate through the ABD's. We can
520 * not have zero'd out skip sectors used for calculating
521 * parity for raidz, because those same sectors are not used
522 * during reconstruction.
524 if (c
>= rm
->rm_skipstart
&& skipped
< rm
->rm_nskip
) {
525 rc
->rc_abd
= abd_alloc_gang();
526 abd_gang_add(rc
->rc_abd
, abd
, B_TRUE
);
527 abd_gang_add(rc
->rc_abd
,
528 abd_get_zeros(1ULL << ashift
), B_TRUE
);
536 ASSERT3U(off
, ==, zio
->io_size
);
537 ASSERT3S(skipped
, ==, rm
->rm_nskip
);
541 vdev_raidz_map_alloc_read(zio_t
*zio
, raidz_map_t
*rm
)
544 raidz_row_t
*rr
= rm
->rm_row
[0];
546 ASSERT3U(rm
->rm_nrows
, ==, 1);
548 /* Allocate buffers for the parity columns */
549 for (c
= 0; c
< rr
->rr_firstdatacol
; c
++)
550 rr
->rr_col
[c
].rc_abd
=
551 abd_alloc_linear(rr
->rr_col
[c
].rc_size
, B_FALSE
);
553 for (uint64_t off
= 0; c
< rr
->rr_cols
; c
++) {
554 raidz_col_t
*rc
= &rr
->rr_col
[c
];
555 rc
->rc_abd
= abd_get_offset_struct(&rc
->rc_abdstruct
,
556 zio
->io_abd
, off
, rc
->rc_size
);
562 * Divides the IO evenly across all child vdevs; usually, dcols is
563 * the number of children in the target vdev.
565 * Avoid inlining the function to keep vdev_raidz_io_start(), which
566 * is this functions only caller, as small as possible on the stack.
568 noinline raidz_map_t
*
569 vdev_raidz_map_alloc(zio_t
*zio
, uint64_t ashift
, uint64_t dcols
,
573 /* The starting RAIDZ (parent) vdev sector of the block. */
574 uint64_t b
= zio
->io_offset
>> ashift
;
575 /* The zio's size in units of the vdev's minimum sector size. */
576 uint64_t s
= zio
->io_size
>> ashift
;
577 /* The first column for this stripe. */
578 uint64_t f
= b
% dcols
;
579 /* The starting byte offset on each child vdev. */
580 uint64_t o
= (b
/ dcols
) << ashift
;
581 uint64_t acols
, scols
;
584 kmem_zalloc(offsetof(raidz_map_t
, rm_row
[1]), KM_SLEEP
);
588 * "Quotient": The number of data sectors for this stripe on all but
589 * the "big column" child vdevs that also contain "remainder" data.
591 uint64_t q
= s
/ (dcols
- nparity
);
594 * "Remainder": The number of partial stripe data sectors in this I/O.
595 * This will add a sector to some, but not all, child vdevs.
597 uint64_t r
= s
- q
* (dcols
- nparity
);
599 /* The number of "big columns" - those which contain remainder data. */
600 uint64_t bc
= (r
== 0 ? 0 : r
+ nparity
);
603 * The total number of data and parity sectors associated with
606 uint64_t tot
= s
+ nparity
* (q
+ (r
== 0 ? 0 : 1));
609 * acols: The columns that will be accessed.
610 * scols: The columns that will be accessed or skipped.
613 /* Our I/O request doesn't span all child vdevs. */
615 scols
= MIN(dcols
, roundup(bc
, nparity
+ 1));
621 ASSERT3U(acols
, <=, scols
);
622 rr
= vdev_raidz_row_alloc(scols
);
626 rr
->rr_firstdatacol
= nparity
;
628 rr
->rr_offset
= zio
->io_offset
;
629 rr
->rr_size
= zio
->io_size
;
634 for (uint64_t c
= 0; c
< scols
; c
++) {
635 raidz_col_t
*rc
= &rr
->rr_col
[c
];
636 uint64_t col
= f
+ c
;
640 coff
+= 1ULL << ashift
;
643 rc
->rc_offset
= coff
;
648 rc
->rc_size
= (q
+ 1) << ashift
;
650 rc
->rc_size
= q
<< ashift
;
652 asize
+= rc
->rc_size
;
655 ASSERT3U(asize
, ==, tot
<< ashift
);
656 rm
->rm_nskip
= roundup(tot
, nparity
+ 1) - tot
;
657 rm
->rm_skipstart
= bc
;
660 * If all data stored spans all columns, there's a danger that parity
661 * will always be on the same device and, since parity isn't read
662 * during normal operation, that device's I/O bandwidth won't be
663 * used effectively. We therefore switch the parity every 1MB.
665 * ... at least that was, ostensibly, the theory. As a practical
666 * matter unless we juggle the parity between all devices evenly, we
667 * won't see any benefit. Further, occasional writes that aren't a
668 * multiple of the LCM of the number of children and the minimum
669 * stripe width are sufficient to avoid pessimal behavior.
670 * Unfortunately, this decision created an implicit on-disk format
671 * requirement that we need to support for all eternity, but only
672 * for single-parity RAID-Z.
674 * If we intend to skip a sector in the zeroth column for padding
675 * we must make sure to note this swap. We will never intend to
676 * skip the first column since at least one data and one parity
677 * column must appear in each row.
679 ASSERT(rr
->rr_cols
>= 2);
680 ASSERT(rr
->rr_col
[0].rc_size
== rr
->rr_col
[1].rc_size
);
682 if (rr
->rr_firstdatacol
== 1 && (zio
->io_offset
& (1ULL << 20))) {
683 uint64_t devidx
= rr
->rr_col
[0].rc_devidx
;
684 o
= rr
->rr_col
[0].rc_offset
;
685 rr
->rr_col
[0].rc_devidx
= rr
->rr_col
[1].rc_devidx
;
686 rr
->rr_col
[0].rc_offset
= rr
->rr_col
[1].rc_offset
;
687 rr
->rr_col
[1].rc_devidx
= devidx
;
688 rr
->rr_col
[1].rc_offset
= o
;
689 if (rm
->rm_skipstart
== 0)
690 rm
->rm_skipstart
= 1;
693 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
694 vdev_raidz_map_alloc_write(zio
, rm
, ashift
);
696 vdev_raidz_map_alloc_read(zio
, rm
);
698 /* init RAIDZ parity ops */
699 rm
->rm_ops
= vdev_raidz_math_get_ops();
705 * Everything before reflow_offset_synced should have been moved to the new
706 * location (read and write completed). However, this may not yet be reflected
707 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
708 * uberblock has not yet been written). If reflow is not in progress,
709 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
710 * entirely before reflow_offset_synced, it will come from the new location.
711 * Otherwise this row will come from the old location. Therefore, rows that
712 * straddle the reflow_offset_synced will come from the old location.
714 * For writes, reflow_offset_next is the next offset to copy. If a sector has
715 * been copied, but not yet reflected in the on-disk progress
716 * (reflow_offset_synced), it will also be written to the new (already copied)
719 noinline raidz_map_t
*
720 vdev_raidz_map_alloc_expanded(zio_t
*zio
,
721 uint64_t ashift
, uint64_t physical_cols
, uint64_t logical_cols
,
722 uint64_t nparity
, uint64_t reflow_offset_synced
,
723 uint64_t reflow_offset_next
, boolean_t use_scratch
)
725 abd_t
*abd
= zio
->io_abd
;
726 uint64_t offset
= zio
->io_offset
;
727 uint64_t size
= zio
->io_size
;
729 /* The zio's size in units of the vdev's minimum sector size. */
730 uint64_t s
= size
>> ashift
;
733 * "Quotient": The number of data sectors for this stripe on all but
734 * the "big column" child vdevs that also contain "remainder" data.
737 uint64_t q
= s
/ (logical_cols
- nparity
);
740 * "Remainder": The number of partial stripe data sectors in this I/O.
741 * This will add a sector to some, but not all, child vdevs.
743 uint64_t r
= s
- q
* (logical_cols
- nparity
);
745 /* The number of "big columns" - those which contain remainder data. */
746 uint64_t bc
= (r
== 0 ? 0 : r
+ nparity
);
749 * The total number of data and parity sectors associated with
752 uint64_t tot
= s
+ nparity
* (q
+ (r
== 0 ? 0 : 1));
754 /* How many rows contain data (not skip) */
755 uint64_t rows
= howmany(tot
, logical_cols
);
756 int cols
= MIN(tot
, logical_cols
);
759 kmem_zalloc(offsetof(raidz_map_t
, rm_row
[rows
]),
762 rm
->rm_nskip
= roundup(tot
, nparity
+ 1) - tot
;
763 rm
->rm_skipstart
= bc
;
766 for (uint64_t row
= 0; row
< rows
; row
++) {
767 boolean_t row_use_scratch
= B_FALSE
;
768 raidz_row_t
*rr
= vdev_raidz_row_alloc(cols
);
769 rm
->rm_row
[row
] = rr
;
771 /* The starting RAIDZ (parent) vdev sector of the row. */
772 uint64_t b
= (offset
>> ashift
) + row
* logical_cols
;
775 * If we are in the middle of a reflow, and the copying has
776 * not yet completed for any part of this row, then use the
777 * old location of this row. Note that reflow_offset_synced
778 * reflects the i/o that's been completed, because it's
779 * updated by a synctask, after zio_wait(spa_txg_zio[]).
780 * This is sufficient for our check, even if that progress
781 * has not yet been recorded to disk (reflected in
782 * spa_ubsync). Also note that we consider the last row to
783 * be "full width" (`cols`-wide rather than `bc`-wide) for
784 * this calculation. This causes a tiny bit of unnecessary
785 * double-writes but is safe and simpler to calculate.
787 int row_phys_cols
= physical_cols
;
788 if (b
+ cols
> reflow_offset_synced
>> ashift
)
790 else if (use_scratch
)
791 row_use_scratch
= B_TRUE
;
793 /* starting child of this row */
794 uint64_t child_id
= b
% row_phys_cols
;
795 /* The starting byte offset on each child vdev. */
796 uint64_t child_offset
= (b
/ row_phys_cols
) << ashift
;
799 * Note, rr_cols is the entire width of the block, even
800 * if this row is shorter. This is needed because parity
801 * generation (for Q and R) needs to know the entire width,
802 * because it treats the short row as though it was
803 * full-width (and the "phantom" sectors were zero-filled).
805 * Another approach to this would be to set cols shorter
806 * (to just the number of columns that we might do i/o to)
807 * and have another mechanism to tell the parity generation
808 * about the "entire width". Reconstruction (at least
809 * vdev_raidz_reconstruct_general()) would also need to
810 * know about the "entire width".
812 rr
->rr_firstdatacol
= nparity
;
815 * note: rr_size is PSIZE, not ASIZE
817 rr
->rr_offset
= b
<< ashift
;
818 rr
->rr_size
= (rr
->rr_cols
- rr
->rr_firstdatacol
) << ashift
;
821 for (int c
= 0; c
< rr
->rr_cols
; c
++, child_id
++) {
822 if (child_id
>= row_phys_cols
) {
823 child_id
-= row_phys_cols
;
824 child_offset
+= 1ULL << ashift
;
826 raidz_col_t
*rc
= &rr
->rr_col
[c
];
827 rc
->rc_devidx
= child_id
;
828 rc
->rc_offset
= child_offset
;
831 * Get this from the scratch space if appropriate.
832 * This only happens if we crashed in the middle of
833 * raidz_reflow_scratch_sync() (while it's running,
834 * the rangelock prevents us from doing concurrent
835 * io), and even then only during zpool import or
836 * when the pool is imported readonly.
839 rc
->rc_offset
-= VDEV_BOOT_SIZE
;
841 uint64_t dc
= c
- rr
->rr_firstdatacol
;
842 if (c
< rr
->rr_firstdatacol
) {
843 rc
->rc_size
= 1ULL << ashift
;
846 * Parity sectors' rc_abd's are set below
847 * after determining if this is an aggregation.
849 } else if (row
== rows
- 1 && bc
!= 0 && c
>= bc
) {
851 * Past the end of the block (even including
852 * skip sectors). This sector is part of the
853 * map so that we have full rows for p/q parity
859 /* "data column" (col excluding parity) */
862 if (c
< bc
|| r
== 0) {
863 off
= dc
* rows
+ row
;
866 (dc
- r
) * (rows
- 1) + row
;
868 rc
->rc_size
= 1ULL << ashift
;
869 rc
->rc_abd
= abd_get_offset_struct(
870 &rc
->rc_abdstruct
, abd
, off
<< ashift
,
874 if (rc
->rc_size
== 0)
878 * If any part of this row is in both old and new
879 * locations, the primary location is the old
880 * location. If this sector was already copied to the
881 * new location, we need to also write to the new,
884 * Note, `row_phys_cols != physical_cols` indicates
885 * that the primary location is the old location.
886 * `b+c < reflow_offset_next` indicates that the copy
887 * to the new location has been initiated. We know
888 * that the copy has completed because we have the
889 * rangelock, which is held exclusively while the
890 * copy is in progress.
892 if (row_use_scratch
||
893 (row_phys_cols
!= physical_cols
&&
894 b
+ c
< reflow_offset_next
>> ashift
)) {
895 rc
->rc_shadow_devidx
= (b
+ c
) % physical_cols
;
896 rc
->rc_shadow_offset
=
897 ((b
+ c
) / physical_cols
) << ashift
;
899 rc
->rc_shadow_offset
-= VDEV_BOOT_SIZE
;
902 asize
+= rc
->rc_size
;
906 * See comment in vdev_raidz_map_alloc()
908 if (rr
->rr_firstdatacol
== 1 && rr
->rr_cols
> 1 &&
909 (offset
& (1ULL << 20))) {
910 ASSERT(rr
->rr_cols
>= 2);
911 ASSERT(rr
->rr_col
[0].rc_size
== rr
->rr_col
[1].rc_size
);
913 int devidx0
= rr
->rr_col
[0].rc_devidx
;
914 uint64_t offset0
= rr
->rr_col
[0].rc_offset
;
915 int shadow_devidx0
= rr
->rr_col
[0].rc_shadow_devidx
;
916 uint64_t shadow_offset0
=
917 rr
->rr_col
[0].rc_shadow_offset
;
919 rr
->rr_col
[0].rc_devidx
= rr
->rr_col
[1].rc_devidx
;
920 rr
->rr_col
[0].rc_offset
= rr
->rr_col
[1].rc_offset
;
921 rr
->rr_col
[0].rc_shadow_devidx
=
922 rr
->rr_col
[1].rc_shadow_devidx
;
923 rr
->rr_col
[0].rc_shadow_offset
=
924 rr
->rr_col
[1].rc_shadow_offset
;
926 rr
->rr_col
[1].rc_devidx
= devidx0
;
927 rr
->rr_col
[1].rc_offset
= offset0
;
928 rr
->rr_col
[1].rc_shadow_devidx
= shadow_devidx0
;
929 rr
->rr_col
[1].rc_shadow_offset
= shadow_offset0
;
932 ASSERT3U(asize
, ==, tot
<< ashift
);
935 * Determine if the block is contiguous, in which case we can use
938 if (rows
>= raidz_io_aggregate_rows
) {
939 rm
->rm_nphys_cols
= physical_cols
;
941 kmem_zalloc(sizeof (raidz_col_t
) * rm
->rm_nphys_cols
,
945 * Determine the aggregate io's offset and size, and check
946 * that the io is contiguous.
949 i
< rm
->rm_nrows
&& rm
->rm_phys_col
!= NULL
; i
++) {
950 raidz_row_t
*rr
= rm
->rm_row
[i
];
951 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
952 raidz_col_t
*rc
= &rr
->rr_col
[c
];
954 &rm
->rm_phys_col
[rc
->rc_devidx
];
956 if (rc
->rc_size
== 0)
959 if (prc
->rc_size
== 0) {
960 ASSERT0(prc
->rc_offset
);
961 prc
->rc_offset
= rc
->rc_offset
;
962 } else if (prc
->rc_offset
+ prc
->rc_size
!=
965 * This block is not contiguous and
966 * therefore can't be aggregated.
967 * This is expected to be rare, so
968 * the cost of allocating and then
969 * freeing rm_phys_col is not
972 kmem_free(rm
->rm_phys_col
,
973 sizeof (raidz_col_t
) *
975 rm
->rm_phys_col
= NULL
;
976 rm
->rm_nphys_cols
= 0;
979 prc
->rc_size
+= rc
->rc_size
;
983 if (rm
->rm_phys_col
!= NULL
) {
985 * Allocate aggregate ABD's.
987 for (int i
= 0; i
< rm
->rm_nphys_cols
; i
++) {
988 raidz_col_t
*prc
= &rm
->rm_phys_col
[i
];
992 if (prc
->rc_size
== 0)
996 abd_alloc_linear(rm
->rm_phys_col
[i
].rc_size
,
1001 * Point the parity abd's into the aggregate abd's.
1003 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
1004 raidz_row_t
*rr
= rm
->rm_row
[i
];
1005 for (int c
= 0; c
< rr
->rr_firstdatacol
; c
++) {
1006 raidz_col_t
*rc
= &rr
->rr_col
[c
];
1008 &rm
->rm_phys_col
[rc
->rc_devidx
];
1010 abd_get_offset_struct(&rc
->rc_abdstruct
,
1012 rc
->rc_offset
- prc
->rc_offset
,
1018 * Allocate new abd's for the parity sectors.
1020 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
1021 raidz_row_t
*rr
= rm
->rm_row
[i
];
1022 for (int c
= 0; c
< rr
->rr_firstdatacol
; c
++) {
1023 raidz_col_t
*rc
= &rr
->rr_col
[c
];
1025 abd_alloc_linear(rc
->rc_size
,
1030 /* init RAIDZ parity ops */
1031 rm
->rm_ops
= vdev_raidz_math_get_ops();
1043 vdev_raidz_p_func(void *buf
, size_t size
, void *private)
1045 struct pqr_struct
*pqr
= private;
1046 const uint64_t *src
= buf
;
1047 int cnt
= size
/ sizeof (src
[0]);
1049 ASSERT(pqr
->p
&& !pqr
->q
&& !pqr
->r
);
1051 for (int i
= 0; i
< cnt
; i
++, src
++, pqr
->p
++)
1058 vdev_raidz_pq_func(void *buf
, size_t size
, void *private)
1060 struct pqr_struct
*pqr
= private;
1061 const uint64_t *src
= buf
;
1063 int cnt
= size
/ sizeof (src
[0]);
1065 ASSERT(pqr
->p
&& pqr
->q
&& !pqr
->r
);
1067 for (int i
= 0; i
< cnt
; i
++, src
++, pqr
->p
++, pqr
->q
++) {
1069 VDEV_RAIDZ_64MUL_2(*pqr
->q
, mask
);
1077 vdev_raidz_pqr_func(void *buf
, size_t size
, void *private)
1079 struct pqr_struct
*pqr
= private;
1080 const uint64_t *src
= buf
;
1082 int cnt
= size
/ sizeof (src
[0]);
1084 ASSERT(pqr
->p
&& pqr
->q
&& pqr
->r
);
1086 for (int i
= 0; i
< cnt
; i
++, src
++, pqr
->p
++, pqr
->q
++, pqr
->r
++) {
1088 VDEV_RAIDZ_64MUL_2(*pqr
->q
, mask
);
1090 VDEV_RAIDZ_64MUL_4(*pqr
->r
, mask
);
1098 vdev_raidz_generate_parity_p(raidz_row_t
*rr
)
1100 uint64_t *p
= abd_to_buf(rr
->rr_col
[VDEV_RAIDZ_P
].rc_abd
);
1102 for (int c
= rr
->rr_firstdatacol
; c
< rr
->rr_cols
; c
++) {
1103 abd_t
*src
= rr
->rr_col
[c
].rc_abd
;
1105 if (c
== rr
->rr_firstdatacol
) {
1106 abd_copy_to_buf(p
, src
, rr
->rr_col
[c
].rc_size
);
1108 struct pqr_struct pqr
= { p
, NULL
, NULL
};
1109 (void) abd_iterate_func(src
, 0, rr
->rr_col
[c
].rc_size
,
1110 vdev_raidz_p_func
, &pqr
);
1116 vdev_raidz_generate_parity_pq(raidz_row_t
*rr
)
1118 uint64_t *p
= abd_to_buf(rr
->rr_col
[VDEV_RAIDZ_P
].rc_abd
);
1119 uint64_t *q
= abd_to_buf(rr
->rr_col
[VDEV_RAIDZ_Q
].rc_abd
);
1120 uint64_t pcnt
= rr
->rr_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (p
[0]);
1121 ASSERT(rr
->rr_col
[VDEV_RAIDZ_P
].rc_size
==
1122 rr
->rr_col
[VDEV_RAIDZ_Q
].rc_size
);
1124 for (int c
= rr
->rr_firstdatacol
; c
< rr
->rr_cols
; c
++) {
1125 abd_t
*src
= rr
->rr_col
[c
].rc_abd
;
1127 uint64_t ccnt
= rr
->rr_col
[c
].rc_size
/ sizeof (p
[0]);
1129 if (c
== rr
->rr_firstdatacol
) {
1130 ASSERT(ccnt
== pcnt
|| ccnt
== 0);
1131 abd_copy_to_buf(p
, src
, rr
->rr_col
[c
].rc_size
);
1132 (void) memcpy(q
, p
, rr
->rr_col
[c
].rc_size
);
1134 for (uint64_t i
= ccnt
; i
< pcnt
; i
++) {
1139 struct pqr_struct pqr
= { p
, q
, NULL
};
1141 ASSERT(ccnt
<= pcnt
);
1142 (void) abd_iterate_func(src
, 0, rr
->rr_col
[c
].rc_size
,
1143 vdev_raidz_pq_func
, &pqr
);
1146 * Treat short columns as though they are full of 0s.
1147 * Note that there's therefore nothing needed for P.
1150 for (uint64_t i
= ccnt
; i
< pcnt
; i
++) {
1151 VDEV_RAIDZ_64MUL_2(q
[i
], mask
);
1158 vdev_raidz_generate_parity_pqr(raidz_row_t
*rr
)
1160 uint64_t *p
= abd_to_buf(rr
->rr_col
[VDEV_RAIDZ_P
].rc_abd
);
1161 uint64_t *q
= abd_to_buf(rr
->rr_col
[VDEV_RAIDZ_Q
].rc_abd
);
1162 uint64_t *r
= abd_to_buf(rr
->rr_col
[VDEV_RAIDZ_R
].rc_abd
);
1163 uint64_t pcnt
= rr
->rr_col
[VDEV_RAIDZ_P
].rc_size
/ sizeof (p
[0]);
1164 ASSERT(rr
->rr_col
[VDEV_RAIDZ_P
].rc_size
==
1165 rr
->rr_col
[VDEV_RAIDZ_Q
].rc_size
);
1166 ASSERT(rr
->rr_col
[VDEV_RAIDZ_P
].rc_size
==
1167 rr
->rr_col
[VDEV_RAIDZ_R
].rc_size
);
1169 for (int c
= rr
->rr_firstdatacol
; c
< rr
->rr_cols
; c
++) {
1170 abd_t
*src
= rr
->rr_col
[c
].rc_abd
;
1172 uint64_t ccnt
= rr
->rr_col
[c
].rc_size
/ sizeof (p
[0]);
1174 if (c
== rr
->rr_firstdatacol
) {
1175 ASSERT(ccnt
== pcnt
|| ccnt
== 0);
1176 abd_copy_to_buf(p
, src
, rr
->rr_col
[c
].rc_size
);
1177 (void) memcpy(q
, p
, rr
->rr_col
[c
].rc_size
);
1178 (void) memcpy(r
, p
, rr
->rr_col
[c
].rc_size
);
1180 for (uint64_t i
= ccnt
; i
< pcnt
; i
++) {
1186 struct pqr_struct pqr
= { p
, q
, r
};
1188 ASSERT(ccnt
<= pcnt
);
1189 (void) abd_iterate_func(src
, 0, rr
->rr_col
[c
].rc_size
,
1190 vdev_raidz_pqr_func
, &pqr
);
1193 * Treat short columns as though they are full of 0s.
1194 * Note that there's therefore nothing needed for P.
1197 for (uint64_t i
= ccnt
; i
< pcnt
; i
++) {
1198 VDEV_RAIDZ_64MUL_2(q
[i
], mask
);
1199 VDEV_RAIDZ_64MUL_4(r
[i
], mask
);
1206 * Generate RAID parity in the first virtual columns according to the number of
1207 * parity columns available.
1210 vdev_raidz_generate_parity_row(raidz_map_t
*rm
, raidz_row_t
*rr
)
1212 if (rr
->rr_cols
== 0) {
1214 * We are handling this block one row at a time (because
1215 * this block has a different logical vs physical width,
1216 * due to RAIDZ expansion), and this is a pad-only row,
1217 * which has no parity.
1222 /* Generate using the new math implementation */
1223 if (vdev_raidz_math_generate(rm
, rr
) != RAIDZ_ORIGINAL_IMPL
)
1226 switch (rr
->rr_firstdatacol
) {
1228 vdev_raidz_generate_parity_p(rr
);
1231 vdev_raidz_generate_parity_pq(rr
);
1234 vdev_raidz_generate_parity_pqr(rr
);
1237 cmn_err(CE_PANIC
, "invalid RAID-Z configuration");
1242 vdev_raidz_generate_parity(raidz_map_t
*rm
)
1244 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
1245 raidz_row_t
*rr
= rm
->rm_row
[i
];
1246 vdev_raidz_generate_parity_row(rm
, rr
);
1251 vdev_raidz_reconst_p_func(void *dbuf
, void *sbuf
, size_t size
, void *private)
1254 uint64_t *dst
= dbuf
;
1255 uint64_t *src
= sbuf
;
1256 int cnt
= size
/ sizeof (src
[0]);
1258 for (int i
= 0; i
< cnt
; i
++) {
1266 vdev_raidz_reconst_q_pre_func(void *dbuf
, void *sbuf
, size_t size
,
1270 uint64_t *dst
= dbuf
;
1271 uint64_t *src
= sbuf
;
1273 int cnt
= size
/ sizeof (dst
[0]);
1275 for (int i
= 0; i
< cnt
; i
++, dst
++, src
++) {
1276 VDEV_RAIDZ_64MUL_2(*dst
, mask
);
1284 vdev_raidz_reconst_q_pre_tail_func(void *buf
, size_t size
, void *private)
1287 uint64_t *dst
= buf
;
1289 int cnt
= size
/ sizeof (dst
[0]);
1291 for (int i
= 0; i
< cnt
; i
++, dst
++) {
1292 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1293 VDEV_RAIDZ_64MUL_2(*dst
, mask
);
1299 struct reconst_q_struct
{
1305 vdev_raidz_reconst_q_post_func(void *buf
, size_t size
, void *private)
1307 struct reconst_q_struct
*rq
= private;
1308 uint64_t *dst
= buf
;
1309 int cnt
= size
/ sizeof (dst
[0]);
1311 for (int i
= 0; i
< cnt
; i
++, dst
++, rq
->q
++) {
1316 for (j
= 0, b
= (uint8_t *)dst
; j
< 8; j
++, b
++) {
1317 *b
= vdev_raidz_exp2(*b
, rq
->exp
);
1324 struct reconst_pq_struct
{
1334 vdev_raidz_reconst_pq_func(void *xbuf
, void *ybuf
, size_t size
, void *private)
1336 struct reconst_pq_struct
*rpq
= private;
1340 for (int i
= 0; i
< size
;
1341 i
++, rpq
->p
++, rpq
->q
++, rpq
->pxy
++, rpq
->qxy
++, xd
++, yd
++) {
1342 *xd
= vdev_raidz_exp2(*rpq
->p
^ *rpq
->pxy
, rpq
->aexp
) ^
1343 vdev_raidz_exp2(*rpq
->q
^ *rpq
->qxy
, rpq
->bexp
);
1344 *yd
= *rpq
->p
^ *rpq
->pxy
^ *xd
;
1351 vdev_raidz_reconst_pq_tail_func(void *xbuf
, size_t size
, void *private)
1353 struct reconst_pq_struct
*rpq
= private;
1356 for (int i
= 0; i
< size
;
1357 i
++, rpq
->p
++, rpq
->q
++, rpq
->pxy
++, rpq
->qxy
++, xd
++) {
1358 /* same operation as vdev_raidz_reconst_pq_func() on xd */
1359 *xd
= vdev_raidz_exp2(*rpq
->p
^ *rpq
->pxy
, rpq
->aexp
) ^
1360 vdev_raidz_exp2(*rpq
->q
^ *rpq
->qxy
, rpq
->bexp
);
1367 vdev_raidz_reconstruct_p(raidz_row_t
*rr
, int *tgts
, int ntgts
)
1372 if (zfs_flags
& ZFS_DEBUG_RAIDZ_RECONSTRUCT
)
1373 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr
, x
);
1375 ASSERT3U(ntgts
, ==, 1);
1376 ASSERT3U(x
, >=, rr
->rr_firstdatacol
);
1377 ASSERT3U(x
, <, rr
->rr_cols
);
1379 ASSERT3U(rr
->rr_col
[x
].rc_size
, <=, rr
->rr_col
[VDEV_RAIDZ_P
].rc_size
);
1381 src
= rr
->rr_col
[VDEV_RAIDZ_P
].rc_abd
;
1382 dst
= rr
->rr_col
[x
].rc_abd
;
1384 abd_copy_from_buf(dst
, abd_to_buf(src
), rr
->rr_col
[x
].rc_size
);
1386 for (int c
= rr
->rr_firstdatacol
; c
< rr
->rr_cols
; c
++) {
1387 uint64_t size
= MIN(rr
->rr_col
[x
].rc_size
,
1388 rr
->rr_col
[c
].rc_size
);
1390 src
= rr
->rr_col
[c
].rc_abd
;
1395 (void) abd_iterate_func2(dst
, src
, 0, 0, size
,
1396 vdev_raidz_reconst_p_func
, NULL
);
1401 vdev_raidz_reconstruct_q(raidz_row_t
*rr
, int *tgts
, int ntgts
)
1407 if (zfs_flags
& ZFS_DEBUG_RAIDZ_RECONSTRUCT
)
1408 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr
, x
);
1412 ASSERT(rr
->rr_col
[x
].rc_size
<= rr
->rr_col
[VDEV_RAIDZ_Q
].rc_size
);
1414 for (c
= rr
->rr_firstdatacol
; c
< rr
->rr_cols
; c
++) {
1415 uint64_t size
= (c
== x
) ? 0 : MIN(rr
->rr_col
[x
].rc_size
,
1416 rr
->rr_col
[c
].rc_size
);
1418 src
= rr
->rr_col
[c
].rc_abd
;
1419 dst
= rr
->rr_col
[x
].rc_abd
;
1421 if (c
== rr
->rr_firstdatacol
) {
1422 abd_copy(dst
, src
, size
);
1423 if (rr
->rr_col
[x
].rc_size
> size
) {
1424 abd_zero_off(dst
, size
,
1425 rr
->rr_col
[x
].rc_size
- size
);
1428 ASSERT3U(size
, <=, rr
->rr_col
[x
].rc_size
);
1429 (void) abd_iterate_func2(dst
, src
, 0, 0, size
,
1430 vdev_raidz_reconst_q_pre_func
, NULL
);
1431 (void) abd_iterate_func(dst
,
1432 size
, rr
->rr_col
[x
].rc_size
- size
,
1433 vdev_raidz_reconst_q_pre_tail_func
, NULL
);
1437 src
= rr
->rr_col
[VDEV_RAIDZ_Q
].rc_abd
;
1438 dst
= rr
->rr_col
[x
].rc_abd
;
1439 exp
= 255 - (rr
->rr_cols
- 1 - x
);
1441 struct reconst_q_struct rq
= { abd_to_buf(src
), exp
};
1442 (void) abd_iterate_func(dst
, 0, rr
->rr_col
[x
].rc_size
,
1443 vdev_raidz_reconst_q_post_func
, &rq
);
1447 vdev_raidz_reconstruct_pq(raidz_row_t
*rr
, int *tgts
, int ntgts
)
1449 uint8_t *p
, *q
, *pxy
, *qxy
, tmp
, a
, b
, aexp
, bexp
;
1450 abd_t
*pdata
, *qdata
;
1451 uint64_t xsize
, ysize
;
1456 if (zfs_flags
& ZFS_DEBUG_RAIDZ_RECONSTRUCT
)
1457 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr
, x
, y
);
1461 ASSERT(x
>= rr
->rr_firstdatacol
);
1462 ASSERT(y
< rr
->rr_cols
);
1464 ASSERT(rr
->rr_col
[x
].rc_size
>= rr
->rr_col
[y
].rc_size
);
1467 * Move the parity data aside -- we're going to compute parity as
1468 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1469 * reuse the parity generation mechanism without trashing the actual
1470 * parity so we make those columns appear to be full of zeros by
1471 * setting their lengths to zero.
1473 pdata
= rr
->rr_col
[VDEV_RAIDZ_P
].rc_abd
;
1474 qdata
= rr
->rr_col
[VDEV_RAIDZ_Q
].rc_abd
;
1475 xsize
= rr
->rr_col
[x
].rc_size
;
1476 ysize
= rr
->rr_col
[y
].rc_size
;
1478 rr
->rr_col
[VDEV_RAIDZ_P
].rc_abd
=
1479 abd_alloc_linear(rr
->rr_col
[VDEV_RAIDZ_P
].rc_size
, B_TRUE
);
1480 rr
->rr_col
[VDEV_RAIDZ_Q
].rc_abd
=
1481 abd_alloc_linear(rr
->rr_col
[VDEV_RAIDZ_Q
].rc_size
, B_TRUE
);
1482 rr
->rr_col
[x
].rc_size
= 0;
1483 rr
->rr_col
[y
].rc_size
= 0;
1485 vdev_raidz_generate_parity_pq(rr
);
1487 rr
->rr_col
[x
].rc_size
= xsize
;
1488 rr
->rr_col
[y
].rc_size
= ysize
;
1490 p
= abd_to_buf(pdata
);
1491 q
= abd_to_buf(qdata
);
1492 pxy
= abd_to_buf(rr
->rr_col
[VDEV_RAIDZ_P
].rc_abd
);
1493 qxy
= abd_to_buf(rr
->rr_col
[VDEV_RAIDZ_Q
].rc_abd
);
1494 xd
= rr
->rr_col
[x
].rc_abd
;
1495 yd
= rr
->rr_col
[y
].rc_abd
;
1499 * Pxy = P + D_x + D_y
1500 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1502 * We can then solve for D_x:
1503 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1505 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1506 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1508 * With D_x in hand, we can easily solve for D_y:
1509 * D_y = P + Pxy + D_x
1512 a
= vdev_raidz_pow2
[255 + x
- y
];
1513 b
= vdev_raidz_pow2
[255 - (rr
->rr_cols
- 1 - x
)];
1514 tmp
= 255 - vdev_raidz_log2
[a
^ 1];
1516 aexp
= vdev_raidz_log2
[vdev_raidz_exp2(a
, tmp
)];
1517 bexp
= vdev_raidz_log2
[vdev_raidz_exp2(b
, tmp
)];
1519 ASSERT3U(xsize
, >=, ysize
);
1520 struct reconst_pq_struct rpq
= { p
, q
, pxy
, qxy
, aexp
, bexp
};
1522 (void) abd_iterate_func2(xd
, yd
, 0, 0, ysize
,
1523 vdev_raidz_reconst_pq_func
, &rpq
);
1524 (void) abd_iterate_func(xd
, ysize
, xsize
- ysize
,
1525 vdev_raidz_reconst_pq_tail_func
, &rpq
);
1527 abd_free(rr
->rr_col
[VDEV_RAIDZ_P
].rc_abd
);
1528 abd_free(rr
->rr_col
[VDEV_RAIDZ_Q
].rc_abd
);
1531 * Restore the saved parity data.
1533 rr
->rr_col
[VDEV_RAIDZ_P
].rc_abd
= pdata
;
1534 rr
->rr_col
[VDEV_RAIDZ_Q
].rc_abd
= qdata
;
1538 * In the general case of reconstruction, we must solve the system of linear
1539 * equations defined by the coefficients used to generate parity as well as
1540 * the contents of the data and parity disks. This can be expressed with
1541 * vectors for the original data (D) and the actual data (d) and parity (p)
1542 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1546 * | V | | D_0 | | p_m-1 |
1547 * | | x | : | = | d_0 |
1548 * | I | | D_n-1 | | : |
1549 * | | ~~ ~~ | d_n-1 |
1552 * I is simply a square identity matrix of size n, and V is a vandermonde
1553 * matrix defined by the coefficients we chose for the various parity columns
1554 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1555 * computation as well as linear separability.
1558 * | 1 .. 1 1 1 | | p_0 |
1559 * | 2^n-1 .. 4 2 1 | __ __ | : |
1560 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1561 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1562 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1563 * | : : : : | | : | | d_2 |
1564 * | 0 .. 1 0 0 | | D_n-1 | | : |
1565 * | 0 .. 0 1 0 | ~~ ~~ | : |
1566 * | 0 .. 0 0 1 | | d_n-1 |
1569 * Note that I, V, d, and p are known. To compute D, we must invert the
1570 * matrix and use the known data and parity values to reconstruct the unknown
1571 * data values. We begin by removing the rows in V|I and d|p that correspond
1572 * to failed or missing columns; we then make V|I square (n x n) and d|p
1573 * sized n by removing rows corresponding to unused parity from the bottom up
1574 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1575 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1576 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1578 * | 1 1 1 1 1 1 1 1 |
1579 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1580 * | 19 205 116 29 64 16 4 1 | / /
1581 * | 1 0 0 0 0 0 0 0 | / /
1582 * | 0 1 0 0 0 0 0 0 | <--' /
1583 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1584 * | 0 0 0 1 0 0 0 0 |
1585 * | 0 0 0 0 1 0 0 0 |
1586 * | 0 0 0 0 0 1 0 0 |
1587 * | 0 0 0 0 0 0 1 0 |
1588 * | 0 0 0 0 0 0 0 1 |
1591 * | 1 1 1 1 1 1 1 1 |
1592 * | 128 64 32 16 8 4 2 1 |
1593 * | 19 205 116 29 64 16 4 1 |
1594 * | 1 0 0 0 0 0 0 0 |
1595 * | 0 1 0 0 0 0 0 0 |
1596 * (V|I)' = | 0 0 1 0 0 0 0 0 |
1597 * | 0 0 0 1 0 0 0 0 |
1598 * | 0 0 0 0 1 0 0 0 |
1599 * | 0 0 0 0 0 1 0 0 |
1600 * | 0 0 0 0 0 0 1 0 |
1601 * | 0 0 0 0 0 0 0 1 |
1604 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1605 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1606 * matrix is not singular.
1608 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1609 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1610 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1611 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1612 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1613 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1614 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1615 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1618 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1619 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1620 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1621 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1622 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1623 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1624 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1625 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1628 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1629 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1630 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1631 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1632 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1633 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1634 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1635 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1638 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1639 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1640 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1641 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1642 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1643 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1644 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1645 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1648 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1649 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1650 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1651 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1652 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1653 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1654 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1655 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1658 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1659 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1660 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1661 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1662 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1663 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1664 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1665 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1668 * | 0 0 1 0 0 0 0 0 |
1669 * | 167 100 5 41 159 169 217 208 |
1670 * | 166 100 4 40 158 168 216 209 |
1671 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1672 * | 0 0 0 0 1 0 0 0 |
1673 * | 0 0 0 0 0 1 0 0 |
1674 * | 0 0 0 0 0 0 1 0 |
1675 * | 0 0 0 0 0 0 0 1 |
1678 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1679 * of the missing data.
1681 * As is apparent from the example above, the only non-trivial rows in the
1682 * inverse matrix correspond to the data disks that we're trying to
1683 * reconstruct. Indeed, those are the only rows we need as the others would
1684 * only be useful for reconstructing data known or assumed to be valid. For
1685 * that reason, we only build the coefficients in the rows that correspond to
1690 vdev_raidz_matrix_init(raidz_row_t
*rr
, int n
, int nmap
, int *map
,
1696 ASSERT(n
== rr
->rr_cols
- rr
->rr_firstdatacol
);
1699 * Fill in the missing rows of interest.
1701 for (i
= 0; i
< nmap
; i
++) {
1702 ASSERT3S(0, <=, map
[i
]);
1703 ASSERT3S(map
[i
], <=, 2);
1710 for (j
= 0; j
< n
; j
++) {
1714 rows
[i
][j
] = vdev_raidz_pow2
[pow
];
1720 vdev_raidz_matrix_invert(raidz_row_t
*rr
, int n
, int nmissing
, int *missing
,
1721 uint8_t **rows
, uint8_t **invrows
, const uint8_t *used
)
1727 * Assert that the first nmissing entries from the array of used
1728 * columns correspond to parity columns and that subsequent entries
1729 * correspond to data columns.
1731 for (i
= 0; i
< nmissing
; i
++) {
1732 ASSERT3S(used
[i
], <, rr
->rr_firstdatacol
);
1734 for (; i
< n
; i
++) {
1735 ASSERT3S(used
[i
], >=, rr
->rr_firstdatacol
);
1739 * First initialize the storage where we'll compute the inverse rows.
1741 for (i
= 0; i
< nmissing
; i
++) {
1742 for (j
= 0; j
< n
; j
++) {
1743 invrows
[i
][j
] = (i
== j
) ? 1 : 0;
1748 * Subtract all trivial rows from the rows of consequence.
1750 for (i
= 0; i
< nmissing
; i
++) {
1751 for (j
= nmissing
; j
< n
; j
++) {
1752 ASSERT3U(used
[j
], >=, rr
->rr_firstdatacol
);
1753 jj
= used
[j
] - rr
->rr_firstdatacol
;
1755 invrows
[i
][j
] = rows
[i
][jj
];
1761 * For each of the rows of interest, we must normalize it and subtract
1762 * a multiple of it from the other rows.
1764 for (i
= 0; i
< nmissing
; i
++) {
1765 for (j
= 0; j
< missing
[i
]; j
++) {
1766 ASSERT0(rows
[i
][j
]);
1768 ASSERT3U(rows
[i
][missing
[i
]], !=, 0);
1771 * Compute the inverse of the first element and multiply each
1772 * element in the row by that value.
1774 log
= 255 - vdev_raidz_log2
[rows
[i
][missing
[i
]]];
1776 for (j
= 0; j
< n
; j
++) {
1777 rows
[i
][j
] = vdev_raidz_exp2(rows
[i
][j
], log
);
1778 invrows
[i
][j
] = vdev_raidz_exp2(invrows
[i
][j
], log
);
1781 for (ii
= 0; ii
< nmissing
; ii
++) {
1785 ASSERT3U(rows
[ii
][missing
[i
]], !=, 0);
1787 log
= vdev_raidz_log2
[rows
[ii
][missing
[i
]]];
1789 for (j
= 0; j
< n
; j
++) {
1791 vdev_raidz_exp2(rows
[i
][j
], log
);
1793 vdev_raidz_exp2(invrows
[i
][j
], log
);
1799 * Verify that the data that is left in the rows are properly part of
1800 * an identity matrix.
1802 for (i
= 0; i
< nmissing
; i
++) {
1803 for (j
= 0; j
< n
; j
++) {
1804 if (j
== missing
[i
]) {
1805 ASSERT3U(rows
[i
][j
], ==, 1);
1807 ASSERT0(rows
[i
][j
]);
1814 vdev_raidz_matrix_reconstruct(raidz_row_t
*rr
, int n
, int nmissing
,
1815 int *missing
, uint8_t **invrows
, const uint8_t *used
)
1820 uint8_t *dst
[VDEV_RAIDZ_MAXPARITY
] = { NULL
};
1821 uint64_t dcount
[VDEV_RAIDZ_MAXPARITY
] = { 0 };
1825 uint8_t *invlog
[VDEV_RAIDZ_MAXPARITY
];
1829 psize
= sizeof (invlog
[0][0]) * n
* nmissing
;
1830 p
= kmem_alloc(psize
, KM_SLEEP
);
1832 for (pp
= p
, i
= 0; i
< nmissing
; i
++) {
1837 for (i
= 0; i
< nmissing
; i
++) {
1838 for (j
= 0; j
< n
; j
++) {
1839 ASSERT3U(invrows
[i
][j
], !=, 0);
1840 invlog
[i
][j
] = vdev_raidz_log2
[invrows
[i
][j
]];
1844 for (i
= 0; i
< n
; i
++) {
1846 ASSERT3U(c
, <, rr
->rr_cols
);
1848 ccount
= rr
->rr_col
[c
].rc_size
;
1849 ASSERT(ccount
>= rr
->rr_col
[missing
[0]].rc_size
|| i
> 0);
1852 src
= abd_to_buf(rr
->rr_col
[c
].rc_abd
);
1853 for (j
= 0; j
< nmissing
; j
++) {
1854 cc
= missing
[j
] + rr
->rr_firstdatacol
;
1855 ASSERT3U(cc
, >=, rr
->rr_firstdatacol
);
1856 ASSERT3U(cc
, <, rr
->rr_cols
);
1857 ASSERT3U(cc
, !=, c
);
1859 dcount
[j
] = rr
->rr_col
[cc
].rc_size
;
1861 dst
[j
] = abd_to_buf(rr
->rr_col
[cc
].rc_abd
);
1864 for (x
= 0; x
< ccount
; x
++, src
++) {
1866 log
= vdev_raidz_log2
[*src
];
1868 for (cc
= 0; cc
< nmissing
; cc
++) {
1869 if (x
>= dcount
[cc
])
1875 if ((ll
= log
+ invlog
[cc
][i
]) >= 255)
1877 val
= vdev_raidz_pow2
[ll
];
1888 kmem_free(p
, psize
);
1892 vdev_raidz_reconstruct_general(raidz_row_t
*rr
, int *tgts
, int ntgts
)
1896 int missing_rows
[VDEV_RAIDZ_MAXPARITY
];
1897 int parity_map
[VDEV_RAIDZ_MAXPARITY
];
1900 uint8_t *rows
[VDEV_RAIDZ_MAXPARITY
];
1901 uint8_t *invrows
[VDEV_RAIDZ_MAXPARITY
];
1904 abd_t
**bufs
= NULL
;
1906 if (zfs_flags
& ZFS_DEBUG_RAIDZ_RECONSTRUCT
)
1907 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr
, ntgts
);
1909 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
1910 * temporary linear ABDs if any non-linear ABDs are found.
1912 for (i
= rr
->rr_firstdatacol
; i
< rr
->rr_cols
; i
++) {
1913 ASSERT(rr
->rr_col
[i
].rc_abd
!= NULL
);
1914 if (!abd_is_linear(rr
->rr_col
[i
].rc_abd
)) {
1915 bufs
= kmem_alloc(rr
->rr_cols
* sizeof (abd_t
*),
1918 for (c
= rr
->rr_firstdatacol
; c
< rr
->rr_cols
; c
++) {
1919 raidz_col_t
*col
= &rr
->rr_col
[c
];
1921 bufs
[c
] = col
->rc_abd
;
1922 if (bufs
[c
] != NULL
) {
1923 col
->rc_abd
= abd_alloc_linear(
1924 col
->rc_size
, B_TRUE
);
1925 abd_copy(col
->rc_abd
, bufs
[c
],
1934 n
= rr
->rr_cols
- rr
->rr_firstdatacol
;
1937 * Figure out which data columns are missing.
1940 for (t
= 0; t
< ntgts
; t
++) {
1941 if (tgts
[t
] >= rr
->rr_firstdatacol
) {
1942 missing_rows
[nmissing_rows
++] =
1943 tgts
[t
] - rr
->rr_firstdatacol
;
1948 * Figure out which parity columns to use to help generate the missing
1951 for (tt
= 0, c
= 0, i
= 0; i
< nmissing_rows
; c
++) {
1953 ASSERT(c
< rr
->rr_firstdatacol
);
1956 * Skip any targeted parity columns.
1958 if (c
== tgts
[tt
]) {
1967 psize
= (sizeof (rows
[0][0]) + sizeof (invrows
[0][0])) *
1968 nmissing_rows
* n
+ sizeof (used
[0]) * n
;
1969 p
= kmem_alloc(psize
, KM_SLEEP
);
1971 for (pp
= p
, i
= 0; i
< nmissing_rows
; i
++) {
1979 for (i
= 0; i
< nmissing_rows
; i
++) {
1980 used
[i
] = parity_map
[i
];
1983 for (tt
= 0, c
= rr
->rr_firstdatacol
; c
< rr
->rr_cols
; c
++) {
1984 if (tt
< nmissing_rows
&&
1985 c
== missing_rows
[tt
] + rr
->rr_firstdatacol
) {
1996 * Initialize the interesting rows of the matrix.
1998 vdev_raidz_matrix_init(rr
, n
, nmissing_rows
, parity_map
, rows
);
2001 * Invert the matrix.
2003 vdev_raidz_matrix_invert(rr
, n
, nmissing_rows
, missing_rows
, rows
,
2007 * Reconstruct the missing data using the generated matrix.
2009 vdev_raidz_matrix_reconstruct(rr
, n
, nmissing_rows
, missing_rows
,
2012 kmem_free(p
, psize
);
2015 * copy back from temporary linear abds and free them
2018 for (c
= rr
->rr_firstdatacol
; c
< rr
->rr_cols
; c
++) {
2019 raidz_col_t
*col
= &rr
->rr_col
[c
];
2021 if (bufs
[c
] != NULL
) {
2022 abd_copy(bufs
[c
], col
->rc_abd
, col
->rc_size
);
2023 abd_free(col
->rc_abd
);
2025 col
->rc_abd
= bufs
[c
];
2027 kmem_free(bufs
, rr
->rr_cols
* sizeof (abd_t
*));
2032 vdev_raidz_reconstruct_row(raidz_map_t
*rm
, raidz_row_t
*rr
,
2033 const int *t
, int nt
)
2035 int tgts
[VDEV_RAIDZ_MAXPARITY
], *dt
;
2038 int nbadparity
, nbaddata
;
2039 int parity_valid
[VDEV_RAIDZ_MAXPARITY
];
2041 if (zfs_flags
& ZFS_DEBUG_RAIDZ_RECONSTRUCT
) {
2042 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2043 rr
, nt
, (int)rr
->rr_cols
, (int)rr
->rr_missingdata
,
2044 (int)rr
->rr_missingparity
);
2047 nbadparity
= rr
->rr_firstdatacol
;
2048 nbaddata
= rr
->rr_cols
- nbadparity
;
2050 for (i
= 0, c
= 0; c
< rr
->rr_cols
; c
++) {
2051 if (zfs_flags
& ZFS_DEBUG_RAIDZ_RECONSTRUCT
) {
2052 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2053 "offset=%llx error=%u)",
2054 rr
, c
, (int)rr
->rr_col
[c
].rc_devidx
,
2055 (long long)rr
->rr_col
[c
].rc_offset
,
2056 (int)rr
->rr_col
[c
].rc_error
);
2058 if (c
< rr
->rr_firstdatacol
)
2059 parity_valid
[c
] = B_FALSE
;
2061 if (i
< nt
&& c
== t
[i
]) {
2064 } else if (rr
->rr_col
[c
].rc_error
!= 0) {
2066 } else if (c
>= rr
->rr_firstdatacol
) {
2069 parity_valid
[c
] = B_TRUE
;
2074 ASSERT(ntgts
>= nt
);
2075 ASSERT(nbaddata
>= 0);
2076 ASSERT(nbaddata
+ nbadparity
== ntgts
);
2078 dt
= &tgts
[nbadparity
];
2080 /* Reconstruct using the new math implementation */
2081 ret
= vdev_raidz_math_reconstruct(rm
, rr
, parity_valid
, dt
, nbaddata
);
2082 if (ret
!= RAIDZ_ORIGINAL_IMPL
)
2086 * See if we can use any of our optimized reconstruction routines.
2090 if (parity_valid
[VDEV_RAIDZ_P
]) {
2091 vdev_raidz_reconstruct_p(rr
, dt
, 1);
2095 ASSERT(rr
->rr_firstdatacol
> 1);
2097 if (parity_valid
[VDEV_RAIDZ_Q
]) {
2098 vdev_raidz_reconstruct_q(rr
, dt
, 1);
2102 ASSERT(rr
->rr_firstdatacol
> 2);
2106 ASSERT(rr
->rr_firstdatacol
> 1);
2108 if (parity_valid
[VDEV_RAIDZ_P
] &&
2109 parity_valid
[VDEV_RAIDZ_Q
]) {
2110 vdev_raidz_reconstruct_pq(rr
, dt
, 2);
2114 ASSERT(rr
->rr_firstdatacol
> 2);
2119 vdev_raidz_reconstruct_general(rr
, tgts
, ntgts
);
2123 vdev_raidz_open(vdev_t
*vd
, uint64_t *asize
, uint64_t *max_asize
,
2124 uint64_t *logical_ashift
, uint64_t *physical_ashift
)
2126 vdev_raidz_t
*vdrz
= vd
->vdev_tsd
;
2127 uint64_t nparity
= vdrz
->vd_nparity
;
2132 ASSERT(nparity
> 0);
2134 if (nparity
> VDEV_RAIDZ_MAXPARITY
||
2135 vd
->vdev_children
< nparity
+ 1) {
2136 vd
->vdev_stat
.vs_aux
= VDEV_AUX_BAD_LABEL
;
2137 return (SET_ERROR(EINVAL
));
2140 vdev_open_children(vd
);
2142 for (c
= 0; c
< vd
->vdev_children
; c
++) {
2143 vdev_t
*cvd
= vd
->vdev_child
[c
];
2145 if (cvd
->vdev_open_error
!= 0) {
2146 lasterror
= cvd
->vdev_open_error
;
2151 *asize
= MIN(*asize
- 1, cvd
->vdev_asize
- 1) + 1;
2152 *max_asize
= MIN(*max_asize
- 1, cvd
->vdev_max_asize
- 1) + 1;
2153 *logical_ashift
= MAX(*logical_ashift
, cvd
->vdev_ashift
);
2155 for (c
= 0; c
< vd
->vdev_children
; c
++) {
2156 vdev_t
*cvd
= vd
->vdev_child
[c
];
2158 if (cvd
->vdev_open_error
!= 0)
2160 *physical_ashift
= vdev_best_ashift(*logical_ashift
,
2161 *physical_ashift
, cvd
->vdev_physical_ashift
);
2164 if (vd
->vdev_rz_expanding
) {
2165 *asize
*= vd
->vdev_children
- 1;
2166 *max_asize
*= vd
->vdev_children
- 1;
2168 vd
->vdev_min_asize
= *asize
;
2170 *asize
*= vd
->vdev_children
;
2171 *max_asize
*= vd
->vdev_children
;
2174 if (numerrors
> nparity
) {
2175 vd
->vdev_stat
.vs_aux
= VDEV_AUX_NO_REPLICAS
;
2183 vdev_raidz_close(vdev_t
*vd
)
2185 for (int c
= 0; c
< vd
->vdev_children
; c
++) {
2186 if (vd
->vdev_child
[c
] != NULL
)
2187 vdev_close(vd
->vdev_child
[c
]);
2192 * Return the logical width to use, given the txg in which the allocation
2193 * happened. Note that BP_GET_BIRTH() is usually the txg in which the
2194 * BP was allocated. Remapped BP's (that were relocated due to device
2195 * removal, see remap_blkptr_cb()), will have a more recent physical birth
2196 * which reflects when the BP was relocated, but we can ignore these because
2197 * they can't be on RAIDZ (device removal doesn't support RAIDZ).
2200 vdev_raidz_get_logical_width(vdev_raidz_t
*vdrz
, uint64_t txg
)
2202 reflow_node_t lookup
= {
2208 mutex_enter(&vdrz
->vd_expand_lock
);
2209 reflow_node_t
*re
= avl_find(&vdrz
->vd_expand_txgs
, &lookup
, &where
);
2211 width
= re
->re_logical_width
;
2213 re
= avl_nearest(&vdrz
->vd_expand_txgs
, where
, AVL_BEFORE
);
2215 width
= re
->re_logical_width
;
2217 width
= vdrz
->vd_original_width
;
2219 mutex_exit(&vdrz
->vd_expand_lock
);
2224 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2225 * more space due to the lower data-to-parity ratio. In this case it's
2226 * important to pass in the correct txg. Note that vdev_gang_header_asize()
2227 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2228 * regardless of txg. This is assured because for a single data sector, we
2229 * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2232 vdev_raidz_asize(vdev_t
*vd
, uint64_t psize
, uint64_t txg
)
2234 vdev_raidz_t
*vdrz
= vd
->vdev_tsd
;
2236 uint64_t ashift
= vd
->vdev_top
->vdev_ashift
;
2237 uint64_t cols
= vdrz
->vd_original_width
;
2238 uint64_t nparity
= vdrz
->vd_nparity
;
2240 cols
= vdev_raidz_get_logical_width(vdrz
, txg
);
2242 asize
= ((psize
- 1) >> ashift
) + 1;
2243 asize
+= nparity
* ((asize
+ cols
- nparity
- 1) / (cols
- nparity
));
2244 asize
= roundup(asize
, nparity
+ 1) << ashift
;
2247 uint64_t asize_new
= ((psize
- 1) >> ashift
) + 1;
2248 uint64_t ncols_new
= vdrz
->vd_physical_width
;
2249 asize_new
+= nparity
* ((asize_new
+ ncols_new
- nparity
- 1) /
2250 (ncols_new
- nparity
));
2251 asize_new
= roundup(asize_new
, nparity
+ 1) << ashift
;
2252 VERIFY3U(asize_new
, <=, asize
);
2259 * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2260 * so each child must provide at least 1/Nth of its asize.
2263 vdev_raidz_min_asize(vdev_t
*vd
)
2265 return ((vd
->vdev_min_asize
+ vd
->vdev_children
- 1) /
2270 vdev_raidz_child_done(zio_t
*zio
)
2272 raidz_col_t
*rc
= zio
->io_private
;
2274 ASSERT3P(rc
->rc_abd
, !=, NULL
);
2275 rc
->rc_error
= zio
->io_error
;
2281 vdev_raidz_shadow_child_done(zio_t
*zio
)
2283 raidz_col_t
*rc
= zio
->io_private
;
2285 rc
->rc_shadow_error
= zio
->io_error
;
2289 vdev_raidz_io_verify(zio_t
*zio
, raidz_map_t
*rm
, raidz_row_t
*rr
, int col
)
2293 range_seg64_t logical_rs
, physical_rs
, remain_rs
;
2294 logical_rs
.rs_start
= rr
->rr_offset
;
2295 logical_rs
.rs_end
= logical_rs
.rs_start
+
2296 vdev_raidz_asize(zio
->io_vd
, rr
->rr_size
,
2297 BP_GET_BIRTH(zio
->io_bp
));
2299 raidz_col_t
*rc
= &rr
->rr_col
[col
];
2300 vdev_t
*cvd
= zio
->io_vd
->vdev_child
[rc
->rc_devidx
];
2302 vdev_xlate(cvd
, &logical_rs
, &physical_rs
, &remain_rs
);
2303 ASSERT(vdev_xlate_is_empty(&remain_rs
));
2304 if (vdev_xlate_is_empty(&physical_rs
)) {
2306 * If we are in the middle of expansion, the
2307 * physical->logical mapping is changing so vdev_xlate()
2308 * can't give us a reliable answer.
2312 ASSERT3U(rc
->rc_offset
, ==, physical_rs
.rs_start
);
2313 ASSERT3U(rc
->rc_offset
, <, physical_rs
.rs_end
);
2315 * It would be nice to assert that rs_end is equal
2316 * to rc_offset + rc_size but there might be an
2317 * optional I/O at the end that is not accounted in
2320 if (physical_rs
.rs_end
> rc
->rc_offset
+ rc
->rc_size
) {
2321 ASSERT3U(physical_rs
.rs_end
, ==, rc
->rc_offset
+
2322 rc
->rc_size
+ (1 << zio
->io_vd
->vdev_top
->vdev_ashift
));
2324 ASSERT3U(physical_rs
.rs_end
, ==, rc
->rc_offset
+ rc
->rc_size
);
2330 vdev_raidz_io_start_write(zio_t
*zio
, raidz_row_t
*rr
)
2332 vdev_t
*vd
= zio
->io_vd
;
2333 raidz_map_t
*rm
= zio
->io_vsd
;
2335 vdev_raidz_generate_parity_row(rm
, rr
);
2337 for (int c
= 0; c
< rr
->rr_scols
; c
++) {
2338 raidz_col_t
*rc
= &rr
->rr_col
[c
];
2339 vdev_t
*cvd
= vd
->vdev_child
[rc
->rc_devidx
];
2341 /* Verify physical to logical translation */
2342 vdev_raidz_io_verify(zio
, rm
, rr
, c
);
2344 if (rc
->rc_size
== 0)
2347 ASSERT3U(rc
->rc_offset
+ rc
->rc_size
, <,
2348 cvd
->vdev_psize
- VDEV_LABEL_END_SIZE
);
2350 ASSERT3P(rc
->rc_abd
, !=, NULL
);
2351 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
2352 rc
->rc_offset
, rc
->rc_abd
,
2353 abd_get_size(rc
->rc_abd
), zio
->io_type
,
2354 zio
->io_priority
, 0, vdev_raidz_child_done
, rc
));
2356 if (rc
->rc_shadow_devidx
!= INT_MAX
) {
2357 vdev_t
*cvd2
= vd
->vdev_child
[rc
->rc_shadow_devidx
];
2360 rc
->rc_shadow_offset
+ abd_get_size(rc
->rc_abd
), <,
2361 cvd2
->vdev_psize
- VDEV_LABEL_END_SIZE
);
2363 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd2
,
2364 rc
->rc_shadow_offset
, rc
->rc_abd
,
2365 abd_get_size(rc
->rc_abd
),
2366 zio
->io_type
, zio
->io_priority
, 0,
2367 vdev_raidz_shadow_child_done
, rc
));
2373 * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2374 * This only works for vdev_raidz_map_alloc() (not _expanded()).
2377 raidz_start_skip_writes(zio_t
*zio
)
2379 vdev_t
*vd
= zio
->io_vd
;
2380 uint64_t ashift
= vd
->vdev_top
->vdev_ashift
;
2381 raidz_map_t
*rm
= zio
->io_vsd
;
2382 ASSERT3U(rm
->rm_nrows
, ==, 1);
2383 raidz_row_t
*rr
= rm
->rm_row
[0];
2384 for (int c
= 0; c
< rr
->rr_scols
; c
++) {
2385 raidz_col_t
*rc
= &rr
->rr_col
[c
];
2386 vdev_t
*cvd
= vd
->vdev_child
[rc
->rc_devidx
];
2387 if (rc
->rc_size
!= 0)
2389 ASSERT3P(rc
->rc_abd
, ==, NULL
);
2391 ASSERT3U(rc
->rc_offset
, <,
2392 cvd
->vdev_psize
- VDEV_LABEL_END_SIZE
);
2394 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
, rc
->rc_offset
,
2395 NULL
, 1ULL << ashift
, zio
->io_type
, zio
->io_priority
,
2396 ZIO_FLAG_NODATA
| ZIO_FLAG_OPTIONAL
, NULL
, NULL
));
2401 vdev_raidz_io_start_read_row(zio_t
*zio
, raidz_row_t
*rr
, boolean_t forceparity
)
2403 vdev_t
*vd
= zio
->io_vd
;
2406 * Iterate over the columns in reverse order so that we hit the parity
2407 * last -- any errors along the way will force us to read the parity.
2409 for (int c
= rr
->rr_cols
- 1; c
>= 0; c
--) {
2410 raidz_col_t
*rc
= &rr
->rr_col
[c
];
2411 if (rc
->rc_size
== 0)
2413 vdev_t
*cvd
= vd
->vdev_child
[rc
->rc_devidx
];
2414 if (!vdev_readable(cvd
)) {
2415 if (c
>= rr
->rr_firstdatacol
)
2416 rr
->rr_missingdata
++;
2418 rr
->rr_missingparity
++;
2419 rc
->rc_error
= SET_ERROR(ENXIO
);
2420 rc
->rc_tried
= 1; /* don't even try */
2424 if (vdev_dtl_contains(cvd
, DTL_MISSING
, zio
->io_txg
, 1)) {
2425 if (c
>= rr
->rr_firstdatacol
)
2426 rr
->rr_missingdata
++;
2428 rr
->rr_missingparity
++;
2429 rc
->rc_error
= SET_ERROR(ESTALE
);
2434 c
>= rr
->rr_firstdatacol
|| rr
->rr_missingdata
> 0 ||
2435 (zio
->io_flags
& (ZIO_FLAG_SCRUB
| ZIO_FLAG_RESILVER
))) {
2436 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
2437 rc
->rc_offset
, rc
->rc_abd
, rc
->rc_size
,
2438 zio
->io_type
, zio
->io_priority
, 0,
2439 vdev_raidz_child_done
, rc
));
2445 vdev_raidz_io_start_read_phys_cols(zio_t
*zio
, raidz_map_t
*rm
)
2447 vdev_t
*vd
= zio
->io_vd
;
2449 for (int i
= 0; i
< rm
->rm_nphys_cols
; i
++) {
2450 raidz_col_t
*prc
= &rm
->rm_phys_col
[i
];
2451 if (prc
->rc_size
== 0)
2454 ASSERT3U(prc
->rc_devidx
, ==, i
);
2455 vdev_t
*cvd
= vd
->vdev_child
[i
];
2456 if (!vdev_readable(cvd
)) {
2457 prc
->rc_error
= SET_ERROR(ENXIO
);
2458 prc
->rc_tried
= 1; /* don't even try */
2459 prc
->rc_skipped
= 1;
2462 if (vdev_dtl_contains(cvd
, DTL_MISSING
, zio
->io_txg
, 1)) {
2463 prc
->rc_error
= SET_ERROR(ESTALE
);
2464 prc
->rc_skipped
= 1;
2467 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
2468 prc
->rc_offset
, prc
->rc_abd
, prc
->rc_size
,
2469 zio
->io_type
, zio
->io_priority
, 0,
2470 vdev_raidz_child_done
, prc
));
2475 vdev_raidz_io_start_read(zio_t
*zio
, raidz_map_t
*rm
)
2478 * If there are multiple rows, we will be hitting
2479 * all disks, so go ahead and read the parity so
2480 * that we are reading in decent size chunks.
2482 boolean_t forceparity
= rm
->rm_nrows
> 1;
2484 if (rm
->rm_phys_col
) {
2485 vdev_raidz_io_start_read_phys_cols(zio
, rm
);
2487 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
2488 raidz_row_t
*rr
= rm
->rm_row
[i
];
2489 vdev_raidz_io_start_read_row(zio
, rr
, forceparity
);
2495 * Start an IO operation on a RAIDZ VDev
2498 * - For write operations:
2499 * 1. Generate the parity data
2500 * 2. Create child zio write operations to each column's vdev, for both
2502 * 3. If the column skips any sectors for padding, create optional dummy
2503 * write zio children for those areas to improve aggregation continuity.
2504 * - For read operations:
2505 * 1. Create child zio read operations to each data column's vdev to read
2506 * the range of data required for zio.
2507 * 2. If this is a scrub or resilver operation, or if any of the data
2508 * vdevs have had errors, then create zio read operations to the parity
2509 * columns' VDevs as well.
2512 vdev_raidz_io_start(zio_t
*zio
)
2514 vdev_t
*vd
= zio
->io_vd
;
2515 vdev_t
*tvd
= vd
->vdev_top
;
2516 vdev_raidz_t
*vdrz
= vd
->vdev_tsd
;
2519 uint64_t logical_width
= vdev_raidz_get_logical_width(vdrz
,
2520 BP_GET_BIRTH(zio
->io_bp
));
2521 if (logical_width
!= vdrz
->vd_physical_width
) {
2522 zfs_locked_range_t
*lr
= NULL
;
2523 uint64_t synced_offset
= UINT64_MAX
;
2524 uint64_t next_offset
= UINT64_MAX
;
2525 boolean_t use_scratch
= B_FALSE
;
2527 * Note: when the expansion is completing, we set
2528 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2529 * in a later txg than when we last update spa_ubsync's state
2530 * (see the end of spa_raidz_expand_thread()). Therefore we
2531 * may see vre_state!=SCANNING before
2532 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2533 * on disk, but the copying progress has been synced to disk
2534 * (and reflected in spa_ubsync). In this case it's fine to
2535 * treat the expansion as completed, since if we crash there's
2536 * no additional copying to do.
2538 if (vdrz
->vn_vre
.vre_state
== DSS_SCANNING
) {
2539 ASSERT3P(vd
->vdev_spa
->spa_raidz_expand
, ==,
2541 lr
= zfs_rangelock_enter(&vdrz
->vn_vre
.vre_rangelock
,
2542 zio
->io_offset
, zio
->io_size
, RL_READER
);
2544 (RRSS_GET_STATE(&vd
->vdev_spa
->spa_ubsync
) ==
2545 RRSS_SCRATCH_VALID
);
2547 RRSS_GET_OFFSET(&vd
->vdev_spa
->spa_ubsync
);
2548 next_offset
= vdrz
->vn_vre
.vre_offset
;
2550 * If we haven't resumed expanding since importing the
2551 * pool, vre_offset won't have been set yet. In
2552 * this case the next offset to be copied is the same
2553 * as what was synced.
2555 if (next_offset
== UINT64_MAX
) {
2556 next_offset
= synced_offset
;
2560 zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2561 "%lld next_offset=%lld use_scratch=%u",
2563 zio
->io_type
== ZIO_TYPE_WRITE
? "WRITE" : "READ",
2564 (long long)zio
->io_offset
,
2565 (long long)synced_offset
,
2566 (long long)next_offset
,
2570 rm
= vdev_raidz_map_alloc_expanded(zio
,
2571 tvd
->vdev_ashift
, vdrz
->vd_physical_width
,
2572 logical_width
, vdrz
->vd_nparity
,
2573 synced_offset
, next_offset
, use_scratch
);
2576 rm
= vdev_raidz_map_alloc(zio
,
2577 tvd
->vdev_ashift
, logical_width
, vdrz
->vd_nparity
);
2579 rm
->rm_original_width
= vdrz
->vd_original_width
;
2582 zio
->io_vsd_ops
= &vdev_raidz_vsd_ops
;
2583 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
2584 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
2585 vdev_raidz_io_start_write(zio
, rm
->rm_row
[i
]);
2588 if (logical_width
== vdrz
->vd_physical_width
) {
2589 raidz_start_skip_writes(zio
);
2592 ASSERT(zio
->io_type
== ZIO_TYPE_READ
);
2593 vdev_raidz_io_start_read(zio
, rm
);
2600 * Report a checksum error for a child of a RAID-Z device.
2603 vdev_raidz_checksum_error(zio_t
*zio
, raidz_col_t
*rc
, abd_t
*bad_data
)
2605 vdev_t
*vd
= zio
->io_vd
->vdev_child
[rc
->rc_devidx
];
2607 if (!(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
) &&
2608 zio
->io_priority
!= ZIO_PRIORITY_REBUILD
) {
2609 zio_bad_cksum_t zbc
;
2610 raidz_map_t
*rm
= zio
->io_vsd
;
2612 zbc
.zbc_has_cksum
= 0;
2613 zbc
.zbc_injected
= rm
->rm_ecksuminjected
;
2615 mutex_enter(&vd
->vdev_stat_lock
);
2616 vd
->vdev_stat
.vs_checksum_errors
++;
2617 mutex_exit(&vd
->vdev_stat_lock
);
2618 (void) zfs_ereport_post_checksum(zio
->io_spa
, vd
,
2619 &zio
->io_bookmark
, zio
, rc
->rc_offset
, rc
->rc_size
,
2620 rc
->rc_abd
, bad_data
, &zbc
);
2625 * We keep track of whether or not there were any injected errors, so that
2626 * any ereports we generate can note it.
2629 raidz_checksum_verify(zio_t
*zio
)
2631 zio_bad_cksum_t zbc
= {0};
2632 raidz_map_t
*rm
= zio
->io_vsd
;
2634 int ret
= zio_checksum_error(zio
, &zbc
);
2635 if (ret
!= 0 && zbc
.zbc_injected
!= 0)
2636 rm
->rm_ecksuminjected
= 1;
2642 * Generate the parity from the data columns. If we tried and were able to
2643 * read the parity without error, verify that the generated parity matches the
2644 * data we read. If it doesn't, we fire off a checksum error. Return the
2645 * number of such failures.
2648 raidz_parity_verify(zio_t
*zio
, raidz_row_t
*rr
)
2650 abd_t
*orig
[VDEV_RAIDZ_MAXPARITY
];
2652 raidz_map_t
*rm
= zio
->io_vsd
;
2655 blkptr_t
*bp
= zio
->io_bp
;
2656 enum zio_checksum checksum
= (bp
== NULL
? zio
->io_prop
.zp_checksum
:
2657 (BP_IS_GANG(bp
) ? ZIO_CHECKSUM_GANG_HEADER
: BP_GET_CHECKSUM(bp
)));
2659 if (checksum
== ZIO_CHECKSUM_NOPARITY
)
2662 for (c
= 0; c
< rr
->rr_firstdatacol
; c
++) {
2663 rc
= &rr
->rr_col
[c
];
2664 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
2667 orig
[c
] = rc
->rc_abd
;
2668 ASSERT3U(abd_get_size(rc
->rc_abd
), ==, rc
->rc_size
);
2669 rc
->rc_abd
= abd_alloc_linear(rc
->rc_size
, B_FALSE
);
2673 * Verify any empty sectors are zero filled to ensure the parity
2674 * is calculated correctly even if these non-data sectors are damaged.
2676 if (rr
->rr_nempty
&& rr
->rr_abd_empty
!= NULL
)
2677 ret
+= vdev_draid_map_verify_empty(zio
, rr
);
2680 * Regenerates parity even for !tried||rc_error!=0 columns. This
2681 * isn't harmful but it does have the side effect of fixing stuff
2682 * we didn't realize was necessary (i.e. even if we return 0).
2684 vdev_raidz_generate_parity_row(rm
, rr
);
2686 for (c
= 0; c
< rr
->rr_firstdatacol
; c
++) {
2687 rc
= &rr
->rr_col
[c
];
2689 if (!rc
->rc_tried
|| rc
->rc_error
!= 0)
2692 if (abd_cmp(orig
[c
], rc
->rc_abd
) != 0) {
2693 zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2694 c
, (int)rc
->rc_devidx
, (u_longlong_t
)rc
->rc_offset
);
2695 vdev_raidz_checksum_error(zio
, rc
, orig
[c
]);
2696 rc
->rc_error
= SET_ERROR(ECKSUM
);
2706 vdev_raidz_worst_error(raidz_row_t
*rr
)
2710 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
2711 error
= zio_worst_error(error
, rr
->rr_col
[c
].rc_error
);
2712 error
= zio_worst_error(error
, rr
->rr_col
[c
].rc_shadow_error
);
2719 vdev_raidz_io_done_verified(zio_t
*zio
, raidz_row_t
*rr
)
2721 int unexpected_errors
= 0;
2722 int parity_errors
= 0;
2723 int parity_untried
= 0;
2724 int data_errors
= 0;
2726 ASSERT3U(zio
->io_type
, ==, ZIO_TYPE_READ
);
2728 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
2729 raidz_col_t
*rc
= &rr
->rr_col
[c
];
2732 if (c
< rr
->rr_firstdatacol
)
2737 if (!rc
->rc_skipped
)
2738 unexpected_errors
++;
2739 } else if (c
< rr
->rr_firstdatacol
&& !rc
->rc_tried
) {
2743 if (rc
->rc_force_repair
)
2744 unexpected_errors
++;
2748 * If we read more parity disks than were used for
2749 * reconstruction, confirm that the other parity disks produced
2752 * Note that we also regenerate parity when resilvering so we
2753 * can write it out to failed devices later.
2755 if (parity_errors
+ parity_untried
<
2756 rr
->rr_firstdatacol
- data_errors
||
2757 (zio
->io_flags
& ZIO_FLAG_RESILVER
)) {
2758 int n
= raidz_parity_verify(zio
, rr
);
2759 unexpected_errors
+= n
;
2762 if (zio
->io_error
== 0 && spa_writeable(zio
->io_spa
) &&
2763 (unexpected_errors
> 0 || (zio
->io_flags
& ZIO_FLAG_RESILVER
))) {
2765 * Use the good data we have in hand to repair damaged children.
2767 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
2768 raidz_col_t
*rc
= &rr
->rr_col
[c
];
2769 vdev_t
*vd
= zio
->io_vd
;
2770 vdev_t
*cvd
= vd
->vdev_child
[rc
->rc_devidx
];
2772 if (!rc
->rc_allow_repair
) {
2774 } else if (!rc
->rc_force_repair
&&
2775 (rc
->rc_error
== 0 || rc
->rc_size
== 0)) {
2779 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2781 zio
, c
, rc
->rc_devidx
, (long long)rc
->rc_offset
);
2783 zio_nowait(zio_vdev_child_io(zio
, NULL
, cvd
,
2784 rc
->rc_offset
, rc
->rc_abd
, rc
->rc_size
,
2786 zio
->io_priority
== ZIO_PRIORITY_REBUILD
?
2787 ZIO_PRIORITY_REBUILD
: ZIO_PRIORITY_ASYNC_WRITE
,
2788 ZIO_FLAG_IO_REPAIR
| (unexpected_errors
?
2789 ZIO_FLAG_SELF_HEAL
: 0), NULL
, NULL
));
2794 * Scrub or resilver i/o's: overwrite any shadow locations with the
2795 * good data. This ensures that if we've already copied this sector,
2796 * it will be corrected if it was damaged. This writes more than is
2797 * necessary, but since expansion is paused during scrub/resilver, at
2798 * most a single row will have a shadow location.
2800 if (zio
->io_error
== 0 && spa_writeable(zio
->io_spa
) &&
2801 (zio
->io_flags
& (ZIO_FLAG_RESILVER
| ZIO_FLAG_SCRUB
))) {
2802 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
2803 raidz_col_t
*rc
= &rr
->rr_col
[c
];
2804 vdev_t
*vd
= zio
->io_vd
;
2806 if (rc
->rc_shadow_devidx
== INT_MAX
|| rc
->rc_size
== 0)
2808 vdev_t
*cvd
= vd
->vdev_child
[rc
->rc_shadow_devidx
];
2811 * Note: We don't want to update the repair stats
2812 * because that would incorrectly indicate that there
2813 * was bad data to repair, which we aren't sure about.
2814 * By clearing the SCAN_THREAD flag, we prevent this
2815 * from happening, despite having the REPAIR flag set.
2816 * We need to set SELF_HEAL so that this i/o can't be
2817 * bypassed by zio_vdev_io_start().
2819 zio_t
*cio
= zio_vdev_child_io(zio
, NULL
, cvd
,
2820 rc
->rc_shadow_offset
, rc
->rc_abd
, rc
->rc_size
,
2821 ZIO_TYPE_WRITE
, ZIO_PRIORITY_ASYNC_WRITE
,
2822 ZIO_FLAG_IO_REPAIR
| ZIO_FLAG_SELF_HEAL
,
2824 cio
->io_flags
&= ~ZIO_FLAG_SCAN_THREAD
;
2831 raidz_restore_orig_data(raidz_map_t
*rm
)
2833 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
2834 raidz_row_t
*rr
= rm
->rm_row
[i
];
2835 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
2836 raidz_col_t
*rc
= &rr
->rr_col
[c
];
2837 if (rc
->rc_need_orig_restore
) {
2838 abd_copy(rc
->rc_abd
,
2839 rc
->rc_orig_data
, rc
->rc_size
);
2840 rc
->rc_need_orig_restore
= B_FALSE
;
2847 * During raidz_reconstruct() for expanded VDEV, we need special consideration
2848 * failure simulations. See note in raidz_reconstruct() on simulating failure
2849 * of a pre-expansion device.
2851 * Treating logical child i as failed, return TRUE if the given column should
2852 * be treated as failed. The idea of logical children allows us to imagine
2853 * that a disk silently failed before a RAIDZ expansion (reads from this disk
2854 * succeed but return the wrong data). Since the expansion doesn't verify
2855 * checksums, the incorrect data will be moved to new locations spread among
2856 * the children (going diagonally across them).
2858 * Higher "logical child failures" (values of `i`) indicate these
2859 * "pre-expansion failures". The first physical_width values imagine that a
2860 * current child failed; the next physical_width-1 values imagine that a
2861 * child failed before the most recent expansion; the next physical_width-2
2862 * values imagine a child failed in the expansion before that, etc.
2865 raidz_simulate_failure(int physical_width
, int original_width
, int ashift
,
2866 int i
, raidz_col_t
*rc
)
2868 uint64_t sector_id
=
2869 physical_width
* (rc
->rc_offset
>> ashift
) +
2872 for (int w
= physical_width
; w
>= original_width
; w
--) {
2874 return (sector_id
% w
== i
);
2879 ASSERT(!"invalid logical child id");
2884 * returns EINVAL if reconstruction of the block will not be possible
2885 * returns ECKSUM if this specific reconstruction failed
2886 * returns 0 on successful reconstruction
2889 raidz_reconstruct(zio_t
*zio
, int *ltgts
, int ntgts
, int nparity
)
2891 raidz_map_t
*rm
= zio
->io_vsd
;
2892 int physical_width
= zio
->io_vd
->vdev_children
;
2893 int original_width
= (rm
->rm_original_width
!= 0) ?
2894 rm
->rm_original_width
: physical_width
;
2895 int dbgmsg
= zfs_flags
& ZFS_DEBUG_RAIDZ_RECONSTRUCT
;
2898 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2899 "ntgts=%u", zio
, ltgts
[0], ltgts
[1], ltgts
[2], ntgts
);
2902 /* Reconstruct each row */
2903 for (int r
= 0; r
< rm
->rm_nrows
; r
++) {
2904 raidz_row_t
*rr
= rm
->rm_row
[r
];
2905 int my_tgts
[VDEV_RAIDZ_MAXPARITY
]; /* value is child id */
2911 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r
);
2913 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
2914 raidz_col_t
*rc
= &rr
->rr_col
[c
];
2915 ASSERT0(rc
->rc_need_orig_restore
);
2916 if (rc
->rc_error
!= 0) {
2922 if (rc
->rc_size
== 0)
2924 for (int lt
= 0; lt
< ntgts
; lt
++) {
2925 if (raidz_simulate_failure(physical_width
,
2927 zio
->io_vd
->vdev_top
->vdev_ashift
,
2929 if (rc
->rc_orig_data
== NULL
) {
2932 rc
->rc_size
, B_TRUE
);
2933 abd_copy(rc
->rc_orig_data
,
2934 rc
->rc_abd
, rc
->rc_size
);
2936 rc
->rc_need_orig_restore
= B_TRUE
;
2942 * Note: simulating failure of a
2943 * pre-expansion device can hit more
2944 * than one column, in which case we
2945 * might try to simulate more failures
2946 * than can be reconstructed, which is
2947 * also more than the size of my_tgts.
2948 * This check prevents accessing past
2949 * the end of my_tgts. The "dead >
2950 * nparity" check below will fail this
2951 * reconstruction attempt.
2953 if (t
< VDEV_RAIDZ_MAXPARITY
) {
2956 zfs_dbgmsg("simulating "
2957 "failure of col %u "
2959 (int)rc
->rc_devidx
);
2966 if (dead
> nparity
) {
2967 /* reconstruction not possible */
2969 zfs_dbgmsg("reconstruction not possible; "
2970 "too many failures");
2972 raidz_restore_orig_data(rm
);
2976 vdev_raidz_reconstruct_row(rm
, rr
, my_tgts
, t
);
2979 /* Check for success */
2980 if (raidz_checksum_verify(zio
) == 0) {
2982 /* Reconstruction succeeded - report errors */
2983 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
2984 raidz_row_t
*rr
= rm
->rm_row
[i
];
2986 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
2987 raidz_col_t
*rc
= &rr
->rr_col
[c
];
2988 if (rc
->rc_need_orig_restore
) {
2990 * Note: if this is a parity column,
2991 * we don't really know if it's wrong.
2993 * vdev_raidz_io_done_verified() check
2994 * it, and if we set rc_error, it will
2995 * think that it is a "known" error
2996 * that doesn't need to be checked
2999 if (rc
->rc_error
== 0 &&
3000 c
>= rr
->rr_firstdatacol
) {
3001 vdev_raidz_checksum_error(zio
,
3002 rc
, rc
->rc_orig_data
);
3006 rc
->rc_need_orig_restore
= B_FALSE
;
3010 vdev_raidz_io_done_verified(zio
, rr
);
3013 zio_checksum_verified(zio
);
3016 zfs_dbgmsg("reconstruction successful "
3017 "(checksum verified)");
3022 /* Reconstruction failed - restore original data */
3023 raidz_restore_orig_data(rm
);
3025 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3032 * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3033 * Note that the algorithm below is non-optimal because it doesn't take into
3034 * account how reconstruction is actually performed. For example, with
3035 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3036 * is targeted as invalid as if columns 1 and 4 are targeted since in both
3037 * cases we'd only use parity information in column 0.
3039 * The order that we find the various possible combinations of failed
3040 * disks is dictated by these rules:
3041 * - Examine each "slot" (the "i" in tgts[i])
3042 * - Try to increment this slot (tgts[i] += 1)
3043 * - if we can't increment because it runs into the next slot,
3044 * reset our slot to the minimum, and examine the next slot
3046 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3047 * 3 columns to reconstruct), we will generate the following sequence:
3050 * 0 1 2 special case: skip since these are all parity
3051 * 0 1 3 first slot: reset to 0; middle slot: increment to 2
3052 * 0 2 3 first slot: increment to 1
3053 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
3054 * 0 1 4 first: reset to 0; middle: increment to 2
3055 * 0 2 4 first: increment to 1
3056 * 1 2 4 first: reset to 0; middle: increment to 3
3057 * 0 3 4 first: increment to 1
3058 * 1 3 4 first: increment to 2
3059 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
3060 * 0 1 5 first: reset to 0; middle: increment to 2
3061 * 0 2 5 first: increment to 1
3062 * 1 2 5 first: reset to 0; middle: increment to 3
3063 * 0 3 5 first: increment to 1
3064 * 1 3 5 first: increment to 2
3065 * 2 3 5 first: reset to 0; middle: increment to 4
3066 * 0 4 5 first: increment to 1
3067 * 1 4 5 first: increment to 2
3068 * 2 4 5 first: increment to 3
3071 * This strategy works for dRAID but is less efficient when there are a large
3072 * number of child vdevs and therefore permutations to check. Furthermore,
3073 * since the raidz_map_t rows likely do not overlap, reconstruction would be
3074 * possible as long as there are no more than nparity data errors per row.
3075 * These additional permutations are not currently checked but could be as
3076 * a future improvement.
3078 * Returns 0 on success, ECKSUM on failure.
3081 vdev_raidz_combrec(zio_t
*zio
)
3083 int nparity
= vdev_get_nparity(zio
->io_vd
);
3084 raidz_map_t
*rm
= zio
->io_vsd
;
3085 int physical_width
= zio
->io_vd
->vdev_children
;
3086 int original_width
= (rm
->rm_original_width
!= 0) ?
3087 rm
->rm_original_width
: physical_width
;
3089 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
3090 raidz_row_t
*rr
= rm
->rm_row
[i
];
3091 int total_errors
= 0;
3093 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
3094 if (rr
->rr_col
[c
].rc_error
)
3098 if (total_errors
> nparity
)
3099 return (vdev_raidz_worst_error(rr
));
3102 for (int num_failures
= 1; num_failures
<= nparity
; num_failures
++) {
3103 int tstore
[VDEV_RAIDZ_MAXPARITY
+ 2];
3104 int *ltgts
= &tstore
[1]; /* value is logical child ID */
3108 * Determine number of logical children, n. See comment
3109 * above raidz_simulate_failure().
3112 for (int w
= physical_width
;
3113 w
>= original_width
; w
--) {
3117 ASSERT3U(num_failures
, <=, nparity
);
3118 ASSERT3U(num_failures
, <=, VDEV_RAIDZ_MAXPARITY
);
3120 /* Handle corner cases in combrec logic */
3122 for (int i
= 0; i
< num_failures
; i
++) {
3125 ltgts
[num_failures
] = n
;
3128 int err
= raidz_reconstruct(zio
, ltgts
, num_failures
,
3130 if (err
== EINVAL
) {
3132 * Reconstruction not possible with this #
3133 * failures; try more failures.
3136 } else if (err
== 0)
3139 /* Compute next targets to try */
3140 for (int t
= 0; ; t
++) {
3141 ASSERT3U(t
, <, num_failures
);
3143 if (ltgts
[t
] == n
) {
3144 /* try more failures */
3145 ASSERT3U(t
, ==, num_failures
- 1);
3147 ZFS_DEBUG_RAIDZ_RECONSTRUCT
) {
3148 zfs_dbgmsg("reconstruction "
3149 "failed for num_failures="
3157 ASSERT3U(ltgts
[t
], <, n
);
3158 ASSERT3U(ltgts
[t
], <=, ltgts
[t
+ 1]);
3161 * If that spot is available, we're done here.
3162 * Try the next combination.
3164 if (ltgts
[t
] != ltgts
[t
+ 1])
3165 break; // found next combination
3168 * Otherwise, reset this tgt to the minimum,
3169 * and move on to the next tgt.
3171 ltgts
[t
] = ltgts
[t
- 1] + 1;
3172 ASSERT3U(ltgts
[t
], ==, t
);
3175 /* Increase the number of failures and keep trying. */
3176 if (ltgts
[num_failures
- 1] == n
)
3180 if (zfs_flags
& ZFS_DEBUG_RAIDZ_RECONSTRUCT
)
3181 zfs_dbgmsg("reconstruction failed for all num_failures");
3186 vdev_raidz_reconstruct(raidz_map_t
*rm
, const int *t
, int nt
)
3188 for (uint64_t row
= 0; row
< rm
->rm_nrows
; row
++) {
3189 raidz_row_t
*rr
= rm
->rm_row
[row
];
3190 vdev_raidz_reconstruct_row(rm
, rr
, t
, nt
);
3195 * Complete a write IO operation on a RAIDZ VDev
3198 * 1. Check for errors on the child IOs.
3199 * 2. Return, setting an error code if too few child VDevs were written
3200 * to reconstruct the data later. Note that partial writes are
3201 * considered successful if they can be reconstructed at all.
3204 vdev_raidz_io_done_write_impl(zio_t
*zio
, raidz_row_t
*rr
)
3206 int normal_errors
= 0;
3207 int shadow_errors
= 0;
3209 ASSERT3U(rr
->rr_missingparity
, <=, rr
->rr_firstdatacol
);
3210 ASSERT3U(rr
->rr_missingdata
, <=, rr
->rr_cols
- rr
->rr_firstdatacol
);
3211 ASSERT3U(zio
->io_type
, ==, ZIO_TYPE_WRITE
);
3213 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
3214 raidz_col_t
*rc
= &rr
->rr_col
[c
];
3216 if (rc
->rc_error
!= 0) {
3217 ASSERT(rc
->rc_error
!= ECKSUM
); /* child has no bp */
3220 if (rc
->rc_shadow_error
!= 0) {
3221 ASSERT(rc
->rc_shadow_error
!= ECKSUM
);
3227 * Treat partial writes as a success. If we couldn't write enough
3228 * columns to reconstruct the data, the I/O failed. Otherwise, good
3229 * enough. Note that in the case of a shadow write (during raidz
3230 * expansion), depending on if we crash, either the normal (old) or
3231 * shadow (new) location may become the "real" version of the block,
3232 * so both locations must have sufficient redundancy.
3234 * Now that we support write reallocation, it would be better
3235 * to treat partial failure as real failure unless there are
3236 * no non-degraded top-level vdevs left, and not update DTLs
3237 * if we intend to reallocate.
3239 if (normal_errors
> rr
->rr_firstdatacol
||
3240 shadow_errors
> rr
->rr_firstdatacol
) {
3241 zio
->io_error
= zio_worst_error(zio
->io_error
,
3242 vdev_raidz_worst_error(rr
));
3247 vdev_raidz_io_done_reconstruct_known_missing(zio_t
*zio
, raidz_map_t
*rm
,
3250 int parity_errors
= 0;
3251 int parity_untried
= 0;
3252 int data_errors
= 0;
3253 int total_errors
= 0;
3255 ASSERT3U(rr
->rr_missingparity
, <=, rr
->rr_firstdatacol
);
3256 ASSERT3U(rr
->rr_missingdata
, <=, rr
->rr_cols
- rr
->rr_firstdatacol
);
3258 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
3259 raidz_col_t
*rc
= &rr
->rr_col
[c
];
3262 * If scrubbing and a replacing/sparing child vdev determined
3263 * that not all of its children have an identical copy of the
3264 * data, then clear the error so the column is treated like
3265 * any other read and force a repair to correct the damage.
3267 if (rc
->rc_error
== ECKSUM
) {
3268 ASSERT(zio
->io_flags
& ZIO_FLAG_SCRUB
);
3269 vdev_raidz_checksum_error(zio
, rc
, rc
->rc_abd
);
3270 rc
->rc_force_repair
= 1;
3275 if (c
< rr
->rr_firstdatacol
)
3281 } else if (c
< rr
->rr_firstdatacol
&& !rc
->rc_tried
) {
3287 * If there were data errors and the number of errors we saw was
3288 * correctable -- less than or equal to the number of parity disks read
3289 * -- reconstruct based on the missing data.
3291 if (data_errors
!= 0 &&
3292 total_errors
<= rr
->rr_firstdatacol
- parity_untried
) {
3294 * We either attempt to read all the parity columns or
3295 * none of them. If we didn't try to read parity, we
3296 * wouldn't be here in the correctable case. There must
3297 * also have been fewer parity errors than parity
3298 * columns or, again, we wouldn't be in this code path.
3300 ASSERT(parity_untried
== 0);
3301 ASSERT(parity_errors
< rr
->rr_firstdatacol
);
3304 * Identify the data columns that reported an error.
3307 int tgts
[VDEV_RAIDZ_MAXPARITY
];
3308 for (int c
= rr
->rr_firstdatacol
; c
< rr
->rr_cols
; c
++) {
3309 raidz_col_t
*rc
= &rr
->rr_col
[c
];
3310 if (rc
->rc_error
!= 0) {
3311 ASSERT(n
< VDEV_RAIDZ_MAXPARITY
);
3316 ASSERT(rr
->rr_firstdatacol
>= n
);
3318 vdev_raidz_reconstruct_row(rm
, rr
, tgts
, n
);
3323 * Return the number of reads issued.
3326 vdev_raidz_read_all(zio_t
*zio
, raidz_row_t
*rr
)
3328 vdev_t
*vd
= zio
->io_vd
;
3331 rr
->rr_missingdata
= 0;
3332 rr
->rr_missingparity
= 0;
3335 * If this rows contains empty sectors which are not required
3336 * for a normal read then allocate an ABD for them now so they
3337 * may be read, verified, and any needed repairs performed.
3339 if (rr
->rr_nempty
!= 0 && rr
->rr_abd_empty
== NULL
)
3340 vdev_draid_map_alloc_empty(zio
, rr
);
3342 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
3343 raidz_col_t
*rc
= &rr
->rr_col
[c
];
3344 if (rc
->rc_tried
|| rc
->rc_size
== 0)
3347 zio_nowait(zio_vdev_child_io(zio
, NULL
,
3348 vd
->vdev_child
[rc
->rc_devidx
],
3349 rc
->rc_offset
, rc
->rc_abd
, rc
->rc_size
,
3350 zio
->io_type
, zio
->io_priority
, 0,
3351 vdev_raidz_child_done
, rc
));
3358 * We're here because either there were too many errors to even attempt
3359 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3360 * failed. In either case, there is enough bad data to prevent reconstruction.
3361 * Start checksum ereports for all children which haven't failed.
3364 vdev_raidz_io_done_unrecoverable(zio_t
*zio
)
3366 raidz_map_t
*rm
= zio
->io_vsd
;
3368 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
3369 raidz_row_t
*rr
= rm
->rm_row
[i
];
3371 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
3372 raidz_col_t
*rc
= &rr
->rr_col
[c
];
3373 vdev_t
*cvd
= zio
->io_vd
->vdev_child
[rc
->rc_devidx
];
3375 if (rc
->rc_error
!= 0)
3378 zio_bad_cksum_t zbc
;
3379 zbc
.zbc_has_cksum
= 0;
3380 zbc
.zbc_injected
= rm
->rm_ecksuminjected
;
3382 mutex_enter(&cvd
->vdev_stat_lock
);
3383 cvd
->vdev_stat
.vs_checksum_errors
++;
3384 mutex_exit(&cvd
->vdev_stat_lock
);
3385 (void) zfs_ereport_start_checksum(zio
->io_spa
,
3386 cvd
, &zio
->io_bookmark
, zio
, rc
->rc_offset
,
3393 vdev_raidz_io_done(zio_t
*zio
)
3395 raidz_map_t
*rm
= zio
->io_vsd
;
3397 ASSERT(zio
->io_bp
!= NULL
);
3398 if (zio
->io_type
== ZIO_TYPE_WRITE
) {
3399 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
3400 vdev_raidz_io_done_write_impl(zio
, rm
->rm_row
[i
]);
3403 if (rm
->rm_phys_col
) {
3405 * This is an aggregated read. Copy the data and status
3406 * from the aggregate abd's to the individual rows.
3408 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
3409 raidz_row_t
*rr
= rm
->rm_row
[i
];
3411 for (int c
= 0; c
< rr
->rr_cols
; c
++) {
3412 raidz_col_t
*rc
= &rr
->rr_col
[c
];
3413 if (rc
->rc_tried
|| rc
->rc_size
== 0)
3417 &rm
->rm_phys_col
[rc
->rc_devidx
];
3418 rc
->rc_error
= prc
->rc_error
;
3419 rc
->rc_tried
= prc
->rc_tried
;
3420 rc
->rc_skipped
= prc
->rc_skipped
;
3421 if (c
>= rr
->rr_firstdatacol
) {
3423 * Note: this is slightly faster
3424 * than using abd_copy_off().
3426 char *physbuf
= abd_to_buf(
3428 void *physloc
= physbuf
+
3432 abd_copy_from_buf(rc
->rc_abd
,
3433 physloc
, rc
->rc_size
);
3439 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
3440 raidz_row_t
*rr
= rm
->rm_row
[i
];
3441 vdev_raidz_io_done_reconstruct_known_missing(zio
,
3445 if (raidz_checksum_verify(zio
) == 0) {
3446 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
3447 raidz_row_t
*rr
= rm
->rm_row
[i
];
3448 vdev_raidz_io_done_verified(zio
, rr
);
3450 zio_checksum_verified(zio
);
3453 * A sequential resilver has no checksum which makes
3454 * combinatoral reconstruction impossible. This code
3455 * path is unreachable since raidz_checksum_verify()
3456 * has no checksum to verify and must succeed.
3458 ASSERT3U(zio
->io_priority
, !=, ZIO_PRIORITY_REBUILD
);
3461 * This isn't a typical situation -- either we got a
3462 * read error or a child silently returned bad data.
3463 * Read every block so we can try again with as much
3464 * data and parity as we can track down. If we've
3465 * already been through once before, all children will
3466 * be marked as tried so we'll proceed to combinatorial
3470 for (int i
= 0; i
< rm
->rm_nrows
; i
++) {
3471 nread
+= vdev_raidz_read_all(zio
,
3476 * Normally our stage is VDEV_IO_DONE, but if
3477 * we've already called redone(), it will have
3478 * changed to VDEV_IO_START, in which case we
3479 * don't want to call redone() again.
3481 if (zio
->io_stage
!= ZIO_STAGE_VDEV_IO_START
)
3482 zio_vdev_io_redone(zio
);
3486 * It would be too expensive to try every possible
3487 * combination of failed sectors in every row, so
3488 * instead we try every combination of failed current or
3489 * past physical disk. This means that if the incorrect
3490 * sectors were all on Nparity disks at any point in the
3491 * past, we will find the correct data. The only known
3492 * case where this is less durable than a non-expanded
3493 * RAIDZ, is if we have a silent failure during
3494 * expansion. In that case, one block could be
3495 * partially in the old format and partially in the
3496 * new format, so we'd lost some sectors from the old
3497 * format and some from the new format.
3499 * e.g. logical_width=4 physical_width=6
3500 * the 15 (6+5+4) possible failed disks are:
3516 * And we will try every combination of Nparity of these
3519 * As a first pass, we can generate every combo,
3520 * and try reconstructing, ignoring any known
3521 * failures. If any row has too many known + simulated
3522 * failures, then we bail on reconstructing with this
3523 * number of simulated failures. As an improvement,
3524 * we could detect the number of whole known failures
3525 * (i.e. we have known failures on these disks for
3526 * every row; the disks never succeeded), and
3527 * subtract that from the max # failures to simulate.
3528 * We could go even further like the current
3529 * combrec code, but that doesn't seem like it
3530 * gains us very much. If we simulate a failure
3531 * that is also a known failure, that's fine.
3533 zio
->io_error
= vdev_raidz_combrec(zio
);
3534 if (zio
->io_error
== ECKSUM
&&
3535 !(zio
->io_flags
& ZIO_FLAG_SPECULATIVE
)) {
3536 vdev_raidz_io_done_unrecoverable(zio
);
3540 if (rm
->rm_lr
!= NULL
) {
3541 zfs_rangelock_exit(rm
->rm_lr
);
3547 vdev_raidz_state_change(vdev_t
*vd
, int faulted
, int degraded
)
3549 vdev_raidz_t
*vdrz
= vd
->vdev_tsd
;
3550 if (faulted
> vdrz
->vd_nparity
)
3551 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_CANT_OPEN
,
3552 VDEV_AUX_NO_REPLICAS
);
3553 else if (degraded
+ faulted
!= 0)
3554 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_DEGRADED
, VDEV_AUX_NONE
);
3556 vdev_set_state(vd
, B_FALSE
, VDEV_STATE_HEALTHY
, VDEV_AUX_NONE
);
3560 * Determine if any portion of the provided block resides on a child vdev
3561 * with a dirty DTL and therefore needs to be resilvered. The function
3562 * assumes that at least one DTL is dirty which implies that full stripe
3563 * width blocks must be resilvered.
3566 vdev_raidz_need_resilver(vdev_t
*vd
, const dva_t
*dva
, size_t psize
,
3567 uint64_t phys_birth
)
3569 vdev_raidz_t
*vdrz
= vd
->vdev_tsd
;
3572 * If we're in the middle of a RAIDZ expansion, this block may be in
3573 * the old and/or new location. For simplicity, always resilver it.
3575 if (vdrz
->vn_vre
.vre_state
== DSS_SCANNING
)
3578 uint64_t dcols
= vd
->vdev_children
;
3579 uint64_t nparity
= vdrz
->vd_nparity
;
3580 uint64_t ashift
= vd
->vdev_top
->vdev_ashift
;
3581 /* The starting RAIDZ (parent) vdev sector of the block. */
3582 uint64_t b
= DVA_GET_OFFSET(dva
) >> ashift
;
3583 /* The zio's size in units of the vdev's minimum sector size. */
3584 uint64_t s
= ((psize
- 1) >> ashift
) + 1;
3585 /* The first column for this stripe. */
3586 uint64_t f
= b
% dcols
;
3588 /* Unreachable by sequential resilver. */
3589 ASSERT3U(phys_birth
, !=, TXG_UNKNOWN
);
3591 if (!vdev_dtl_contains(vd
, DTL_PARTIAL
, phys_birth
, 1))
3594 if (s
+ nparity
>= dcols
)
3597 for (uint64_t c
= 0; c
< s
+ nparity
; c
++) {
3598 uint64_t devidx
= (f
+ c
) % dcols
;
3599 vdev_t
*cvd
= vd
->vdev_child
[devidx
];
3602 * dsl_scan_need_resilver() already checked vd with
3603 * vdev_dtl_contains(). So here just check cvd with
3604 * vdev_dtl_empty(), cheaper and a good approximation.
3606 if (!vdev_dtl_empty(cvd
, DTL_PARTIAL
))
3614 vdev_raidz_xlate(vdev_t
*cvd
, const range_seg64_t
*logical_rs
,
3615 range_seg64_t
*physical_rs
, range_seg64_t
*remain_rs
)
3619 vdev_t
*raidvd
= cvd
->vdev_parent
;
3620 ASSERT(raidvd
->vdev_ops
== &vdev_raidz_ops
);
3622 vdev_raidz_t
*vdrz
= raidvd
->vdev_tsd
;
3624 if (vdrz
->vn_vre
.vre_state
== DSS_SCANNING
) {
3626 * We're in the middle of expansion, in which case the
3627 * translation is in flux. Any answer we give may be wrong
3628 * by the time we return, so it isn't safe for the caller to
3629 * act on it. Therefore we say that this range isn't present
3630 * on any children. The only consumers of this are "zpool
3631 * initialize" and trimming, both of which are "best effort"
3634 physical_rs
->rs_start
= physical_rs
->rs_end
= 0;
3635 remain_rs
->rs_start
= remain_rs
->rs_end
= 0;
3639 uint64_t width
= vdrz
->vd_physical_width
;
3640 uint64_t tgt_col
= cvd
->vdev_id
;
3641 uint64_t ashift
= raidvd
->vdev_top
->vdev_ashift
;
3643 /* make sure the offsets are block-aligned */
3644 ASSERT0(logical_rs
->rs_start
% (1 << ashift
));
3645 ASSERT0(logical_rs
->rs_end
% (1 << ashift
));
3646 uint64_t b_start
= logical_rs
->rs_start
>> ashift
;
3647 uint64_t b_end
= logical_rs
->rs_end
>> ashift
;
3649 uint64_t start_row
= 0;
3650 if (b_start
> tgt_col
) /* avoid underflow */
3651 start_row
= ((b_start
- tgt_col
- 1) / width
) + 1;
3653 uint64_t end_row
= 0;
3654 if (b_end
> tgt_col
)
3655 end_row
= ((b_end
- tgt_col
- 1) / width
) + 1;
3657 physical_rs
->rs_start
= start_row
<< ashift
;
3658 physical_rs
->rs_end
= end_row
<< ashift
;
3660 ASSERT3U(physical_rs
->rs_start
, <=, logical_rs
->rs_start
);
3661 ASSERT3U(physical_rs
->rs_end
- physical_rs
->rs_start
, <=,
3662 logical_rs
->rs_end
- logical_rs
->rs_start
);
3666 raidz_reflow_sync(void *arg
, dmu_tx_t
*tx
)
3669 int txgoff
= dmu_tx_get_txg(tx
) & TXG_MASK
;
3670 vdev_raidz_expand_t
*vre
= spa
->spa_raidz_expand
;
3673 * Ensure there are no i/os to the range that is being committed.
3675 uint64_t old_offset
= RRSS_GET_OFFSET(&spa
->spa_uberblock
);
3676 ASSERT3U(vre
->vre_offset_pertxg
[txgoff
], >=, old_offset
);
3678 mutex_enter(&vre
->vre_lock
);
3679 uint64_t new_offset
=
3680 MIN(vre
->vre_offset_pertxg
[txgoff
], vre
->vre_failed_offset
);
3682 * We should not have committed anything that failed.
3684 VERIFY3U(vre
->vre_failed_offset
, >=, old_offset
);
3685 mutex_exit(&vre
->vre_lock
);
3687 zfs_locked_range_t
*lr
= zfs_rangelock_enter(&vre
->vre_rangelock
,
3688 old_offset
, new_offset
- old_offset
,
3692 * Update the uberblock that will be written when this txg completes.
3694 RAIDZ_REFLOW_SET(&spa
->spa_uberblock
,
3695 RRSS_SCRATCH_INVALID_SYNCED_REFLOW
, new_offset
);
3696 vre
->vre_offset_pertxg
[txgoff
] = 0;
3697 zfs_rangelock_exit(lr
);
3699 mutex_enter(&vre
->vre_lock
);
3700 vre
->vre_bytes_copied
+= vre
->vre_bytes_copied_pertxg
[txgoff
];
3701 vre
->vre_bytes_copied_pertxg
[txgoff
] = 0;
3702 mutex_exit(&vre
->vre_lock
);
3704 vdev_t
*vd
= vdev_lookup_top(spa
, vre
->vre_vdev_id
);
3705 VERIFY0(zap_update(spa
->spa_meta_objset
,
3706 vd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
,
3707 sizeof (vre
->vre_bytes_copied
), 1, &vre
->vre_bytes_copied
, tx
));
3711 raidz_reflow_complete_sync(void *arg
, dmu_tx_t
*tx
)
3714 vdev_raidz_expand_t
*vre
= spa
->spa_raidz_expand
;
3715 vdev_t
*raidvd
= vdev_lookup_top(spa
, vre
->vre_vdev_id
);
3716 vdev_raidz_t
*vdrz
= raidvd
->vdev_tsd
;
3718 for (int i
= 0; i
< TXG_SIZE
; i
++)
3719 VERIFY0(vre
->vre_offset_pertxg
[i
]);
3721 reflow_node_t
*re
= kmem_zalloc(sizeof (*re
), KM_SLEEP
);
3722 re
->re_txg
= tx
->tx_txg
+ TXG_CONCURRENT_STATES
;
3723 re
->re_logical_width
= vdrz
->vd_physical_width
;
3724 mutex_enter(&vdrz
->vd_expand_lock
);
3725 avl_add(&vdrz
->vd_expand_txgs
, re
);
3726 mutex_exit(&vdrz
->vd_expand_lock
);
3728 vdev_t
*vd
= vdev_lookup_top(spa
, vre
->vre_vdev_id
);
3731 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3732 * will get written (based on vd_expand_txgs).
3734 vdev_config_dirty(vd
);
3737 * Before we change vre_state, the on-disk state must reflect that we
3738 * have completed all copying, so that vdev_raidz_io_start() can use
3739 * vre_state to determine if the reflow is in progress. See also the
3740 * end of spa_raidz_expand_thread().
3742 VERIFY3U(RRSS_GET_OFFSET(&spa
->spa_ubsync
), ==,
3743 raidvd
->vdev_ms_count
<< raidvd
->vdev_ms_shift
);
3745 vre
->vre_end_time
= gethrestime_sec();
3746 vre
->vre_state
= DSS_FINISHED
;
3748 uint64_t state
= vre
->vre_state
;
3749 VERIFY0(zap_update(spa
->spa_meta_objset
,
3750 vd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
,
3751 sizeof (state
), 1, &state
, tx
));
3753 uint64_t end_time
= vre
->vre_end_time
;
3754 VERIFY0(zap_update(spa
->spa_meta_objset
,
3755 vd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
,
3756 sizeof (end_time
), 1, &end_time
, tx
));
3758 spa
->spa_uberblock
.ub_raidz_reflow_info
= 0;
3760 spa_history_log_internal(spa
, "raidz vdev expansion completed", tx
,
3761 "%s vdev %llu new width %llu", spa_name(spa
),
3762 (unsigned long long)vd
->vdev_id
,
3763 (unsigned long long)vd
->vdev_children
);
3765 spa
->spa_raidz_expand
= NULL
;
3766 raidvd
->vdev_rz_expanding
= B_FALSE
;
3768 spa_async_request(spa
, SPA_ASYNC_INITIALIZE_RESTART
);
3769 spa_async_request(spa
, SPA_ASYNC_TRIM_RESTART
);
3770 spa_async_request(spa
, SPA_ASYNC_AUTOTRIM_RESTART
);
3772 spa_notify_waiters(spa
);
3775 * While we're in syncing context take the opportunity to
3776 * setup a scrub. All the data has been sucessfully copied
3777 * but we have not validated any checksums.
3779 pool_scan_func_t func
= POOL_SCAN_SCRUB
;
3780 if (zfs_scrub_after_expand
&& dsl_scan_setup_check(&func
, tx
) == 0)
3781 dsl_scan_setup_sync(&func
, tx
);
3785 * Struct for one copy zio.
3787 typedef struct raidz_reflow_arg
{
3788 vdev_raidz_expand_t
*rra_vre
;
3789 zfs_locked_range_t
*rra_lr
;
3791 } raidz_reflow_arg_t
;
3794 * The write of the new location is done.
3797 raidz_reflow_write_done(zio_t
*zio
)
3799 raidz_reflow_arg_t
*rra
= zio
->io_private
;
3800 vdev_raidz_expand_t
*vre
= rra
->rra_vre
;
3802 abd_free(zio
->io_abd
);
3804 mutex_enter(&vre
->vre_lock
);
3805 if (zio
->io_error
!= 0) {
3806 /* Force a reflow pause on errors */
3807 vre
->vre_failed_offset
=
3808 MIN(vre
->vre_failed_offset
, rra
->rra_lr
->lr_offset
);
3810 ASSERT3U(vre
->vre_outstanding_bytes
, >=, zio
->io_size
);
3811 vre
->vre_outstanding_bytes
-= zio
->io_size
;
3812 if (rra
->rra_lr
->lr_offset
+ rra
->rra_lr
->lr_length
<
3813 vre
->vre_failed_offset
) {
3814 vre
->vre_bytes_copied_pertxg
[rra
->rra_txg
& TXG_MASK
] +=
3817 cv_signal(&vre
->vre_cv
);
3818 mutex_exit(&vre
->vre_lock
);
3820 zfs_rangelock_exit(rra
->rra_lr
);
3822 kmem_free(rra
, sizeof (*rra
));
3823 spa_config_exit(zio
->io_spa
, SCL_STATE
, zio
->io_spa
);
3827 * The read of the old location is done. The parent zio is the write to
3828 * the new location. Allow it to start.
3831 raidz_reflow_read_done(zio_t
*zio
)
3833 raidz_reflow_arg_t
*rra
= zio
->io_private
;
3834 vdev_raidz_expand_t
*vre
= rra
->rra_vre
;
3837 * If the read failed, or if it was done on a vdev that is not fully
3838 * healthy (e.g. a child that has a resilver in progress), we may not
3839 * have the correct data. Note that it's OK if the write proceeds.
3840 * It may write garbage but the location is otherwise unused and we
3841 * will retry later due to vre_failed_offset.
3843 if (zio
->io_error
!= 0 || !vdev_dtl_empty(zio
->io_vd
, DTL_MISSING
)) {
3844 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3845 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3846 (long long)rra
->rra_lr
->lr_offset
,
3847 (long long)rra
->rra_lr
->lr_length
,
3848 (long long)rra
->rra_txg
,
3850 vdev_dtl_empty(zio
->io_vd
, DTL_PARTIAL
),
3851 vdev_dtl_empty(zio
->io_vd
, DTL_MISSING
));
3852 mutex_enter(&vre
->vre_lock
);
3853 /* Force a reflow pause on errors */
3854 vre
->vre_failed_offset
=
3855 MIN(vre
->vre_failed_offset
, rra
->rra_lr
->lr_offset
);
3856 mutex_exit(&vre
->vre_lock
);
3859 zio_nowait(zio_unique_parent(zio
));
3863 raidz_reflow_record_progress(vdev_raidz_expand_t
*vre
, uint64_t offset
,
3866 int txgoff
= dmu_tx_get_txg(tx
) & TXG_MASK
;
3867 spa_t
*spa
= dmu_tx_pool(tx
)->dp_spa
;
3872 mutex_enter(&vre
->vre_lock
);
3873 ASSERT3U(vre
->vre_offset
, <=, offset
);
3874 vre
->vre_offset
= offset
;
3875 mutex_exit(&vre
->vre_lock
);
3877 if (vre
->vre_offset_pertxg
[txgoff
] == 0) {
3878 dsl_sync_task_nowait(dmu_tx_pool(tx
), raidz_reflow_sync
,
3881 vre
->vre_offset_pertxg
[txgoff
] = offset
;
3885 vdev_raidz_expand_child_replacing(vdev_t
*raidz_vd
)
3887 for (int i
= 0; i
< raidz_vd
->vdev_children
; i
++) {
3888 /* Quick check if a child is being replaced */
3889 if (!raidz_vd
->vdev_child
[i
]->vdev_ops
->vdev_op_leaf
)
3896 raidz_reflow_impl(vdev_t
*vd
, vdev_raidz_expand_t
*vre
, range_tree_t
*rt
,
3899 spa_t
*spa
= vd
->vdev_spa
;
3900 int ashift
= vd
->vdev_top
->vdev_ashift
;
3901 uint64_t offset
, size
;
3903 if (!range_tree_find_in(rt
, 0, vd
->vdev_top
->vdev_asize
,
3907 ASSERT(IS_P2ALIGNED(offset
, 1 << ashift
));
3908 ASSERT3U(size
, >=, 1 << ashift
);
3909 uint64_t length
= 1 << ashift
;
3910 int txgoff
= dmu_tx_get_txg(tx
) & TXG_MASK
;
3912 uint64_t blkid
= offset
>> ashift
;
3914 int old_children
= vd
->vdev_children
- 1;
3917 * We can only progress to the point that writes will not overlap
3918 * with blocks whose progress has not yet been recorded on disk.
3919 * Since partially-copied rows are still read from the old location,
3920 * we need to stop one row before the sector-wise overlap, to prevent
3923 * Note that even if we are skipping over a large unallocated region,
3924 * we can't move the on-disk progress to `offset`, because concurrent
3925 * writes/allocations could still use the currently-unallocated
3928 uint64_t ubsync_blkid
=
3929 RRSS_GET_OFFSET(&spa
->spa_ubsync
) >> ashift
;
3930 uint64_t next_overwrite_blkid
= ubsync_blkid
+
3931 ubsync_blkid
/ old_children
- old_children
;
3932 VERIFY3U(next_overwrite_blkid
, >, ubsync_blkid
);
3934 if (blkid
>= next_overwrite_blkid
) {
3935 raidz_reflow_record_progress(vre
,
3936 next_overwrite_blkid
<< ashift
, tx
);
3940 range_tree_remove(rt
, offset
, length
);
3942 raidz_reflow_arg_t
*rra
= kmem_zalloc(sizeof (*rra
), KM_SLEEP
);
3944 rra
->rra_lr
= zfs_rangelock_enter(&vre
->vre_rangelock
,
3945 offset
, length
, RL_WRITER
);
3946 rra
->rra_txg
= dmu_tx_get_txg(tx
);
3948 raidz_reflow_record_progress(vre
, offset
+ length
, tx
);
3950 mutex_enter(&vre
->vre_lock
);
3951 vre
->vre_outstanding_bytes
+= length
;
3952 mutex_exit(&vre
->vre_lock
);
3955 * SCL_STATE will be released when the read and write are done,
3956 * by raidz_reflow_write_done().
3958 spa_config_enter(spa
, SCL_STATE
, spa
, RW_READER
);
3960 /* check if a replacing vdev was added, if so treat it as an error */
3961 if (vdev_raidz_expand_child_replacing(vd
)) {
3962 zfs_dbgmsg("replacing vdev encountered, reflow paused at "
3963 "offset=%llu txg=%llu",
3964 (long long)rra
->rra_lr
->lr_offset
,
3965 (long long)rra
->rra_txg
);
3967 mutex_enter(&vre
->vre_lock
);
3968 vre
->vre_failed_offset
=
3969 MIN(vre
->vre_failed_offset
, rra
->rra_lr
->lr_offset
);
3970 cv_signal(&vre
->vre_cv
);
3971 mutex_exit(&vre
->vre_lock
);
3973 /* drop everything we acquired */
3974 zfs_rangelock_exit(rra
->rra_lr
);
3975 kmem_free(rra
, sizeof (*rra
));
3976 spa_config_exit(spa
, SCL_STATE
, spa
);
3980 zio_t
*pio
= spa
->spa_txg_zio
[txgoff
];
3981 abd_t
*abd
= abd_alloc_for_io(length
, B_FALSE
);
3982 zio_t
*write_zio
= zio_vdev_child_io(pio
, NULL
,
3983 vd
->vdev_child
[blkid
% vd
->vdev_children
],
3984 (blkid
/ vd
->vdev_children
) << ashift
,
3986 ZIO_TYPE_WRITE
, ZIO_PRIORITY_REMOVAL
,
3988 raidz_reflow_write_done
, rra
);
3990 zio_nowait(zio_vdev_child_io(write_zio
, NULL
,
3991 vd
->vdev_child
[blkid
% old_children
],
3992 (blkid
/ old_children
) << ashift
,
3994 ZIO_TYPE_READ
, ZIO_PRIORITY_REMOVAL
,
3996 raidz_reflow_read_done
, rra
));
4002 * For testing (ztest specific)
4005 raidz_expand_pause(uint_t pause_point
)
4007 while (raidz_expand_pause_point
!= 0 &&
4008 raidz_expand_pause_point
<= pause_point
)
4013 raidz_scratch_child_done(zio_t
*zio
)
4015 zio_t
*pio
= zio
->io_private
;
4017 mutex_enter(&pio
->io_lock
);
4018 pio
->io_error
= zio_worst_error(pio
->io_error
, zio
->io_error
);
4019 mutex_exit(&pio
->io_lock
);
4023 * Reflow the beginning portion of the vdev into an intermediate scratch area
4024 * in memory and on disk. This operation must be persisted on disk before we
4025 * proceed to overwrite the beginning portion with the reflowed data.
4027 * This multi-step task can fail to complete if disk errors are encountered
4028 * and we can return here after a pause (waiting for disk to become healthy).
4031 raidz_reflow_scratch_sync(void *arg
, dmu_tx_t
*tx
)
4033 vdev_raidz_expand_t
*vre
= arg
;
4034 spa_t
*spa
= dmu_tx_pool(tx
)->dp_spa
;
4038 spa_config_enter(spa
, SCL_STATE
, FTAG
, RW_READER
);
4039 vdev_t
*raidvd
= vdev_lookup_top(spa
, vre
->vre_vdev_id
);
4040 int ashift
= raidvd
->vdev_ashift
;
4041 uint64_t write_size
= P2ALIGN(VDEV_BOOT_SIZE
, 1 << ashift
);
4042 uint64_t logical_size
= write_size
* raidvd
->vdev_children
;
4043 uint64_t read_size
=
4044 P2ROUNDUP(DIV_ROUND_UP(logical_size
, (raidvd
->vdev_children
- 1)),
4048 * The scratch space must be large enough to get us to the point
4049 * that one row does not overlap itself when moved. This is checked
4050 * by vdev_raidz_attach_check().
4052 VERIFY3U(write_size
, >=, raidvd
->vdev_children
<< ashift
);
4053 VERIFY3U(write_size
, <=, VDEV_BOOT_SIZE
);
4054 VERIFY3U(write_size
, <=, read_size
);
4056 zfs_locked_range_t
*lr
= zfs_rangelock_enter(&vre
->vre_rangelock
,
4057 0, logical_size
, RL_WRITER
);
4059 abd_t
**abds
= kmem_alloc(raidvd
->vdev_children
* sizeof (abd_t
*),
4061 for (int i
= 0; i
< raidvd
->vdev_children
; i
++) {
4062 abds
[i
] = abd_alloc_linear(read_size
, B_FALSE
);
4065 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1
);
4068 * If we have already written the scratch area then we must read from
4069 * there, since new writes were redirected there while we were paused
4070 * or the original location may have been partially overwritten with
4073 if (RRSS_GET_STATE(&spa
->spa_ubsync
) == RRSS_SCRATCH_VALID
) {
4074 VERIFY3U(RRSS_GET_OFFSET(&spa
->spa_ubsync
), ==, logical_size
);
4076 * Read from scratch space.
4078 pio
= zio_root(spa
, NULL
, NULL
, ZIO_FLAG_CANFAIL
);
4079 for (int i
= 0; i
< raidvd
->vdev_children
; i
++) {
4081 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4082 * to the offset to calculate the physical offset to
4083 * write to. Passing in a negative offset makes us
4084 * access the scratch area.
4086 zio_nowait(zio_vdev_child_io(pio
, NULL
,
4087 raidvd
->vdev_child
[i
],
4088 VDEV_BOOT_OFFSET
- VDEV_LABEL_START_SIZE
, abds
[i
],
4089 write_size
, ZIO_TYPE_READ
, ZIO_PRIORITY_ASYNC_READ
,
4090 ZIO_FLAG_CANFAIL
, raidz_scratch_child_done
, pio
));
4092 error
= zio_wait(pio
);
4094 zfs_dbgmsg("reflow: error %d reading scratch location",
4102 * Read from original location.
4104 pio
= zio_root(spa
, NULL
, NULL
, ZIO_FLAG_CANFAIL
);
4105 for (int i
= 0; i
< raidvd
->vdev_children
- 1; i
++) {
4106 ASSERT0(vdev_is_dead(raidvd
->vdev_child
[i
]));
4107 zio_nowait(zio_vdev_child_io(pio
, NULL
, raidvd
->vdev_child
[i
],
4108 0, abds
[i
], read_size
, ZIO_TYPE_READ
,
4109 ZIO_PRIORITY_ASYNC_READ
, ZIO_FLAG_CANFAIL
,
4110 raidz_scratch_child_done
, pio
));
4112 error
= zio_wait(pio
);
4114 zfs_dbgmsg("reflow: error %d reading original location", error
);
4116 for (int i
= 0; i
< raidvd
->vdev_children
; i
++)
4118 kmem_free(abds
, raidvd
->vdev_children
* sizeof (abd_t
*));
4119 zfs_rangelock_exit(lr
);
4120 spa_config_exit(spa
, SCL_STATE
, FTAG
);
4124 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2
);
4129 uint64_t logical_sectors
= logical_size
>> ashift
;
4130 for (int i
= raidvd
->vdev_children
- 1; i
< logical_sectors
; i
++) {
4131 int oldchild
= i
% (raidvd
->vdev_children
- 1);
4132 uint64_t oldoff
= (i
/ (raidvd
->vdev_children
- 1)) << ashift
;
4134 int newchild
= i
% raidvd
->vdev_children
;
4135 uint64_t newoff
= (i
/ raidvd
->vdev_children
) << ashift
;
4137 /* a single sector should not be copying over itself */
4138 ASSERT(!(newchild
== oldchild
&& newoff
== oldoff
));
4140 abd_copy_off(abds
[newchild
], abds
[oldchild
],
4141 newoff
, oldoff
, 1 << ashift
);
4145 * Verify that we filled in everything we intended to (write_size on
4148 VERIFY0(logical_sectors
% raidvd
->vdev_children
);
4149 VERIFY3U((logical_sectors
/ raidvd
->vdev_children
) << ashift
, ==,
4153 * Write to scratch location (boot area).
4155 pio
= zio_root(spa
, NULL
, NULL
, ZIO_FLAG_CANFAIL
);
4156 for (int i
= 0; i
< raidvd
->vdev_children
; i
++) {
4158 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4159 * the offset to calculate the physical offset to write to.
4160 * Passing in a negative offset lets us access the boot area.
4162 zio_nowait(zio_vdev_child_io(pio
, NULL
, raidvd
->vdev_child
[i
],
4163 VDEV_BOOT_OFFSET
- VDEV_LABEL_START_SIZE
, abds
[i
],
4164 write_size
, ZIO_TYPE_WRITE
, ZIO_PRIORITY_ASYNC_WRITE
,
4165 ZIO_FLAG_CANFAIL
, raidz_scratch_child_done
, pio
));
4167 error
= zio_wait(pio
);
4169 zfs_dbgmsg("reflow: error %d writing scratch location", error
);
4172 pio
= zio_root(spa
, NULL
, NULL
, 0);
4173 zio_flush(pio
, raidvd
);
4176 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4177 (long long)logical_size
);
4179 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3
);
4182 * Update uberblock to indicate that scratch space is valid. This is
4183 * needed because after this point, the real location may be
4184 * overwritten. If we crash, we need to get the data from the
4185 * scratch space, rather than the real location.
4187 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4188 * will prefer this uberblock.
4190 RAIDZ_REFLOW_SET(&spa
->spa_ubsync
, RRSS_SCRATCH_VALID
, logical_size
);
4191 spa
->spa_ubsync
.ub_timestamp
++;
4192 ASSERT0(vdev_uberblock_sync_list(&spa
->spa_root_vdev
, 1,
4193 &spa
->spa_ubsync
, ZIO_FLAG_CONFIG_WRITER
));
4194 if (spa_multihost(spa
))
4195 mmp_update_uberblock(spa
, &spa
->spa_ubsync
);
4197 zfs_dbgmsg("reflow: uberblock updated "
4198 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4199 (long long)spa
->spa_ubsync
.ub_txg
,
4200 (long long)logical_size
,
4201 (long long)spa
->spa_ubsync
.ub_timestamp
);
4203 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID
);
4206 * Overwrite with reflow'ed data.
4209 pio
= zio_root(spa
, NULL
, NULL
, ZIO_FLAG_CANFAIL
);
4210 for (int i
= 0; i
< raidvd
->vdev_children
; i
++) {
4211 zio_nowait(zio_vdev_child_io(pio
, NULL
, raidvd
->vdev_child
[i
],
4212 0, abds
[i
], write_size
, ZIO_TYPE_WRITE
,
4213 ZIO_PRIORITY_ASYNC_WRITE
, ZIO_FLAG_CANFAIL
,
4214 raidz_scratch_child_done
, pio
));
4216 error
= zio_wait(pio
);
4219 * When we exit early here and drop the range lock, new
4220 * writes will go into the scratch area so we'll need to
4221 * read from there when we return after pausing.
4223 zfs_dbgmsg("reflow: error %d writing real location", error
);
4225 * Update the uberblock that is written when this txg completes.
4227 RAIDZ_REFLOW_SET(&spa
->spa_uberblock
, RRSS_SCRATCH_VALID
,
4231 pio
= zio_root(spa
, NULL
, NULL
, 0);
4232 zio_flush(pio
, raidvd
);
4235 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4236 (long long)logical_size
);
4237 for (int i
= 0; i
< raidvd
->vdev_children
; i
++)
4239 kmem_free(abds
, raidvd
->vdev_children
* sizeof (abd_t
*));
4241 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED
);
4244 * Update uberblock to indicate that the initial part has been
4245 * reflow'ed. This is needed because after this point (when we exit
4246 * the rangelock), we allow regular writes to this region, which will
4247 * be written to the new location only (because reflow_offset_next ==
4248 * reflow_offset_synced). If we crashed and re-copied from the
4249 * scratch space, we would lose the regular writes.
4251 RAIDZ_REFLOW_SET(&spa
->spa_ubsync
, RRSS_SCRATCH_INVALID_SYNCED
,
4253 spa
->spa_ubsync
.ub_timestamp
++;
4254 ASSERT0(vdev_uberblock_sync_list(&spa
->spa_root_vdev
, 1,
4255 &spa
->spa_ubsync
, ZIO_FLAG_CONFIG_WRITER
));
4256 if (spa_multihost(spa
))
4257 mmp_update_uberblock(spa
, &spa
->spa_ubsync
);
4259 zfs_dbgmsg("reflow: uberblock updated "
4260 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4261 (long long)spa
->spa_ubsync
.ub_txg
,
4262 (long long)logical_size
,
4263 (long long)spa
->spa_ubsync
.ub_timestamp
);
4265 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1
);
4270 vre
->vre_offset
= logical_size
;
4271 zfs_rangelock_exit(lr
);
4272 spa_config_exit(spa
, SCL_STATE
, FTAG
);
4274 int txgoff
= dmu_tx_get_txg(tx
) & TXG_MASK
;
4275 vre
->vre_offset_pertxg
[txgoff
] = vre
->vre_offset
;
4276 vre
->vre_bytes_copied_pertxg
[txgoff
] = vre
->vre_bytes_copied
;
4278 * Note - raidz_reflow_sync() will update the uberblock state to
4279 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4281 raidz_reflow_sync(spa
, tx
);
4283 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2
);
4287 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4288 * here. No other i/o can be in progress, so we don't need the vre_rangelock.
4291 vdev_raidz_reflow_copy_scratch(spa_t
*spa
)
4293 vdev_raidz_expand_t
*vre
= spa
->spa_raidz_expand
;
4294 uint64_t logical_size
= RRSS_GET_OFFSET(&spa
->spa_uberblock
);
4295 ASSERT3U(RRSS_GET_STATE(&spa
->spa_uberblock
), ==, RRSS_SCRATCH_VALID
);
4297 spa_config_enter(spa
, SCL_STATE
, FTAG
, RW_READER
);
4298 vdev_t
*raidvd
= vdev_lookup_top(spa
, vre
->vre_vdev_id
);
4299 ASSERT0(logical_size
% raidvd
->vdev_children
);
4300 uint64_t write_size
= logical_size
/ raidvd
->vdev_children
;
4305 * Read from scratch space.
4307 abd_t
**abds
= kmem_alloc(raidvd
->vdev_children
* sizeof (abd_t
*),
4309 for (int i
= 0; i
< raidvd
->vdev_children
; i
++) {
4310 abds
[i
] = abd_alloc_linear(write_size
, B_FALSE
);
4313 pio
= zio_root(spa
, NULL
, NULL
, 0);
4314 for (int i
= 0; i
< raidvd
->vdev_children
; i
++) {
4316 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4317 * the offset to calculate the physical offset to write to.
4318 * Passing in a negative offset lets us access the boot area.
4320 zio_nowait(zio_vdev_child_io(pio
, NULL
, raidvd
->vdev_child
[i
],
4321 VDEV_BOOT_OFFSET
- VDEV_LABEL_START_SIZE
, abds
[i
],
4322 write_size
, ZIO_TYPE_READ
,
4323 ZIO_PRIORITY_ASYNC_READ
, 0,
4324 raidz_scratch_child_done
, pio
));
4329 * Overwrite real location with reflow'ed data.
4331 pio
= zio_root(spa
, NULL
, NULL
, 0);
4332 for (int i
= 0; i
< raidvd
->vdev_children
; i
++) {
4333 zio_nowait(zio_vdev_child_io(pio
, NULL
, raidvd
->vdev_child
[i
],
4334 0, abds
[i
], write_size
, ZIO_TYPE_WRITE
,
4335 ZIO_PRIORITY_ASYNC_WRITE
, 0,
4336 raidz_scratch_child_done
, pio
));
4339 pio
= zio_root(spa
, NULL
, NULL
, 0);
4340 zio_flush(pio
, raidvd
);
4343 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4344 "to real location", (long long)logical_size
);
4346 for (int i
= 0; i
< raidvd
->vdev_children
; i
++)
4348 kmem_free(abds
, raidvd
->vdev_children
* sizeof (abd_t
*));
4353 RAIDZ_REFLOW_SET(&spa
->spa_ubsync
,
4354 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT
, logical_size
);
4355 spa
->spa_ubsync
.ub_timestamp
++;
4356 VERIFY0(vdev_uberblock_sync_list(&spa
->spa_root_vdev
, 1,
4357 &spa
->spa_ubsync
, ZIO_FLAG_CONFIG_WRITER
));
4358 if (spa_multihost(spa
))
4359 mmp_update_uberblock(spa
, &spa
->spa_ubsync
);
4361 zfs_dbgmsg("reflow recovery: uberblock updated "
4362 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4363 (long long)spa
->spa_ubsync
.ub_txg
,
4364 (long long)logical_size
,
4365 (long long)spa
->spa_ubsync
.ub_timestamp
);
4367 dmu_tx_t
*tx
= dmu_tx_create_assigned(spa
->spa_dsl_pool
,
4368 spa_first_txg(spa
));
4369 int txgoff
= dmu_tx_get_txg(tx
) & TXG_MASK
;
4370 vre
->vre_offset
= logical_size
;
4371 vre
->vre_offset_pertxg
[txgoff
] = vre
->vre_offset
;
4372 vre
->vre_bytes_copied_pertxg
[txgoff
] = vre
->vre_bytes_copied
;
4374 * Note that raidz_reflow_sync() will update the uberblock once more
4376 raidz_reflow_sync(spa
, tx
);
4380 spa_config_exit(spa
, SCL_STATE
, FTAG
);
4384 spa_raidz_expand_thread_check(void *arg
, zthr_t
*zthr
)
4389 return (spa
->spa_raidz_expand
!= NULL
&&
4390 !spa
->spa_raidz_expand
->vre_waiting_for_resilver
);
4394 * RAIDZ expansion background thread
4396 * Can be called multiple times if the reflow is paused
4399 spa_raidz_expand_thread(void *arg
, zthr_t
*zthr
)
4402 vdev_raidz_expand_t
*vre
= spa
->spa_raidz_expand
;
4404 if (RRSS_GET_STATE(&spa
->spa_ubsync
) == RRSS_SCRATCH_VALID
)
4405 vre
->vre_offset
= 0;
4407 vre
->vre_offset
= RRSS_GET_OFFSET(&spa
->spa_ubsync
);
4409 /* Reflow the begining portion using the scratch area */
4410 if (vre
->vre_offset
== 0) {
4411 VERIFY0(dsl_sync_task(spa_name(spa
),
4412 NULL
, raidz_reflow_scratch_sync
,
4413 vre
, 0, ZFS_SPACE_CHECK_NONE
));
4415 /* if we encountered errors then pause */
4416 if (vre
->vre_offset
== 0) {
4417 mutex_enter(&vre
->vre_lock
);
4418 vre
->vre_waiting_for_resilver
= B_TRUE
;
4419 mutex_exit(&vre
->vre_lock
);
4424 spa_config_enter(spa
, SCL_CONFIG
, FTAG
, RW_READER
);
4425 vdev_t
*raidvd
= vdev_lookup_top(spa
, vre
->vre_vdev_id
);
4427 uint64_t guid
= raidvd
->vdev_guid
;
4429 /* Iterate over all the remaining metaslabs */
4430 for (uint64_t i
= vre
->vre_offset
>> raidvd
->vdev_ms_shift
;
4431 i
< raidvd
->vdev_ms_count
&&
4432 !zthr_iscancelled(zthr
) &&
4433 vre
->vre_failed_offset
== UINT64_MAX
; i
++) {
4434 metaslab_t
*msp
= raidvd
->vdev_ms
[i
];
4436 metaslab_disable(msp
);
4437 mutex_enter(&msp
->ms_lock
);
4440 * The metaslab may be newly created (for the expanded
4441 * space), in which case its trees won't exist yet,
4442 * so we need to bail out early.
4445 mutex_exit(&msp
->ms_lock
);
4446 metaslab_enable(msp
, B_FALSE
, B_FALSE
);
4450 VERIFY0(metaslab_load(msp
));
4453 * We want to copy everything except the free (allocatable)
4454 * space. Note that there may be a little bit more free
4455 * space (e.g. in ms_defer), and it's fine to copy that too.
4457 range_tree_t
*rt
= range_tree_create(NULL
, RANGE_SEG64
,
4459 range_tree_add(rt
, msp
->ms_start
, msp
->ms_size
);
4460 range_tree_walk(msp
->ms_allocatable
, range_tree_remove
, rt
);
4461 mutex_exit(&msp
->ms_lock
);
4464 * Force the last sector of each metaslab to be copied. This
4465 * ensures that we advance the on-disk progress to the end of
4466 * this metaslab while the metaslab is disabled. Otherwise, we
4467 * could move past this metaslab without advancing the on-disk
4468 * progress, and then an allocation to this metaslab would not
4471 int sectorsz
= 1 << raidvd
->vdev_ashift
;
4472 uint64_t ms_last_offset
= msp
->ms_start
+
4473 msp
->ms_size
- sectorsz
;
4474 if (!range_tree_contains(rt
, ms_last_offset
, sectorsz
)) {
4475 range_tree_add(rt
, ms_last_offset
, sectorsz
);
4479 * When we are resuming from a paused expansion (i.e.
4480 * when importing a pool with a expansion in progress),
4481 * discard any state that we have already processed.
4483 range_tree_clear(rt
, 0, vre
->vre_offset
);
4485 while (!zthr_iscancelled(zthr
) &&
4486 !range_tree_is_empty(rt
) &&
4487 vre
->vre_failed_offset
== UINT64_MAX
) {
4490 * We need to periodically drop the config lock so that
4491 * writers can get in. Additionally, we can't wait
4492 * for a txg to sync while holding a config lock
4493 * (since a waiting writer could cause a 3-way deadlock
4494 * with the sync thread, which also gets a config
4495 * lock for reader). So we can't hold the config lock
4496 * while calling dmu_tx_assign().
4498 spa_config_exit(spa
, SCL_CONFIG
, FTAG
);
4501 * If requested, pause the reflow when the amount
4502 * specified by raidz_expand_max_reflow_bytes is reached
4504 * This pause is only used during testing or debugging.
4506 while (raidz_expand_max_reflow_bytes
!= 0 &&
4507 raidz_expand_max_reflow_bytes
<=
4508 vre
->vre_bytes_copied
&& !zthr_iscancelled(zthr
)) {
4512 mutex_enter(&vre
->vre_lock
);
4513 while (vre
->vre_outstanding_bytes
>
4514 raidz_expand_max_copy_bytes
) {
4515 cv_wait(&vre
->vre_cv
, &vre
->vre_lock
);
4517 mutex_exit(&vre
->vre_lock
);
4520 dmu_tx_create_dd(spa_get_dsl(spa
)->dp_mos_dir
);
4522 VERIFY0(dmu_tx_assign(tx
, TXG_WAIT
));
4523 uint64_t txg
= dmu_tx_get_txg(tx
);
4526 * Reacquire the vdev_config lock. Theoretically, the
4527 * vdev_t that we're expanding may have changed.
4529 spa_config_enter(spa
, SCL_CONFIG
, FTAG
, RW_READER
);
4530 raidvd
= vdev_lookup_top(spa
, vre
->vre_vdev_id
);
4532 boolean_t needsync
=
4533 raidz_reflow_impl(raidvd
, vre
, rt
, tx
);
4538 spa_config_exit(spa
, SCL_CONFIG
, FTAG
);
4539 txg_wait_synced(spa
->spa_dsl_pool
, txg
);
4540 spa_config_enter(spa
, SCL_CONFIG
, FTAG
,
4545 spa_config_exit(spa
, SCL_CONFIG
, FTAG
);
4547 metaslab_enable(msp
, B_FALSE
, B_FALSE
);
4548 range_tree_vacate(rt
, NULL
, NULL
);
4549 range_tree_destroy(rt
);
4551 spa_config_enter(spa
, SCL_CONFIG
, FTAG
, RW_READER
);
4552 raidvd
= vdev_lookup_top(spa
, vre
->vre_vdev_id
);
4555 spa_config_exit(spa
, SCL_CONFIG
, FTAG
);
4558 * The txg_wait_synced() here ensures that all reflow zio's have
4559 * completed, and vre_failed_offset has been set if necessary. It
4560 * also ensures that the progress of the last raidz_reflow_sync() is
4561 * written to disk before raidz_reflow_complete_sync() changes the
4562 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to
4563 * determine if a reflow is in progress, in which case we may need to
4564 * write to both old and new locations. Therefore we can only change
4565 * vre_state once this is not necessary, which is once the on-disk
4566 * progress (in spa_ubsync) has been set past any possible writes (to
4567 * the end of the last metaslab).
4569 txg_wait_synced(spa
->spa_dsl_pool
, 0);
4571 if (!zthr_iscancelled(zthr
) &&
4572 vre
->vre_offset
== raidvd
->vdev_ms_count
<< raidvd
->vdev_ms_shift
) {
4574 * We are not being canceled or paused, so the reflow must be
4575 * complete. In that case also mark it as completed on disk.
4577 ASSERT3U(vre
->vre_failed_offset
, ==, UINT64_MAX
);
4578 VERIFY0(dsl_sync_task(spa_name(spa
), NULL
,
4579 raidz_reflow_complete_sync
, spa
,
4580 0, ZFS_SPACE_CHECK_NONE
));
4581 (void) vdev_online(spa
, guid
, ZFS_ONLINE_EXPAND
, NULL
);
4584 * Wait for all copy zio's to complete and for all the
4585 * raidz_reflow_sync() synctasks to be run.
4587 spa_history_log_internal(spa
, "reflow pause",
4588 NULL
, "offset=%llu failed_offset=%lld",
4589 (long long)vre
->vre_offset
,
4590 (long long)vre
->vre_failed_offset
);
4591 mutex_enter(&vre
->vre_lock
);
4592 if (vre
->vre_failed_offset
!= UINT64_MAX
) {
4594 * Reset progress so that we will retry everything
4595 * after the point that something failed.
4597 vre
->vre_offset
= vre
->vre_failed_offset
;
4598 vre
->vre_failed_offset
= UINT64_MAX
;
4599 vre
->vre_waiting_for_resilver
= B_TRUE
;
4601 mutex_exit(&vre
->vre_lock
);
4606 spa_start_raidz_expansion_thread(spa_t
*spa
)
4608 ASSERT3P(spa
->spa_raidz_expand_zthr
, ==, NULL
);
4609 spa
->spa_raidz_expand_zthr
= zthr_create("raidz_expand",
4610 spa_raidz_expand_thread_check
, spa_raidz_expand_thread
,
4615 raidz_dtl_reassessed(vdev_t
*vd
)
4617 spa_t
*spa
= vd
->vdev_spa
;
4618 if (spa
->spa_raidz_expand
!= NULL
) {
4619 vdev_raidz_expand_t
*vre
= spa
->spa_raidz_expand
;
4621 * we get called often from vdev_dtl_reassess() so make
4622 * sure it's our vdev and any replacing is complete
4624 if (vd
->vdev_top
->vdev_id
== vre
->vre_vdev_id
&&
4625 !vdev_raidz_expand_child_replacing(vd
->vdev_top
)) {
4626 mutex_enter(&vre
->vre_lock
);
4627 if (vre
->vre_waiting_for_resilver
) {
4628 vdev_dbgmsg(vd
, "DTL reassessed, "
4629 "continuing raidz expansion");
4630 vre
->vre_waiting_for_resilver
= B_FALSE
;
4631 zthr_wakeup(spa
->spa_raidz_expand_zthr
);
4633 mutex_exit(&vre
->vre_lock
);
4639 vdev_raidz_attach_check(vdev_t
*new_child
)
4641 vdev_t
*raidvd
= new_child
->vdev_parent
;
4642 uint64_t new_children
= raidvd
->vdev_children
;
4645 * We use the "boot" space as scratch space to handle overwriting the
4646 * initial part of the vdev. If it is too small, then this expansion
4647 * is not allowed. This would be very unusual (e.g. ashift > 13 and
4650 if (new_children
<< raidvd
->vdev_ashift
> VDEV_BOOT_SIZE
) {
4657 vdev_raidz_attach_sync(void *arg
, dmu_tx_t
*tx
)
4659 vdev_t
*new_child
= arg
;
4660 spa_t
*spa
= new_child
->vdev_spa
;
4661 vdev_t
*raidvd
= new_child
->vdev_parent
;
4662 vdev_raidz_t
*vdrz
= raidvd
->vdev_tsd
;
4663 ASSERT3P(raidvd
->vdev_ops
, ==, &vdev_raidz_ops
);
4664 ASSERT3P(raidvd
->vdev_top
, ==, raidvd
);
4665 ASSERT3U(raidvd
->vdev_children
, >, vdrz
->vd_original_width
);
4666 ASSERT3U(raidvd
->vdev_children
, ==, vdrz
->vd_physical_width
+ 1);
4667 ASSERT3P(raidvd
->vdev_child
[raidvd
->vdev_children
- 1], ==,
4670 spa_feature_incr(spa
, SPA_FEATURE_RAIDZ_EXPANSION
, tx
);
4672 vdrz
->vd_physical_width
++;
4674 VERIFY0(spa
->spa_uberblock
.ub_raidz_reflow_info
);
4675 vdrz
->vn_vre
.vre_vdev_id
= raidvd
->vdev_id
;
4676 vdrz
->vn_vre
.vre_offset
= 0;
4677 vdrz
->vn_vre
.vre_failed_offset
= UINT64_MAX
;
4678 spa
->spa_raidz_expand
= &vdrz
->vn_vre
;
4679 zthr_wakeup(spa
->spa_raidz_expand_zthr
);
4682 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4683 * written to the config.
4685 vdev_config_dirty(raidvd
);
4687 vdrz
->vn_vre
.vre_start_time
= gethrestime_sec();
4688 vdrz
->vn_vre
.vre_end_time
= 0;
4689 vdrz
->vn_vre
.vre_state
= DSS_SCANNING
;
4690 vdrz
->vn_vre
.vre_bytes_copied
= 0;
4692 uint64_t state
= vdrz
->vn_vre
.vre_state
;
4693 VERIFY0(zap_update(spa
->spa_meta_objset
,
4694 raidvd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
,
4695 sizeof (state
), 1, &state
, tx
));
4697 uint64_t start_time
= vdrz
->vn_vre
.vre_start_time
;
4698 VERIFY0(zap_update(spa
->spa_meta_objset
,
4699 raidvd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
,
4700 sizeof (start_time
), 1, &start_time
, tx
));
4702 (void) zap_remove(spa
->spa_meta_objset
,
4703 raidvd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
, tx
);
4704 (void) zap_remove(spa
->spa_meta_objset
,
4705 raidvd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
, tx
);
4707 spa_history_log_internal(spa
, "raidz vdev expansion started", tx
,
4708 "%s vdev %llu new width %llu", spa_name(spa
),
4709 (unsigned long long)raidvd
->vdev_id
,
4710 (unsigned long long)raidvd
->vdev_children
);
4714 vdev_raidz_load(vdev_t
*vd
)
4716 vdev_raidz_t
*vdrz
= vd
->vdev_tsd
;
4719 uint64_t state
= DSS_NONE
;
4720 uint64_t start_time
= 0;
4721 uint64_t end_time
= 0;
4722 uint64_t bytes_copied
= 0;
4724 if (vd
->vdev_top_zap
!= 0) {
4725 err
= zap_lookup(vd
->vdev_spa
->spa_meta_objset
,
4726 vd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
,
4727 sizeof (state
), 1, &state
);
4728 if (err
!= 0 && err
!= ENOENT
)
4731 err
= zap_lookup(vd
->vdev_spa
->spa_meta_objset
,
4732 vd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
,
4733 sizeof (start_time
), 1, &start_time
);
4734 if (err
!= 0 && err
!= ENOENT
)
4737 err
= zap_lookup(vd
->vdev_spa
->spa_meta_objset
,
4738 vd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
,
4739 sizeof (end_time
), 1, &end_time
);
4740 if (err
!= 0 && err
!= ENOENT
)
4743 err
= zap_lookup(vd
->vdev_spa
->spa_meta_objset
,
4744 vd
->vdev_top_zap
, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
,
4745 sizeof (bytes_copied
), 1, &bytes_copied
);
4746 if (err
!= 0 && err
!= ENOENT
)
4751 * If we are in the middle of expansion, vre_state should have
4752 * already been set by vdev_raidz_init().
4754 EQUIV(vdrz
->vn_vre
.vre_state
== DSS_SCANNING
, state
== DSS_SCANNING
);
4755 vdrz
->vn_vre
.vre_state
= (dsl_scan_state_t
)state
;
4756 vdrz
->vn_vre
.vre_start_time
= start_time
;
4757 vdrz
->vn_vre
.vre_end_time
= end_time
;
4758 vdrz
->vn_vre
.vre_bytes_copied
= bytes_copied
;
4764 spa_raidz_expand_get_stats(spa_t
*spa
, pool_raidz_expand_stat_t
*pres
)
4766 vdev_raidz_expand_t
*vre
= spa
->spa_raidz_expand
;
4769 /* no removal in progress; find most recent completed */
4770 for (int c
= 0; c
< spa
->spa_root_vdev
->vdev_children
; c
++) {
4771 vdev_t
*vd
= spa
->spa_root_vdev
->vdev_child
[c
];
4772 if (vd
->vdev_ops
== &vdev_raidz_ops
) {
4773 vdev_raidz_t
*vdrz
= vd
->vdev_tsd
;
4775 if (vdrz
->vn_vre
.vre_end_time
!= 0 &&
4777 vdrz
->vn_vre
.vre_end_time
>
4778 vre
->vre_end_time
)) {
4779 vre
= &vdrz
->vn_vre
;
4786 return (SET_ERROR(ENOENT
));
4789 pres
->pres_state
= vre
->vre_state
;
4790 pres
->pres_expanding_vdev
= vre
->vre_vdev_id
;
4792 vdev_t
*vd
= vdev_lookup_top(spa
, vre
->vre_vdev_id
);
4793 pres
->pres_to_reflow
= vd
->vdev_stat
.vs_alloc
;
4795 mutex_enter(&vre
->vre_lock
);
4796 pres
->pres_reflowed
= vre
->vre_bytes_copied
;
4797 for (int i
= 0; i
< TXG_SIZE
; i
++)
4798 pres
->pres_reflowed
+= vre
->vre_bytes_copied_pertxg
[i
];
4799 mutex_exit(&vre
->vre_lock
);
4801 pres
->pres_start_time
= vre
->vre_start_time
;
4802 pres
->pres_end_time
= vre
->vre_end_time
;
4803 pres
->pres_waiting_for_resilver
= vre
->vre_waiting_for_resilver
;
4809 * Initialize private RAIDZ specific fields from the nvlist.
4812 vdev_raidz_init(spa_t
*spa
, nvlist_t
*nv
, void **tsd
)
4816 int error
= nvlist_lookup_nvlist_array(nv
,
4817 ZPOOL_CONFIG_CHILDREN
, &child
, &children
);
4819 return (SET_ERROR(EINVAL
));
4822 if (nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_NPARITY
, &nparity
) == 0) {
4823 if (nparity
== 0 || nparity
> VDEV_RAIDZ_MAXPARITY
)
4824 return (SET_ERROR(EINVAL
));
4827 * Previous versions could only support 1 or 2 parity
4830 if (nparity
> 1 && spa_version(spa
) < SPA_VERSION_RAIDZ2
)
4831 return (SET_ERROR(EINVAL
));
4832 else if (nparity
> 2 && spa_version(spa
) < SPA_VERSION_RAIDZ3
)
4833 return (SET_ERROR(EINVAL
));
4836 * We require the parity to be specified for SPAs that
4837 * support multiple parity levels.
4839 if (spa_version(spa
) >= SPA_VERSION_RAIDZ2
)
4840 return (SET_ERROR(EINVAL
));
4843 * Otherwise, we default to 1 parity device for RAID-Z.
4848 vdev_raidz_t
*vdrz
= kmem_zalloc(sizeof (*vdrz
), KM_SLEEP
);
4849 vdrz
->vn_vre
.vre_vdev_id
= -1;
4850 vdrz
->vn_vre
.vre_offset
= UINT64_MAX
;
4851 vdrz
->vn_vre
.vre_failed_offset
= UINT64_MAX
;
4852 mutex_init(&vdrz
->vn_vre
.vre_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
4853 cv_init(&vdrz
->vn_vre
.vre_cv
, NULL
, CV_DEFAULT
, NULL
);
4854 zfs_rangelock_init(&vdrz
->vn_vre
.vre_rangelock
, NULL
, NULL
);
4855 mutex_init(&vdrz
->vd_expand_lock
, NULL
, MUTEX_DEFAULT
, NULL
);
4856 avl_create(&vdrz
->vd_expand_txgs
, vdev_raidz_reflow_compare
,
4857 sizeof (reflow_node_t
), offsetof(reflow_node_t
, re_link
));
4859 vdrz
->vd_physical_width
= children
;
4860 vdrz
->vd_nparity
= nparity
;
4862 /* note, the ID does not exist when creating a pool */
4863 (void) nvlist_lookup_uint64(nv
, ZPOOL_CONFIG_ID
,
4864 &vdrz
->vn_vre
.vre_vdev_id
);
4866 boolean_t reflow_in_progress
=
4867 nvlist_exists(nv
, ZPOOL_CONFIG_RAIDZ_EXPANDING
);
4868 if (reflow_in_progress
) {
4869 spa
->spa_raidz_expand
= &vdrz
->vn_vre
;
4870 vdrz
->vn_vre
.vre_state
= DSS_SCANNING
;
4873 vdrz
->vd_original_width
= children
;
4875 unsigned int txgs_size
= 0;
4876 error
= nvlist_lookup_uint64_array(nv
, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
,
4879 for (int i
= 0; i
< txgs_size
; i
++) {
4880 reflow_node_t
*re
= kmem_zalloc(sizeof (*re
), KM_SLEEP
);
4881 re
->re_txg
= txgs
[txgs_size
- i
- 1];
4882 re
->re_logical_width
= vdrz
->vd_physical_width
- i
;
4884 if (reflow_in_progress
)
4885 re
->re_logical_width
--;
4887 avl_add(&vdrz
->vd_expand_txgs
, re
);
4890 vdrz
->vd_original_width
= vdrz
->vd_physical_width
- txgs_size
;
4892 if (reflow_in_progress
) {
4893 vdrz
->vd_original_width
--;
4894 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4895 children
, txgs_size
);
4904 vdev_raidz_fini(vdev_t
*vd
)
4906 vdev_raidz_t
*vdrz
= vd
->vdev_tsd
;
4907 if (vd
->vdev_spa
->spa_raidz_expand
== &vdrz
->vn_vre
)
4908 vd
->vdev_spa
->spa_raidz_expand
= NULL
;
4910 void *cookie
= NULL
;
4911 avl_tree_t
*tree
= &vdrz
->vd_expand_txgs
;
4912 while ((re
= avl_destroy_nodes(tree
, &cookie
)) != NULL
)
4913 kmem_free(re
, sizeof (*re
));
4914 avl_destroy(&vdrz
->vd_expand_txgs
);
4915 mutex_destroy(&vdrz
->vd_expand_lock
);
4916 mutex_destroy(&vdrz
->vn_vre
.vre_lock
);
4917 cv_destroy(&vdrz
->vn_vre
.vre_cv
);
4918 zfs_rangelock_fini(&vdrz
->vn_vre
.vre_rangelock
);
4919 kmem_free(vdrz
, sizeof (*vdrz
));
4923 * Add RAIDZ specific fields to the config nvlist.
4926 vdev_raidz_config_generate(vdev_t
*vd
, nvlist_t
*nv
)
4928 ASSERT3P(vd
->vdev_ops
, ==, &vdev_raidz_ops
);
4929 vdev_raidz_t
*vdrz
= vd
->vdev_tsd
;
4932 * Make sure someone hasn't managed to sneak a fancy new vdev
4933 * into a crufty old storage pool.
4935 ASSERT(vdrz
->vd_nparity
== 1 ||
4936 (vdrz
->vd_nparity
<= 2 &&
4937 spa_version(vd
->vdev_spa
) >= SPA_VERSION_RAIDZ2
) ||
4938 (vdrz
->vd_nparity
<= 3 &&
4939 spa_version(vd
->vdev_spa
) >= SPA_VERSION_RAIDZ3
));
4942 * Note that we'll add these even on storage pools where they
4943 * aren't strictly required -- older software will just ignore
4946 fnvlist_add_uint64(nv
, ZPOOL_CONFIG_NPARITY
, vdrz
->vd_nparity
);
4948 if (vdrz
->vn_vre
.vre_state
== DSS_SCANNING
) {
4949 fnvlist_add_boolean(nv
, ZPOOL_CONFIG_RAIDZ_EXPANDING
);
4952 mutex_enter(&vdrz
->vd_expand_lock
);
4953 if (!avl_is_empty(&vdrz
->vd_expand_txgs
)) {
4954 uint64_t count
= avl_numnodes(&vdrz
->vd_expand_txgs
);
4955 uint64_t *txgs
= kmem_alloc(sizeof (uint64_t) * count
,
4959 for (reflow_node_t
*re
= avl_first(&vdrz
->vd_expand_txgs
);
4960 re
!= NULL
; re
= AVL_NEXT(&vdrz
->vd_expand_txgs
, re
)) {
4961 txgs
[i
++] = re
->re_txg
;
4964 fnvlist_add_uint64_array(nv
, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
,
4967 kmem_free(txgs
, sizeof (uint64_t) * count
);
4969 mutex_exit(&vdrz
->vd_expand_lock
);
4973 vdev_raidz_nparity(vdev_t
*vd
)
4975 vdev_raidz_t
*vdrz
= vd
->vdev_tsd
;
4976 return (vdrz
->vd_nparity
);
4980 vdev_raidz_ndisks(vdev_t
*vd
)
4982 return (vd
->vdev_children
);
4985 vdev_ops_t vdev_raidz_ops
= {
4986 .vdev_op_init
= vdev_raidz_init
,
4987 .vdev_op_fini
= vdev_raidz_fini
,
4988 .vdev_op_open
= vdev_raidz_open
,
4989 .vdev_op_close
= vdev_raidz_close
,
4990 .vdev_op_asize
= vdev_raidz_asize
,
4991 .vdev_op_min_asize
= vdev_raidz_min_asize
,
4992 .vdev_op_min_alloc
= NULL
,
4993 .vdev_op_io_start
= vdev_raidz_io_start
,
4994 .vdev_op_io_done
= vdev_raidz_io_done
,
4995 .vdev_op_state_change
= vdev_raidz_state_change
,
4996 .vdev_op_need_resilver
= vdev_raidz_need_resilver
,
4997 .vdev_op_hold
= NULL
,
4998 .vdev_op_rele
= NULL
,
4999 .vdev_op_remap
= NULL
,
5000 .vdev_op_xlate
= vdev_raidz_xlate
,
5001 .vdev_op_rebuild_asize
= NULL
,
5002 .vdev_op_metaslab_init
= NULL
,
5003 .vdev_op_config_generate
= vdev_raidz_config_generate
,
5004 .vdev_op_nparity
= vdev_raidz_nparity
,
5005 .vdev_op_ndisks
= vdev_raidz_ndisks
,
5006 .vdev_op_type
= VDEV_TYPE_RAIDZ
, /* name of this vdev type */
5007 .vdev_op_leaf
= B_FALSE
/* not a leaf vdev */
5011 ZFS_MODULE_PARAM(zfs_vdev
, raidz_
, expand_max_reflow_bytes
, ULONG
, ZMOD_RW
,
5012 "For testing, pause RAIDZ expansion after reflowing this many bytes");
5013 ZFS_MODULE_PARAM(zfs_vdev
, raidz_
, expand_max_copy_bytes
, ULONG
, ZMOD_RW
,
5014 "Max amount of concurrent i/o for RAIDZ expansion");
5015 ZFS_MODULE_PARAM(zfs_vdev
, raidz_
, io_aggregate_rows
, ULONG
, ZMOD_RW
,
5016 "For expanded RAIDZ, aggregate reads that have more rows than this");
5017 ZFS_MODULE_PARAM(zfs
, zfs_
, scrub_after_expand
, INT
, ZMOD_RW
,
5018 "For expanded RAIDZ, automatically start a pool scrub when expansion "