]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_raidz.c
Provide macros for setting and getting blkptr birth times
[mirror_zfs.git] / module / zfs / vdev_raidz.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
1d3ba0bf 9 * or https://opensource.org/licenses/CDDL-1.0.
34dc7c2f
BB
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
428870ff 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
4f072827 24 * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
ab9f4b0b 25 * Copyright (c) 2016 Gvozden Nešković. All rights reserved.
34dc7c2f
BB
26 */
27
34dc7c2f
BB
28#include <sys/zfs_context.h>
29#include <sys/spa.h>
5caeef02
DB
30#include <sys/spa_impl.h>
31#include <sys/zap.h>
34dc7c2f 32#include <sys/vdev_impl.h>
5caeef02 33#include <sys/metaslab_impl.h>
34dc7c2f
BB
34#include <sys/zio.h>
35#include <sys/zio_checksum.h>
5caeef02 36#include <sys/dmu_tx.h>
a6255b7f 37#include <sys/abd.h>
5caeef02 38#include <sys/zfs_rlock.h>
34dc7c2f
BB
39#include <sys/fs/zfs.h>
40#include <sys/fm/fs/zfs.h>
ab9f4b0b
GN
41#include <sys/vdev_raidz.h>
42#include <sys/vdev_raidz_impl.h>
b2255edc 43#include <sys/vdev_draid.h>
5caeef02
DB
44#include <sys/uberblock_impl.h>
45#include <sys/dsl_scan.h>
34dc7c2f 46
619f0976 47#ifdef ZFS_DEBUG
1b939560 48#include <sys/vdev.h> /* For vdev_xlate() in vdev_raidz_io_verify() */
619f0976
GW
49#endif
50
34dc7c2f
BB
51/*
52 * Virtual device vector for RAID-Z.
53 *
45d1cae3
BB
54 * This vdev supports single, double, and triple parity. For single parity,
55 * we use a simple XOR of all the data columns. For double or triple parity,
56 * we use a special case of Reed-Solomon coding. This extends the
57 * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
58 * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
59 * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
60 * former is also based. The latter is designed to provide higher performance
61 * for writes.
62 *
63 * Note that the Plank paper claimed to support arbitrary N+M, but was then
64 * amended six years later identifying a critical flaw that invalidates its
65 * claims. Nevertheless, the technique can be adapted to work for up to
66 * triple parity. For additional parity, the amendment "Note: Correction to
67 * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
68 * is viable, but the additional complexity means that write performance will
69 * suffer.
70 *
71 * All of the methods above operate on a Galois field, defined over the
72 * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
73 * can be expressed with a single byte. Briefly, the operations on the
74 * field are defined as follows:
34dc7c2f
BB
75 *
76 * o addition (+) is represented by a bitwise XOR
77 * o subtraction (-) is therefore identical to addition: A + B = A - B
78 * o multiplication of A by 2 is defined by the following bitwise expression:
d3cc8b15 79 *
34dc7c2f
BB
80 * (A * 2)_7 = A_6
81 * (A * 2)_6 = A_5
82 * (A * 2)_5 = A_4
83 * (A * 2)_4 = A_3 + A_7
84 * (A * 2)_3 = A_2 + A_7
85 * (A * 2)_2 = A_1 + A_7
86 * (A * 2)_1 = A_0
87 * (A * 2)_0 = A_7
88 *
89 * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
45d1cae3
BB
90 * As an aside, this multiplication is derived from the error correcting
91 * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
34dc7c2f
BB
92 *
93 * Observe that any number in the field (except for 0) can be expressed as a
94 * power of 2 -- a generator for the field. We store a table of the powers of
95 * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
96 * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
45d1cae3
BB
97 * than field addition). The inverse of a field element A (A^-1) is therefore
98 * A ^ (255 - 1) = A^254.
34dc7c2f 99 *
45d1cae3
BB
100 * The up-to-three parity columns, P, Q, R over several data columns,
101 * D_0, ... D_n-1, can be expressed by field operations:
34dc7c2f
BB
102 *
103 * P = D_0 + D_1 + ... + D_n-2 + D_n-1
104 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
105 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
45d1cae3
BB
106 * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
107 * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
34dc7c2f 108 *
e1cfd73f 109 * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial
45d1cae3
BB
110 * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
111 * independent coefficients. (There are no additional coefficients that have
112 * this property which is why the uncorrected Plank method breaks down.)
113 *
114 * See the reconstruction code below for how P, Q and R can used individually
115 * or in concert to recover missing data columns.
34dc7c2f
BB
116 */
117
34dc7c2f
BB
118#define VDEV_RAIDZ_P 0
119#define VDEV_RAIDZ_Q 1
45d1cae3 120#define VDEV_RAIDZ_R 2
45d1cae3
BB
121
122#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
123#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
124
125/*
126 * We provide a mechanism to perform the field multiplication operation on a
127 * 64-bit value all at once rather than a byte at a time. This works by
128 * creating a mask from the top bit in each byte and using that to
129 * conditionally apply the XOR of 0x1d.
130 */
131#define VDEV_RAIDZ_64MUL_2(x, mask) \
132{ \
133 (mask) = (x) & 0x8080808080808080ULL; \
134 (mask) = ((mask) << 1) - ((mask) >> 7); \
135 (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
c5b3a7bb 136 ((mask) & 0x1d1d1d1d1d1d1d1dULL); \
45d1cae3 137}
34dc7c2f 138
45d1cae3
BB
139#define VDEV_RAIDZ_64MUL_4(x, mask) \
140{ \
141 VDEV_RAIDZ_64MUL_2((x), mask); \
142 VDEV_RAIDZ_64MUL_2((x), mask); \
143}
34dc7c2f 144
5caeef02
DB
145
146/*
147 * Big Theory Statement for how a RAIDZ VDEV is expanded
148 *
149 * An existing RAIDZ VDEV can be expanded by attaching a new disk. Expansion
150 * works with all three RAIDZ parity choices, including RAIDZ1, 2, or 3. VDEVs
151 * that have been previously expanded can be expanded again.
152 *
153 * The RAIDZ VDEV must be healthy (must be able to write to all the drives in
154 * the VDEV) when an expansion starts. And the expansion will pause if any
155 * disk in the VDEV fails, and resume once the VDEV is healthy again. All other
156 * operations on the pool can continue while an expansion is in progress (e.g.
157 * read/write, snapshot, zpool add, etc). Except zpool checkpoint, zpool trim,
158 * and zpool initialize which can't be run during an expansion. Following a
159 * reboot or export/import, the expansion resumes where it left off.
160 *
161 * == Reflowing the Data ==
162 *
163 * The expansion involves reflowing (copying) the data from the current set
164 * of disks to spread it across the new set which now has one more disk. This
165 * reflow operation is similar to reflowing text when the column width of a
166 * text editor window is expanded. The text doesn’t change but the location of
167 * the text changes to accommodate the new width. An example reflow result for
168 * a 4-wide RAIDZ1 to a 5-wide is shown below.
169 *
170 * Reflow End State
171 * Each letter indicates a parity group (logical stripe)
172 *
173 * Before expansion After Expansion
174 * D1 D2 D3 D4 D1 D2 D3 D4 D5
175 * +------+------+------+------+ +------+------+------+------+------+
176 * | | | | | | | | | | |
177 * | A | A | A | A | | A | A | A | A | B |
178 * | 1| 2| 3| 4| | 1| 2| 3| 4| 5|
179 * +------+------+------+------+ +------+------+------+------+------+
180 * | | | | | | | | | | |
181 * | B | B | C | C | | B | C | C | C | C |
182 * | 5| 6| 7| 8| | 6| 7| 8| 9| 10|
183 * +------+------+------+------+ +------+------+------+------+------+
184 * | | | | | | | | | | |
185 * | C | C | D | D | | D | D | E | E | E |
186 * | 9| 10| 11| 12| | 11| 12| 13| 14| 15|
187 * +------+------+------+------+ +------+------+------+------+------+
188 * | | | | | | | | | | |
189 * | E | E | E | E | --> | E | F | F | G | G |
190 * | 13| 14| 15| 16| | 16| 17| 18|p 19| 20|
191 * +------+------+------+------+ +------+------+------+------+------+
192 * | | | | | | | | | | |
193 * | F | F | G | G | | G | G | H | H | H |
194 * | 17| 18| 19| 20| | 21| 22| 23| 24| 25|
195 * +------+------+------+------+ +------+------+------+------+------+
196 * | | | | | | | | | | |
197 * | G | G | H | H | | H | I | I | J | J |
198 * | 21| 22| 23| 24| | 26| 27| 28| 29| 30|
199 * +------+------+------+------+ +------+------+------+------+------+
200 * | | | | | | | | | | |
201 * | H | H | I | I | | J | J | | | K |
202 * | 25| 26| 27| 28| | 31| 32| 33| 34| 35|
203 * +------+------+------+------+ +------+------+------+------+------+
204 *
205 * This reflow approach has several advantages. There is no need to read or
206 * modify the block pointers or recompute any block checksums. The reflow
207 * doesn’t need to know where the parity sectors reside. We can read and write
208 * data sequentially and the copy can occur in a background thread in open
209 * context. The design also allows for fast discovery of what data to copy.
210 *
211 * The VDEV metaslabs are processed, one at a time, to copy the block data to
212 * have it flow across all the disks. The metaslab is disabled for allocations
213 * during the copy. As an optimization, we only copy the allocated data which
214 * can be determined by looking at the metaslab range tree. During the copy we
215 * must maintain the redundancy guarantees of the RAIDZ VDEV (i.e., we still
216 * need to be able to survive losing parity count disks). This means we
217 * cannot overwrite data during the reflow that would be needed if a disk is
218 * lost.
219 *
220 * After the reflow completes, all newly-written blocks will have the new
221 * layout, i.e., they will have the parity to data ratio implied by the new
222 * number of disks in the RAIDZ group. Even though the reflow copies all of
223 * the allocated space (data and parity), it is only rearranged, not changed.
224 *
225 * This act of reflowing the data has a few implications about blocks
226 * that were written before the reflow completes:
227 *
228 * - Old blocks will still use the same amount of space (i.e., they will have
229 * the parity to data ratio implied by the old number of disks in the RAIDZ
230 * group).
231 * - Reading old blocks will be slightly slower than before the reflow, for
232 * two reasons. First, we will have to read from all disks in the RAIDZ
233 * VDEV, rather than being able to skip the children that contain only
234 * parity of this block (because the data of a single block is now spread
235 * out across all the disks). Second, in most cases there will be an extra
236 * bcopy, needed to rearrange the data back to its original layout in memory.
237 *
238 * == Scratch Area ==
239 *
240 * As we copy the block data, we can only progress to the point that writes
241 * will not overlap with blocks whose progress has not yet been recorded on
242 * disk. Since partially-copied rows are always read from the old location,
243 * we need to stop one row before the sector-wise overlap, to prevent any
244 * row-wise overlap. For example, in the diagram above, when we reflow sector
245 * B6 it will overwite the original location for B5.
246 *
247 * To get around this, a scratch space is used so that we can start copying
248 * without risking data loss by overlapping the row. As an added benefit, it
249 * improves performance at the beginning of the reflow, but that small perf
250 * boost wouldn't be worth the complexity on its own.
251 *
252 * Ideally we want to copy at least 2 * (new_width)^2 so that we have a
253 * separation of 2*(new_width+1) and a chunk size of new_width+2. With the max
254 * RAIDZ width of 255 and 4K sectors this would be 2MB per disk. In practice
255 * the widths will likely be single digits so we can get a substantial chuck
256 * size using only a few MB of scratch per disk.
257 *
258 * The scratch area is persisted to disk which holds a large amount of reflowed
259 * state. We can always read the partially written stripes when a disk fails or
260 * the copy is interrupted (crash) during the initial copying phase and also
261 * get past a small chunk size restriction. At a minimum, the scratch space
262 * must be large enough to get us to the point that one row does not overlap
263 * itself when moved (i.e new_width^2). But going larger is even better. We
264 * use the 3.5 MiB reserved "boot" space that resides after the ZFS disk labels
265 * as our scratch space to handle overwriting the initial part of the VDEV.
266 *
267 * 0 256K 512K 4M
268 * +------+------+-----------------------+-----------------------------
269 * | VDEV | VDEV | Boot Block (3.5M) | Allocatable space ...
270 * | L0 | L1 | Reserved | (Metaslabs)
271 * +------+------+-----------------------+-------------------------------
272 * Scratch Area
273 *
274 * == Reflow Progress Updates ==
275 * After the initial scratch-based reflow, the expansion process works
276 * similarly to device removal. We create a new open context thread which
277 * reflows the data, and periodically kicks off sync tasks to update logical
278 * state. In this case, state is the committed progress (offset of next data
279 * to copy). We need to persist the completed offset on disk, so that if we
280 * crash we know which format each VDEV offset is in.
281 *
282 * == Time Dependent Geometry ==
283 *
284 * In non-expanded RAIDZ, blocks are read from disk in a column by column
285 * fashion. For a multi-row block, the second sector is in the first column
286 * not in the second column. This allows us to issue full reads for each
287 * column directly into the request buffer. The block data is thus laid out
288 * sequentially in a column-by-column fashion.
289 *
290 * For example, in the before expansion diagram above, one logical block might
291 * be sectors G19-H26. The parity is in G19,H23; and the data is in
292 * G20,H24,G21,H25,G22,H26.
293 *
294 * After a block is reflowed, the sectors that were all in the original column
295 * data can now reside in different columns. When reading from an expanded
296 * VDEV, we need to know the logical stripe width for each block so we can
297 * reconstitute the block’s data after the reads are completed. Likewise,
298 * when we perform the combinatorial reconstruction we need to know the
299 * original width so we can retry combinations from the past layouts.
300 *
301 * Time dependent geometry is what we call having blocks with different layouts
302 * (stripe widths) in the same VDEV. This time-dependent geometry uses the
303 * block’s birth time (+ the time expansion ended) to establish the correct
304 * width for a given block. After an expansion completes, we record the time
305 * for blocks written with a particular width (geometry).
306 *
307 * == On Disk Format Changes ==
308 *
309 * New pool feature flag, 'raidz_expansion' whose reference count is the number
310 * of RAIDZ VDEVs that have been expanded.
311 *
312 * The blocks on expanded RAIDZ VDEV can have different logical stripe widths.
313 *
314 * Since the uberblock can point to arbitrary blocks, which might be on the
315 * expanding RAIDZ, and might or might not have been expanded. We need to know
316 * which way a block is laid out before reading it. This info is the next
317 * offset that needs to be reflowed and we persist that in the uberblock, in
318 * the new ub_raidz_reflow_info field, as opposed to the MOS or the vdev label.
319 * After the expansion is complete, we then use the raidz_expand_txgs array
320 * (see below) to determine how to read a block and the ub_raidz_reflow_info
321 * field no longer required.
322 *
323 * The uberblock's ub_raidz_reflow_info field also holds the scratch space
324 * state (i.e., active or not) which is also required before reading a block
325 * during the initial phase of reflowing the data.
326 *
327 * The top-level RAIDZ VDEV has two new entries in the nvlist:
328 *
329 * 'raidz_expand_txgs' array: logical stripe widths by txg are recorded here
330 * and used after the expansion is complete to
331 * determine how to read a raidz block
332 * 'raidz_expanding' boolean: present during reflow and removed after completion
333 * used during a spa import to resume an unfinished
334 * expansion
335 *
336 * And finally the VDEVs top zap adds the following informational entries:
337 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE
338 * VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME
339 * VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME
340 * VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED
341 */
342
343/*
344 * For testing only: pause the raidz expansion after reflowing this amount.
345 * (accessed by ZTS and ztest)
346 */
347#ifdef _KERNEL
348static
349#endif /* _KERNEL */
350unsigned long raidz_expand_max_reflow_bytes = 0;
351
352/*
353 * For testing only: pause the raidz expansion at a certain point.
354 */
355uint_t raidz_expand_pause_point = 0;
356
357/*
358 * Maximum amount of copy io's outstanding at once.
359 */
360static unsigned long raidz_expand_max_copy_bytes = 10 * SPA_MAXBLOCKSIZE;
361
362/*
363 * Apply raidz map abds aggregation if the number of rows in the map is equal
364 * or greater than the value below.
365 */
366static unsigned long raidz_io_aggregate_rows = 4;
367
368/*
369 * Automatically start a pool scrub when a RAIDZ expansion completes in
370 * order to verify the checksums of all blocks which have been copied
371 * during the expansion. Automatic scrubbing is enabled by default and
372 * is strongly recommended.
373 */
374static int zfs_scrub_after_expand = 1;
375
b2255edc
BB
376static void
377vdev_raidz_row_free(raidz_row_t *rr)
b128c09f 378{
e2af2acc
MA
379 for (int c = 0; c < rr->rr_cols; c++) {
380 raidz_col_t *rc = &rr->rr_col[c];
b128c09f 381
e2af2acc
MA
382 if (rc->rc_size != 0)
383 abd_free(rc->rc_abd);
e2af2acc 384 if (rc->rc_orig_data != NULL)
330c6c05 385 abd_free(rc->rc_orig_data);
b2255edc
BB
386 }
387
b2255edc
BB
388 if (rr->rr_abd_empty != NULL)
389 abd_free(rr->rr_abd_empty);
390
391 kmem_free(rr, offsetof(raidz_row_t, rr_col[rr->rr_scols]));
392}
428870ff 393
b2255edc
BB
394void
395vdev_raidz_map_free(raidz_map_t *rm)
396{
397 for (int i = 0; i < rm->rm_nrows; i++)
398 vdev_raidz_row_free(rm->rm_row[i]);
428870ff 399
5caeef02
DB
400 if (rm->rm_nphys_cols) {
401 for (int i = 0; i < rm->rm_nphys_cols; i++) {
402 if (rm->rm_phys_col[i].rc_abd != NULL)
403 abd_free(rm->rm_phys_col[i].rc_abd);
404 }
405
406 kmem_free(rm->rm_phys_col, sizeof (raidz_col_t) *
407 rm->rm_nphys_cols);
408 }
409
410 ASSERT3P(rm->rm_lr, ==, NULL);
b2255edc 411 kmem_free(rm, offsetof(raidz_map_t, rm_row[rm->rm_nrows]));
b128c09f
BB
412}
413
428870ff
BB
414static void
415vdev_raidz_map_free_vsd(zio_t *zio)
416{
417 raidz_map_t *rm = zio->io_vsd;
418
330c6c05 419 vdev_raidz_map_free(rm);
428870ff
BB
420}
421
5caeef02
DB
422static int
423vdev_raidz_reflow_compare(const void *x1, const void *x2)
424{
425 const reflow_node_t *l = x1;
426 const reflow_node_t *r = x2;
427
428 return (TREE_CMP(l->re_txg, r->re_txg));
429}
430
330c6c05 431const zio_vsd_ops_t vdev_raidz_vsd_ops = {
56d8d8ac 432 .vsd_free = vdev_raidz_map_free_vsd,
428870ff
BB
433};
434
5caeef02
DB
435raidz_row_t *
436vdev_raidz_row_alloc(int cols)
437{
438 raidz_row_t *rr =
439 kmem_zalloc(offsetof(raidz_row_t, rr_col[cols]), KM_SLEEP);
440
441 rr->rr_cols = cols;
442 rr->rr_scols = cols;
443
444 for (int c = 0; c < cols; c++) {
445 raidz_col_t *rc = &rr->rr_col[c];
446 rc->rc_shadow_devidx = INT_MAX;
447 rc->rc_shadow_offset = UINT64_MAX;
448 rc->rc_allow_repair = 1;
449 }
450 return (rr);
451}
452
345196be
BA
453static void
454vdev_raidz_map_alloc_write(zio_t *zio, raidz_map_t *rm, uint64_t ashift)
455{
456 int c;
457 int nwrapped = 0;
458 uint64_t off = 0;
459 raidz_row_t *rr = rm->rm_row[0];
460
461 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
462 ASSERT3U(rm->rm_nrows, ==, 1);
463
464 /*
465 * Pad any parity columns with additional space to account for skip
466 * sectors.
467 */
468 if (rm->rm_skipstart < rr->rr_firstdatacol) {
469 ASSERT0(rm->rm_skipstart);
470 nwrapped = rm->rm_nskip;
471 } else if (rr->rr_scols < (rm->rm_skipstart + rm->rm_nskip)) {
472 nwrapped =
473 (rm->rm_skipstart + rm->rm_nskip) % rr->rr_scols;
474 }
475
476 /*
477 * Optional single skip sectors (rc_size == 0) will be handled in
478 * vdev_raidz_io_start_write().
479 */
480 int skipped = rr->rr_scols - rr->rr_cols;
481
482 /* Allocate buffers for the parity columns */
483 for (c = 0; c < rr->rr_firstdatacol; c++) {
484 raidz_col_t *rc = &rr->rr_col[c];
485
486 /*
487 * Parity columns will pad out a linear ABD to account for
488 * the skip sector. A linear ABD is used here because
489 * parity calculations use the ABD buffer directly to calculate
490 * parity. This avoids doing a memcpy back to the ABD after the
491 * parity has been calculated. By issuing the parity column
492 * with the skip sector we can reduce contention on the child
493 * VDEV queue locks (vq_lock).
494 */
495 if (c < nwrapped) {
496 rc->rc_abd = abd_alloc_linear(
497 rc->rc_size + (1ULL << ashift), B_FALSE);
498 abd_zero_off(rc->rc_abd, rc->rc_size, 1ULL << ashift);
499 skipped++;
500 } else {
501 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
502 }
503 }
504
505 for (off = 0; c < rr->rr_cols; c++) {
506 raidz_col_t *rc = &rr->rr_col[c];
507 abd_t *abd = abd_get_offset_struct(&rc->rc_abdstruct,
508 zio->io_abd, off, rc->rc_size);
509
510 /*
511 * Generate I/O for skip sectors to improve aggregation
512 * continuity. We will use gang ABD's to reduce contention
513 * on the child VDEV queue locks (vq_lock) by issuing
514 * a single I/O that contains the data and skip sector.
515 *
516 * It is important to make sure that rc_size is not updated
517 * even though we are adding a skip sector to the ABD. When
518 * calculating the parity in vdev_raidz_generate_parity_row()
519 * the rc_size is used to iterate through the ABD's. We can
520 * not have zero'd out skip sectors used for calculating
521 * parity for raidz, because those same sectors are not used
522 * during reconstruction.
523 */
524 if (c >= rm->rm_skipstart && skipped < rm->rm_nskip) {
525 rc->rc_abd = abd_alloc_gang();
526 abd_gang_add(rc->rc_abd, abd, B_TRUE);
527 abd_gang_add(rc->rc_abd,
528 abd_get_zeros(1ULL << ashift), B_TRUE);
529 skipped++;
530 } else {
531 rc->rc_abd = abd;
532 }
533 off += rc->rc_size;
534 }
535
536 ASSERT3U(off, ==, zio->io_size);
537 ASSERT3S(skipped, ==, rm->rm_nskip);
538}
539
540static void
541vdev_raidz_map_alloc_read(zio_t *zio, raidz_map_t *rm)
542{
543 int c;
544 raidz_row_t *rr = rm->rm_row[0];
545
546 ASSERT3U(rm->rm_nrows, ==, 1);
547
548 /* Allocate buffers for the parity columns */
549 for (c = 0; c < rr->rr_firstdatacol; c++)
550 rr->rr_col[c].rc_abd =
551 abd_alloc_linear(rr->rr_col[c].rc_size, B_FALSE);
552
553 for (uint64_t off = 0; c < rr->rr_cols; c++) {
554 raidz_col_t *rc = &rr->rr_col[c];
555 rc->rc_abd = abd_get_offset_struct(&rc->rc_abdstruct,
556 zio->io_abd, off, rc->rc_size);
557 off += rc->rc_size;
558 }
559}
560
e49f1e20
WA
561/*
562 * Divides the IO evenly across all child vdevs; usually, dcols is
563 * the number of children in the target vdev.
a1687880
BB
564 *
565 * Avoid inlining the function to keep vdev_raidz_io_start(), which
566 * is this functions only caller, as small as possible on the stack.
e49f1e20 567 */
ab9f4b0b 568noinline raidz_map_t *
3d6da72d 569vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols,
34dc7c2f
BB
570 uint64_t nparity)
571{
b2255edc 572 raidz_row_t *rr;
e49f1e20 573 /* The starting RAIDZ (parent) vdev sector of the block. */
3d6da72d 574 uint64_t b = zio->io_offset >> ashift;
e49f1e20 575 /* The zio's size in units of the vdev's minimum sector size. */
3d6da72d 576 uint64_t s = zio->io_size >> ashift;
e49f1e20 577 /* The first column for this stripe. */
34dc7c2f 578 uint64_t f = b % dcols;
e49f1e20 579 /* The starting byte offset on each child vdev. */
3d6da72d 580 uint64_t o = (b / dcols) << ashift;
5caeef02 581 uint64_t acols, scols;
34dc7c2f 582
b2255edc
BB
583 raidz_map_t *rm =
584 kmem_zalloc(offsetof(raidz_map_t, rm_row[1]), KM_SLEEP);
585 rm->rm_nrows = 1;
586
e49f1e20
WA
587 /*
588 * "Quotient": The number of data sectors for this stripe on all but
589 * the "big column" child vdevs that also contain "remainder" data.
590 */
5caeef02 591 uint64_t q = s / (dcols - nparity);
e49f1e20
WA
592
593 /*
594 * "Remainder": The number of partial stripe data sectors in this I/O.
595 * This will add a sector to some, but not all, child vdevs.
596 */
5caeef02 597 uint64_t r = s - q * (dcols - nparity);
e49f1e20
WA
598
599 /* The number of "big columns" - those which contain remainder data. */
5caeef02 600 uint64_t bc = (r == 0 ? 0 : r + nparity);
e49f1e20
WA
601
602 /*
603 * The total number of data and parity sectors associated with
604 * this I/O.
605 */
5caeef02 606 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
45d1cae3 607
b2255edc
BB
608 /*
609 * acols: The columns that will be accessed.
610 * scols: The columns that will be accessed or skipped.
611 */
45d1cae3 612 if (q == 0) {
e49f1e20 613 /* Our I/O request doesn't span all child vdevs. */
45d1cae3
BB
614 acols = bc;
615 scols = MIN(dcols, roundup(bc, nparity + 1));
616 } else {
617 acols = dcols;
618 scols = dcols;
619 }
34dc7c2f 620
45d1cae3 621 ASSERT3U(acols, <=, scols);
5caeef02 622 rr = vdev_raidz_row_alloc(scols);
b2255edc 623 rm->rm_row[0] = rr;
b2255edc 624 rr->rr_cols = acols;
b2255edc 625 rr->rr_bigcols = bc;
b2255edc 626 rr->rr_firstdatacol = nparity;
b2255edc
BB
627#ifdef ZFS_DEBUG
628 rr->rr_offset = zio->io_offset;
629 rr->rr_size = zio->io_size;
630#endif
34dc7c2f 631
5caeef02 632 uint64_t asize = 0;
45d1cae3 633
5caeef02 634 for (uint64_t c = 0; c < scols; c++) {
b2255edc 635 raidz_col_t *rc = &rr->rr_col[c];
5caeef02
DB
636 uint64_t col = f + c;
637 uint64_t coff = o;
34dc7c2f
BB
638 if (col >= dcols) {
639 col -= dcols;
3d6da72d 640 coff += 1ULL << ashift;
34dc7c2f 641 }
b2255edc
BB
642 rc->rc_devidx = col;
643 rc->rc_offset = coff;
45d1cae3
BB
644
645 if (c >= acols)
b2255edc 646 rc->rc_size = 0;
45d1cae3 647 else if (c < bc)
b2255edc 648 rc->rc_size = (q + 1) << ashift;
45d1cae3 649 else
b2255edc 650 rc->rc_size = q << ashift;
45d1cae3 651
b2255edc 652 asize += rc->rc_size;
34dc7c2f
BB
653 }
654
3d6da72d 655 ASSERT3U(asize, ==, tot << ashift);
428870ff 656 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
b2255edc 657 rm->rm_skipstart = bc;
34dc7c2f 658
34dc7c2f
BB
659 /*
660 * If all data stored spans all columns, there's a danger that parity
661 * will always be on the same device and, since parity isn't read
e1cfd73f 662 * during normal operation, that device's I/O bandwidth won't be
34dc7c2f
BB
663 * used effectively. We therefore switch the parity every 1MB.
664 *
665 * ... at least that was, ostensibly, the theory. As a practical
666 * matter unless we juggle the parity between all devices evenly, we
667 * won't see any benefit. Further, occasional writes that aren't a
668 * multiple of the LCM of the number of children and the minimum
669 * stripe width are sufficient to avoid pessimal behavior.
670 * Unfortunately, this decision created an implicit on-disk format
671 * requirement that we need to support for all eternity, but only
672 * for single-parity RAID-Z.
428870ff
BB
673 *
674 * If we intend to skip a sector in the zeroth column for padding
675 * we must make sure to note this swap. We will never intend to
676 * skip the first column since at least one data and one parity
677 * column must appear in each row.
34dc7c2f 678 */
b2255edc
BB
679 ASSERT(rr->rr_cols >= 2);
680 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
34dc7c2f 681
b2255edc 682 if (rr->rr_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) {
5caeef02 683 uint64_t devidx = rr->rr_col[0].rc_devidx;
b2255edc
BB
684 o = rr->rr_col[0].rc_offset;
685 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
686 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
687 rr->rr_col[1].rc_devidx = devidx;
688 rr->rr_col[1].rc_offset = o;
428870ff
BB
689 if (rm->rm_skipstart == 0)
690 rm->rm_skipstart = 1;
34dc7c2f
BB
691 }
692
345196be
BA
693 if (zio->io_type == ZIO_TYPE_WRITE) {
694 vdev_raidz_map_alloc_write(zio, rm, ashift);
695 } else {
696 vdev_raidz_map_alloc_read(zio, rm);
697 }
5caeef02
DB
698 /* init RAIDZ parity ops */
699 rm->rm_ops = vdev_raidz_math_get_ops();
700
701 return (rm);
702}
703
704/*
705 * Everything before reflow_offset_synced should have been moved to the new
706 * location (read and write completed). However, this may not yet be reflected
707 * in the on-disk format (e.g. raidz_reflow_sync() has been called but the
708 * uberblock has not yet been written). If reflow is not in progress,
709 * reflow_offset_synced should be UINT64_MAX. For each row, if the row is
710 * entirely before reflow_offset_synced, it will come from the new location.
711 * Otherwise this row will come from the old location. Therefore, rows that
712 * straddle the reflow_offset_synced will come from the old location.
713 *
714 * For writes, reflow_offset_next is the next offset to copy. If a sector has
715 * been copied, but not yet reflected in the on-disk progress
716 * (reflow_offset_synced), it will also be written to the new (already copied)
717 * offset.
718 */
719noinline raidz_map_t *
720vdev_raidz_map_alloc_expanded(zio_t *zio,
721 uint64_t ashift, uint64_t physical_cols, uint64_t logical_cols,
722 uint64_t nparity, uint64_t reflow_offset_synced,
723 uint64_t reflow_offset_next, boolean_t use_scratch)
724{
725 abd_t *abd = zio->io_abd;
726 uint64_t offset = zio->io_offset;
727 uint64_t size = zio->io_size;
728
729 /* The zio's size in units of the vdev's minimum sector size. */
730 uint64_t s = size >> ashift;
731
732 /*
733 * "Quotient": The number of data sectors for this stripe on all but
734 * the "big column" child vdevs that also contain "remainder" data.
735 * AKA "full rows"
736 */
737 uint64_t q = s / (logical_cols - nparity);
738
739 /*
740 * "Remainder": The number of partial stripe data sectors in this I/O.
741 * This will add a sector to some, but not all, child vdevs.
742 */
743 uint64_t r = s - q * (logical_cols - nparity);
744
745 /* The number of "big columns" - those which contain remainder data. */
746 uint64_t bc = (r == 0 ? 0 : r + nparity);
747
748 /*
749 * The total number of data and parity sectors associated with
750 * this I/O.
751 */
752 uint64_t tot = s + nparity * (q + (r == 0 ? 0 : 1));
753
754 /* How many rows contain data (not skip) */
755 uint64_t rows = howmany(tot, logical_cols);
756 int cols = MIN(tot, logical_cols);
757
758 raidz_map_t *rm =
759 kmem_zalloc(offsetof(raidz_map_t, rm_row[rows]),
760 KM_SLEEP);
761 rm->rm_nrows = rows;
762 rm->rm_nskip = roundup(tot, nparity + 1) - tot;
763 rm->rm_skipstart = bc;
764 uint64_t asize = 0;
765
766 for (uint64_t row = 0; row < rows; row++) {
767 boolean_t row_use_scratch = B_FALSE;
768 raidz_row_t *rr = vdev_raidz_row_alloc(cols);
769 rm->rm_row[row] = rr;
770
771 /* The starting RAIDZ (parent) vdev sector of the row. */
772 uint64_t b = (offset >> ashift) + row * logical_cols;
773
774 /*
775 * If we are in the middle of a reflow, and the copying has
776 * not yet completed for any part of this row, then use the
777 * old location of this row. Note that reflow_offset_synced
778 * reflects the i/o that's been completed, because it's
779 * updated by a synctask, after zio_wait(spa_txg_zio[]).
780 * This is sufficient for our check, even if that progress
781 * has not yet been recorded to disk (reflected in
782 * spa_ubsync). Also note that we consider the last row to
783 * be "full width" (`cols`-wide rather than `bc`-wide) for
784 * this calculation. This causes a tiny bit of unnecessary
785 * double-writes but is safe and simpler to calculate.
786 */
787 int row_phys_cols = physical_cols;
788 if (b + cols > reflow_offset_synced >> ashift)
789 row_phys_cols--;
790 else if (use_scratch)
791 row_use_scratch = B_TRUE;
792
793 /* starting child of this row */
794 uint64_t child_id = b % row_phys_cols;
795 /* The starting byte offset on each child vdev. */
796 uint64_t child_offset = (b / row_phys_cols) << ashift;
797
798 /*
799 * Note, rr_cols is the entire width of the block, even
800 * if this row is shorter. This is needed because parity
801 * generation (for Q and R) needs to know the entire width,
802 * because it treats the short row as though it was
803 * full-width (and the "phantom" sectors were zero-filled).
804 *
805 * Another approach to this would be to set cols shorter
806 * (to just the number of columns that we might do i/o to)
807 * and have another mechanism to tell the parity generation
808 * about the "entire width". Reconstruction (at least
809 * vdev_raidz_reconstruct_general()) would also need to
810 * know about the "entire width".
811 */
812 rr->rr_firstdatacol = nparity;
813#ifdef ZFS_DEBUG
814 /*
815 * note: rr_size is PSIZE, not ASIZE
816 */
817 rr->rr_offset = b << ashift;
818 rr->rr_size = (rr->rr_cols - rr->rr_firstdatacol) << ashift;
819#endif
820
821 for (int c = 0; c < rr->rr_cols; c++, child_id++) {
822 if (child_id >= row_phys_cols) {
823 child_id -= row_phys_cols;
824 child_offset += 1ULL << ashift;
825 }
826 raidz_col_t *rc = &rr->rr_col[c];
827 rc->rc_devidx = child_id;
828 rc->rc_offset = child_offset;
829
830 /*
831 * Get this from the scratch space if appropriate.
832 * This only happens if we crashed in the middle of
833 * raidz_reflow_scratch_sync() (while it's running,
834 * the rangelock prevents us from doing concurrent
835 * io), and even then only during zpool import or
836 * when the pool is imported readonly.
837 */
838 if (row_use_scratch)
839 rc->rc_offset -= VDEV_BOOT_SIZE;
840
841 uint64_t dc = c - rr->rr_firstdatacol;
842 if (c < rr->rr_firstdatacol) {
843 rc->rc_size = 1ULL << ashift;
844
845 /*
846 * Parity sectors' rc_abd's are set below
847 * after determining if this is an aggregation.
848 */
849 } else if (row == rows - 1 && bc != 0 && c >= bc) {
850 /*
851 * Past the end of the block (even including
852 * skip sectors). This sector is part of the
853 * map so that we have full rows for p/q parity
854 * generation.
855 */
856 rc->rc_size = 0;
857 rc->rc_abd = NULL;
858 } else {
859 /* "data column" (col excluding parity) */
860 uint64_t off;
861
862 if (c < bc || r == 0) {
863 off = dc * rows + row;
864 } else {
865 off = r * rows +
866 (dc - r) * (rows - 1) + row;
867 }
868 rc->rc_size = 1ULL << ashift;
869 rc->rc_abd = abd_get_offset_struct(
870 &rc->rc_abdstruct, abd, off << ashift,
871 rc->rc_size);
872 }
873
874 if (rc->rc_size == 0)
875 continue;
876
877 /*
878 * If any part of this row is in both old and new
879 * locations, the primary location is the old
880 * location. If this sector was already copied to the
881 * new location, we need to also write to the new,
882 * "shadow" location.
883 *
884 * Note, `row_phys_cols != physical_cols` indicates
885 * that the primary location is the old location.
886 * `b+c < reflow_offset_next` indicates that the copy
887 * to the new location has been initiated. We know
888 * that the copy has completed because we have the
889 * rangelock, which is held exclusively while the
890 * copy is in progress.
891 */
892 if (row_use_scratch ||
893 (row_phys_cols != physical_cols &&
894 b + c < reflow_offset_next >> ashift)) {
895 rc->rc_shadow_devidx = (b + c) % physical_cols;
896 rc->rc_shadow_offset =
897 ((b + c) / physical_cols) << ashift;
898 if (row_use_scratch)
899 rc->rc_shadow_offset -= VDEV_BOOT_SIZE;
900 }
901
902 asize += rc->rc_size;
903 }
904
905 /*
906 * See comment in vdev_raidz_map_alloc()
907 */
908 if (rr->rr_firstdatacol == 1 && rr->rr_cols > 1 &&
909 (offset & (1ULL << 20))) {
910 ASSERT(rr->rr_cols >= 2);
911 ASSERT(rr->rr_col[0].rc_size == rr->rr_col[1].rc_size);
912
913 int devidx0 = rr->rr_col[0].rc_devidx;
914 uint64_t offset0 = rr->rr_col[0].rc_offset;
915 int shadow_devidx0 = rr->rr_col[0].rc_shadow_devidx;
916 uint64_t shadow_offset0 =
917 rr->rr_col[0].rc_shadow_offset;
918
919 rr->rr_col[0].rc_devidx = rr->rr_col[1].rc_devidx;
920 rr->rr_col[0].rc_offset = rr->rr_col[1].rc_offset;
921 rr->rr_col[0].rc_shadow_devidx =
922 rr->rr_col[1].rc_shadow_devidx;
923 rr->rr_col[0].rc_shadow_offset =
924 rr->rr_col[1].rc_shadow_offset;
925
926 rr->rr_col[1].rc_devidx = devidx0;
927 rr->rr_col[1].rc_offset = offset0;
928 rr->rr_col[1].rc_shadow_devidx = shadow_devidx0;
929 rr->rr_col[1].rc_shadow_offset = shadow_offset0;
930 }
931 }
932 ASSERT3U(asize, ==, tot << ashift);
933
934 /*
935 * Determine if the block is contiguous, in which case we can use
936 * an aggregation.
937 */
938 if (rows >= raidz_io_aggregate_rows) {
939 rm->rm_nphys_cols = physical_cols;
940 rm->rm_phys_col =
941 kmem_zalloc(sizeof (raidz_col_t) * rm->rm_nphys_cols,
942 KM_SLEEP);
943
944 /*
945 * Determine the aggregate io's offset and size, and check
946 * that the io is contiguous.
947 */
948 for (int i = 0;
949 i < rm->rm_nrows && rm->rm_phys_col != NULL; i++) {
950 raidz_row_t *rr = rm->rm_row[i];
951 for (int c = 0; c < rr->rr_cols; c++) {
952 raidz_col_t *rc = &rr->rr_col[c];
953 raidz_col_t *prc =
954 &rm->rm_phys_col[rc->rc_devidx];
955
956 if (rc->rc_size == 0)
957 continue;
958
959 if (prc->rc_size == 0) {
960 ASSERT0(prc->rc_offset);
961 prc->rc_offset = rc->rc_offset;
962 } else if (prc->rc_offset + prc->rc_size !=
963 rc->rc_offset) {
964 /*
965 * This block is not contiguous and
966 * therefore can't be aggregated.
967 * This is expected to be rare, so
968 * the cost of allocating and then
969 * freeing rm_phys_col is not
970 * significant.
971 */
972 kmem_free(rm->rm_phys_col,
973 sizeof (raidz_col_t) *
974 rm->rm_nphys_cols);
975 rm->rm_phys_col = NULL;
976 rm->rm_nphys_cols = 0;
977 break;
978 }
979 prc->rc_size += rc->rc_size;
980 }
981 }
982 }
983 if (rm->rm_phys_col != NULL) {
984 /*
985 * Allocate aggregate ABD's.
986 */
987 for (int i = 0; i < rm->rm_nphys_cols; i++) {
988 raidz_col_t *prc = &rm->rm_phys_col[i];
989
990 prc->rc_devidx = i;
345196be 991
5caeef02
DB
992 if (prc->rc_size == 0)
993 continue;
994
995 prc->rc_abd =
996 abd_alloc_linear(rm->rm_phys_col[i].rc_size,
997 B_FALSE);
998 }
999
1000 /*
1001 * Point the parity abd's into the aggregate abd's.
1002 */
1003 for (int i = 0; i < rm->rm_nrows; i++) {
1004 raidz_row_t *rr = rm->rm_row[i];
1005 for (int c = 0; c < rr->rr_firstdatacol; c++) {
1006 raidz_col_t *rc = &rr->rr_col[c];
1007 raidz_col_t *prc =
1008 &rm->rm_phys_col[rc->rc_devidx];
1009 rc->rc_abd =
1010 abd_get_offset_struct(&rc->rc_abdstruct,
1011 prc->rc_abd,
1012 rc->rc_offset - prc->rc_offset,
1013 rc->rc_size);
1014 }
1015 }
1016 } else {
1017 /*
1018 * Allocate new abd's for the parity sectors.
1019 */
1020 for (int i = 0; i < rm->rm_nrows; i++) {
1021 raidz_row_t *rr = rm->rm_row[i];
1022 for (int c = 0; c < rr->rr_firstdatacol; c++) {
1023 raidz_col_t *rc = &rr->rr_col[c];
1024 rc->rc_abd =
1025 abd_alloc_linear(rc->rc_size,
1026 B_TRUE);
1027 }
1028 }
1029 }
c9187d86
GN
1030 /* init RAIDZ parity ops */
1031 rm->rm_ops = vdev_raidz_math_get_ops();
ab9f4b0b 1032
34dc7c2f
BB
1033 return (rm);
1034}
1035
a6255b7f
DQ
1036struct pqr_struct {
1037 uint64_t *p;
1038 uint64_t *q;
1039 uint64_t *r;
1040};
1041
1042static int
1043vdev_raidz_p_func(void *buf, size_t size, void *private)
1044{
1045 struct pqr_struct *pqr = private;
1046 const uint64_t *src = buf;
5caeef02 1047 int cnt = size / sizeof (src[0]);
a6255b7f
DQ
1048
1049 ASSERT(pqr->p && !pqr->q && !pqr->r);
1050
5caeef02 1051 for (int i = 0; i < cnt; i++, src++, pqr->p++)
a6255b7f
DQ
1052 *pqr->p ^= *src;
1053
1054 return (0);
1055}
1056
1057static int
1058vdev_raidz_pq_func(void *buf, size_t size, void *private)
1059{
1060 struct pqr_struct *pqr = private;
1061 const uint64_t *src = buf;
1062 uint64_t mask;
5caeef02 1063 int cnt = size / sizeof (src[0]);
a6255b7f
DQ
1064
1065 ASSERT(pqr->p && pqr->q && !pqr->r);
1066
5caeef02 1067 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) {
a6255b7f
DQ
1068 *pqr->p ^= *src;
1069 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1070 *pqr->q ^= *src;
1071 }
1072
1073 return (0);
1074}
1075
1076static int
1077vdev_raidz_pqr_func(void *buf, size_t size, void *private)
1078{
1079 struct pqr_struct *pqr = private;
1080 const uint64_t *src = buf;
1081 uint64_t mask;
5caeef02 1082 int cnt = size / sizeof (src[0]);
a6255b7f
DQ
1083
1084 ASSERT(pqr->p && pqr->q && pqr->r);
1085
5caeef02 1086 for (int i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
a6255b7f
DQ
1087 *pqr->p ^= *src;
1088 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
1089 *pqr->q ^= *src;
1090 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
1091 *pqr->r ^= *src;
1092 }
1093
1094 return (0);
1095}
1096
34dc7c2f 1097static void
b2255edc 1098vdev_raidz_generate_parity_p(raidz_row_t *rr)
34dc7c2f 1099{
b2255edc 1100 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
34dc7c2f 1101
b2255edc
BB
1102 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1103 abd_t *src = rr->rr_col[c].rc_abd;
34dc7c2f 1104
b2255edc
BB
1105 if (c == rr->rr_firstdatacol) {
1106 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
34dc7c2f 1107 } else {
a6255b7f 1108 struct pqr_struct pqr = { p, NULL, NULL };
b2255edc 1109 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
a6255b7f 1110 vdev_raidz_p_func, &pqr);
34dc7c2f
BB
1111 }
1112 }
1113}
1114
1115static void
b2255edc 1116vdev_raidz_generate_parity_pq(raidz_row_t *rr)
34dc7c2f 1117{
b2255edc
BB
1118 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1119 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1120 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1121 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1122 rr->rr_col[VDEV_RAIDZ_Q].rc_size);
34dc7c2f 1123
b2255edc
BB
1124 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1125 abd_t *src = rr->rr_col[c].rc_abd;
45d1cae3 1126
b2255edc 1127 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
34dc7c2f 1128
b2255edc 1129 if (c == rr->rr_firstdatacol) {
f7e76821 1130 ASSERT(ccnt == pcnt || ccnt == 0);
b2255edc
BB
1131 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1132 (void) memcpy(q, p, rr->rr_col[c].rc_size);
45d1cae3 1133
b2255edc 1134 for (uint64_t i = ccnt; i < pcnt; i++) {
a6255b7f
DQ
1135 p[i] = 0;
1136 q[i] = 0;
45d1cae3 1137 }
a6255b7f 1138 } else {
f7e76821
IH
1139 struct pqr_struct pqr = { p, q, NULL };
1140
1141 ASSERT(ccnt <= pcnt);
b2255edc 1142 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
f7e76821 1143 vdev_raidz_pq_func, &pqr);
45d1cae3
BB
1144
1145 /*
1146 * Treat short columns as though they are full of 0s.
1147 * Note that there's therefore nothing needed for P.
1148 */
b2255edc
BB
1149 uint64_t mask;
1150 for (uint64_t i = ccnt; i < pcnt; i++) {
a6255b7f 1151 VDEV_RAIDZ_64MUL_2(q[i], mask);
45d1cae3
BB
1152 }
1153 }
1154 }
1155}
1156
1157static void
b2255edc 1158vdev_raidz_generate_parity_pqr(raidz_row_t *rr)
45d1cae3 1159{
b2255edc
BB
1160 uint64_t *p = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1161 uint64_t *q = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1162 uint64_t *r = abd_to_buf(rr->rr_col[VDEV_RAIDZ_R].rc_abd);
1163 uint64_t pcnt = rr->rr_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]);
1164 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1165 rr->rr_col[VDEV_RAIDZ_Q].rc_size);
1166 ASSERT(rr->rr_col[VDEV_RAIDZ_P].rc_size ==
1167 rr->rr_col[VDEV_RAIDZ_R].rc_size);
45d1cae3 1168
b2255edc
BB
1169 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1170 abd_t *src = rr->rr_col[c].rc_abd;
45d1cae3 1171
b2255edc 1172 uint64_t ccnt = rr->rr_col[c].rc_size / sizeof (p[0]);
45d1cae3 1173
b2255edc 1174 if (c == rr->rr_firstdatacol) {
f7e76821 1175 ASSERT(ccnt == pcnt || ccnt == 0);
b2255edc
BB
1176 abd_copy_to_buf(p, src, rr->rr_col[c].rc_size);
1177 (void) memcpy(q, p, rr->rr_col[c].rc_size);
1178 (void) memcpy(r, p, rr->rr_col[c].rc_size);
45d1cae3 1179
b2255edc 1180 for (uint64_t i = ccnt; i < pcnt; i++) {
a6255b7f
DQ
1181 p[i] = 0;
1182 q[i] = 0;
1183 r[i] = 0;
34dc7c2f 1184 }
a6255b7f 1185 } else {
f7e76821
IH
1186 struct pqr_struct pqr = { p, q, r };
1187
1188 ASSERT(ccnt <= pcnt);
b2255edc 1189 (void) abd_iterate_func(src, 0, rr->rr_col[c].rc_size,
f7e76821
IH
1190 vdev_raidz_pqr_func, &pqr);
1191
34dc7c2f
BB
1192 /*
1193 * Treat short columns as though they are full of 0s.
45d1cae3 1194 * Note that there's therefore nothing needed for P.
34dc7c2f 1195 */
b2255edc
BB
1196 uint64_t mask;
1197 for (uint64_t i = ccnt; i < pcnt; i++) {
a6255b7f
DQ
1198 VDEV_RAIDZ_64MUL_2(q[i], mask);
1199 VDEV_RAIDZ_64MUL_4(r[i], mask);
34dc7c2f
BB
1200 }
1201 }
1202 }
1203}
1204
45d1cae3
BB
1205/*
1206 * Generate RAID parity in the first virtual columns according to the number of
1207 * parity columns available.
1208 */
ab9f4b0b 1209void
b2255edc 1210vdev_raidz_generate_parity_row(raidz_map_t *rm, raidz_row_t *rr)
45d1cae3 1211{
5caeef02
DB
1212 if (rr->rr_cols == 0) {
1213 /*
1214 * We are handling this block one row at a time (because
1215 * this block has a different logical vs physical width,
1216 * due to RAIDZ expansion), and this is a pad-only row,
1217 * which has no parity.
1218 */
1219 return;
1220 }
b2255edc 1221
c9187d86 1222 /* Generate using the new math implementation */
b2255edc 1223 if (vdev_raidz_math_generate(rm, rr) != RAIDZ_ORIGINAL_IMPL)
ab9f4b0b 1224 return;
ab9f4b0b 1225
b2255edc 1226 switch (rr->rr_firstdatacol) {
45d1cae3 1227 case 1:
b2255edc 1228 vdev_raidz_generate_parity_p(rr);
45d1cae3
BB
1229 break;
1230 case 2:
b2255edc 1231 vdev_raidz_generate_parity_pq(rr);
45d1cae3
BB
1232 break;
1233 case 3:
b2255edc 1234 vdev_raidz_generate_parity_pqr(rr);
45d1cae3
BB
1235 break;
1236 default:
1237 cmn_err(CE_PANIC, "invalid RAID-Z configuration");
1238 }
1239}
1240
b2255edc
BB
1241void
1242vdev_raidz_generate_parity(raidz_map_t *rm)
1243{
1244 for (int i = 0; i < rm->rm_nrows; i++) {
1245 raidz_row_t *rr = rm->rm_row[i];
1246 vdev_raidz_generate_parity_row(rm, rr);
1247 }
1248}
1249
a6255b7f
DQ
1250static int
1251vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private)
1252{
14e4e3cb 1253 (void) private;
a6255b7f
DQ
1254 uint64_t *dst = dbuf;
1255 uint64_t *src = sbuf;
1256 int cnt = size / sizeof (src[0]);
a6255b7f 1257
1c27024e 1258 for (int i = 0; i < cnt; i++) {
a6255b7f
DQ
1259 dst[i] ^= src[i];
1260 }
1261
1262 return (0);
1263}
1264
a6255b7f
DQ
1265static int
1266vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size,
1267 void *private)
1268{
14e4e3cb 1269 (void) private;
a6255b7f
DQ
1270 uint64_t *dst = dbuf;
1271 uint64_t *src = sbuf;
1272 uint64_t mask;
1273 int cnt = size / sizeof (dst[0]);
a6255b7f 1274
1c27024e 1275 for (int i = 0; i < cnt; i++, dst++, src++) {
a6255b7f
DQ
1276 VDEV_RAIDZ_64MUL_2(*dst, mask);
1277 *dst ^= *src;
1278 }
1279
1280 return (0);
1281}
1282
a6255b7f
DQ
1283static int
1284vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private)
1285{
14e4e3cb 1286 (void) private;
a6255b7f
DQ
1287 uint64_t *dst = buf;
1288 uint64_t mask;
1289 int cnt = size / sizeof (dst[0]);
a6255b7f 1290
1c27024e 1291 for (int i = 0; i < cnt; i++, dst++) {
a6255b7f
DQ
1292 /* same operation as vdev_raidz_reconst_q_pre_func() on dst */
1293 VDEV_RAIDZ_64MUL_2(*dst, mask);
1294 }
1295
1296 return (0);
1297}
1298
1299struct reconst_q_struct {
1300 uint64_t *q;
1301 int exp;
1302};
1303
1304static int
1305vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private)
1306{
1307 struct reconst_q_struct *rq = private;
1308 uint64_t *dst = buf;
1309 int cnt = size / sizeof (dst[0]);
a6255b7f 1310
1c27024e 1311 for (int i = 0; i < cnt; i++, dst++, rq->q++) {
a6255b7f
DQ
1312 int j;
1313 uint8_t *b;
1314
1315 *dst ^= *rq->q;
1316 for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) {
1317 *b = vdev_raidz_exp2(*b, rq->exp);
1318 }
1319 }
1320
1321 return (0);
1322}
1323
1324struct reconst_pq_struct {
1325 uint8_t *p;
1326 uint8_t *q;
1327 uint8_t *pxy;
1328 uint8_t *qxy;
1329 int aexp;
1330 int bexp;
1331};
1332
1333static int
1334vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private)
1335{
1336 struct reconst_pq_struct *rpq = private;
1337 uint8_t *xd = xbuf;
1338 uint8_t *yd = ybuf;
a6255b7f 1339
1c27024e 1340 for (int i = 0; i < size;
a6255b7f
DQ
1341 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) {
1342 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1343 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1344 *yd = *rpq->p ^ *rpq->pxy ^ *xd;
1345 }
1346
1347 return (0);
1348}
1349
1350static int
1351vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private)
1352{
1353 struct reconst_pq_struct *rpq = private;
1354 uint8_t *xd = xbuf;
a6255b7f 1355
1c27024e 1356 for (int i = 0; i < size;
a6255b7f
DQ
1357 i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) {
1358 /* same operation as vdev_raidz_reconst_pq_func() on xd */
1359 *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^
1360 vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp);
1361 }
1362
1363 return (0);
1364}
1365
46df6e98 1366static void
b2255edc 1367vdev_raidz_reconstruct_p(raidz_row_t *rr, int *tgts, int ntgts)
34dc7c2f 1368{
45d1cae3 1369 int x = tgts[0];
a6255b7f 1370 abd_t *dst, *src;
34dc7c2f 1371
5caeef02
DB
1372 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1373 zfs_dbgmsg("reconstruct_p(rm=%px x=%u)", rr, x);
1374
b2255edc
BB
1375 ASSERT3U(ntgts, ==, 1);
1376 ASSERT3U(x, >=, rr->rr_firstdatacol);
1377 ASSERT3U(x, <, rr->rr_cols);
45d1cae3 1378
b2255edc 1379 ASSERT3U(rr->rr_col[x].rc_size, <=, rr->rr_col[VDEV_RAIDZ_P].rc_size);
34dc7c2f 1380
b2255edc
BB
1381 src = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1382 dst = rr->rr_col[x].rc_abd;
a6255b7f 1383
b2255edc 1384 abd_copy_from_buf(dst, abd_to_buf(src), rr->rr_col[x].rc_size);
34dc7c2f 1385
b2255edc
BB
1386 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1387 uint64_t size = MIN(rr->rr_col[x].rc_size,
1388 rr->rr_col[c].rc_size);
a6255b7f 1389
b2255edc 1390 src = rr->rr_col[c].rc_abd;
34dc7c2f
BB
1391
1392 if (c == x)
1393 continue;
1394
a6255b7f
DQ
1395 (void) abd_iterate_func2(dst, src, 0, 0, size,
1396 vdev_raidz_reconst_p_func, NULL);
34dc7c2f
BB
1397 }
1398}
1399
46df6e98 1400static void
b2255edc 1401vdev_raidz_reconstruct_q(raidz_row_t *rr, int *tgts, int ntgts)
34dc7c2f 1402{
45d1cae3 1403 int x = tgts[0];
a6255b7f
DQ
1404 int c, exp;
1405 abd_t *dst, *src;
34dc7c2f 1406
5caeef02
DB
1407 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1408 zfs_dbgmsg("reconstruct_q(rm=%px x=%u)", rr, x);
1409
45d1cae3
BB
1410 ASSERT(ntgts == 1);
1411
b2255edc 1412 ASSERT(rr->rr_col[x].rc_size <= rr->rr_col[VDEV_RAIDZ_Q].rc_size);
34dc7c2f 1413
b2255edc
BB
1414 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1415 uint64_t size = (c == x) ? 0 : MIN(rr->rr_col[x].rc_size,
1416 rr->rr_col[c].rc_size);
34dc7c2f 1417
b2255edc
BB
1418 src = rr->rr_col[c].rc_abd;
1419 dst = rr->rr_col[x].rc_abd;
34dc7c2f 1420
b2255edc 1421 if (c == rr->rr_firstdatacol) {
a6255b7f 1422 abd_copy(dst, src, size);
b2255edc 1423 if (rr->rr_col[x].rc_size > size) {
a6255b7f 1424 abd_zero_off(dst, size,
b2255edc
BB
1425 rr->rr_col[x].rc_size - size);
1426 }
34dc7c2f 1427 } else {
b2255edc 1428 ASSERT3U(size, <=, rr->rr_col[x].rc_size);
a6255b7f
DQ
1429 (void) abd_iterate_func2(dst, src, 0, 0, size,
1430 vdev_raidz_reconst_q_pre_func, NULL);
1431 (void) abd_iterate_func(dst,
b2255edc 1432 size, rr->rr_col[x].rc_size - size,
a6255b7f 1433 vdev_raidz_reconst_q_pre_tail_func, NULL);
34dc7c2f
BB
1434 }
1435 }
1436
b2255edc
BB
1437 src = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1438 dst = rr->rr_col[x].rc_abd;
1439 exp = 255 - (rr->rr_cols - 1 - x);
34dc7c2f 1440
1c27024e 1441 struct reconst_q_struct rq = { abd_to_buf(src), exp };
b2255edc 1442 (void) abd_iterate_func(dst, 0, rr->rr_col[x].rc_size,
a6255b7f 1443 vdev_raidz_reconst_q_post_func, &rq);
34dc7c2f
BB
1444}
1445
46df6e98 1446static void
b2255edc 1447vdev_raidz_reconstruct_pq(raidz_row_t *rr, int *tgts, int ntgts)
34dc7c2f 1448{
a6255b7f
DQ
1449 uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp;
1450 abd_t *pdata, *qdata;
1451 uint64_t xsize, ysize;
45d1cae3
BB
1452 int x = tgts[0];
1453 int y = tgts[1];
a6255b7f 1454 abd_t *xd, *yd;
34dc7c2f 1455
5caeef02
DB
1456 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1457 zfs_dbgmsg("reconstruct_pq(rm=%px x=%u y=%u)", rr, x, y);
1458
45d1cae3 1459 ASSERT(ntgts == 2);
34dc7c2f 1460 ASSERT(x < y);
b2255edc
BB
1461 ASSERT(x >= rr->rr_firstdatacol);
1462 ASSERT(y < rr->rr_cols);
34dc7c2f 1463
b2255edc 1464 ASSERT(rr->rr_col[x].rc_size >= rr->rr_col[y].rc_size);
34dc7c2f
BB
1465
1466 /*
1467 * Move the parity data aside -- we're going to compute parity as
1468 * though columns x and y were full of zeros -- Pxy and Qxy. We want to
1469 * reuse the parity generation mechanism without trashing the actual
1470 * parity so we make those columns appear to be full of zeros by
1471 * setting their lengths to zero.
1472 */
b2255edc
BB
1473 pdata = rr->rr_col[VDEV_RAIDZ_P].rc_abd;
1474 qdata = rr->rr_col[VDEV_RAIDZ_Q].rc_abd;
1475 xsize = rr->rr_col[x].rc_size;
1476 ysize = rr->rr_col[y].rc_size;
34dc7c2f 1477
b2255edc
BB
1478 rr->rr_col[VDEV_RAIDZ_P].rc_abd =
1479 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_P].rc_size, B_TRUE);
1480 rr->rr_col[VDEV_RAIDZ_Q].rc_abd =
1481 abd_alloc_linear(rr->rr_col[VDEV_RAIDZ_Q].rc_size, B_TRUE);
1482 rr->rr_col[x].rc_size = 0;
1483 rr->rr_col[y].rc_size = 0;
34dc7c2f 1484
b2255edc 1485 vdev_raidz_generate_parity_pq(rr);
34dc7c2f 1486
b2255edc
BB
1487 rr->rr_col[x].rc_size = xsize;
1488 rr->rr_col[y].rc_size = ysize;
34dc7c2f 1489
a6255b7f
DQ
1490 p = abd_to_buf(pdata);
1491 q = abd_to_buf(qdata);
b2255edc
BB
1492 pxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1493 qxy = abd_to_buf(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
1494 xd = rr->rr_col[x].rc_abd;
1495 yd = rr->rr_col[y].rc_abd;
34dc7c2f
BB
1496
1497 /*
1498 * We now have:
1499 * Pxy = P + D_x + D_y
1500 * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y
1501 *
1502 * We can then solve for D_x:
1503 * D_x = A * (P + Pxy) + B * (Q + Qxy)
1504 * where
1505 * A = 2^(x - y) * (2^(x - y) + 1)^-1
1506 * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1
1507 *
1508 * With D_x in hand, we can easily solve for D_y:
1509 * D_y = P + Pxy + D_x
1510 */
1511
1512 a = vdev_raidz_pow2[255 + x - y];
b2255edc 1513 b = vdev_raidz_pow2[255 - (rr->rr_cols - 1 - x)];
34dc7c2f
BB
1514 tmp = 255 - vdev_raidz_log2[a ^ 1];
1515
1516 aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)];
1517 bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)];
1518
a6255b7f 1519 ASSERT3U(xsize, >=, ysize);
1c27024e 1520 struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp };
34dc7c2f 1521
a6255b7f
DQ
1522 (void) abd_iterate_func2(xd, yd, 0, 0, ysize,
1523 vdev_raidz_reconst_pq_func, &rpq);
1524 (void) abd_iterate_func(xd, ysize, xsize - ysize,
1525 vdev_raidz_reconst_pq_tail_func, &rpq);
34dc7c2f 1526
b2255edc
BB
1527 abd_free(rr->rr_col[VDEV_RAIDZ_P].rc_abd);
1528 abd_free(rr->rr_col[VDEV_RAIDZ_Q].rc_abd);
34dc7c2f
BB
1529
1530 /*
1531 * Restore the saved parity data.
1532 */
b2255edc
BB
1533 rr->rr_col[VDEV_RAIDZ_P].rc_abd = pdata;
1534 rr->rr_col[VDEV_RAIDZ_Q].rc_abd = qdata;
45d1cae3
BB
1535}
1536
45d1cae3
BB
1537/*
1538 * In the general case of reconstruction, we must solve the system of linear
dd4bc569 1539 * equations defined by the coefficients used to generate parity as well as
45d1cae3
BB
1540 * the contents of the data and parity disks. This can be expressed with
1541 * vectors for the original data (D) and the actual data (d) and parity (p)
1542 * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
1543 *
1544 * __ __ __ __
1545 * | | __ __ | p_0 |
1546 * | V | | D_0 | | p_m-1 |
1547 * | | x | : | = | d_0 |
1548 * | I | | D_n-1 | | : |
1549 * | | ~~ ~~ | d_n-1 |
1550 * ~~ ~~ ~~ ~~
1551 *
1552 * I is simply a square identity matrix of size n, and V is a vandermonde
dd4bc569 1553 * matrix defined by the coefficients we chose for the various parity columns
45d1cae3
BB
1554 * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
1555 * computation as well as linear separability.
1556 *
1557 * __ __ __ __
1558 * | 1 .. 1 1 1 | | p_0 |
1559 * | 2^n-1 .. 4 2 1 | __ __ | : |
1560 * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 |
1561 * | 1 .. 0 0 0 | | D_1 | | d_0 |
1562 * | 0 .. 0 0 0 | x | D_2 | = | d_1 |
1563 * | : : : : | | : | | d_2 |
1564 * | 0 .. 1 0 0 | | D_n-1 | | : |
1565 * | 0 .. 0 1 0 | ~~ ~~ | : |
1566 * | 0 .. 0 0 1 | | d_n-1 |
1567 * ~~ ~~ ~~ ~~
1568 *
1569 * Note that I, V, d, and p are known. To compute D, we must invert the
1570 * matrix and use the known data and parity values to reconstruct the unknown
1571 * data values. We begin by removing the rows in V|I and d|p that correspond
1572 * to failed or missing columns; we then make V|I square (n x n) and d|p
1573 * sized n by removing rows corresponding to unused parity from the bottom up
1574 * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
1575 * using Gauss-Jordan elimination. In the example below we use m=3 parity
1576 * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
1577 * __ __
1578 * | 1 1 1 1 1 1 1 1 |
1579 * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks
1580 * | 19 205 116 29 64 16 4 1 | / /
1581 * | 1 0 0 0 0 0 0 0 | / /
1582 * | 0 1 0 0 0 0 0 0 | <--' /
1583 * (V|I) = | 0 0 1 0 0 0 0 0 | <---'
1584 * | 0 0 0 1 0 0 0 0 |
1585 * | 0 0 0 0 1 0 0 0 |
1586 * | 0 0 0 0 0 1 0 0 |
1587 * | 0 0 0 0 0 0 1 0 |
1588 * | 0 0 0 0 0 0 0 1 |
1589 * ~~ ~~
1590 * __ __
1591 * | 1 1 1 1 1 1 1 1 |
1592 * | 128 64 32 16 8 4 2 1 |
1593 * | 19 205 116 29 64 16 4 1 |
1594 * | 1 0 0 0 0 0 0 0 |
1595 * | 0 1 0 0 0 0 0 0 |
1596 * (V|I)' = | 0 0 1 0 0 0 0 0 |
1597 * | 0 0 0 1 0 0 0 0 |
1598 * | 0 0 0 0 1 0 0 0 |
1599 * | 0 0 0 0 0 1 0 0 |
1600 * | 0 0 0 0 0 0 1 0 |
1601 * | 0 0 0 0 0 0 0 1 |
1602 * ~~ ~~
1603 *
1604 * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
1605 * have carefully chosen the seed values 1, 2, and 4 to ensure that this
1606 * matrix is not singular.
1607 * __ __
1608 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1609 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1610 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1611 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1612 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1613 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1614 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1615 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1616 * ~~ ~~
1617 * __ __
1618 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1619 * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 |
1620 * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 |
1621 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1622 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1623 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1624 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1625 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1626 * ~~ ~~
1627 * __ __
1628 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1629 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1630 * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 |
1631 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1632 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1633 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1634 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1635 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1636 * ~~ ~~
1637 * __ __
1638 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1639 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1640 * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 |
1641 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1642 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1643 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1644 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1645 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1646 * ~~ ~~
1647 * __ __
1648 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1649 * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 |
1650 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1651 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1652 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1653 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1654 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1655 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1656 * ~~ ~~
1657 * __ __
1658 * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 |
1659 * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 |
1660 * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 |
1661 * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 |
1662 * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 |
1663 * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 |
1664 * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 |
1665 * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 |
1666 * ~~ ~~
1667 * __ __
1668 * | 0 0 1 0 0 0 0 0 |
1669 * | 167 100 5 41 159 169 217 208 |
1670 * | 166 100 4 40 158 168 216 209 |
1671 * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 |
1672 * | 0 0 0 0 1 0 0 0 |
1673 * | 0 0 0 0 0 1 0 0 |
1674 * | 0 0 0 0 0 0 1 0 |
1675 * | 0 0 0 0 0 0 0 1 |
1676 * ~~ ~~
1677 *
1678 * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
1679 * of the missing data.
1680 *
1681 * As is apparent from the example above, the only non-trivial rows in the
1682 * inverse matrix correspond to the data disks that we're trying to
1683 * reconstruct. Indeed, those are the only rows we need as the others would
1684 * only be useful for reconstructing data known or assumed to be valid. For
1685 * that reason, we only build the coefficients in the rows that correspond to
1686 * targeted columns.
1687 */
45d1cae3
BB
1688
1689static void
b2255edc 1690vdev_raidz_matrix_init(raidz_row_t *rr, int n, int nmap, int *map,
45d1cae3
BB
1691 uint8_t **rows)
1692{
1693 int i, j;
1694 int pow;
1695
b2255edc 1696 ASSERT(n == rr->rr_cols - rr->rr_firstdatacol);
45d1cae3
BB
1697
1698 /*
1699 * Fill in the missing rows of interest.
1700 */
1701 for (i = 0; i < nmap; i++) {
1702 ASSERT3S(0, <=, map[i]);
1703 ASSERT3S(map[i], <=, 2);
1704
1705 pow = map[i] * n;
1706 if (pow > 255)
1707 pow -= 255;
1708 ASSERT(pow <= 255);
1709
1710 for (j = 0; j < n; j++) {
1711 pow -= map[i];
1712 if (pow < 0)
1713 pow += 255;
1714 rows[i][j] = vdev_raidz_pow2[pow];
1715 }
1716 }
1717}
1718
1719static void
b2255edc 1720vdev_raidz_matrix_invert(raidz_row_t *rr, int n, int nmissing, int *missing,
45d1cae3
BB
1721 uint8_t **rows, uint8_t **invrows, const uint8_t *used)
1722{
1723 int i, j, ii, jj;
1724 uint8_t log;
1725
1726 /*
1727 * Assert that the first nmissing entries from the array of used
1728 * columns correspond to parity columns and that subsequent entries
1729 * correspond to data columns.
1730 */
1731 for (i = 0; i < nmissing; i++) {
b2255edc 1732 ASSERT3S(used[i], <, rr->rr_firstdatacol);
45d1cae3
BB
1733 }
1734 for (; i < n; i++) {
b2255edc 1735 ASSERT3S(used[i], >=, rr->rr_firstdatacol);
45d1cae3
BB
1736 }
1737
1738 /*
1739 * First initialize the storage where we'll compute the inverse rows.
1740 */
1741 for (i = 0; i < nmissing; i++) {
1742 for (j = 0; j < n; j++) {
1743 invrows[i][j] = (i == j) ? 1 : 0;
1744 }
1745 }
1746
1747 /*
1748 * Subtract all trivial rows from the rows of consequence.
1749 */
1750 for (i = 0; i < nmissing; i++) {
1751 for (j = nmissing; j < n; j++) {
b2255edc
BB
1752 ASSERT3U(used[j], >=, rr->rr_firstdatacol);
1753 jj = used[j] - rr->rr_firstdatacol;
45d1cae3
BB
1754 ASSERT3S(jj, <, n);
1755 invrows[i][j] = rows[i][jj];
1756 rows[i][jj] = 0;
1757 }
1758 }
1759
1760 /*
1761 * For each of the rows of interest, we must normalize it and subtract
1762 * a multiple of it from the other rows.
1763 */
1764 for (i = 0; i < nmissing; i++) {
1765 for (j = 0; j < missing[i]; j++) {
c99c9001 1766 ASSERT0(rows[i][j]);
45d1cae3
BB
1767 }
1768 ASSERT3U(rows[i][missing[i]], !=, 0);
1769
1770 /*
1771 * Compute the inverse of the first element and multiply each
1772 * element in the row by that value.
1773 */
1774 log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
1775
1776 for (j = 0; j < n; j++) {
1777 rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
1778 invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
1779 }
1780
1781 for (ii = 0; ii < nmissing; ii++) {
1782 if (i == ii)
1783 continue;
1784
1785 ASSERT3U(rows[ii][missing[i]], !=, 0);
1786
1787 log = vdev_raidz_log2[rows[ii][missing[i]]];
1788
1789 for (j = 0; j < n; j++) {
1790 rows[ii][j] ^=
1791 vdev_raidz_exp2(rows[i][j], log);
1792 invrows[ii][j] ^=
1793 vdev_raidz_exp2(invrows[i][j], log);
1794 }
1795 }
1796 }
1797
1798 /*
1799 * Verify that the data that is left in the rows are properly part of
1800 * an identity matrix.
1801 */
1802 for (i = 0; i < nmissing; i++) {
1803 for (j = 0; j < n; j++) {
1804 if (j == missing[i]) {
1805 ASSERT3U(rows[i][j], ==, 1);
1806 } else {
c99c9001 1807 ASSERT0(rows[i][j]);
45d1cae3
BB
1808 }
1809 }
1810 }
1811}
1812
1813static void
b2255edc 1814vdev_raidz_matrix_reconstruct(raidz_row_t *rr, int n, int nmissing,
45d1cae3
BB
1815 int *missing, uint8_t **invrows, const uint8_t *used)
1816{
1817 int i, j, x, cc, c;
1818 uint8_t *src;
1819 uint64_t ccount;
689f093e
GN
1820 uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL };
1821 uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 };
a117a6d6
GW
1822 uint8_t log = 0;
1823 uint8_t val;
45d1cae3
BB
1824 int ll;
1825 uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
1826 uint8_t *p, *pp;
1827 size_t psize;
1828
1829 psize = sizeof (invlog[0][0]) * n * nmissing;
79c76d5b 1830 p = kmem_alloc(psize, KM_SLEEP);
45d1cae3
BB
1831
1832 for (pp = p, i = 0; i < nmissing; i++) {
1833 invlog[i] = pp;
1834 pp += n;
1835 }
1836
1837 for (i = 0; i < nmissing; i++) {
1838 for (j = 0; j < n; j++) {
1839 ASSERT3U(invrows[i][j], !=, 0);
1840 invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
1841 }
1842 }
1843
1844 for (i = 0; i < n; i++) {
1845 c = used[i];
b2255edc 1846 ASSERT3U(c, <, rr->rr_cols);
45d1cae3 1847
b2255edc
BB
1848 ccount = rr->rr_col[c].rc_size;
1849 ASSERT(ccount >= rr->rr_col[missing[0]].rc_size || i > 0);
1850 if (ccount == 0)
1851 continue;
1852 src = abd_to_buf(rr->rr_col[c].rc_abd);
45d1cae3 1853 for (j = 0; j < nmissing; j++) {
b2255edc
BB
1854 cc = missing[j] + rr->rr_firstdatacol;
1855 ASSERT3U(cc, >=, rr->rr_firstdatacol);
1856 ASSERT3U(cc, <, rr->rr_cols);
45d1cae3
BB
1857 ASSERT3U(cc, !=, c);
1858
b2255edc
BB
1859 dcount[j] = rr->rr_col[cc].rc_size;
1860 if (dcount[j] != 0)
1861 dst[j] = abd_to_buf(rr->rr_col[cc].rc_abd);
45d1cae3
BB
1862 }
1863
45d1cae3
BB
1864 for (x = 0; x < ccount; x++, src++) {
1865 if (*src != 0)
1866 log = vdev_raidz_log2[*src];
1867
1868 for (cc = 0; cc < nmissing; cc++) {
1869 if (x >= dcount[cc])
1870 continue;
1871
1872 if (*src == 0) {
1873 val = 0;
1874 } else {
1875 if ((ll = log + invlog[cc][i]) >= 255)
1876 ll -= 255;
1877 val = vdev_raidz_pow2[ll];
1878 }
1879
1880 if (i == 0)
1881 dst[cc][x] = val;
1882 else
1883 dst[cc][x] ^= val;
1884 }
1885 }
1886 }
1887
1888 kmem_free(p, psize);
1889}
1890
46df6e98 1891static void
b2255edc 1892vdev_raidz_reconstruct_general(raidz_row_t *rr, int *tgts, int ntgts)
45d1cae3
BB
1893{
1894 int n, i, c, t, tt;
1895 int nmissing_rows;
1896 int missing_rows[VDEV_RAIDZ_MAXPARITY];
1897 int parity_map[VDEV_RAIDZ_MAXPARITY];
45d1cae3
BB
1898 uint8_t *p, *pp;
1899 size_t psize;
45d1cae3
BB
1900 uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
1901 uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
1902 uint8_t *used;
1903
a6255b7f
DQ
1904 abd_t **bufs = NULL;
1905
5caeef02
DB
1906 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
1907 zfs_dbgmsg("reconstruct_general(rm=%px ntgts=%u)", rr, ntgts);
a6255b7f
DQ
1908 /*
1909 * Matrix reconstruction can't use scatter ABDs yet, so we allocate
b2255edc 1910 * temporary linear ABDs if any non-linear ABDs are found.
a6255b7f 1911 */
b2255edc 1912 for (i = rr->rr_firstdatacol; i < rr->rr_cols; i++) {
5caeef02 1913 ASSERT(rr->rr_col[i].rc_abd != NULL);
b2255edc
BB
1914 if (!abd_is_linear(rr->rr_col[i].rc_abd)) {
1915 bufs = kmem_alloc(rr->rr_cols * sizeof (abd_t *),
1916 KM_PUSHPAGE);
1917
1918 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
1919 raidz_col_t *col = &rr->rr_col[c];
1920
1921 bufs[c] = col->rc_abd;
1922 if (bufs[c] != NULL) {
1923 col->rc_abd = abd_alloc_linear(
1924 col->rc_size, B_TRUE);
1925 abd_copy(col->rc_abd, bufs[c],
1926 col->rc_size);
1927 }
1928 }
a6255b7f 1929
b2255edc 1930 break;
a6255b7f
DQ
1931 }
1932 }
45d1cae3 1933
b2255edc 1934 n = rr->rr_cols - rr->rr_firstdatacol;
45d1cae3
BB
1935
1936 /*
1937 * Figure out which data columns are missing.
1938 */
1939 nmissing_rows = 0;
1940 for (t = 0; t < ntgts; t++) {
b2255edc 1941 if (tgts[t] >= rr->rr_firstdatacol) {
45d1cae3 1942 missing_rows[nmissing_rows++] =
b2255edc 1943 tgts[t] - rr->rr_firstdatacol;
45d1cae3
BB
1944 }
1945 }
1946
1947 /*
1948 * Figure out which parity columns to use to help generate the missing
1949 * data columns.
1950 */
1951 for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
1952 ASSERT(tt < ntgts);
b2255edc 1953 ASSERT(c < rr->rr_firstdatacol);
45d1cae3
BB
1954
1955 /*
1956 * Skip any targeted parity columns.
1957 */
1958 if (c == tgts[tt]) {
1959 tt++;
1960 continue;
1961 }
1962
45d1cae3
BB
1963 parity_map[i] = c;
1964 i++;
1965 }
1966
45d1cae3
BB
1967 psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
1968 nmissing_rows * n + sizeof (used[0]) * n;
79c76d5b 1969 p = kmem_alloc(psize, KM_SLEEP);
45d1cae3
BB
1970
1971 for (pp = p, i = 0; i < nmissing_rows; i++) {
1972 rows[i] = pp;
1973 pp += n;
1974 invrows[i] = pp;
1975 pp += n;
1976 }
1977 used = pp;
1978
1979 for (i = 0; i < nmissing_rows; i++) {
1980 used[i] = parity_map[i];
1981 }
1982
b2255edc 1983 for (tt = 0, c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
45d1cae3 1984 if (tt < nmissing_rows &&
b2255edc 1985 c == missing_rows[tt] + rr->rr_firstdatacol) {
45d1cae3
BB
1986 tt++;
1987 continue;
1988 }
1989
1990 ASSERT3S(i, <, n);
1991 used[i] = c;
1992 i++;
1993 }
1994
1995 /*
1996 * Initialize the interesting rows of the matrix.
1997 */
b2255edc 1998 vdev_raidz_matrix_init(rr, n, nmissing_rows, parity_map, rows);
45d1cae3
BB
1999
2000 /*
2001 * Invert the matrix.
2002 */
b2255edc 2003 vdev_raidz_matrix_invert(rr, n, nmissing_rows, missing_rows, rows,
45d1cae3
BB
2004 invrows, used);
2005
2006 /*
2007 * Reconstruct the missing data using the generated matrix.
2008 */
b2255edc 2009 vdev_raidz_matrix_reconstruct(rr, n, nmissing_rows, missing_rows,
45d1cae3
BB
2010 invrows, used);
2011
2012 kmem_free(p, psize);
2013
a6255b7f
DQ
2014 /*
2015 * copy back from temporary linear abds and free them
2016 */
2017 if (bufs) {
b2255edc
BB
2018 for (c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
2019 raidz_col_t *col = &rr->rr_col[c];
a6255b7f 2020
b2255edc
BB
2021 if (bufs[c] != NULL) {
2022 abd_copy(bufs[c], col->rc_abd, col->rc_size);
2023 abd_free(col->rc_abd);
2024 }
a6255b7f
DQ
2025 col->rc_abd = bufs[c];
2026 }
b2255edc 2027 kmem_free(bufs, rr->rr_cols * sizeof (abd_t *));
a6255b7f 2028 }
34dc7c2f
BB
2029}
2030
46df6e98 2031static void
b2255edc
BB
2032vdev_raidz_reconstruct_row(raidz_map_t *rm, raidz_row_t *rr,
2033 const int *t, int nt)
45d1cae3
BB
2034{
2035 int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
2036 int ntgts;
c9187d86 2037 int i, c, ret;
45d1cae3
BB
2038 int nbadparity, nbaddata;
2039 int parity_valid[VDEV_RAIDZ_MAXPARITY];
2040
5caeef02
DB
2041 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2042 zfs_dbgmsg("reconstruct(rm=%px nt=%u cols=%u md=%u mp=%u)",
2043 rr, nt, (int)rr->rr_cols, (int)rr->rr_missingdata,
2044 (int)rr->rr_missingparity);
2045 }
2046
b2255edc
BB
2047 nbadparity = rr->rr_firstdatacol;
2048 nbaddata = rr->rr_cols - nbadparity;
45d1cae3 2049 ntgts = 0;
b2255edc 2050 for (i = 0, c = 0; c < rr->rr_cols; c++) {
5caeef02
DB
2051 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
2052 zfs_dbgmsg("reconstruct(rm=%px col=%u devid=%u "
2053 "offset=%llx error=%u)",
2054 rr, c, (int)rr->rr_col[c].rc_devidx,
2055 (long long)rr->rr_col[c].rc_offset,
2056 (int)rr->rr_col[c].rc_error);
2057 }
b2255edc 2058 if (c < rr->rr_firstdatacol)
45d1cae3
BB
2059 parity_valid[c] = B_FALSE;
2060
2061 if (i < nt && c == t[i]) {
2062 tgts[ntgts++] = c;
2063 i++;
b2255edc 2064 } else if (rr->rr_col[c].rc_error != 0) {
45d1cae3 2065 tgts[ntgts++] = c;
b2255edc 2066 } else if (c >= rr->rr_firstdatacol) {
45d1cae3
BB
2067 nbaddata--;
2068 } else {
2069 parity_valid[c] = B_TRUE;
2070 nbadparity--;
2071 }
2072 }
2073
2074 ASSERT(ntgts >= nt);
2075 ASSERT(nbaddata >= 0);
2076 ASSERT(nbaddata + nbadparity == ntgts);
2077
2078 dt = &tgts[nbadparity];
2079
c9187d86 2080 /* Reconstruct using the new math implementation */
b2255edc 2081 ret = vdev_raidz_math_reconstruct(rm, rr, parity_valid, dt, nbaddata);
c9187d86 2082 if (ret != RAIDZ_ORIGINAL_IMPL)
46df6e98 2083 return;
ab9f4b0b 2084
45d1cae3
BB
2085 /*
2086 * See if we can use any of our optimized reconstruction routines.
2087 */
ab9f4b0b
GN
2088 switch (nbaddata) {
2089 case 1:
46df6e98
MA
2090 if (parity_valid[VDEV_RAIDZ_P]) {
2091 vdev_raidz_reconstruct_p(rr, dt, 1);
2092 return;
2093 }
45d1cae3 2094
b2255edc 2095 ASSERT(rr->rr_firstdatacol > 1);
45d1cae3 2096
46df6e98
MA
2097 if (parity_valid[VDEV_RAIDZ_Q]) {
2098 vdev_raidz_reconstruct_q(rr, dt, 1);
2099 return;
2100 }
45d1cae3 2101
b2255edc 2102 ASSERT(rr->rr_firstdatacol > 2);
ab9f4b0b 2103 break;
45d1cae3 2104
ab9f4b0b 2105 case 2:
b2255edc 2106 ASSERT(rr->rr_firstdatacol > 1);
45d1cae3 2107
ab9f4b0b 2108 if (parity_valid[VDEV_RAIDZ_P] &&
46df6e98
MA
2109 parity_valid[VDEV_RAIDZ_Q]) {
2110 vdev_raidz_reconstruct_pq(rr, dt, 2);
2111 return;
2112 }
45d1cae3 2113
b2255edc 2114 ASSERT(rr->rr_firstdatacol > 2);
45d1cae3 2115
ab9f4b0b 2116 break;
45d1cae3
BB
2117 }
2118
46df6e98 2119 vdev_raidz_reconstruct_general(rr, tgts, ntgts);
45d1cae3 2120}
34dc7c2f
BB
2121
2122static int
1bd201e7 2123vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize,
6fe3498c 2124 uint64_t *logical_ashift, uint64_t *physical_ashift)
34dc7c2f 2125{
b2255edc
BB
2126 vdev_raidz_t *vdrz = vd->vdev_tsd;
2127 uint64_t nparity = vdrz->vd_nparity;
45d1cae3 2128 int c;
34dc7c2f
BB
2129 int lasterror = 0;
2130 int numerrors = 0;
2131
2132 ASSERT(nparity > 0);
2133
2134 if (nparity > VDEV_RAIDZ_MAXPARITY ||
2135 vd->vdev_children < nparity + 1) {
2136 vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
2e528b49 2137 return (SET_ERROR(EINVAL));
34dc7c2f
BB
2138 }
2139
45d1cae3
BB
2140 vdev_open_children(vd);
2141
34dc7c2f 2142 for (c = 0; c < vd->vdev_children; c++) {
b2255edc 2143 vdev_t *cvd = vd->vdev_child[c];
34dc7c2f 2144
45d1cae3
BB
2145 if (cvd->vdev_open_error != 0) {
2146 lasterror = cvd->vdev_open_error;
34dc7c2f
BB
2147 numerrors++;
2148 continue;
2149 }
2150
2151 *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1;
1bd201e7 2152 *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1;
6fe3498c 2153 *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift);
37f6845c
AM
2154 }
2155 for (c = 0; c < vd->vdev_children; c++) {
2156 vdev_t *cvd = vd->vdev_child[c];
2157
2158 if (cvd->vdev_open_error != 0)
2159 continue;
2160 *physical_ashift = vdev_best_ashift(*logical_ashift,
2161 *physical_ashift, cvd->vdev_physical_ashift);
34dc7c2f
BB
2162 }
2163
5caeef02
DB
2164 if (vd->vdev_rz_expanding) {
2165 *asize *= vd->vdev_children - 1;
2166 *max_asize *= vd->vdev_children - 1;
2167
2168 vd->vdev_min_asize = *asize;
2169 } else {
2170 *asize *= vd->vdev_children;
2171 *max_asize *= vd->vdev_children;
2172 }
34dc7c2f
BB
2173
2174 if (numerrors > nparity) {
2175 vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS;
2176 return (lasterror);
2177 }
2178
2179 return (0);
2180}
2181
2182static void
2183vdev_raidz_close(vdev_t *vd)
2184{
b2255edc
BB
2185 for (int c = 0; c < vd->vdev_children; c++) {
2186 if (vd->vdev_child[c] != NULL)
2187 vdev_close(vd->vdev_child[c]);
2188 }
34dc7c2f
BB
2189}
2190
5caeef02
DB
2191/*
2192 * Return the logical width to use, given the txg in which the allocation
493fcce9 2193 * happened. Note that BP_GET_BIRTH() is usually the txg in which the
5caeef02 2194 * BP was allocated. Remapped BP's (that were relocated due to device
493fcce9
GW
2195 * removal, see remap_blkptr_cb()), will have a more recent physical birth
2196 * which reflects when the BP was relocated, but we can ignore these because
2197 * they can't be on RAIDZ (device removal doesn't support RAIDZ).
5caeef02
DB
2198 */
2199static uint64_t
2200vdev_raidz_get_logical_width(vdev_raidz_t *vdrz, uint64_t txg)
2201{
2202 reflow_node_t lookup = {
2203 .re_txg = txg,
2204 };
2205 avl_index_t where;
2206
2207 uint64_t width;
2208 mutex_enter(&vdrz->vd_expand_lock);
2209 reflow_node_t *re = avl_find(&vdrz->vd_expand_txgs, &lookup, &where);
2210 if (re != NULL) {
2211 width = re->re_logical_width;
2212 } else {
2213 re = avl_nearest(&vdrz->vd_expand_txgs, where, AVL_BEFORE);
2214 if (re != NULL)
2215 width = re->re_logical_width;
2216 else
2217 width = vdrz->vd_original_width;
2218 }
2219 mutex_exit(&vdrz->vd_expand_lock);
2220 return (width);
2221}
2222
2223/*
2224 * Note: If the RAIDZ vdev has been expanded, older BP's may have allocated
2225 * more space due to the lower data-to-parity ratio. In this case it's
2226 * important to pass in the correct txg. Note that vdev_gang_header_asize()
2227 * relies on a constant asize for psize=SPA_GANGBLOCKSIZE=SPA_MINBLOCKSIZE,
2228 * regardless of txg. This is assured because for a single data sector, we
2229 * allocate P+1 sectors regardless of width ("cols", which is at least P+1).
2230 */
34dc7c2f 2231static uint64_t
5caeef02 2232vdev_raidz_asize(vdev_t *vd, uint64_t psize, uint64_t txg)
34dc7c2f 2233{
b2255edc 2234 vdev_raidz_t *vdrz = vd->vdev_tsd;
34dc7c2f
BB
2235 uint64_t asize;
2236 uint64_t ashift = vd->vdev_top->vdev_ashift;
5caeef02 2237 uint64_t cols = vdrz->vd_original_width;
b2255edc 2238 uint64_t nparity = vdrz->vd_nparity;
34dc7c2f 2239
5caeef02
DB
2240 cols = vdev_raidz_get_logical_width(vdrz, txg);
2241
34dc7c2f
BB
2242 asize = ((psize - 1) >> ashift) + 1;
2243 asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
2244 asize = roundup(asize, nparity + 1) << ashift;
2245
5caeef02
DB
2246#ifdef ZFS_DEBUG
2247 uint64_t asize_new = ((psize - 1) >> ashift) + 1;
2248 uint64_t ncols_new = vdrz->vd_physical_width;
2249 asize_new += nparity * ((asize_new + ncols_new - nparity - 1) /
2250 (ncols_new - nparity));
2251 asize_new = roundup(asize_new, nparity + 1) << ashift;
2252 VERIFY3U(asize_new, <=, asize);
2253#endif
2254
34dc7c2f
BB
2255 return (asize);
2256}
2257
b2255edc
BB
2258/*
2259 * The allocatable space for a raidz vdev is N * sizeof(smallest child)
2260 * so each child must provide at least 1/Nth of its asize.
2261 */
2262static uint64_t
2263vdev_raidz_min_asize(vdev_t *vd)
2264{
2265 return ((vd->vdev_min_asize + vd->vdev_children - 1) /
2266 vd->vdev_children);
2267}
2268
2269void
34dc7c2f
BB
2270vdev_raidz_child_done(zio_t *zio)
2271{
2272 raidz_col_t *rc = zio->io_private;
2273
345196be 2274 ASSERT3P(rc->rc_abd, !=, NULL);
34dc7c2f
BB
2275 rc->rc_error = zio->io_error;
2276 rc->rc_tried = 1;
2277 rc->rc_skipped = 0;
2278}
2279
619f0976 2280static void
5caeef02 2281vdev_raidz_shadow_child_done(zio_t *zio)
619f0976 2282{
5caeef02
DB
2283 raidz_col_t *rc = zio->io_private;
2284
2285 rc->rc_shadow_error = zio->io_error;
2286}
619f0976 2287
5caeef02
DB
2288static void
2289vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, raidz_row_t *rr, int col)
2290{
2291 (void) rm;
2292#ifdef ZFS_DEBUG
b2255edc
BB
2293 range_seg64_t logical_rs, physical_rs, remain_rs;
2294 logical_rs.rs_start = rr->rr_offset;
619f0976 2295 logical_rs.rs_end = logical_rs.rs_start +
5caeef02 2296 vdev_raidz_asize(zio->io_vd, rr->rr_size,
493fcce9 2297 BP_GET_BIRTH(zio->io_bp));
619f0976 2298
b2255edc 2299 raidz_col_t *rc = &rr->rr_col[col];
5caeef02 2300 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
619f0976 2301
b2255edc
BB
2302 vdev_xlate(cvd, &logical_rs, &physical_rs, &remain_rs);
2303 ASSERT(vdev_xlate_is_empty(&remain_rs));
5caeef02
DB
2304 if (vdev_xlate_is_empty(&physical_rs)) {
2305 /*
2306 * If we are in the middle of expansion, the
2307 * physical->logical mapping is changing so vdev_xlate()
2308 * can't give us a reliable answer.
2309 */
2310 return;
2311 }
619f0976
GW
2312 ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start);
2313 ASSERT3U(rc->rc_offset, <, physical_rs.rs_end);
2314 /*
2315 * It would be nice to assert that rs_end is equal
2316 * to rc_offset + rc_size but there might be an
2317 * optional I/O at the end that is not accounted in
2318 * rc_size.
2319 */
2320 if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) {
2321 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset +
5caeef02 2322 rc->rc_size + (1 << zio->io_vd->vdev_top->vdev_ashift));
619f0976
GW
2323 } else {
2324 ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size);
2325 }
2326#endif
2327}
2328
98b25418 2329static void
5caeef02 2330vdev_raidz_io_start_write(zio_t *zio, raidz_row_t *rr)
34dc7c2f
BB
2331{
2332 vdev_t *vd = zio->io_vd;
b2255edc 2333 raidz_map_t *rm = zio->io_vsd;
34dc7c2f 2334
b2255edc 2335 vdev_raidz_generate_parity_row(rm, rr);
34dc7c2f 2336
345196be 2337 for (int c = 0; c < rr->rr_scols; c++) {
b2255edc 2338 raidz_col_t *rc = &rr->rr_col[c];
345196be 2339 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
619f0976 2340
b2255edc 2341 /* Verify physical to logical translation */
5caeef02 2342 vdev_raidz_io_verify(zio, rm, rr, c);
34dc7c2f 2343
5caeef02
DB
2344 if (rc->rc_size == 0)
2345 continue;
2346
2347 ASSERT3U(rc->rc_offset + rc->rc_size, <,
2348 cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2349
2350 ASSERT3P(rc->rc_abd, !=, NULL);
2351 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2352 rc->rc_offset, rc->rc_abd,
2353 abd_get_size(rc->rc_abd), zio->io_type,
2354 zio->io_priority, 0, vdev_raidz_child_done, rc));
2355
2356 if (rc->rc_shadow_devidx != INT_MAX) {
2357 vdev_t *cvd2 = vd->vdev_child[rc->rc_shadow_devidx];
2358
2359 ASSERT3U(
2360 rc->rc_shadow_offset + abd_get_size(rc->rc_abd), <,
2361 cvd2->vdev_psize - VDEV_LABEL_END_SIZE);
2362
2363 zio_nowait(zio_vdev_child_io(zio, NULL, cvd2,
2364 rc->rc_shadow_offset, rc->rc_abd,
2365 abd_get_size(rc->rc_abd),
2366 zio->io_type, zio->io_priority, 0,
2367 vdev_raidz_shadow_child_done, rc));
345196be 2368 }
34dc7c2f 2369 }
b2255edc 2370}
34dc7c2f 2371
5caeef02
DB
2372/*
2373 * Generate optional I/Os for skip sectors to improve aggregation contiguity.
2374 * This only works for vdev_raidz_map_alloc() (not _expanded()).
2375 */
b2255edc 2376static void
5caeef02 2377raidz_start_skip_writes(zio_t *zio)
b2255edc
BB
2378{
2379 vdev_t *vd = zio->io_vd;
5caeef02
DB
2380 uint64_t ashift = vd->vdev_top->vdev_ashift;
2381 raidz_map_t *rm = zio->io_vsd;
2382 ASSERT3U(rm->rm_nrows, ==, 1);
2383 raidz_row_t *rr = rm->rm_row[0];
2384 for (int c = 0; c < rr->rr_scols; c++) {
2385 raidz_col_t *rc = &rr->rr_col[c];
2386 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2387 if (rc->rc_size != 0)
2388 continue;
2389 ASSERT3P(rc->rc_abd, ==, NULL);
2390
2391 ASSERT3U(rc->rc_offset, <,
2392 cvd->vdev_psize - VDEV_LABEL_END_SIZE);
2393
2394 zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset,
2395 NULL, 1ULL << ashift, zio->io_type, zio->io_priority,
2396 ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
2397 }
2398}
2399
2400static void
2401vdev_raidz_io_start_read_row(zio_t *zio, raidz_row_t *rr, boolean_t forceparity)
2402{
2403 vdev_t *vd = zio->io_vd;
2404
2405 /*
34dc7c2f 2406 * Iterate over the columns in reverse order so that we hit the parity
45d1cae3 2407 * last -- any errors along the way will force us to read the parity.
34dc7c2f 2408 */
b2255edc
BB
2409 for (int c = rr->rr_cols - 1; c >= 0; c--) {
2410 raidz_col_t *rc = &rr->rr_col[c];
2411 if (rc->rc_size == 0)
2412 continue;
2413 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
34dc7c2f 2414 if (!vdev_readable(cvd)) {
b2255edc
BB
2415 if (c >= rr->rr_firstdatacol)
2416 rr->rr_missingdata++;
34dc7c2f 2417 else
b2255edc 2418 rr->rr_missingparity++;
2e528b49 2419 rc->rc_error = SET_ERROR(ENXIO);
34dc7c2f
BB
2420 rc->rc_tried = 1; /* don't even try */
2421 rc->rc_skipped = 1;
2422 continue;
2423 }
428870ff 2424 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
b2255edc
BB
2425 if (c >= rr->rr_firstdatacol)
2426 rr->rr_missingdata++;
34dc7c2f 2427 else
b2255edc 2428 rr->rr_missingparity++;
2e528b49 2429 rc->rc_error = SET_ERROR(ESTALE);
34dc7c2f
BB
2430 rc->rc_skipped = 1;
2431 continue;
2432 }
5caeef02
DB
2433 if (forceparity ||
2434 c >= rr->rr_firstdatacol || rr->rr_missingdata > 0 ||
9babb374 2435 (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
34dc7c2f 2436 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
a6255b7f 2437 rc->rc_offset, rc->rc_abd, rc->rc_size,
b128c09f 2438 zio->io_type, zio->io_priority, 0,
34dc7c2f
BB
2439 vdev_raidz_child_done, rc));
2440 }
2441 }
b2255edc
BB
2442}
2443
5caeef02
DB
2444static void
2445vdev_raidz_io_start_read_phys_cols(zio_t *zio, raidz_map_t *rm)
2446{
2447 vdev_t *vd = zio->io_vd;
2448
2449 for (int i = 0; i < rm->rm_nphys_cols; i++) {
2450 raidz_col_t *prc = &rm->rm_phys_col[i];
2451 if (prc->rc_size == 0)
2452 continue;
2453
2454 ASSERT3U(prc->rc_devidx, ==, i);
2455 vdev_t *cvd = vd->vdev_child[i];
2456 if (!vdev_readable(cvd)) {
2457 prc->rc_error = SET_ERROR(ENXIO);
2458 prc->rc_tried = 1; /* don't even try */
2459 prc->rc_skipped = 1;
2460 continue;
2461 }
2462 if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
2463 prc->rc_error = SET_ERROR(ESTALE);
2464 prc->rc_skipped = 1;
2465 continue;
2466 }
2467 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2468 prc->rc_offset, prc->rc_abd, prc->rc_size,
2469 zio->io_type, zio->io_priority, 0,
2470 vdev_raidz_child_done, prc));
2471 }
2472}
2473
2474static void
2475vdev_raidz_io_start_read(zio_t *zio, raidz_map_t *rm)
2476{
2477 /*
2478 * If there are multiple rows, we will be hitting
2479 * all disks, so go ahead and read the parity so
2480 * that we are reading in decent size chunks.
2481 */
2482 boolean_t forceparity = rm->rm_nrows > 1;
2483
2484 if (rm->rm_phys_col) {
2485 vdev_raidz_io_start_read_phys_cols(zio, rm);
2486 } else {
2487 for (int i = 0; i < rm->rm_nrows; i++) {
2488 raidz_row_t *rr = rm->rm_row[i];
2489 vdev_raidz_io_start_read_row(zio, rr, forceparity);
2490 }
2491 }
2492}
2493
b2255edc
BB
2494/*
2495 * Start an IO operation on a RAIDZ VDev
2496 *
2497 * Outline:
2498 * - For write operations:
2499 * 1. Generate the parity data
2500 * 2. Create child zio write operations to each column's vdev, for both
2501 * data and parity.
2502 * 3. If the column skips any sectors for padding, create optional dummy
2503 * write zio children for those areas to improve aggregation continuity.
2504 * - For read operations:
2505 * 1. Create child zio read operations to each data column's vdev to read
2506 * the range of data required for zio.
2507 * 2. If this is a scrub or resilver operation, or if any of the data
2508 * vdevs have had errors, then create zio read operations to the parity
2509 * columns' VDevs as well.
2510 */
2511static void
2512vdev_raidz_io_start(zio_t *zio)
2513{
2514 vdev_t *vd = zio->io_vd;
2515 vdev_t *tvd = vd->vdev_top;
2516 vdev_raidz_t *vdrz = vd->vdev_tsd;
5caeef02
DB
2517 raidz_map_t *rm;
2518
2519 uint64_t logical_width = vdev_raidz_get_logical_width(vdrz,
493fcce9 2520 BP_GET_BIRTH(zio->io_bp));
5caeef02
DB
2521 if (logical_width != vdrz->vd_physical_width) {
2522 zfs_locked_range_t *lr = NULL;
2523 uint64_t synced_offset = UINT64_MAX;
2524 uint64_t next_offset = UINT64_MAX;
2525 boolean_t use_scratch = B_FALSE;
2526 /*
2527 * Note: when the expansion is completing, we set
2528 * vre_state=DSS_FINISHED (in raidz_reflow_complete_sync())
2529 * in a later txg than when we last update spa_ubsync's state
2530 * (see the end of spa_raidz_expand_thread()). Therefore we
2531 * may see vre_state!=SCANNING before
2532 * VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE=DSS_FINISHED is reflected
2533 * on disk, but the copying progress has been synced to disk
2534 * (and reflected in spa_ubsync). In this case it's fine to
2535 * treat the expansion as completed, since if we crash there's
2536 * no additional copying to do.
2537 */
2538 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
2539 ASSERT3P(vd->vdev_spa->spa_raidz_expand, ==,
2540 &vdrz->vn_vre);
2541 lr = zfs_rangelock_enter(&vdrz->vn_vre.vre_rangelock,
2542 zio->io_offset, zio->io_size, RL_READER);
2543 use_scratch =
2544 (RRSS_GET_STATE(&vd->vdev_spa->spa_ubsync) ==
2545 RRSS_SCRATCH_VALID);
2546 synced_offset =
2547 RRSS_GET_OFFSET(&vd->vdev_spa->spa_ubsync);
2548 next_offset = vdrz->vn_vre.vre_offset;
2549 /*
2550 * If we haven't resumed expanding since importing the
2551 * pool, vre_offset won't have been set yet. In
2552 * this case the next offset to be copied is the same
2553 * as what was synced.
2554 */
2555 if (next_offset == UINT64_MAX) {
2556 next_offset = synced_offset;
2557 }
2558 }
2559 if (use_scratch) {
2560 zfs_dbgmsg("zio=%px %s io_offset=%llu offset_synced="
2561 "%lld next_offset=%lld use_scratch=%u",
2562 zio,
2563 zio->io_type == ZIO_TYPE_WRITE ? "WRITE" : "READ",
2564 (long long)zio->io_offset,
2565 (long long)synced_offset,
2566 (long long)next_offset,
2567 use_scratch);
2568 }
2569
2570 rm = vdev_raidz_map_alloc_expanded(zio,
2571 tvd->vdev_ashift, vdrz->vd_physical_width,
2572 logical_width, vdrz->vd_nparity,
2573 synced_offset, next_offset, use_scratch);
2574 rm->rm_lr = lr;
2575 } else {
2576 rm = vdev_raidz_map_alloc(zio,
2577 tvd->vdev_ashift, logical_width, vdrz->vd_nparity);
2578 }
2579 rm->rm_original_width = vdrz->vd_original_width;
b2255edc 2580
330c6c05
MA
2581 zio->io_vsd = rm;
2582 zio->io_vsd_ops = &vdev_raidz_vsd_ops;
b2255edc 2583 if (zio->io_type == ZIO_TYPE_WRITE) {
5caeef02
DB
2584 for (int i = 0; i < rm->rm_nrows; i++) {
2585 vdev_raidz_io_start_write(zio, rm->rm_row[i]);
2586 }
2587
2588 if (logical_width == vdrz->vd_physical_width) {
2589 raidz_start_skip_writes(zio);
2590 }
b2255edc
BB
2591 } else {
2592 ASSERT(zio->io_type == ZIO_TYPE_READ);
5caeef02 2593 vdev_raidz_io_start_read(zio, rm);
b2255edc 2594 }
34dc7c2f 2595
98b25418 2596 zio_execute(zio);
34dc7c2f
BB
2597}
2598
2599/*
2600 * Report a checksum error for a child of a RAID-Z device.
2601 */
3c80e074
BB
2602void
2603vdev_raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data)
34dc7c2f
BB
2604{
2605 vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
34dc7c2f 2606
b2255edc
BB
2607 if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE) &&
2608 zio->io_priority != ZIO_PRIORITY_REBUILD) {
428870ff
BB
2609 zio_bad_cksum_t zbc;
2610 raidz_map_t *rm = zio->io_vsd;
2611
428870ff
BB
2612 zbc.zbc_has_cksum = 0;
2613 zbc.zbc_injected = rm->rm_ecksuminjected;
2614
03e02e5b
DB
2615 mutex_enter(&vd->vdev_stat_lock);
2616 vd->vdev_stat.vs_checksum_errors++;
2617 mutex_exit(&vd->vdev_stat_lock);
7a75f74c
RW
2618 (void) zfs_ereport_post_checksum(zio->io_spa, vd,
2619 &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size,
2620 rc->rc_abd, bad_data, &zbc);
34dc7c2f 2621 }
428870ff
BB
2622}
2623
2624/*
2625 * We keep track of whether or not there were any injected errors, so that
2626 * any ereports we generate can note it.
2627 */
2628static int
2629raidz_checksum_verify(zio_t *zio)
2630{
6fd87e1d 2631 zio_bad_cksum_t zbc = {0};
428870ff
BB
2632 raidz_map_t *rm = zio->io_vsd;
2633
1c27024e 2634 int ret = zio_checksum_error(zio, &zbc);
428870ff
BB
2635 if (ret != 0 && zbc.zbc_injected != 0)
2636 rm->rm_ecksuminjected = 1;
34dc7c2f 2637
428870ff 2638 return (ret);
34dc7c2f
BB
2639}
2640
2641/*
2642 * Generate the parity from the data columns. If we tried and were able to
2643 * read the parity without error, verify that the generated parity matches the
2644 * data we read. If it doesn't, we fire off a checksum error. Return the
b2255edc 2645 * number of such failures.
34dc7c2f
BB
2646 */
2647static int
b2255edc 2648raidz_parity_verify(zio_t *zio, raidz_row_t *rr)
34dc7c2f 2649{
84c07ada 2650 abd_t *orig[VDEV_RAIDZ_MAXPARITY];
34dc7c2f 2651 int c, ret = 0;
b2255edc 2652 raidz_map_t *rm = zio->io_vsd;
34dc7c2f
BB
2653 raidz_col_t *rc;
2654
3c67d83a
TH
2655 blkptr_t *bp = zio->io_bp;
2656 enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum :
2657 (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
2658
2659 if (checksum == ZIO_CHECKSUM_NOPARITY)
2660 return (ret);
2661
b2255edc
BB
2662 for (c = 0; c < rr->rr_firstdatacol; c++) {
2663 rc = &rr->rr_col[c];
34dc7c2f
BB
2664 if (!rc->rc_tried || rc->rc_error != 0)
2665 continue;
84c07ada 2666
74230a5b
AM
2667 orig[c] = rc->rc_abd;
2668 ASSERT3U(abd_get_size(rc->rc_abd), ==, rc->rc_size);
2669 rc->rc_abd = abd_alloc_linear(rc->rc_size, B_FALSE);
34dc7c2f
BB
2670 }
2671
3c80e074
BB
2672 /*
2673 * Verify any empty sectors are zero filled to ensure the parity
2674 * is calculated correctly even if these non-data sectors are damaged.
2675 */
2676 if (rr->rr_nempty && rr->rr_abd_empty != NULL)
2677 ret += vdev_draid_map_verify_empty(zio, rr);
2678
b2255edc
BB
2679 /*
2680 * Regenerates parity even for !tried||rc_error!=0 columns. This
2681 * isn't harmful but it does have the side effect of fixing stuff
2682 * we didn't realize was necessary (i.e. even if we return 0).
2683 */
2684 vdev_raidz_generate_parity_row(rm, rr);
2685
2686 for (c = 0; c < rr->rr_firstdatacol; c++) {
2687 rc = &rr->rr_col[c];
34dc7c2f 2688
34dc7c2f
BB
2689 if (!rc->rc_tried || rc->rc_error != 0)
2690 continue;
b2255edc 2691
84c07ada 2692 if (abd_cmp(orig[c], rc->rc_abd) != 0) {
5caeef02
DB
2693 zfs_dbgmsg("found error on col=%u devidx=%u off %llx",
2694 c, (int)rc->rc_devidx, (u_longlong_t)rc->rc_offset);
3c80e074 2695 vdev_raidz_checksum_error(zio, rc, orig[c]);
2e528b49 2696 rc->rc_error = SET_ERROR(ECKSUM);
34dc7c2f
BB
2697 ret++;
2698 }
84c07ada 2699 abd_free(orig[c]);
34dc7c2f
BB
2700 }
2701
2702 return (ret);
2703}
2704
34dc7c2f 2705static int
b2255edc 2706vdev_raidz_worst_error(raidz_row_t *rr)
b128c09f 2707{
1c27024e 2708 int error = 0;
b128c09f 2709
5caeef02 2710 for (int c = 0; c < rr->rr_cols; c++) {
b2255edc 2711 error = zio_worst_error(error, rr->rr_col[c].rc_error);
5caeef02
DB
2712 error = zio_worst_error(error, rr->rr_col[c].rc_shadow_error);
2713 }
b128c09f
BB
2714
2715 return (error);
2716}
2717
b2255edc
BB
2718static void
2719vdev_raidz_io_done_verified(zio_t *zio, raidz_row_t *rr)
45d1cae3 2720{
b2255edc
BB
2721 int unexpected_errors = 0;
2722 int parity_errors = 0;
2723 int parity_untried = 0;
2724 int data_errors = 0;
45d1cae3 2725
b2255edc
BB
2726 ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ);
2727
2728 for (int c = 0; c < rr->rr_cols; c++) {
2729 raidz_col_t *rc = &rr->rr_col[c];
2730
2731 if (rc->rc_error) {
2732 if (c < rr->rr_firstdatacol)
2733 parity_errors++;
2734 else
2735 data_errors++;
2736
2737 if (!rc->rc_skipped)
2738 unexpected_errors++;
2739 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
2740 parity_untried++;
2741 }
ad8b9f94
BB
2742
2743 if (rc->rc_force_repair)
2744 unexpected_errors++;
b2255edc 2745 }
45d1cae3
BB
2746
2747 /*
b2255edc
BB
2748 * If we read more parity disks than were used for
2749 * reconstruction, confirm that the other parity disks produced
2750 * correct data.
2751 *
2752 * Note that we also regenerate parity when resilvering so we
2753 * can write it out to failed devices later.
45d1cae3 2754 */
b2255edc
BB
2755 if (parity_errors + parity_untried <
2756 rr->rr_firstdatacol - data_errors ||
2757 (zio->io_flags & ZIO_FLAG_RESILVER)) {
2758 int n = raidz_parity_verify(zio, rr);
2759 unexpected_errors += n;
b2255edc 2760 }
45d1cae3 2761
b2255edc
BB
2762 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2763 (unexpected_errors > 0 || (zio->io_flags & ZIO_FLAG_RESILVER))) {
45d1cae3 2764 /*
b2255edc 2765 * Use the good data we have in hand to repair damaged children.
45d1cae3 2766 */
b2255edc
BB
2767 for (int c = 0; c < rr->rr_cols; c++) {
2768 raidz_col_t *rc = &rr->rr_col[c];
2769 vdev_t *vd = zio->io_vd;
2770 vdev_t *cvd = vd->vdev_child[rc->rc_devidx];
2771
8fb577ae
BB
2772 if (!rc->rc_allow_repair) {
2773 continue;
2774 } else if (!rc->rc_force_repair &&
2775 (rc->rc_error == 0 || rc->rc_size == 0)) {
b2255edc 2776 continue;
45d1cae3
BB
2777 }
2778
5caeef02
DB
2779 zfs_dbgmsg("zio=%px repairing c=%u devidx=%u "
2780 "offset=%llx",
2781 zio, c, rc->rc_devidx, (long long)rc->rc_offset);
2782
b2255edc
BB
2783 zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2784 rc->rc_offset, rc->rc_abd, rc->rc_size,
2785 ZIO_TYPE_WRITE,
2786 zio->io_priority == ZIO_PRIORITY_REBUILD ?
2787 ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE,
2788 ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2789 ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2790 }
2791 }
5caeef02
DB
2792
2793 /*
2794 * Scrub or resilver i/o's: overwrite any shadow locations with the
2795 * good data. This ensures that if we've already copied this sector,
2796 * it will be corrected if it was damaged. This writes more than is
2797 * necessary, but since expansion is paused during scrub/resilver, at
2798 * most a single row will have a shadow location.
2799 */
2800 if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
2801 (zio->io_flags & (ZIO_FLAG_RESILVER | ZIO_FLAG_SCRUB))) {
2802 for (int c = 0; c < rr->rr_cols; c++) {
2803 raidz_col_t *rc = &rr->rr_col[c];
2804 vdev_t *vd = zio->io_vd;
2805
2806 if (rc->rc_shadow_devidx == INT_MAX || rc->rc_size == 0)
2807 continue;
2808 vdev_t *cvd = vd->vdev_child[rc->rc_shadow_devidx];
2809
2810 /*
2811 * Note: We don't want to update the repair stats
2812 * because that would incorrectly indicate that there
2813 * was bad data to repair, which we aren't sure about.
2814 * By clearing the SCAN_THREAD flag, we prevent this
2815 * from happening, despite having the REPAIR flag set.
2816 * We need to set SELF_HEAL so that this i/o can't be
2817 * bypassed by zio_vdev_io_start().
2818 */
2819 zio_t *cio = zio_vdev_child_io(zio, NULL, cvd,
2820 rc->rc_shadow_offset, rc->rc_abd, rc->rc_size,
2821 ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2822 ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL,
2823 NULL, NULL);
2824 cio->io_flags &= ~ZIO_FLAG_SCAN_THREAD;
2825 zio_nowait(cio);
2826 }
2827 }
b2255edc
BB
2828}
2829
2830static void
2831raidz_restore_orig_data(raidz_map_t *rm)
2832{
2833 for (int i = 0; i < rm->rm_nrows; i++) {
2834 raidz_row_t *rr = rm->rm_row[i];
2835 for (int c = 0; c < rr->rr_cols; c++) {
2836 raidz_col_t *rc = &rr->rr_col[c];
2837 if (rc->rc_need_orig_restore) {
330c6c05 2838 abd_copy(rc->rc_abd,
b2255edc
BB
2839 rc->rc_orig_data, rc->rc_size);
2840 rc->rc_need_orig_restore = B_FALSE;
45d1cae3 2841 }
b2255edc
BB
2842 }
2843 }
2844}
2845
5caeef02
DB
2846/*
2847 * During raidz_reconstruct() for expanded VDEV, we need special consideration
2848 * failure simulations. See note in raidz_reconstruct() on simulating failure
2849 * of a pre-expansion device.
2850 *
2851 * Treating logical child i as failed, return TRUE if the given column should
2852 * be treated as failed. The idea of logical children allows us to imagine
2853 * that a disk silently failed before a RAIDZ expansion (reads from this disk
2854 * succeed but return the wrong data). Since the expansion doesn't verify
2855 * checksums, the incorrect data will be moved to new locations spread among
2856 * the children (going diagonally across them).
2857 *
2858 * Higher "logical child failures" (values of `i`) indicate these
2859 * "pre-expansion failures". The first physical_width values imagine that a
2860 * current child failed; the next physical_width-1 values imagine that a
2861 * child failed before the most recent expansion; the next physical_width-2
2862 * values imagine a child failed in the expansion before that, etc.
2863 */
2864static boolean_t
2865raidz_simulate_failure(int physical_width, int original_width, int ashift,
2866 int i, raidz_col_t *rc)
2867{
2868 uint64_t sector_id =
2869 physical_width * (rc->rc_offset >> ashift) +
2870 rc->rc_devidx;
2871
2872 for (int w = physical_width; w >= original_width; w--) {
2873 if (i < w) {
2874 return (sector_id % w == i);
2875 } else {
2876 i -= w;
2877 }
2878 }
2879 ASSERT(!"invalid logical child id");
2880 return (B_FALSE);
2881}
2882
b2255edc
BB
2883/*
2884 * returns EINVAL if reconstruction of the block will not be possible
2885 * returns ECKSUM if this specific reconstruction failed
2886 * returns 0 on successful reconstruction
2887 */
2888static int
2889raidz_reconstruct(zio_t *zio, int *ltgts, int ntgts, int nparity)
2890{
2891 raidz_map_t *rm = zio->io_vsd;
5caeef02
DB
2892 int physical_width = zio->io_vd->vdev_children;
2893 int original_width = (rm->rm_original_width != 0) ?
2894 rm->rm_original_width : physical_width;
2895 int dbgmsg = zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT;
2896
2897 if (dbgmsg) {
2898 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px ltgts=%u,%u,%u "
2899 "ntgts=%u", zio, ltgts[0], ltgts[1], ltgts[2], ntgts);
2900 }
45d1cae3 2901
b2255edc
BB
2902 /* Reconstruct each row */
2903 for (int r = 0; r < rm->rm_nrows; r++) {
2904 raidz_row_t *rr = rm->rm_row[r];
2905 int my_tgts[VDEV_RAIDZ_MAXPARITY]; /* value is child id */
2906 int t = 0;
2907 int dead = 0;
2908 int dead_data = 0;
2909
5caeef02
DB
2910 if (dbgmsg)
2911 zfs_dbgmsg("raidz_reconstruct_expanded(row=%u)", r);
2912
b2255edc
BB
2913 for (int c = 0; c < rr->rr_cols; c++) {
2914 raidz_col_t *rc = &rr->rr_col[c];
2915 ASSERT0(rc->rc_need_orig_restore);
2916 if (rc->rc_error != 0) {
2917 dead++;
2918 if (c >= nparity)
2919 dead_data++;
2920 continue;
2921 }
2922 if (rc->rc_size == 0)
2923 continue;
2924 for (int lt = 0; lt < ntgts; lt++) {
5caeef02
DB
2925 if (raidz_simulate_failure(physical_width,
2926 original_width,
2927 zio->io_vd->vdev_top->vdev_ashift,
2928 ltgts[lt], rc)) {
b2255edc
BB
2929 if (rc->rc_orig_data == NULL) {
2930 rc->rc_orig_data =
330c6c05
MA
2931 abd_alloc_linear(
2932 rc->rc_size, B_TRUE);
2933 abd_copy(rc->rc_orig_data,
b2255edc
BB
2934 rc->rc_abd, rc->rc_size);
2935 }
2936 rc->rc_need_orig_restore = B_TRUE;
2937
2938 dead++;
2939 if (c >= nparity)
2940 dead_data++;
5caeef02
DB
2941 /*
2942 * Note: simulating failure of a
2943 * pre-expansion device can hit more
2944 * than one column, in which case we
2945 * might try to simulate more failures
2946 * than can be reconstructed, which is
2947 * also more than the size of my_tgts.
2948 * This check prevents accessing past
2949 * the end of my_tgts. The "dead >
2950 * nparity" check below will fail this
2951 * reconstruction attempt.
2952 */
2953 if (t < VDEV_RAIDZ_MAXPARITY) {
2954 my_tgts[t++] = c;
2955 if (dbgmsg) {
2956 zfs_dbgmsg("simulating "
2957 "failure of col %u "
2958 "devidx %u", c,
2959 (int)rc->rc_devidx);
2960 }
2961 }
b2255edc
BB
2962 break;
2963 }
2964 }
2965 }
2966 if (dead > nparity) {
2967 /* reconstruction not possible */
5caeef02
DB
2968 if (dbgmsg) {
2969 zfs_dbgmsg("reconstruction not possible; "
2970 "too many failures");
2971 }
b2255edc
BB
2972 raidz_restore_orig_data(rm);
2973 return (EINVAL);
45d1cae3 2974 }
b2255edc 2975 if (dead_data > 0)
46df6e98 2976 vdev_raidz_reconstruct_row(rm, rr, my_tgts, t);
b2255edc 2977 }
45d1cae3 2978
b2255edc
BB
2979 /* Check for success */
2980 if (raidz_checksum_verify(zio) == 0) {
2981
2982 /* Reconstruction succeeded - report errors */
2983 for (int i = 0; i < rm->rm_nrows; i++) {
2984 raidz_row_t *rr = rm->rm_row[i];
2985
2986 for (int c = 0; c < rr->rr_cols; c++) {
2987 raidz_col_t *rc = &rr->rr_col[c];
2988 if (rc->rc_need_orig_restore) {
2989 /*
2990 * Note: if this is a parity column,
2991 * we don't really know if it's wrong.
2992 * We need to let
2993 * vdev_raidz_io_done_verified() check
2994 * it, and if we set rc_error, it will
2995 * think that it is a "known" error
2996 * that doesn't need to be checked
2997 * or corrected.
2998 */
2999 if (rc->rc_error == 0 &&
3000 c >= rr->rr_firstdatacol) {
3c80e074 3001 vdev_raidz_checksum_error(zio,
330c6c05 3002 rc, rc->rc_orig_data);
b2255edc
BB
3003 rc->rc_error =
3004 SET_ERROR(ECKSUM);
3005 }
3006 rc->rc_need_orig_restore = B_FALSE;
3007 }
3008 }
45d1cae3 3009
b2255edc 3010 vdev_raidz_io_done_verified(zio, rr);
45d1cae3
BB
3011 }
3012
b2255edc 3013 zio_checksum_verified(zio);
45d1cae3 3014
5caeef02
DB
3015 if (dbgmsg) {
3016 zfs_dbgmsg("reconstruction successful "
3017 "(checksum verified)");
3018 }
b2255edc
BB
3019 return (0);
3020 }
45d1cae3 3021
b2255edc
BB
3022 /* Reconstruction failed - restore original data */
3023 raidz_restore_orig_data(rm);
5caeef02
DB
3024 if (dbgmsg) {
3025 zfs_dbgmsg("raidz_reconstruct_expanded(zio=%px) checksum "
3026 "failed", zio);
3027 }
b2255edc
BB
3028 return (ECKSUM);
3029}
45d1cae3 3030
b2255edc
BB
3031/*
3032 * Iterate over all combinations of N bad vdevs and attempt a reconstruction.
3033 * Note that the algorithm below is non-optimal because it doesn't take into
3034 * account how reconstruction is actually performed. For example, with
3035 * triple-parity RAID-Z the reconstruction procedure is the same if column 4
3036 * is targeted as invalid as if columns 1 and 4 are targeted since in both
3037 * cases we'd only use parity information in column 0.
3038 *
3039 * The order that we find the various possible combinations of failed
3040 * disks is dictated by these rules:
3041 * - Examine each "slot" (the "i" in tgts[i])
5caeef02 3042 * - Try to increment this slot (tgts[i] += 1)
b2255edc
BB
3043 * - if we can't increment because it runs into the next slot,
3044 * reset our slot to the minimum, and examine the next slot
3045 *
3046 * For example, with a 6-wide RAIDZ3, and no known errors (so we have to choose
3047 * 3 columns to reconstruct), we will generate the following sequence:
3048 *
3049 * STATE ACTION
3050 * 0 1 2 special case: skip since these are all parity
3051 * 0 1 3 first slot: reset to 0; middle slot: increment to 2
3052 * 0 2 3 first slot: increment to 1
3053 * 1 2 3 first: reset to 0; middle: reset to 1; last: increment to 4
3054 * 0 1 4 first: reset to 0; middle: increment to 2
3055 * 0 2 4 first: increment to 1
3056 * 1 2 4 first: reset to 0; middle: increment to 3
3057 * 0 3 4 first: increment to 1
3058 * 1 3 4 first: increment to 2
3059 * 2 3 4 first: reset to 0; middle: reset to 1; last: increment to 5
3060 * 0 1 5 first: reset to 0; middle: increment to 2
3061 * 0 2 5 first: increment to 1
3062 * 1 2 5 first: reset to 0; middle: increment to 3
3063 * 0 3 5 first: increment to 1
3064 * 1 3 5 first: increment to 2
3065 * 2 3 5 first: reset to 0; middle: increment to 4
3066 * 0 4 5 first: increment to 1
3067 * 1 4 5 first: increment to 2
3068 * 2 4 5 first: increment to 3
3069 * 3 4 5 done
3070 *
bf169e9f 3071 * This strategy works for dRAID but is less efficient when there are a large
b2255edc 3072 * number of child vdevs and therefore permutations to check. Furthermore,
5caeef02 3073 * since the raidz_map_t rows likely do not overlap, reconstruction would be
b2255edc
BB
3074 * possible as long as there are no more than nparity data errors per row.
3075 * These additional permutations are not currently checked but could be as
3076 * a future improvement.
5caeef02
DB
3077 *
3078 * Returns 0 on success, ECKSUM on failure.
b2255edc
BB
3079 */
3080static int
3081vdev_raidz_combrec(zio_t *zio)
3082{
3083 int nparity = vdev_get_nparity(zio->io_vd);
3084 raidz_map_t *rm = zio->io_vsd;
5caeef02
DB
3085 int physical_width = zio->io_vd->vdev_children;
3086 int original_width = (rm->rm_original_width != 0) ?
3087 rm->rm_original_width : physical_width;
45d1cae3 3088
b2255edc
BB
3089 for (int i = 0; i < rm->rm_nrows; i++) {
3090 raidz_row_t *rr = rm->rm_row[i];
3091 int total_errors = 0;
45d1cae3 3092
b2255edc
BB
3093 for (int c = 0; c < rr->rr_cols; c++) {
3094 if (rr->rr_col[c].rc_error)
3095 total_errors++;
3096 }
45d1cae3 3097
b2255edc
BB
3098 if (total_errors > nparity)
3099 return (vdev_raidz_worst_error(rr));
3100 }
45d1cae3 3101
b2255edc
BB
3102 for (int num_failures = 1; num_failures <= nparity; num_failures++) {
3103 int tstore[VDEV_RAIDZ_MAXPARITY + 2];
3104 int *ltgts = &tstore[1]; /* value is logical child ID */
3105
5caeef02
DB
3106
3107 /*
3108 * Determine number of logical children, n. See comment
3109 * above raidz_simulate_failure().
3110 */
3111 int n = 0;
3112 for (int w = physical_width;
3113 w >= original_width; w--) {
3114 n += w;
3115 }
b2255edc
BB
3116
3117 ASSERT3U(num_failures, <=, nparity);
3118 ASSERT3U(num_failures, <=, VDEV_RAIDZ_MAXPARITY);
3119
3120 /* Handle corner cases in combrec logic */
3121 ltgts[-1] = -1;
3122 for (int i = 0; i < num_failures; i++) {
3123 ltgts[i] = i;
3124 }
3125 ltgts[num_failures] = n;
3126
3127 for (;;) {
3128 int err = raidz_reconstruct(zio, ltgts, num_failures,
3129 nparity);
3130 if (err == EINVAL) {
45d1cae3 3131 /*
b2255edc
BB
3132 * Reconstruction not possible with this #
3133 * failures; try more failures.
45d1cae3 3134 */
b2255edc
BB
3135 break;
3136 } else if (err == 0)
3137 return (0);
3138
3139 /* Compute next targets to try */
3140 for (int t = 0; ; t++) {
3141 ASSERT3U(t, <, num_failures);
3142 ltgts[t]++;
3143 if (ltgts[t] == n) {
3144 /* try more failures */
3145 ASSERT3U(t, ==, num_failures - 1);
5caeef02
DB
3146 if (zfs_flags &
3147 ZFS_DEBUG_RAIDZ_RECONSTRUCT) {
3148 zfs_dbgmsg("reconstruction "
3149 "failed for num_failures="
3150 "%u; tried all "
3151 "combinations",
3152 num_failures);
3153 }
b2255edc
BB
3154 break;
3155 }
45d1cae3 3156
b2255edc
BB
3157 ASSERT3U(ltgts[t], <, n);
3158 ASSERT3U(ltgts[t], <=, ltgts[t + 1]);
45d1cae3
BB
3159
3160 /*
3161 * If that spot is available, we're done here.
b2255edc 3162 * Try the next combination.
45d1cae3 3163 */
b2255edc 3164 if (ltgts[t] != ltgts[t + 1])
5caeef02 3165 break; // found next combination
45d1cae3
BB
3166
3167 /*
b2255edc
BB
3168 * Otherwise, reset this tgt to the minimum,
3169 * and move on to the next tgt.
45d1cae3 3170 */
b2255edc
BB
3171 ltgts[t] = ltgts[t - 1] + 1;
3172 ASSERT3U(ltgts[t], ==, t);
3173 }
45d1cae3 3174
b2255edc
BB
3175 /* Increase the number of failures and keep trying. */
3176 if (ltgts[num_failures - 1] == n)
3177 break;
45d1cae3
BB
3178 }
3179 }
5caeef02
DB
3180 if (zfs_flags & ZFS_DEBUG_RAIDZ_RECONSTRUCT)
3181 zfs_dbgmsg("reconstruction failed for all num_failures");
b2255edc
BB
3182 return (ECKSUM);
3183}
3184
3185void
3186vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt)
3187{
3188 for (uint64_t row = 0; row < rm->rm_nrows; row++) {
3189 raidz_row_t *rr = rm->rm_row[row];
3190 vdev_raidz_reconstruct_row(rm, rr, t, nt);
3191 }
45d1cae3
BB
3192}
3193
e49f1e20 3194/*
b2255edc 3195 * Complete a write IO operation on a RAIDZ VDev
e49f1e20
WA
3196 *
3197 * Outline:
e49f1e20
WA
3198 * 1. Check for errors on the child IOs.
3199 * 2. Return, setting an error code if too few child VDevs were written
3200 * to reconstruct the data later. Note that partial writes are
3201 * considered successful if they can be reconstructed at all.
e49f1e20 3202 */
b128c09f 3203static void
b2255edc
BB
3204vdev_raidz_io_done_write_impl(zio_t *zio, raidz_row_t *rr)
3205{
5caeef02
DB
3206 int normal_errors = 0;
3207 int shadow_errors = 0;
b2255edc
BB
3208
3209 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3210 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
3211 ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE);
3212
3213 for (int c = 0; c < rr->rr_cols; c++) {
3214 raidz_col_t *rc = &rr->rr_col[c];
3215
5caeef02 3216 if (rc->rc_error != 0) {
b2255edc 3217 ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
5caeef02
DB
3218 normal_errors++;
3219 }
3220 if (rc->rc_shadow_error != 0) {
3221 ASSERT(rc->rc_shadow_error != ECKSUM);
3222 shadow_errors++;
b2255edc
BB
3223 }
3224 }
3225
3226 /*
3227 * Treat partial writes as a success. If we couldn't write enough
5caeef02
DB
3228 * columns to reconstruct the data, the I/O failed. Otherwise, good
3229 * enough. Note that in the case of a shadow write (during raidz
3230 * expansion), depending on if we crash, either the normal (old) or
3231 * shadow (new) location may become the "real" version of the block,
3232 * so both locations must have sufficient redundancy.
b2255edc
BB
3233 *
3234 * Now that we support write reallocation, it would be better
3235 * to treat partial failure as real failure unless there are
3236 * no non-degraded top-level vdevs left, and not update DTLs
3237 * if we intend to reallocate.
3238 */
5caeef02
DB
3239 if (normal_errors > rr->rr_firstdatacol ||
3240 shadow_errors > rr->rr_firstdatacol) {
b2255edc
BB
3241 zio->io_error = zio_worst_error(zio->io_error,
3242 vdev_raidz_worst_error(rr));
3243 }
3244}
3245
46df6e98 3246static void
b2255edc
BB
3247vdev_raidz_io_done_reconstruct_known_missing(zio_t *zio, raidz_map_t *rm,
3248 raidz_row_t *rr)
34dc7c2f 3249{
34dc7c2f
BB
3250 int parity_errors = 0;
3251 int parity_untried = 0;
3252 int data_errors = 0;
b128c09f 3253 int total_errors = 0;
34dc7c2f 3254
b2255edc
BB
3255 ASSERT3U(rr->rr_missingparity, <=, rr->rr_firstdatacol);
3256 ASSERT3U(rr->rr_missingdata, <=, rr->rr_cols - rr->rr_firstdatacol);
34dc7c2f 3257
b2255edc
BB
3258 for (int c = 0; c < rr->rr_cols; c++) {
3259 raidz_col_t *rc = &rr->rr_col[c];
34dc7c2f 3260
ad8b9f94
BB
3261 /*
3262 * If scrubbing and a replacing/sparing child vdev determined
3263 * that not all of its children have an identical copy of the
3264 * data, then clear the error so the column is treated like
3265 * any other read and force a repair to correct the damage.
3266 */
3267 if (rc->rc_error == ECKSUM) {
3268 ASSERT(zio->io_flags & ZIO_FLAG_SCRUB);
3269 vdev_raidz_checksum_error(zio, rc, rc->rc_abd);
3270 rc->rc_force_repair = 1;
3271 rc->rc_error = 0;
3272 }
34dc7c2f 3273
ad8b9f94 3274 if (rc->rc_error) {
b2255edc 3275 if (c < rr->rr_firstdatacol)
34dc7c2f
BB
3276 parity_errors++;
3277 else
3278 data_errors++;
3279
b128c09f 3280 total_errors++;
b2255edc 3281 } else if (c < rr->rr_firstdatacol && !rc->rc_tried) {
34dc7c2f
BB
3282 parity_untried++;
3283 }
3284 }
3285
34dc7c2f 3286 /*
b2255edc
BB
3287 * If there were data errors and the number of errors we saw was
3288 * correctable -- less than or equal to the number of parity disks read
3289 * -- reconstruct based on the missing data.
34dc7c2f 3290 */
b2255edc
BB
3291 if (data_errors != 0 &&
3292 total_errors <= rr->rr_firstdatacol - parity_untried) {
3293 /*
3294 * We either attempt to read all the parity columns or
3295 * none of them. If we didn't try to read parity, we
3296 * wouldn't be here in the correctable case. There must
3297 * also have been fewer parity errors than parity
3298 * columns or, again, we wouldn't be in this code path.
3299 */
3300 ASSERT(parity_untried == 0);
3301 ASSERT(parity_errors < rr->rr_firstdatacol);
34dc7c2f 3302
b2255edc
BB
3303 /*
3304 * Identify the data columns that reported an error.
3305 */
3306 int n = 0;
3307 int tgts[VDEV_RAIDZ_MAXPARITY];
3308 for (int c = rr->rr_firstdatacol; c < rr->rr_cols; c++) {
3309 raidz_col_t *rc = &rr->rr_col[c];
3310 if (rc->rc_error != 0) {
3311 ASSERT(n < VDEV_RAIDZ_MAXPARITY);
3312 tgts[n++] = c;
34dc7c2f 3313 }
b2255edc 3314 }
34dc7c2f 3315
b2255edc 3316 ASSERT(rr->rr_firstdatacol >= n);
34dc7c2f 3317
46df6e98 3318 vdev_raidz_reconstruct_row(rm, rr, tgts, n);
b2255edc 3319 }
b2255edc 3320}
34dc7c2f 3321
b2255edc
BB
3322/*
3323 * Return the number of reads issued.
3324 */
3325static int
3326vdev_raidz_read_all(zio_t *zio, raidz_row_t *rr)
3327{
3328 vdev_t *vd = zio->io_vd;
3329 int nread = 0;
34dc7c2f 3330
b2255edc
BB
3331 rr->rr_missingdata = 0;
3332 rr->rr_missingparity = 0;
34dc7c2f
BB
3333
3334 /*
b2255edc
BB
3335 * If this rows contains empty sectors which are not required
3336 * for a normal read then allocate an ABD for them now so they
3337 * may be read, verified, and any needed repairs performed.
34dc7c2f 3338 */
5caeef02 3339 if (rr->rr_nempty != 0 && rr->rr_abd_empty == NULL)
b2255edc 3340 vdev_draid_map_alloc_empty(zio, rr);
34dc7c2f 3341
b2255edc
BB
3342 for (int c = 0; c < rr->rr_cols; c++) {
3343 raidz_col_t *rc = &rr->rr_col[c];
3344 if (rc->rc_tried || rc->rc_size == 0)
34dc7c2f
BB
3345 continue;
3346
b2255edc
BB
3347 zio_nowait(zio_vdev_child_io(zio, NULL,
3348 vd->vdev_child[rc->rc_devidx],
3349 rc->rc_offset, rc->rc_abd, rc->rc_size,
3350 zio->io_type, zio->io_priority, 0,
3351 vdev_raidz_child_done, rc));
3352 nread++;
34dc7c2f 3353 }
b2255edc
BB
3354 return (nread);
3355}
34dc7c2f 3356
b2255edc
BB
3357/*
3358 * We're here because either there were too many errors to even attempt
3359 * reconstruction (total_errors == rm_first_datacol), or vdev_*_combrec()
3360 * failed. In either case, there is enough bad data to prevent reconstruction.
3361 * Start checksum ereports for all children which haven't failed.
3362 */
3363static void
3364vdev_raidz_io_done_unrecoverable(zio_t *zio)
3365{
3366 raidz_map_t *rm = zio->io_vsd;
34dc7c2f 3367
b2255edc
BB
3368 for (int i = 0; i < rm->rm_nrows; i++) {
3369 raidz_row_t *rr = rm->rm_row[i];
428870ff 3370
b2255edc
BB
3371 for (int c = 0; c < rr->rr_cols; c++) {
3372 raidz_col_t *rc = &rr->rr_col[c];
3373 vdev_t *cvd = zio->io_vd->vdev_child[rc->rc_devidx];
3374
3375 if (rc->rc_error != 0)
3376 continue;
3377
3378 zio_bad_cksum_t zbc;
3379 zbc.zbc_has_cksum = 0;
3380 zbc.zbc_injected = rm->rm_ecksuminjected;
3381
03e02e5b
DB
3382 mutex_enter(&cvd->vdev_stat_lock);
3383 cvd->vdev_stat.vs_checksum_errors++;
3384 mutex_exit(&cvd->vdev_stat_lock);
7a75f74c
RW
3385 (void) zfs_ereport_start_checksum(zio->io_spa,
3386 cvd, &zio->io_bookmark, zio, rc->rc_offset,
3387 rc->rc_size, &zbc);
34dc7c2f
BB
3388 }
3389 }
b2255edc 3390}
34dc7c2f 3391
b2255edc
BB
3392void
3393vdev_raidz_io_done(zio_t *zio)
3394{
3395 raidz_map_t *rm = zio->io_vsd;
34dc7c2f 3396
5caeef02 3397 ASSERT(zio->io_bp != NULL);
b2255edc
BB
3398 if (zio->io_type == ZIO_TYPE_WRITE) {
3399 for (int i = 0; i < rm->rm_nrows; i++) {
3400 vdev_raidz_io_done_write_impl(zio, rm->rm_row[i]);
3401 }
3402 } else {
5caeef02
DB
3403 if (rm->rm_phys_col) {
3404 /*
3405 * This is an aggregated read. Copy the data and status
3406 * from the aggregate abd's to the individual rows.
3407 */
3408 for (int i = 0; i < rm->rm_nrows; i++) {
3409 raidz_row_t *rr = rm->rm_row[i];
3410
3411 for (int c = 0; c < rr->rr_cols; c++) {
3412 raidz_col_t *rc = &rr->rr_col[c];
3413 if (rc->rc_tried || rc->rc_size == 0)
3414 continue;
3415
3416 raidz_col_t *prc =
3417 &rm->rm_phys_col[rc->rc_devidx];
3418 rc->rc_error = prc->rc_error;
3419 rc->rc_tried = prc->rc_tried;
3420 rc->rc_skipped = prc->rc_skipped;
3421 if (c >= rr->rr_firstdatacol) {
3422 /*
3423 * Note: this is slightly faster
3424 * than using abd_copy_off().
3425 */
3426 char *physbuf = abd_to_buf(
3427 prc->rc_abd);
3428 void *physloc = physbuf +
3429 rc->rc_offset -
3430 prc->rc_offset;
3431
3432 abd_copy_from_buf(rc->rc_abd,
3433 physloc, rc->rc_size);
3434 }
3435 }
3436 }
3437 }
3438
b2255edc
BB
3439 for (int i = 0; i < rm->rm_nrows; i++) {
3440 raidz_row_t *rr = rm->rm_row[i];
46df6e98 3441 vdev_raidz_io_done_reconstruct_known_missing(zio,
b2255edc
BB
3442 rm, rr);
3443 }
34dc7c2f 3444
b2255edc
BB
3445 if (raidz_checksum_verify(zio) == 0) {
3446 for (int i = 0; i < rm->rm_nrows; i++) {
3447 raidz_row_t *rr = rm->rm_row[i];
3448 vdev_raidz_io_done_verified(zio, rr);
3449 }
3450 zio_checksum_verified(zio);
3451 } else {
3452 /*
3453 * A sequential resilver has no checksum which makes
3454 * combinatoral reconstruction impossible. This code
3455 * path is unreachable since raidz_checksum_verify()
3456 * has no checksum to verify and must succeed.
3457 */
3458 ASSERT3U(zio->io_priority, !=, ZIO_PRIORITY_REBUILD);
34dc7c2f 3459
b2255edc
BB
3460 /*
3461 * This isn't a typical situation -- either we got a
3462 * read error or a child silently returned bad data.
3463 * Read every block so we can try again with as much
3464 * data and parity as we can track down. If we've
3465 * already been through once before, all children will
3466 * be marked as tried so we'll proceed to combinatorial
3467 * reconstruction.
3468 */
3469 int nread = 0;
3470 for (int i = 0; i < rm->rm_nrows; i++) {
3471 nread += vdev_raidz_read_all(zio,
3472 rm->rm_row[i]);
3473 }
3474 if (nread != 0) {
3475 /*
3476 * Normally our stage is VDEV_IO_DONE, but if
3477 * we've already called redone(), it will have
3478 * changed to VDEV_IO_START, in which case we
3479 * don't want to call redone() again.
3480 */
3481 if (zio->io_stage != ZIO_STAGE_VDEV_IO_START)
3482 zio_vdev_io_redone(zio);
3483 return;
3484 }
5caeef02
DB
3485 /*
3486 * It would be too expensive to try every possible
3487 * combination of failed sectors in every row, so
3488 * instead we try every combination of failed current or
3489 * past physical disk. This means that if the incorrect
3490 * sectors were all on Nparity disks at any point in the
3491 * past, we will find the correct data. The only known
3492 * case where this is less durable than a non-expanded
3493 * RAIDZ, is if we have a silent failure during
3494 * expansion. In that case, one block could be
3495 * partially in the old format and partially in the
3496 * new format, so we'd lost some sectors from the old
3497 * format and some from the new format.
3498 *
3499 * e.g. logical_width=4 physical_width=6
3500 * the 15 (6+5+4) possible failed disks are:
3501 * width=6 child=0
3502 * width=6 child=1
3503 * width=6 child=2
3504 * width=6 child=3
3505 * width=6 child=4
3506 * width=6 child=5
3507 * width=5 child=0
3508 * width=5 child=1
3509 * width=5 child=2
3510 * width=5 child=3
3511 * width=5 child=4
3512 * width=4 child=0
3513 * width=4 child=1
3514 * width=4 child=2
3515 * width=4 child=3
3516 * And we will try every combination of Nparity of these
3517 * failing.
3518 *
3519 * As a first pass, we can generate every combo,
3520 * and try reconstructing, ignoring any known
3521 * failures. If any row has too many known + simulated
3522 * failures, then we bail on reconstructing with this
3523 * number of simulated failures. As an improvement,
3524 * we could detect the number of whole known failures
3525 * (i.e. we have known failures on these disks for
3526 * every row; the disks never succeeded), and
3527 * subtract that from the max # failures to simulate.
3528 * We could go even further like the current
3529 * combrec code, but that doesn't seem like it
3530 * gains us very much. If we simulate a failure
3531 * that is also a known failure, that's fine.
3532 */
b2255edc
BB
3533 zio->io_error = vdev_raidz_combrec(zio);
3534 if (zio->io_error == ECKSUM &&
3535 !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
3536 vdev_raidz_io_done_unrecoverable(zio);
3537 }
34dc7c2f 3538 }
34dc7c2f 3539 }
5caeef02
DB
3540 if (rm->rm_lr != NULL) {
3541 zfs_rangelock_exit(rm->rm_lr);
3542 rm->rm_lr = NULL;
3543 }
34dc7c2f
BB
3544}
3545
3546static void
3547vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
3548{
b2255edc
BB
3549 vdev_raidz_t *vdrz = vd->vdev_tsd;
3550 if (faulted > vdrz->vd_nparity)
34dc7c2f
BB
3551 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
3552 VDEV_AUX_NO_REPLICAS);
3553 else if (degraded + faulted != 0)
3554 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
3555 else
3556 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
3557}
3558
3d6da72d
IH
3559/*
3560 * Determine if any portion of the provided block resides on a child vdev
3561 * with a dirty DTL and therefore needs to be resilvered. The function
e1cfd73f 3562 * assumes that at least one DTL is dirty which implies that full stripe
3d6da72d
IH
3563 * width blocks must be resilvered.
3564 */
3565static boolean_t
b2255edc
BB
3566vdev_raidz_need_resilver(vdev_t *vd, const dva_t *dva, size_t psize,
3567 uint64_t phys_birth)
3d6da72d 3568{
b2255edc 3569 vdev_raidz_t *vdrz = vd->vdev_tsd;
5caeef02
DB
3570
3571 /*
3572 * If we're in the middle of a RAIDZ expansion, this block may be in
3573 * the old and/or new location. For simplicity, always resilver it.
3574 */
3575 if (vdrz->vn_vre.vre_state == DSS_SCANNING)
3576 return (B_TRUE);
3577
3d6da72d 3578 uint64_t dcols = vd->vdev_children;
b2255edc 3579 uint64_t nparity = vdrz->vd_nparity;
3d6da72d
IH
3580 uint64_t ashift = vd->vdev_top->vdev_ashift;
3581 /* The starting RAIDZ (parent) vdev sector of the block. */
b2255edc 3582 uint64_t b = DVA_GET_OFFSET(dva) >> ashift;
3d6da72d
IH
3583 /* The zio's size in units of the vdev's minimum sector size. */
3584 uint64_t s = ((psize - 1) >> ashift) + 1;
3585 /* The first column for this stripe. */
3586 uint64_t f = b % dcols;
3587
b2255edc
BB
3588 /* Unreachable by sequential resilver. */
3589 ASSERT3U(phys_birth, !=, TXG_UNKNOWN);
3590
3591 if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1))
3592 return (B_FALSE);
3593
3d6da72d
IH
3594 if (s + nparity >= dcols)
3595 return (B_TRUE);
3596
3597 for (uint64_t c = 0; c < s + nparity; c++) {
3598 uint64_t devidx = (f + c) % dcols;
3599 vdev_t *cvd = vd->vdev_child[devidx];
3600
3601 /*
3602 * dsl_scan_need_resilver() already checked vd with
3603 * vdev_dtl_contains(). So here just check cvd with
3604 * vdev_dtl_empty(), cheaper and a good approximation.
3605 */
3606 if (!vdev_dtl_empty(cvd, DTL_PARTIAL))
3607 return (B_TRUE);
3608 }
3609
3610 return (B_FALSE);
3611}
3612
619f0976 3613static void
b2255edc
BB
3614vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *logical_rs,
3615 range_seg64_t *physical_rs, range_seg64_t *remain_rs)
619f0976 3616{
14e4e3cb
AZ
3617 (void) remain_rs;
3618
619f0976
GW
3619 vdev_t *raidvd = cvd->vdev_parent;
3620 ASSERT(raidvd->vdev_ops == &vdev_raidz_ops);
3621
5caeef02
DB
3622 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
3623
3624 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
3625 /*
3626 * We're in the middle of expansion, in which case the
3627 * translation is in flux. Any answer we give may be wrong
3628 * by the time we return, so it isn't safe for the caller to
3629 * act on it. Therefore we say that this range isn't present
3630 * on any children. The only consumers of this are "zpool
3631 * initialize" and trimming, both of which are "best effort"
3632 * anyway.
3633 */
3634 physical_rs->rs_start = physical_rs->rs_end = 0;
3635 remain_rs->rs_start = remain_rs->rs_end = 0;
3636 return;
3637 }
3638
3639 uint64_t width = vdrz->vd_physical_width;
619f0976
GW
3640 uint64_t tgt_col = cvd->vdev_id;
3641 uint64_t ashift = raidvd->vdev_top->vdev_ashift;
3642
3643 /* make sure the offsets are block-aligned */
b2255edc
BB
3644 ASSERT0(logical_rs->rs_start % (1 << ashift));
3645 ASSERT0(logical_rs->rs_end % (1 << ashift));
3646 uint64_t b_start = logical_rs->rs_start >> ashift;
3647 uint64_t b_end = logical_rs->rs_end >> ashift;
619f0976
GW
3648
3649 uint64_t start_row = 0;
3650 if (b_start > tgt_col) /* avoid underflow */
3651 start_row = ((b_start - tgt_col - 1) / width) + 1;
3652
3653 uint64_t end_row = 0;
3654 if (b_end > tgt_col)
3655 end_row = ((b_end - tgt_col - 1) / width) + 1;
3656
b2255edc
BB
3657 physical_rs->rs_start = start_row << ashift;
3658 physical_rs->rs_end = end_row << ashift;
619f0976 3659
b2255edc
BB
3660 ASSERT3U(physical_rs->rs_start, <=, logical_rs->rs_start);
3661 ASSERT3U(physical_rs->rs_end - physical_rs->rs_start, <=,
3662 logical_rs->rs_end - logical_rs->rs_start);
3663}
3664
5caeef02
DB
3665static void
3666raidz_reflow_sync(void *arg, dmu_tx_t *tx)
b2255edc 3667{
5caeef02
DB
3668 spa_t *spa = arg;
3669 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3670 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
b2255edc 3671
5caeef02
DB
3672 /*
3673 * Ensure there are no i/os to the range that is being committed.
3674 */
3675 uint64_t old_offset = RRSS_GET_OFFSET(&spa->spa_uberblock);
3676 ASSERT3U(vre->vre_offset_pertxg[txgoff], >=, old_offset);
b2255edc 3677
5caeef02
DB
3678 mutex_enter(&vre->vre_lock);
3679 uint64_t new_offset =
3680 MIN(vre->vre_offset_pertxg[txgoff], vre->vre_failed_offset);
3681 /*
3682 * We should not have committed anything that failed.
3683 */
3684 VERIFY3U(vre->vre_failed_offset, >=, old_offset);
3685 mutex_exit(&vre->vre_lock);
b2255edc 3686
5caeef02
DB
3687 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
3688 old_offset, new_offset - old_offset,
3689 RL_WRITER);
b2255edc 3690
5caeef02
DB
3691 /*
3692 * Update the uberblock that will be written when this txg completes.
3693 */
3694 RAIDZ_REFLOW_SET(&spa->spa_uberblock,
3695 RRSS_SCRATCH_INVALID_SYNCED_REFLOW, new_offset);
3696 vre->vre_offset_pertxg[txgoff] = 0;
3697 zfs_rangelock_exit(lr);
3698
3699 mutex_enter(&vre->vre_lock);
3700 vre->vre_bytes_copied += vre->vre_bytes_copied_pertxg[txgoff];
3701 vre->vre_bytes_copied_pertxg[txgoff] = 0;
3702 mutex_exit(&vre->vre_lock);
3703
3704 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3705 VERIFY0(zap_update(spa->spa_meta_objset,
3706 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
3707 sizeof (vre->vre_bytes_copied), 1, &vre->vre_bytes_copied, tx));
3708}
b2255edc 3709
5caeef02
DB
3710static void
3711raidz_reflow_complete_sync(void *arg, dmu_tx_t *tx)
3712{
3713 spa_t *spa = arg;
3714 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
3715 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
3716 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
b2255edc 3717
5caeef02
DB
3718 for (int i = 0; i < TXG_SIZE; i++)
3719 VERIFY0(vre->vre_offset_pertxg[i]);
b2255edc 3720
5caeef02
DB
3721 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
3722 re->re_txg = tx->tx_txg + TXG_CONCURRENT_STATES;
3723 re->re_logical_width = vdrz->vd_physical_width;
3724 mutex_enter(&vdrz->vd_expand_lock);
3725 avl_add(&vdrz->vd_expand_txgs, re);
3726 mutex_exit(&vdrz->vd_expand_lock);
3727
3728 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
3729
3730 /*
3731 * Dirty the config so that the updated ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS
3732 * will get written (based on vd_expand_txgs).
3733 */
3734 vdev_config_dirty(vd);
3735
3736 /*
3737 * Before we change vre_state, the on-disk state must reflect that we
3738 * have completed all copying, so that vdev_raidz_io_start() can use
3739 * vre_state to determine if the reflow is in progress. See also the
3740 * end of spa_raidz_expand_thread().
3741 */
3742 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==,
3743 raidvd->vdev_ms_count << raidvd->vdev_ms_shift);
3744
3745 vre->vre_end_time = gethrestime_sec();
3746 vre->vre_state = DSS_FINISHED;
3747
3748 uint64_t state = vre->vre_state;
3749 VERIFY0(zap_update(spa->spa_meta_objset,
3750 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
3751 sizeof (state), 1, &state, tx));
3752
3753 uint64_t end_time = vre->vre_end_time;
3754 VERIFY0(zap_update(spa->spa_meta_objset,
3755 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
3756 sizeof (end_time), 1, &end_time, tx));
3757
3758 spa->spa_uberblock.ub_raidz_reflow_info = 0;
3759
3760 spa_history_log_internal(spa, "raidz vdev expansion completed", tx,
3761 "%s vdev %llu new width %llu", spa_name(spa),
3762 (unsigned long long)vd->vdev_id,
3763 (unsigned long long)vd->vdev_children);
3764
3765 spa->spa_raidz_expand = NULL;
3766 raidvd->vdev_rz_expanding = B_FALSE;
3767
3768 spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
3769 spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
3770 spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
3771
3772 spa_notify_waiters(spa);
3773
3774 /*
3775 * While we're in syncing context take the opportunity to
3776 * setup a scrub. All the data has been sucessfully copied
3777 * but we have not validated any checksums.
3778 */
3779 pool_scan_func_t func = POOL_SCAN_SCRUB;
3780 if (zfs_scrub_after_expand && dsl_scan_setup_check(&func, tx) == 0)
3781 dsl_scan_setup_sync(&func, tx);
b2255edc
BB
3782}
3783
5caeef02
DB
3784/*
3785 * Struct for one copy zio.
3786 */
3787typedef struct raidz_reflow_arg {
3788 vdev_raidz_expand_t *rra_vre;
3789 zfs_locked_range_t *rra_lr;
3790 uint64_t rra_txg;
3791} raidz_reflow_arg_t;
3792
3793/*
3794 * The write of the new location is done.
3795 */
b2255edc 3796static void
5caeef02 3797raidz_reflow_write_done(zio_t *zio)
b2255edc 3798{
5caeef02
DB
3799 raidz_reflow_arg_t *rra = zio->io_private;
3800 vdev_raidz_expand_t *vre = rra->rra_vre;
3801
3802 abd_free(zio->io_abd);
3803
3804 mutex_enter(&vre->vre_lock);
3805 if (zio->io_error != 0) {
3806 /* Force a reflow pause on errors */
3807 vre->vre_failed_offset =
3808 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3809 }
3810 ASSERT3U(vre->vre_outstanding_bytes, >=, zio->io_size);
3811 vre->vre_outstanding_bytes -= zio->io_size;
3812 if (rra->rra_lr->lr_offset + rra->rra_lr->lr_length <
3813 vre->vre_failed_offset) {
3814 vre->vre_bytes_copied_pertxg[rra->rra_txg & TXG_MASK] +=
3815 zio->io_size;
3816 }
3817 cv_signal(&vre->vre_cv);
3818 mutex_exit(&vre->vre_lock);
3819
3820 zfs_rangelock_exit(rra->rra_lr);
3821
3822 kmem_free(rra, sizeof (*rra));
3823 spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa);
b2255edc
BB
3824}
3825
3826/*
5caeef02
DB
3827 * The read of the old location is done. The parent zio is the write to
3828 * the new location. Allow it to start.
b2255edc
BB
3829 */
3830static void
5caeef02 3831raidz_reflow_read_done(zio_t *zio)
b2255edc 3832{
5caeef02
DB
3833 raidz_reflow_arg_t *rra = zio->io_private;
3834 vdev_raidz_expand_t *vre = rra->rra_vre;
b2255edc
BB
3835
3836 /*
5caeef02
DB
3837 * If the read failed, or if it was done on a vdev that is not fully
3838 * healthy (e.g. a child that has a resilver in progress), we may not
3839 * have the correct data. Note that it's OK if the write proceeds.
3840 * It may write garbage but the location is otherwise unused and we
3841 * will retry later due to vre_failed_offset.
b2255edc 3842 */
5caeef02
DB
3843 if (zio->io_error != 0 || !vdev_dtl_empty(zio->io_vd, DTL_MISSING)) {
3844 zfs_dbgmsg("reflow read failed off=%llu size=%llu txg=%llu "
3845 "err=%u partial_dtl_empty=%u missing_dtl_empty=%u",
3846 (long long)rra->rra_lr->lr_offset,
3847 (long long)rra->rra_lr->lr_length,
3848 (long long)rra->rra_txg,
3849 zio->io_error,
3850 vdev_dtl_empty(zio->io_vd, DTL_PARTIAL),
3851 vdev_dtl_empty(zio->io_vd, DTL_MISSING));
3852 mutex_enter(&vre->vre_lock);
3853 /* Force a reflow pause on errors */
3854 vre->vre_failed_offset =
3855 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3856 mutex_exit(&vre->vre_lock);
3857 }
b2255edc 3858
5caeef02 3859 zio_nowait(zio_unique_parent(zio));
b2255edc
BB
3860}
3861
5caeef02
DB
3862static void
3863raidz_reflow_record_progress(vdev_raidz_expand_t *vre, uint64_t offset,
3864 dmu_tx_t *tx)
b2255edc 3865{
5caeef02
DB
3866 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3867 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
b2255edc 3868
5caeef02
DB
3869 if (offset == 0)
3870 return;
3871
3872 mutex_enter(&vre->vre_lock);
3873 ASSERT3U(vre->vre_offset, <=, offset);
3874 vre->vre_offset = offset;
3875 mutex_exit(&vre->vre_lock);
3876
3877 if (vre->vre_offset_pertxg[txgoff] == 0) {
3878 dsl_sync_task_nowait(dmu_tx_pool(tx), raidz_reflow_sync,
3879 spa, tx);
3880 }
3881 vre->vre_offset_pertxg[txgoff] = offset;
619f0976
GW
3882}
3883
5caeef02
DB
3884static boolean_t
3885vdev_raidz_expand_child_replacing(vdev_t *raidz_vd)
3886{
3887 for (int i = 0; i < raidz_vd->vdev_children; i++) {
3888 /* Quick check if a child is being replaced */
3889 if (!raidz_vd->vdev_child[i]->vdev_ops->vdev_op_leaf)
3890 return (B_TRUE);
3891 }
3892 return (B_FALSE);
3893}
3894
3895static boolean_t
3896raidz_reflow_impl(vdev_t *vd, vdev_raidz_expand_t *vre, range_tree_t *rt,
3897 dmu_tx_t *tx)
3898{
3899 spa_t *spa = vd->vdev_spa;
3900 int ashift = vd->vdev_top->vdev_ashift;
3901 uint64_t offset, size;
3902
3903 if (!range_tree_find_in(rt, 0, vd->vdev_top->vdev_asize,
3904 &offset, &size)) {
3905 return (B_FALSE);
3906 }
3907 ASSERT(IS_P2ALIGNED(offset, 1 << ashift));
3908 ASSERT3U(size, >=, 1 << ashift);
3909 uint64_t length = 1 << ashift;
3910 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
3911
3912 uint64_t blkid = offset >> ashift;
3913
3914 int old_children = vd->vdev_children - 1;
3915
3916 /*
3917 * We can only progress to the point that writes will not overlap
3918 * with blocks whose progress has not yet been recorded on disk.
3919 * Since partially-copied rows are still read from the old location,
3920 * we need to stop one row before the sector-wise overlap, to prevent
3921 * row-wise overlap.
3922 *
3923 * Note that even if we are skipping over a large unallocated region,
3924 * we can't move the on-disk progress to `offset`, because concurrent
3925 * writes/allocations could still use the currently-unallocated
3926 * region.
3927 */
3928 uint64_t ubsync_blkid =
3929 RRSS_GET_OFFSET(&spa->spa_ubsync) >> ashift;
3930 uint64_t next_overwrite_blkid = ubsync_blkid +
3931 ubsync_blkid / old_children - old_children;
3932 VERIFY3U(next_overwrite_blkid, >, ubsync_blkid);
3933
3934 if (blkid >= next_overwrite_blkid) {
3935 raidz_reflow_record_progress(vre,
3936 next_overwrite_blkid << ashift, tx);
3937 return (B_TRUE);
3938 }
3939
3940 range_tree_remove(rt, offset, length);
3941
3942 raidz_reflow_arg_t *rra = kmem_zalloc(sizeof (*rra), KM_SLEEP);
3943 rra->rra_vre = vre;
3944 rra->rra_lr = zfs_rangelock_enter(&vre->vre_rangelock,
3945 offset, length, RL_WRITER);
3946 rra->rra_txg = dmu_tx_get_txg(tx);
3947
3948 raidz_reflow_record_progress(vre, offset + length, tx);
3949
3950 mutex_enter(&vre->vre_lock);
3951 vre->vre_outstanding_bytes += length;
3952 mutex_exit(&vre->vre_lock);
3953
3954 /*
3955 * SCL_STATE will be released when the read and write are done,
3956 * by raidz_reflow_write_done().
3957 */
3958 spa_config_enter(spa, SCL_STATE, spa, RW_READER);
3959
3960 /* check if a replacing vdev was added, if so treat it as an error */
3961 if (vdev_raidz_expand_child_replacing(vd)) {
3962 zfs_dbgmsg("replacing vdev encountered, reflow paused at "
3963 "offset=%llu txg=%llu",
3964 (long long)rra->rra_lr->lr_offset,
3965 (long long)rra->rra_txg);
3966
3967 mutex_enter(&vre->vre_lock);
3968 vre->vre_failed_offset =
3969 MIN(vre->vre_failed_offset, rra->rra_lr->lr_offset);
3970 cv_signal(&vre->vre_cv);
3971 mutex_exit(&vre->vre_lock);
3972
3973 /* drop everything we acquired */
3974 zfs_rangelock_exit(rra->rra_lr);
3975 kmem_free(rra, sizeof (*rra));
3976 spa_config_exit(spa, SCL_STATE, spa);
3977 return (B_TRUE);
3978 }
3979
3980 zio_t *pio = spa->spa_txg_zio[txgoff];
3981 abd_t *abd = abd_alloc_for_io(length, B_FALSE);
3982 zio_t *write_zio = zio_vdev_child_io(pio, NULL,
3983 vd->vdev_child[blkid % vd->vdev_children],
3984 (blkid / vd->vdev_children) << ashift,
3985 abd, length,
3986 ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL,
3987 ZIO_FLAG_CANFAIL,
3988 raidz_reflow_write_done, rra);
3989
3990 zio_nowait(zio_vdev_child_io(write_zio, NULL,
3991 vd->vdev_child[blkid % old_children],
3992 (blkid / old_children) << ashift,
3993 abd, length,
3994 ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL,
3995 ZIO_FLAG_CANFAIL,
3996 raidz_reflow_read_done, rra));
3997
3998 return (B_FALSE);
3999}
4000
4001/*
4002 * For testing (ztest specific)
4003 */
4004static void
4005raidz_expand_pause(uint_t pause_point)
4006{
4007 while (raidz_expand_pause_point != 0 &&
4008 raidz_expand_pause_point <= pause_point)
4009 delay(hz);
4010}
4011
4012static void
4013raidz_scratch_child_done(zio_t *zio)
4014{
4015 zio_t *pio = zio->io_private;
4016
4017 mutex_enter(&pio->io_lock);
4018 pio->io_error = zio_worst_error(pio->io_error, zio->io_error);
4019 mutex_exit(&pio->io_lock);
4020}
4021
4022/*
4023 * Reflow the beginning portion of the vdev into an intermediate scratch area
4024 * in memory and on disk. This operation must be persisted on disk before we
4025 * proceed to overwrite the beginning portion with the reflowed data.
4026 *
4027 * This multi-step task can fail to complete if disk errors are encountered
4028 * and we can return here after a pause (waiting for disk to become healthy).
4029 */
4030static void
4031raidz_reflow_scratch_sync(void *arg, dmu_tx_t *tx)
4032{
4033 vdev_raidz_expand_t *vre = arg;
4034 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
4035 zio_t *pio;
4036 int error;
4037
4038 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4039 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4040 int ashift = raidvd->vdev_ashift;
4041 uint64_t write_size = P2ALIGN(VDEV_BOOT_SIZE, 1 << ashift);
4042 uint64_t logical_size = write_size * raidvd->vdev_children;
4043 uint64_t read_size =
4044 P2ROUNDUP(DIV_ROUND_UP(logical_size, (raidvd->vdev_children - 1)),
4045 1 << ashift);
4046
4047 /*
4048 * The scratch space must be large enough to get us to the point
4049 * that one row does not overlap itself when moved. This is checked
4050 * by vdev_raidz_attach_check().
4051 */
4052 VERIFY3U(write_size, >=, raidvd->vdev_children << ashift);
4053 VERIFY3U(write_size, <=, VDEV_BOOT_SIZE);
4054 VERIFY3U(write_size, <=, read_size);
4055
4056 zfs_locked_range_t *lr = zfs_rangelock_enter(&vre->vre_rangelock,
4057 0, logical_size, RL_WRITER);
4058
4059 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4060 KM_SLEEP);
4061 for (int i = 0; i < raidvd->vdev_children; i++) {
4062 abds[i] = abd_alloc_linear(read_size, B_FALSE);
4063 }
4064
4065 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_1);
4066
4067 /*
4068 * If we have already written the scratch area then we must read from
4069 * there, since new writes were redirected there while we were paused
4070 * or the original location may have been partially overwritten with
4071 * reflowed data.
4072 */
4073 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID) {
4074 VERIFY3U(RRSS_GET_OFFSET(&spa->spa_ubsync), ==, logical_size);
4075 /*
4076 * Read from scratch space.
4077 */
4078 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4079 for (int i = 0; i < raidvd->vdev_children; i++) {
4080 /*
4081 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE
4082 * to the offset to calculate the physical offset to
4083 * write to. Passing in a negative offset makes us
4084 * access the scratch area.
4085 */
4086 zio_nowait(zio_vdev_child_io(pio, NULL,
4087 raidvd->vdev_child[i],
4088 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4089 write_size, ZIO_TYPE_READ, ZIO_PRIORITY_ASYNC_READ,
4090 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4091 }
4092 error = zio_wait(pio);
4093 if (error != 0) {
4094 zfs_dbgmsg("reflow: error %d reading scratch location",
4095 error);
4096 goto io_error_exit;
4097 }
4098 goto overwrite;
4099 }
4100
4101 /*
4102 * Read from original location.
4103 */
4104 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4105 for (int i = 0; i < raidvd->vdev_children - 1; i++) {
4106 ASSERT0(vdev_is_dead(raidvd->vdev_child[i]));
4107 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4108 0, abds[i], read_size, ZIO_TYPE_READ,
4109 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
4110 raidz_scratch_child_done, pio));
4111 }
4112 error = zio_wait(pio);
4113 if (error != 0) {
4114 zfs_dbgmsg("reflow: error %d reading original location", error);
4115io_error_exit:
4116 for (int i = 0; i < raidvd->vdev_children; i++)
4117 abd_free(abds[i]);
4118 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4119 zfs_rangelock_exit(lr);
4120 spa_config_exit(spa, SCL_STATE, FTAG);
4121 return;
4122 }
4123
4124 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_2);
4125
4126 /*
4127 * Reflow in memory.
4128 */
4129 uint64_t logical_sectors = logical_size >> ashift;
4130 for (int i = raidvd->vdev_children - 1; i < logical_sectors; i++) {
4131 int oldchild = i % (raidvd->vdev_children - 1);
4132 uint64_t oldoff = (i / (raidvd->vdev_children - 1)) << ashift;
4133
4134 int newchild = i % raidvd->vdev_children;
4135 uint64_t newoff = (i / raidvd->vdev_children) << ashift;
4136
4137 /* a single sector should not be copying over itself */
4138 ASSERT(!(newchild == oldchild && newoff == oldoff));
4139
4140 abd_copy_off(abds[newchild], abds[oldchild],
4141 newoff, oldoff, 1 << ashift);
4142 }
4143
4144 /*
4145 * Verify that we filled in everything we intended to (write_size on
4146 * each child).
4147 */
4148 VERIFY0(logical_sectors % raidvd->vdev_children);
4149 VERIFY3U((logical_sectors / raidvd->vdev_children) << ashift, ==,
4150 write_size);
4151
4152 /*
4153 * Write to scratch location (boot area).
4154 */
4155 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4156 for (int i = 0; i < raidvd->vdev_children; i++) {
4157 /*
4158 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4159 * the offset to calculate the physical offset to write to.
4160 * Passing in a negative offset lets us access the boot area.
4161 */
4162 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4163 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4164 write_size, ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
4165 ZIO_FLAG_CANFAIL, raidz_scratch_child_done, pio));
4166 }
4167 error = zio_wait(pio);
4168 if (error != 0) {
4169 zfs_dbgmsg("reflow: error %d writing scratch location", error);
4170 goto io_error_exit;
4171 }
4172 pio = zio_root(spa, NULL, NULL, 0);
4173 zio_flush(pio, raidvd);
4174 zio_wait(pio);
4175
4176 zfs_dbgmsg("reflow: wrote %llu bytes (logical) to scratch area",
4177 (long long)logical_size);
4178
4179 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_PRE_SCRATCH_3);
4180
4181 /*
4182 * Update uberblock to indicate that scratch space is valid. This is
4183 * needed because after this point, the real location may be
4184 * overwritten. If we crash, we need to get the data from the
4185 * scratch space, rather than the real location.
4186 *
4187 * Note: ub_timestamp is bumped so that vdev_uberblock_compare()
4188 * will prefer this uberblock.
4189 */
4190 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_VALID, logical_size);
4191 spa->spa_ubsync.ub_timestamp++;
4192 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4193 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4194 if (spa_multihost(spa))
4195 mmp_update_uberblock(spa, &spa->spa_ubsync);
4196
4197 zfs_dbgmsg("reflow: uberblock updated "
4198 "(txg %llu, SCRATCH_VALID, size %llu, ts %llu)",
4199 (long long)spa->spa_ubsync.ub_txg,
4200 (long long)logical_size,
4201 (long long)spa->spa_ubsync.ub_timestamp);
4202
4203 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_VALID);
4204
4205 /*
4206 * Overwrite with reflow'ed data.
4207 */
4208overwrite:
4209 pio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
4210 for (int i = 0; i < raidvd->vdev_children; i++) {
4211 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4212 0, abds[i], write_size, ZIO_TYPE_WRITE,
4213 ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL,
4214 raidz_scratch_child_done, pio));
4215 }
4216 error = zio_wait(pio);
4217 if (error != 0) {
4218 /*
4219 * When we exit early here and drop the range lock, new
4220 * writes will go into the scratch area so we'll need to
4221 * read from there when we return after pausing.
4222 */
4223 zfs_dbgmsg("reflow: error %d writing real location", error);
4224 /*
4225 * Update the uberblock that is written when this txg completes.
4226 */
4227 RAIDZ_REFLOW_SET(&spa->spa_uberblock, RRSS_SCRATCH_VALID,
4228 logical_size);
4229 goto io_error_exit;
4230 }
4231 pio = zio_root(spa, NULL, NULL, 0);
4232 zio_flush(pio, raidvd);
4233 zio_wait(pio);
4234
4235 zfs_dbgmsg("reflow: overwrote %llu bytes (logical) to real location",
4236 (long long)logical_size);
4237 for (int i = 0; i < raidvd->vdev_children; i++)
4238 abd_free(abds[i]);
4239 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4240
4241 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_REFLOWED);
4242
4243 /*
4244 * Update uberblock to indicate that the initial part has been
4245 * reflow'ed. This is needed because after this point (when we exit
4246 * the rangelock), we allow regular writes to this region, which will
4247 * be written to the new location only (because reflow_offset_next ==
4248 * reflow_offset_synced). If we crashed and re-copied from the
4249 * scratch space, we would lose the regular writes.
4250 */
4251 RAIDZ_REFLOW_SET(&spa->spa_ubsync, RRSS_SCRATCH_INVALID_SYNCED,
4252 logical_size);
4253 spa->spa_ubsync.ub_timestamp++;
4254 ASSERT0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4255 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4256 if (spa_multihost(spa))
4257 mmp_update_uberblock(spa, &spa->spa_ubsync);
4258
4259 zfs_dbgmsg("reflow: uberblock updated "
4260 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4261 (long long)spa->spa_ubsync.ub_txg,
4262 (long long)logical_size,
4263 (long long)spa->spa_ubsync.ub_timestamp);
4264
4265 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_1);
4266
4267 /*
4268 * Update progress.
4269 */
4270 vre->vre_offset = logical_size;
4271 zfs_rangelock_exit(lr);
4272 spa_config_exit(spa, SCL_STATE, FTAG);
4273
4274 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4275 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4276 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4277 /*
4278 * Note - raidz_reflow_sync() will update the uberblock state to
4279 * RRSS_SCRATCH_INVALID_SYNCED_REFLOW
4280 */
4281 raidz_reflow_sync(spa, tx);
4282
4283 raidz_expand_pause(RAIDZ_EXPAND_PAUSE_SCRATCH_POST_REFLOW_2);
4284}
4285
4286/*
4287 * We crashed in the middle of raidz_reflow_scratch_sync(); complete its work
4288 * here. No other i/o can be in progress, so we don't need the vre_rangelock.
4289 */
4290void
4291vdev_raidz_reflow_copy_scratch(spa_t *spa)
4292{
4293 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4294 uint64_t logical_size = RRSS_GET_OFFSET(&spa->spa_uberblock);
4295 ASSERT3U(RRSS_GET_STATE(&spa->spa_uberblock), ==, RRSS_SCRATCH_VALID);
4296
4297 spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
4298 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4299 ASSERT0(logical_size % raidvd->vdev_children);
4300 uint64_t write_size = logical_size / raidvd->vdev_children;
4301
4302 zio_t *pio;
4303
4304 /*
4305 * Read from scratch space.
4306 */
4307 abd_t **abds = kmem_alloc(raidvd->vdev_children * sizeof (abd_t *),
4308 KM_SLEEP);
4309 for (int i = 0; i < raidvd->vdev_children; i++) {
4310 abds[i] = abd_alloc_linear(write_size, B_FALSE);
4311 }
4312
4313 pio = zio_root(spa, NULL, NULL, 0);
4314 for (int i = 0; i < raidvd->vdev_children; i++) {
4315 /*
4316 * Note: zio_vdev_child_io() adds VDEV_LABEL_START_SIZE to
4317 * the offset to calculate the physical offset to write to.
4318 * Passing in a negative offset lets us access the boot area.
4319 */
4320 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4321 VDEV_BOOT_OFFSET - VDEV_LABEL_START_SIZE, abds[i],
4322 write_size, ZIO_TYPE_READ,
4323 ZIO_PRIORITY_ASYNC_READ, 0,
4324 raidz_scratch_child_done, pio));
4325 }
4326 zio_wait(pio);
4327
4328 /*
4329 * Overwrite real location with reflow'ed data.
4330 */
4331 pio = zio_root(spa, NULL, NULL, 0);
4332 for (int i = 0; i < raidvd->vdev_children; i++) {
4333 zio_nowait(zio_vdev_child_io(pio, NULL, raidvd->vdev_child[i],
4334 0, abds[i], write_size, ZIO_TYPE_WRITE,
4335 ZIO_PRIORITY_ASYNC_WRITE, 0,
4336 raidz_scratch_child_done, pio));
4337 }
4338 zio_wait(pio);
4339 pio = zio_root(spa, NULL, NULL, 0);
4340 zio_flush(pio, raidvd);
4341 zio_wait(pio);
4342
4343 zfs_dbgmsg("reflow recovery: overwrote %llu bytes (logical) "
4344 "to real location", (long long)logical_size);
4345
4346 for (int i = 0; i < raidvd->vdev_children; i++)
4347 abd_free(abds[i]);
4348 kmem_free(abds, raidvd->vdev_children * sizeof (abd_t *));
4349
4350 /*
4351 * Update uberblock.
4352 */
4353 RAIDZ_REFLOW_SET(&spa->spa_ubsync,
4354 RRSS_SCRATCH_INVALID_SYNCED_ON_IMPORT, logical_size);
4355 spa->spa_ubsync.ub_timestamp++;
4356 VERIFY0(vdev_uberblock_sync_list(&spa->spa_root_vdev, 1,
4357 &spa->spa_ubsync, ZIO_FLAG_CONFIG_WRITER));
4358 if (spa_multihost(spa))
4359 mmp_update_uberblock(spa, &spa->spa_ubsync);
4360
4361 zfs_dbgmsg("reflow recovery: uberblock updated "
4362 "(txg %llu, SCRATCH_NOT_IN_USE, size %llu, ts %llu)",
4363 (long long)spa->spa_ubsync.ub_txg,
4364 (long long)logical_size,
4365 (long long)spa->spa_ubsync.ub_timestamp);
4366
4367 dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
4368 spa_first_txg(spa));
4369 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
4370 vre->vre_offset = logical_size;
4371 vre->vre_offset_pertxg[txgoff] = vre->vre_offset;
4372 vre->vre_bytes_copied_pertxg[txgoff] = vre->vre_bytes_copied;
4373 /*
4374 * Note that raidz_reflow_sync() will update the uberblock once more
4375 */
4376 raidz_reflow_sync(spa, tx);
4377
4378 dmu_tx_commit(tx);
4379
4380 spa_config_exit(spa, SCL_STATE, FTAG);
4381}
4382
4383static boolean_t
4384spa_raidz_expand_thread_check(void *arg, zthr_t *zthr)
4385{
4386 (void) zthr;
4387 spa_t *spa = arg;
4388
4389 return (spa->spa_raidz_expand != NULL &&
4390 !spa->spa_raidz_expand->vre_waiting_for_resilver);
4391}
4392
4393/*
4394 * RAIDZ expansion background thread
4395 *
4396 * Can be called multiple times if the reflow is paused
4397 */
4398static void
4399spa_raidz_expand_thread(void *arg, zthr_t *zthr)
4400{
4401 spa_t *spa = arg;
4402 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4403
4404 if (RRSS_GET_STATE(&spa->spa_ubsync) == RRSS_SCRATCH_VALID)
4405 vre->vre_offset = 0;
4406 else
4407 vre->vre_offset = RRSS_GET_OFFSET(&spa->spa_ubsync);
4408
4409 /* Reflow the begining portion using the scratch area */
4410 if (vre->vre_offset == 0) {
4411 VERIFY0(dsl_sync_task(spa_name(spa),
4412 NULL, raidz_reflow_scratch_sync,
4413 vre, 0, ZFS_SPACE_CHECK_NONE));
4414
4415 /* if we encountered errors then pause */
4416 if (vre->vre_offset == 0) {
4417 mutex_enter(&vre->vre_lock);
4418 vre->vre_waiting_for_resilver = B_TRUE;
4419 mutex_exit(&vre->vre_lock);
4420 return;
4421 }
4422 }
4423
4424 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4425 vdev_t *raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4426
4427 uint64_t guid = raidvd->vdev_guid;
4428
4429 /* Iterate over all the remaining metaslabs */
4430 for (uint64_t i = vre->vre_offset >> raidvd->vdev_ms_shift;
4431 i < raidvd->vdev_ms_count &&
4432 !zthr_iscancelled(zthr) &&
4433 vre->vre_failed_offset == UINT64_MAX; i++) {
4434 metaslab_t *msp = raidvd->vdev_ms[i];
4435
4436 metaslab_disable(msp);
4437 mutex_enter(&msp->ms_lock);
4438
4439 /*
4440 * The metaslab may be newly created (for the expanded
4441 * space), in which case its trees won't exist yet,
4442 * so we need to bail out early.
4443 */
4444 if (msp->ms_new) {
4445 mutex_exit(&msp->ms_lock);
4446 metaslab_enable(msp, B_FALSE, B_FALSE);
4447 continue;
4448 }
4449
4450 VERIFY0(metaslab_load(msp));
4451
4452 /*
4453 * We want to copy everything except the free (allocatable)
4454 * space. Note that there may be a little bit more free
4455 * space (e.g. in ms_defer), and it's fine to copy that too.
4456 */
4457 range_tree_t *rt = range_tree_create(NULL, RANGE_SEG64,
4458 NULL, 0, 0);
4459 range_tree_add(rt, msp->ms_start, msp->ms_size);
4460 range_tree_walk(msp->ms_allocatable, range_tree_remove, rt);
4461 mutex_exit(&msp->ms_lock);
4462
4463 /*
4464 * Force the last sector of each metaslab to be copied. This
4465 * ensures that we advance the on-disk progress to the end of
4466 * this metaslab while the metaslab is disabled. Otherwise, we
4467 * could move past this metaslab without advancing the on-disk
4468 * progress, and then an allocation to this metaslab would not
4469 * be copied.
4470 */
4471 int sectorsz = 1 << raidvd->vdev_ashift;
4472 uint64_t ms_last_offset = msp->ms_start +
4473 msp->ms_size - sectorsz;
4474 if (!range_tree_contains(rt, ms_last_offset, sectorsz)) {
4475 range_tree_add(rt, ms_last_offset, sectorsz);
4476 }
4477
4478 /*
4479 * When we are resuming from a paused expansion (i.e.
4480 * when importing a pool with a expansion in progress),
4481 * discard any state that we have already processed.
4482 */
4483 range_tree_clear(rt, 0, vre->vre_offset);
4484
4485 while (!zthr_iscancelled(zthr) &&
4486 !range_tree_is_empty(rt) &&
4487 vre->vre_failed_offset == UINT64_MAX) {
4488
4489 /*
4490 * We need to periodically drop the config lock so that
4491 * writers can get in. Additionally, we can't wait
4492 * for a txg to sync while holding a config lock
4493 * (since a waiting writer could cause a 3-way deadlock
4494 * with the sync thread, which also gets a config
4495 * lock for reader). So we can't hold the config lock
4496 * while calling dmu_tx_assign().
4497 */
4498 spa_config_exit(spa, SCL_CONFIG, FTAG);
4499
4500 /*
4501 * If requested, pause the reflow when the amount
4502 * specified by raidz_expand_max_reflow_bytes is reached
4503 *
4504 * This pause is only used during testing or debugging.
4505 */
4506 while (raidz_expand_max_reflow_bytes != 0 &&
4507 raidz_expand_max_reflow_bytes <=
4508 vre->vre_bytes_copied && !zthr_iscancelled(zthr)) {
4509 delay(hz);
4510 }
4511
4512 mutex_enter(&vre->vre_lock);
4513 while (vre->vre_outstanding_bytes >
4514 raidz_expand_max_copy_bytes) {
4515 cv_wait(&vre->vre_cv, &vre->vre_lock);
4516 }
4517 mutex_exit(&vre->vre_lock);
4518
4519 dmu_tx_t *tx =
4520 dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4521
4522 VERIFY0(dmu_tx_assign(tx, TXG_WAIT));
4523 uint64_t txg = dmu_tx_get_txg(tx);
4524
4525 /*
4526 * Reacquire the vdev_config lock. Theoretically, the
4527 * vdev_t that we're expanding may have changed.
4528 */
4529 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4530 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4531
4532 boolean_t needsync =
4533 raidz_reflow_impl(raidvd, vre, rt, tx);
4534
4535 dmu_tx_commit(tx);
4536
4537 if (needsync) {
4538 spa_config_exit(spa, SCL_CONFIG, FTAG);
4539 txg_wait_synced(spa->spa_dsl_pool, txg);
4540 spa_config_enter(spa, SCL_CONFIG, FTAG,
4541 RW_READER);
4542 }
4543 }
4544
4545 spa_config_exit(spa, SCL_CONFIG, FTAG);
4546
4547 metaslab_enable(msp, B_FALSE, B_FALSE);
4548 range_tree_vacate(rt, NULL, NULL);
4549 range_tree_destroy(rt);
4550
4551 spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4552 raidvd = vdev_lookup_top(spa, vre->vre_vdev_id);
4553 }
4554
4555 spa_config_exit(spa, SCL_CONFIG, FTAG);
4556
4557 /*
4558 * The txg_wait_synced() here ensures that all reflow zio's have
4559 * completed, and vre_failed_offset has been set if necessary. It
4560 * also ensures that the progress of the last raidz_reflow_sync() is
4561 * written to disk before raidz_reflow_complete_sync() changes the
4562 * in-memory vre_state. vdev_raidz_io_start() uses vre_state to
4563 * determine if a reflow is in progress, in which case we may need to
4564 * write to both old and new locations. Therefore we can only change
4565 * vre_state once this is not necessary, which is once the on-disk
4566 * progress (in spa_ubsync) has been set past any possible writes (to
4567 * the end of the last metaslab).
4568 */
4569 txg_wait_synced(spa->spa_dsl_pool, 0);
4570
4571 if (!zthr_iscancelled(zthr) &&
4572 vre->vre_offset == raidvd->vdev_ms_count << raidvd->vdev_ms_shift) {
4573 /*
4574 * We are not being canceled or paused, so the reflow must be
4575 * complete. In that case also mark it as completed on disk.
4576 */
4577 ASSERT3U(vre->vre_failed_offset, ==, UINT64_MAX);
4578 VERIFY0(dsl_sync_task(spa_name(spa), NULL,
4579 raidz_reflow_complete_sync, spa,
4580 0, ZFS_SPACE_CHECK_NONE));
4581 (void) vdev_online(spa, guid, ZFS_ONLINE_EXPAND, NULL);
4582 } else {
4583 /*
4584 * Wait for all copy zio's to complete and for all the
4585 * raidz_reflow_sync() synctasks to be run.
4586 */
4587 spa_history_log_internal(spa, "reflow pause",
4588 NULL, "offset=%llu failed_offset=%lld",
4589 (long long)vre->vre_offset,
4590 (long long)vre->vre_failed_offset);
4591 mutex_enter(&vre->vre_lock);
4592 if (vre->vre_failed_offset != UINT64_MAX) {
4593 /*
4594 * Reset progress so that we will retry everything
4595 * after the point that something failed.
4596 */
4597 vre->vre_offset = vre->vre_failed_offset;
4598 vre->vre_failed_offset = UINT64_MAX;
4599 vre->vre_waiting_for_resilver = B_TRUE;
4600 }
4601 mutex_exit(&vre->vre_lock);
4602 }
4603}
4604
4605void
4606spa_start_raidz_expansion_thread(spa_t *spa)
4607{
4608 ASSERT3P(spa->spa_raidz_expand_zthr, ==, NULL);
4609 spa->spa_raidz_expand_zthr = zthr_create("raidz_expand",
4610 spa_raidz_expand_thread_check, spa_raidz_expand_thread,
4611 spa, defclsyspri);
4612}
4613
4614void
4615raidz_dtl_reassessed(vdev_t *vd)
4616{
4617 spa_t *spa = vd->vdev_spa;
4618 if (spa->spa_raidz_expand != NULL) {
4619 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4620 /*
4621 * we get called often from vdev_dtl_reassess() so make
4622 * sure it's our vdev and any replacing is complete
4623 */
4624 if (vd->vdev_top->vdev_id == vre->vre_vdev_id &&
4625 !vdev_raidz_expand_child_replacing(vd->vdev_top)) {
4626 mutex_enter(&vre->vre_lock);
4627 if (vre->vre_waiting_for_resilver) {
4628 vdev_dbgmsg(vd, "DTL reassessed, "
4629 "continuing raidz expansion");
4630 vre->vre_waiting_for_resilver = B_FALSE;
4631 zthr_wakeup(spa->spa_raidz_expand_zthr);
4632 }
4633 mutex_exit(&vre->vre_lock);
4634 }
4635 }
4636}
4637
4638int
4639vdev_raidz_attach_check(vdev_t *new_child)
4640{
4641 vdev_t *raidvd = new_child->vdev_parent;
4642 uint64_t new_children = raidvd->vdev_children;
4643
4644 /*
4645 * We use the "boot" space as scratch space to handle overwriting the
4646 * initial part of the vdev. If it is too small, then this expansion
4647 * is not allowed. This would be very unusual (e.g. ashift > 13 and
4648 * >200 children).
4649 */
4650 if (new_children << raidvd->vdev_ashift > VDEV_BOOT_SIZE) {
4651 return (EINVAL);
4652 }
4653 return (0);
4654}
4655
4656void
4657vdev_raidz_attach_sync(void *arg, dmu_tx_t *tx)
4658{
4659 vdev_t *new_child = arg;
4660 spa_t *spa = new_child->vdev_spa;
4661 vdev_t *raidvd = new_child->vdev_parent;
4662 vdev_raidz_t *vdrz = raidvd->vdev_tsd;
4663 ASSERT3P(raidvd->vdev_ops, ==, &vdev_raidz_ops);
4664 ASSERT3P(raidvd->vdev_top, ==, raidvd);
4665 ASSERT3U(raidvd->vdev_children, >, vdrz->vd_original_width);
4666 ASSERT3U(raidvd->vdev_children, ==, vdrz->vd_physical_width + 1);
4667 ASSERT3P(raidvd->vdev_child[raidvd->vdev_children - 1], ==,
4668 new_child);
4669
4670 spa_feature_incr(spa, SPA_FEATURE_RAIDZ_EXPANSION, tx);
4671
4672 vdrz->vd_physical_width++;
4673
4674 VERIFY0(spa->spa_uberblock.ub_raidz_reflow_info);
4675 vdrz->vn_vre.vre_vdev_id = raidvd->vdev_id;
4676 vdrz->vn_vre.vre_offset = 0;
4677 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4678 spa->spa_raidz_expand = &vdrz->vn_vre;
4679 zthr_wakeup(spa->spa_raidz_expand_zthr);
4680
4681 /*
4682 * Dirty the config so that ZPOOL_CONFIG_RAIDZ_EXPANDING will get
4683 * written to the config.
4684 */
4685 vdev_config_dirty(raidvd);
4686
4687 vdrz->vn_vre.vre_start_time = gethrestime_sec();
4688 vdrz->vn_vre.vre_end_time = 0;
4689 vdrz->vn_vre.vre_state = DSS_SCANNING;
4690 vdrz->vn_vre.vre_bytes_copied = 0;
4691
4692 uint64_t state = vdrz->vn_vre.vre_state;
4693 VERIFY0(zap_update(spa->spa_meta_objset,
4694 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4695 sizeof (state), 1, &state, tx));
4696
4697 uint64_t start_time = vdrz->vn_vre.vre_start_time;
4698 VERIFY0(zap_update(spa->spa_meta_objset,
4699 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4700 sizeof (start_time), 1, &start_time, tx));
4701
4702 (void) zap_remove(spa->spa_meta_objset,
4703 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME, tx);
4704 (void) zap_remove(spa->spa_meta_objset,
4705 raidvd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED, tx);
4706
4707 spa_history_log_internal(spa, "raidz vdev expansion started", tx,
4708 "%s vdev %llu new width %llu", spa_name(spa),
4709 (unsigned long long)raidvd->vdev_id,
4710 (unsigned long long)raidvd->vdev_children);
4711}
4712
4713int
4714vdev_raidz_load(vdev_t *vd)
4715{
4716 vdev_raidz_t *vdrz = vd->vdev_tsd;
4717 int err;
4718
4719 uint64_t state = DSS_NONE;
4720 uint64_t start_time = 0;
4721 uint64_t end_time = 0;
4722 uint64_t bytes_copied = 0;
4723
4724 if (vd->vdev_top_zap != 0) {
4725 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4726 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_STATE,
4727 sizeof (state), 1, &state);
4728 if (err != 0 && err != ENOENT)
4729 return (err);
4730
4731 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4732 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_START_TIME,
4733 sizeof (start_time), 1, &start_time);
4734 if (err != 0 && err != ENOENT)
4735 return (err);
4736
4737 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4738 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_END_TIME,
4739 sizeof (end_time), 1, &end_time);
4740 if (err != 0 && err != ENOENT)
4741 return (err);
4742
4743 err = zap_lookup(vd->vdev_spa->spa_meta_objset,
4744 vd->vdev_top_zap, VDEV_TOP_ZAP_RAIDZ_EXPAND_BYTES_COPIED,
4745 sizeof (bytes_copied), 1, &bytes_copied);
4746 if (err != 0 && err != ENOENT)
4747 return (err);
4748 }
4749
4750 /*
4751 * If we are in the middle of expansion, vre_state should have
4752 * already been set by vdev_raidz_init().
4753 */
4754 EQUIV(vdrz->vn_vre.vre_state == DSS_SCANNING, state == DSS_SCANNING);
4755 vdrz->vn_vre.vre_state = (dsl_scan_state_t)state;
4756 vdrz->vn_vre.vre_start_time = start_time;
4757 vdrz->vn_vre.vre_end_time = end_time;
4758 vdrz->vn_vre.vre_bytes_copied = bytes_copied;
4759
4760 return (0);
4761}
4762
4763int
4764spa_raidz_expand_get_stats(spa_t *spa, pool_raidz_expand_stat_t *pres)
4765{
4766 vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
4767
4768 if (vre == NULL) {
4769 /* no removal in progress; find most recent completed */
4770 for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) {
4771 vdev_t *vd = spa->spa_root_vdev->vdev_child[c];
4772 if (vd->vdev_ops == &vdev_raidz_ops) {
4773 vdev_raidz_t *vdrz = vd->vdev_tsd;
4774
4775 if (vdrz->vn_vre.vre_end_time != 0 &&
4776 (vre == NULL ||
4777 vdrz->vn_vre.vre_end_time >
4778 vre->vre_end_time)) {
4779 vre = &vdrz->vn_vre;
4780 }
4781 }
4782 }
4783 }
4784
4785 if (vre == NULL) {
4786 return (SET_ERROR(ENOENT));
4787 }
4788
4789 pres->pres_state = vre->vre_state;
4790 pres->pres_expanding_vdev = vre->vre_vdev_id;
4791
4792 vdev_t *vd = vdev_lookup_top(spa, vre->vre_vdev_id);
4793 pres->pres_to_reflow = vd->vdev_stat.vs_alloc;
4794
4795 mutex_enter(&vre->vre_lock);
4796 pres->pres_reflowed = vre->vre_bytes_copied;
4797 for (int i = 0; i < TXG_SIZE; i++)
4798 pres->pres_reflowed += vre->vre_bytes_copied_pertxg[i];
4799 mutex_exit(&vre->vre_lock);
4800
4801 pres->pres_start_time = vre->vre_start_time;
4802 pres->pres_end_time = vre->vre_end_time;
4803 pres->pres_waiting_for_resilver = vre->vre_waiting_for_resilver;
4804
4805 return (0);
4806}
4807
4808/*
4809 * Initialize private RAIDZ specific fields from the nvlist.
4810 */
4811static int
4812vdev_raidz_init(spa_t *spa, nvlist_t *nv, void **tsd)
4813{
4814 uint_t children;
4815 nvlist_t **child;
4816 int error = nvlist_lookup_nvlist_array(nv,
4817 ZPOOL_CONFIG_CHILDREN, &child, &children);
4818 if (error != 0)
4819 return (SET_ERROR(EINVAL));
4820
4821 uint64_t nparity;
4822 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) {
4823 if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
4824 return (SET_ERROR(EINVAL));
4825
4826 /*
4827 * Previous versions could only support 1 or 2 parity
4828 * device.
4829 */
4830 if (nparity > 1 && spa_version(spa) < SPA_VERSION_RAIDZ2)
4831 return (SET_ERROR(EINVAL));
4832 else if (nparity > 2 && spa_version(spa) < SPA_VERSION_RAIDZ3)
4833 return (SET_ERROR(EINVAL));
4834 } else {
4835 /*
4836 * We require the parity to be specified for SPAs that
4837 * support multiple parity levels.
4838 */
4839 if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
4840 return (SET_ERROR(EINVAL));
4841
4842 /*
4843 * Otherwise, we default to 1 parity device for RAID-Z.
4844 */
4845 nparity = 1;
4846 }
4847
4848 vdev_raidz_t *vdrz = kmem_zalloc(sizeof (*vdrz), KM_SLEEP);
4849 vdrz->vn_vre.vre_vdev_id = -1;
4850 vdrz->vn_vre.vre_offset = UINT64_MAX;
4851 vdrz->vn_vre.vre_failed_offset = UINT64_MAX;
4852 mutex_init(&vdrz->vn_vre.vre_lock, NULL, MUTEX_DEFAULT, NULL);
4853 cv_init(&vdrz->vn_vre.vre_cv, NULL, CV_DEFAULT, NULL);
4854 zfs_rangelock_init(&vdrz->vn_vre.vre_rangelock, NULL, NULL);
4855 mutex_init(&vdrz->vd_expand_lock, NULL, MUTEX_DEFAULT, NULL);
4856 avl_create(&vdrz->vd_expand_txgs, vdev_raidz_reflow_compare,
4857 sizeof (reflow_node_t), offsetof(reflow_node_t, re_link));
4858
4859 vdrz->vd_physical_width = children;
4860 vdrz->vd_nparity = nparity;
4861
4862 /* note, the ID does not exist when creating a pool */
4863 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
4864 &vdrz->vn_vre.vre_vdev_id);
4865
4866 boolean_t reflow_in_progress =
4867 nvlist_exists(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4868 if (reflow_in_progress) {
4869 spa->spa_raidz_expand = &vdrz->vn_vre;
4870 vdrz->vn_vre.vre_state = DSS_SCANNING;
4871 }
4872
4873 vdrz->vd_original_width = children;
4874 uint64_t *txgs;
4875 unsigned int txgs_size = 0;
4876 error = nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4877 &txgs, &txgs_size);
4878 if (error == 0) {
4879 for (int i = 0; i < txgs_size; i++) {
4880 reflow_node_t *re = kmem_zalloc(sizeof (*re), KM_SLEEP);
4881 re->re_txg = txgs[txgs_size - i - 1];
4882 re->re_logical_width = vdrz->vd_physical_width - i;
4883
4884 if (reflow_in_progress)
4885 re->re_logical_width--;
4886
4887 avl_add(&vdrz->vd_expand_txgs, re);
4888 }
4889
4890 vdrz->vd_original_width = vdrz->vd_physical_width - txgs_size;
4891 }
4892 if (reflow_in_progress) {
4893 vdrz->vd_original_width--;
4894 zfs_dbgmsg("reflow_in_progress, %u wide, %d prior expansions",
4895 children, txgs_size);
4896 }
4897
4898 *tsd = vdrz;
4899
4900 return (0);
4901}
4902
4903static void
4904vdev_raidz_fini(vdev_t *vd)
4905{
4906 vdev_raidz_t *vdrz = vd->vdev_tsd;
4907 if (vd->vdev_spa->spa_raidz_expand == &vdrz->vn_vre)
4908 vd->vdev_spa->spa_raidz_expand = NULL;
4909 reflow_node_t *re;
4910 void *cookie = NULL;
4911 avl_tree_t *tree = &vdrz->vd_expand_txgs;
4912 while ((re = avl_destroy_nodes(tree, &cookie)) != NULL)
4913 kmem_free(re, sizeof (*re));
4914 avl_destroy(&vdrz->vd_expand_txgs);
4915 mutex_destroy(&vdrz->vd_expand_lock);
4916 mutex_destroy(&vdrz->vn_vre.vre_lock);
4917 cv_destroy(&vdrz->vn_vre.vre_cv);
4918 zfs_rangelock_fini(&vdrz->vn_vre.vre_rangelock);
4919 kmem_free(vdrz, sizeof (*vdrz));
4920}
4921
4922/*
4923 * Add RAIDZ specific fields to the config nvlist.
4924 */
4925static void
4926vdev_raidz_config_generate(vdev_t *vd, nvlist_t *nv)
4927{
4928 ASSERT3P(vd->vdev_ops, ==, &vdev_raidz_ops);
4929 vdev_raidz_t *vdrz = vd->vdev_tsd;
4930
4931 /*
4932 * Make sure someone hasn't managed to sneak a fancy new vdev
4933 * into a crufty old storage pool.
4934 */
4935 ASSERT(vdrz->vd_nparity == 1 ||
4936 (vdrz->vd_nparity <= 2 &&
4937 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ2) ||
4938 (vdrz->vd_nparity <= 3 &&
4939 spa_version(vd->vdev_spa) >= SPA_VERSION_RAIDZ3));
4940
4941 /*
4942 * Note that we'll add these even on storage pools where they
4943 * aren't strictly required -- older software will just ignore
4944 * it.
4945 */
4946 fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vdrz->vd_nparity);
4947
4948 if (vdrz->vn_vre.vre_state == DSS_SCANNING) {
4949 fnvlist_add_boolean(nv, ZPOOL_CONFIG_RAIDZ_EXPANDING);
4950 }
4951
4952 mutex_enter(&vdrz->vd_expand_lock);
4953 if (!avl_is_empty(&vdrz->vd_expand_txgs)) {
4954 uint64_t count = avl_numnodes(&vdrz->vd_expand_txgs);
4955 uint64_t *txgs = kmem_alloc(sizeof (uint64_t) * count,
4956 KM_SLEEP);
4957 uint64_t i = 0;
4958
4959 for (reflow_node_t *re = avl_first(&vdrz->vd_expand_txgs);
4960 re != NULL; re = AVL_NEXT(&vdrz->vd_expand_txgs, re)) {
4961 txgs[i++] = re->re_txg;
4962 }
4963
4964 fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_RAIDZ_EXPAND_TXGS,
4965 txgs, count);
4966
4967 kmem_free(txgs, sizeof (uint64_t) * count);
4968 }
4969 mutex_exit(&vdrz->vd_expand_lock);
4970}
4971
4972static uint64_t
4973vdev_raidz_nparity(vdev_t *vd)
4974{
4975 vdev_raidz_t *vdrz = vd->vdev_tsd;
4976 return (vdrz->vd_nparity);
4977}
4978
4979static uint64_t
4980vdev_raidz_ndisks(vdev_t *vd)
4981{
4982 return (vd->vdev_children);
4983}
4984
4985vdev_ops_t vdev_raidz_ops = {
b2255edc
BB
4986 .vdev_op_init = vdev_raidz_init,
4987 .vdev_op_fini = vdev_raidz_fini,
a64f8276
I
4988 .vdev_op_open = vdev_raidz_open,
4989 .vdev_op_close = vdev_raidz_close,
4990 .vdev_op_asize = vdev_raidz_asize,
b2255edc
BB
4991 .vdev_op_min_asize = vdev_raidz_min_asize,
4992 .vdev_op_min_alloc = NULL,
a64f8276
I
4993 .vdev_op_io_start = vdev_raidz_io_start,
4994 .vdev_op_io_done = vdev_raidz_io_done,
4995 .vdev_op_state_change = vdev_raidz_state_change,
4996 .vdev_op_need_resilver = vdev_raidz_need_resilver,
4997 .vdev_op_hold = NULL,
4998 .vdev_op_rele = NULL,
4999 .vdev_op_remap = NULL,
5000 .vdev_op_xlate = vdev_raidz_xlate,
b2255edc
BB
5001 .vdev_op_rebuild_asize = NULL,
5002 .vdev_op_metaslab_init = NULL,
5003 .vdev_op_config_generate = vdev_raidz_config_generate,
5004 .vdev_op_nparity = vdev_raidz_nparity,
5005 .vdev_op_ndisks = vdev_raidz_ndisks,
a64f8276
I
5006 .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */
5007 .vdev_op_leaf = B_FALSE /* not a leaf vdev */
34dc7c2f 5008};
5caeef02
DB
5009
5010/* BEGIN CSTYLED */
5011ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_reflow_bytes, ULONG, ZMOD_RW,
5012 "For testing, pause RAIDZ expansion after reflowing this many bytes");
5013ZFS_MODULE_PARAM(zfs_vdev, raidz_, expand_max_copy_bytes, ULONG, ZMOD_RW,
5014 "Max amount of concurrent i/o for RAIDZ expansion");
5015ZFS_MODULE_PARAM(zfs_vdev, raidz_, io_aggregate_rows, ULONG, ZMOD_RW,
5016 "For expanded RAIDZ, aggregate reads that have more rows than this");
5017ZFS_MODULE_PARAM(zfs, zfs_, scrub_after_expand, INT, ZMOD_RW,
5018 "For expanded RAIDZ, automatically start a pool scrub when expansion "
5019 "completes");
5020/* END CSTYLED */