]>
Commit | Line | Data |
---|---|---|
9a49d3f3 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * | |
23 | * Copyright (c) 2018, Intel Corporation. | |
24 | * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. | |
25 | */ | |
26 | ||
27 | #include <sys/vdev_impl.h> | |
b2255edc | 28 | #include <sys/vdev_draid.h> |
9a49d3f3 BB |
29 | #include <sys/dsl_scan.h> |
30 | #include <sys/spa_impl.h> | |
31 | #include <sys/metaslab_impl.h> | |
32 | #include <sys/vdev_rebuild.h> | |
33 | #include <sys/zio.h> | |
34 | #include <sys/dmu_tx.h> | |
35 | #include <sys/arc.h> | |
36 | #include <sys/zap.h> | |
37 | ||
38 | /* | |
39 | * This file contains the sequential reconstruction implementation for | |
40 | * resilvering. This form of resilvering is internally referred to as device | |
41 | * rebuild to avoid conflating it with the traditional healing reconstruction | |
42 | * performed by the dsl scan code. | |
43 | * | |
44 | * When replacing a device, or scrubbing the pool, ZFS has historically used | |
45 | * a process called resilvering which is a form of healing reconstruction. | |
46 | * This approach has the advantage that as blocks are read from disk their | |
47 | * checksums can be immediately verified and the data repaired. Unfortunately, | |
48 | * it also results in a random IO pattern to the disk even when extra care | |
49 | * is taken to sequentialize the IO as much as possible. This substantially | |
50 | * increases the time required to resilver the pool and restore redundancy. | |
51 | * | |
52 | * For mirrored devices it's possible to implement an alternate sequential | |
53 | * reconstruction strategy when resilvering. Sequential reconstruction | |
54 | * behaves like a traditional RAID rebuild and reconstructs a device in LBA | |
55 | * order without verifying the checksum. After this phase completes a second | |
56 | * scrub phase is started to verify all of the checksums. This two phase | |
57 | * process will take longer than the healing reconstruction described above. | |
58 | * However, it has that advantage that after the reconstruction first phase | |
59 | * completes redundancy has been restored. At this point the pool can incur | |
60 | * another device failure without risking data loss. | |
61 | * | |
62 | * There are a few noteworthy limitations and other advantages of resilvering | |
63 | * using sequential reconstruction vs healing reconstruction. | |
64 | * | |
65 | * Limitations: | |
66 | * | |
b2255edc BB |
67 | * - Sequential reconstruction is not possible on RAIDZ due to its |
68 | * variable stripe width. Note dRAID uses a fixed stripe width which | |
69 | * avoids this issue, but comes at the expense of some usable capacity. | |
9a49d3f3 | 70 | * |
b2255edc | 71 | * - Block checksums are not verified during sequential reconstruction. |
9a49d3f3 BB |
72 | * Similar to traditional RAID the parity/mirror data is reconstructed |
73 | * but cannot be immediately double checked. For this reason when the | |
b2255edc BB |
74 | * last active resilver completes the pool is automatically scrubbed |
75 | * by default. | |
9a49d3f3 BB |
76 | * |
77 | * - Deferred resilvers using sequential reconstruction are not currently | |
78 | * supported. When adding another vdev to an active top-level resilver | |
79 | * it must be restarted. | |
80 | * | |
81 | * Advantages: | |
82 | * | |
b2255edc BB |
83 | * - Sequential reconstruction is performed in LBA order which may be faster |
84 | * than healing reconstruction particularly when using using HDDs (or | |
9a49d3f3 BB |
85 | * especially with SMR devices). Only allocated capacity is resilvered. |
86 | * | |
87 | * - Sequential reconstruction is not constrained by ZFS block boundaries. | |
88 | * This allows it to issue larger IOs to disk which span multiple blocks | |
89 | * allowing all of these logical blocks to be repaired with a single IO. | |
90 | * | |
91 | * - Unlike a healing resilver or scrub which are pool wide operations, | |
b2255edc BB |
92 | * sequential reconstruction is handled by the top-level vdevs. This |
93 | * allows for it to be started or canceled on a top-level vdev without | |
94 | * impacting any other top-level vdevs in the pool. | |
9a49d3f3 BB |
95 | * |
96 | * - Data only referenced by a pool checkpoint will be repaired because | |
97 | * that space is reflected in the space maps. This differs for a | |
98 | * healing resilver or scrub which will not repair that data. | |
99 | */ | |
100 | ||
101 | ||
102 | /* | |
b2255edc BB |
103 | * Size of rebuild reads; defaults to 1MiB per data disk and is capped at |
104 | * SPA_MAXBLOCKSIZE. | |
9a49d3f3 | 105 | */ |
b2255edc | 106 | unsigned long zfs_rebuild_max_segment = 1024 * 1024; |
9a49d3f3 BB |
107 | |
108 | /* | |
b2255edc BB |
109 | * Maximum number of parallelly executed bytes per leaf vdev caused by a |
110 | * sequential resilver. We attempt to strike a balance here between keeping | |
111 | * the vdev queues full of I/Os at all times and not overflowing the queues | |
112 | * to cause long latency, which would cause long txg sync times. | |
113 | * | |
114 | * A large default value can be safely used here because the default target | |
115 | * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep | |
116 | * the queue depth short. | |
117 | * | |
118 | * 32MB was selected as the default value to achieve good performance with | |
119 | * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential | |
120 | * rebuild was unable to saturate all of the drives using smaller values. | |
121 | * With a value of 32MB the sequential resilver write rate was measured at | |
122 | * 800MB/s sustained while rebuilding to a distributed spare. | |
9a49d3f3 | 123 | */ |
b2255edc BB |
124 | unsigned long zfs_rebuild_vdev_limit = 32 << 20; |
125 | ||
126 | /* | |
127 | * Automatically start a pool scrub when the last active sequential resilver | |
128 | * completes in order to verify the checksums of all blocks which have been | |
129 | * resilvered. This option is enabled by default and is strongly recommended. | |
130 | */ | |
131 | int zfs_rebuild_scrub_enabled = 1; | |
9a49d3f3 BB |
132 | |
133 | /* | |
134 | * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). | |
135 | */ | |
136 | static void vdev_rebuild_thread(void *arg); | |
137 | ||
138 | /* | |
139 | * Clear the per-vdev rebuild bytes value for a vdev tree. | |
140 | */ | |
141 | static void | |
142 | clear_rebuild_bytes(vdev_t *vd) | |
143 | { | |
144 | vdev_stat_t *vs = &vd->vdev_stat; | |
145 | ||
146 | for (uint64_t i = 0; i < vd->vdev_children; i++) | |
147 | clear_rebuild_bytes(vd->vdev_child[i]); | |
148 | ||
149 | mutex_enter(&vd->vdev_stat_lock); | |
150 | vs->vs_rebuild_processed = 0; | |
151 | mutex_exit(&vd->vdev_stat_lock); | |
152 | } | |
153 | ||
154 | /* | |
155 | * Determines whether a vdev_rebuild_thread() should be stopped. | |
156 | */ | |
157 | static boolean_t | |
158 | vdev_rebuild_should_stop(vdev_t *vd) | |
159 | { | |
160 | return (!vdev_writeable(vd) || vd->vdev_removing || | |
161 | vd->vdev_rebuild_exit_wanted || | |
162 | vd->vdev_rebuild_cancel_wanted || | |
163 | vd->vdev_rebuild_reset_wanted); | |
164 | } | |
165 | ||
166 | /* | |
167 | * Determine if the rebuild should be canceled. This may happen when all | |
168 | * vdevs with MISSING DTLs are detached. | |
169 | */ | |
170 | static boolean_t | |
171 | vdev_rebuild_should_cancel(vdev_t *vd) | |
172 | { | |
173 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
174 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
175 | ||
176 | if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) | |
177 | return (B_TRUE); | |
178 | ||
179 | return (B_FALSE); | |
180 | } | |
181 | ||
182 | /* | |
183 | * The sync task for updating the on-disk state of a rebuild. This is | |
184 | * scheduled by vdev_rebuild_range(). | |
185 | */ | |
186 | static void | |
187 | vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) | |
188 | { | |
189 | int vdev_id = (uintptr_t)arg; | |
190 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
191 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
192 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
193 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
194 | uint64_t txg = dmu_tx_get_txg(tx); | |
195 | ||
196 | mutex_enter(&vd->vdev_rebuild_lock); | |
197 | ||
198 | if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { | |
199 | vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; | |
200 | vr->vr_scan_offset[txg & TXG_MASK] = 0; | |
201 | } | |
202 | ||
203 | vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + | |
204 | NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); | |
205 | ||
206 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
207 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
208 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
209 | ||
210 | mutex_exit(&vd->vdev_rebuild_lock); | |
211 | } | |
212 | ||
213 | /* | |
214 | * Initialize the on-disk state for a new rebuild, start the rebuild thread. | |
215 | */ | |
216 | static void | |
217 | vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) | |
218 | { | |
219 | int vdev_id = (uintptr_t)arg; | |
220 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
221 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
222 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
223 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
224 | ||
225 | ASSERT(vd->vdev_rebuilding); | |
226 | ||
227 | spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); | |
228 | ||
229 | mutex_enter(&vd->vdev_rebuild_lock); | |
230 | bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); | |
231 | vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; | |
232 | vrp->vrp_min_txg = 0; | |
233 | vrp->vrp_max_txg = dmu_tx_get_txg(tx); | |
234 | vrp->vrp_start_time = gethrestime_sec(); | |
235 | vrp->vrp_scan_time_ms = 0; | |
236 | vr->vr_prev_scan_time_ms = 0; | |
237 | ||
238 | /* | |
239 | * Rebuilds are currently only used when replacing a device, in which | |
240 | * case there must be DTL_MISSING entries. In the future, we could | |
241 | * allow rebuilds to be used in a way similar to a scrub. This would | |
242 | * be useful because it would allow us to rebuild the space used by | |
243 | * pool checkpoints. | |
244 | */ | |
245 | VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); | |
246 | ||
247 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
248 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
249 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
250 | ||
251 | spa_history_log_internal(spa, "rebuild", tx, | |
252 | "vdev_id=%llu vdev_guid=%llu started", | |
253 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
254 | ||
255 | ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); | |
256 | vd->vdev_rebuild_thread = thread_create(NULL, 0, | |
257 | vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
258 | ||
259 | mutex_exit(&vd->vdev_rebuild_lock); | |
260 | } | |
261 | ||
262 | static void | |
263 | vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name) | |
264 | { | |
265 | nvlist_t *aux = fnvlist_alloc(); | |
266 | ||
267 | fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); | |
268 | spa_event_notify(spa, vd, aux, name); | |
269 | nvlist_free(aux); | |
270 | } | |
271 | ||
272 | /* | |
273 | * Called to request that a new rebuild be started. The feature will remain | |
274 | * active for the duration of the rebuild, then revert to the enabled state. | |
275 | */ | |
276 | static void | |
277 | vdev_rebuild_initiate(vdev_t *vd) | |
278 | { | |
279 | spa_t *spa = vd->vdev_spa; | |
280 | ||
281 | ASSERT(vd->vdev_top == vd); | |
282 | ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); | |
283 | ASSERT(!vd->vdev_rebuilding); | |
284 | ||
285 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
286 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
287 | ||
288 | vd->vdev_rebuilding = B_TRUE; | |
289 | ||
290 | dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, | |
38080324 | 291 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
292 | dmu_tx_commit(tx); |
293 | ||
294 | vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); | |
295 | } | |
296 | ||
297 | /* | |
298 | * Update the on-disk state to completed when a rebuild finishes. | |
299 | */ | |
300 | static void | |
301 | vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) | |
302 | { | |
303 | int vdev_id = (uintptr_t)arg; | |
304 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
305 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
306 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
307 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
308 | ||
309 | mutex_enter(&vd->vdev_rebuild_lock); | |
310 | vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; | |
311 | vrp->vrp_end_time = gethrestime_sec(); | |
312 | ||
313 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
314 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
315 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
316 | ||
b2255edc | 317 | vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); |
9a49d3f3 BB |
318 | spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); |
319 | ||
320 | spa_history_log_internal(spa, "rebuild", tx, | |
321 | "vdev_id=%llu vdev_guid=%llu complete", | |
322 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
323 | vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); | |
324 | ||
325 | /* Handles detaching of spares */ | |
326 | spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); | |
327 | vd->vdev_rebuilding = B_FALSE; | |
328 | mutex_exit(&vd->vdev_rebuild_lock); | |
329 | ||
b2255edc BB |
330 | /* |
331 | * While we're in syncing context take the opportunity to | |
332 | * setup the scrub when there are no more active rebuilds. | |
333 | */ | |
334 | if (!vdev_rebuild_active(spa->spa_root_vdev) && | |
335 | zfs_rebuild_scrub_enabled) { | |
336 | pool_scan_func_t func = POOL_SCAN_SCRUB; | |
337 | dsl_scan_setup_sync(&func, tx); | |
338 | } | |
339 | ||
9a49d3f3 BB |
340 | cv_broadcast(&vd->vdev_rebuild_cv); |
341 | } | |
342 | ||
343 | /* | |
344 | * Update the on-disk state to canceled when a rebuild finishes. | |
345 | */ | |
346 | static void | |
347 | vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) | |
348 | { | |
349 | int vdev_id = (uintptr_t)arg; | |
350 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
351 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
352 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
353 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
354 | ||
355 | mutex_enter(&vd->vdev_rebuild_lock); | |
356 | vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; | |
357 | vrp->vrp_end_time = gethrestime_sec(); | |
358 | ||
359 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
360 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
361 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
362 | ||
363 | spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); | |
364 | ||
365 | spa_history_log_internal(spa, "rebuild", tx, | |
366 | "vdev_id=%llu vdev_guid=%llu canceled", | |
367 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
368 | vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); | |
369 | ||
370 | vd->vdev_rebuild_cancel_wanted = B_FALSE; | |
371 | vd->vdev_rebuilding = B_FALSE; | |
372 | mutex_exit(&vd->vdev_rebuild_lock); | |
373 | ||
374 | spa_notify_waiters(spa); | |
375 | cv_broadcast(&vd->vdev_rebuild_cv); | |
376 | } | |
377 | ||
378 | /* | |
379 | * Resets the progress of a running rebuild. This will occur when a new | |
380 | * vdev is added to rebuild. | |
381 | */ | |
382 | static void | |
383 | vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) | |
384 | { | |
385 | int vdev_id = (uintptr_t)arg; | |
386 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
387 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
388 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
389 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
390 | ||
391 | mutex_enter(&vd->vdev_rebuild_lock); | |
392 | ||
393 | ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); | |
394 | ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); | |
395 | ||
396 | vrp->vrp_last_offset = 0; | |
397 | vrp->vrp_min_txg = 0; | |
398 | vrp->vrp_max_txg = dmu_tx_get_txg(tx); | |
399 | vrp->vrp_bytes_scanned = 0; | |
400 | vrp->vrp_bytes_issued = 0; | |
401 | vrp->vrp_bytes_rebuilt = 0; | |
402 | vrp->vrp_bytes_est = 0; | |
403 | vrp->vrp_scan_time_ms = 0; | |
404 | vr->vr_prev_scan_time_ms = 0; | |
405 | ||
406 | /* See vdev_rebuild_initiate_sync comment */ | |
407 | VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); | |
408 | ||
409 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
410 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
411 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
412 | ||
413 | spa_history_log_internal(spa, "rebuild", tx, | |
414 | "vdev_id=%llu vdev_guid=%llu reset", | |
415 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
416 | ||
417 | vd->vdev_rebuild_reset_wanted = B_FALSE; | |
418 | ASSERT(vd->vdev_rebuilding); | |
419 | ||
420 | vd->vdev_rebuild_thread = thread_create(NULL, 0, | |
421 | vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
422 | ||
423 | mutex_exit(&vd->vdev_rebuild_lock); | |
424 | } | |
425 | ||
426 | /* | |
427 | * Clear the last rebuild status. | |
428 | */ | |
429 | void | |
430 | vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) | |
431 | { | |
432 | int vdev_id = (uintptr_t)arg; | |
433 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
434 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
435 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
436 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
437 | objset_t *mos = spa_meta_objset(spa); | |
438 | ||
439 | mutex_enter(&vd->vdev_rebuild_lock); | |
440 | ||
441 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || | |
442 | vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { | |
443 | mutex_exit(&vd->vdev_rebuild_lock); | |
444 | return; | |
445 | } | |
446 | ||
447 | clear_rebuild_bytes(vd); | |
448 | bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); | |
449 | ||
450 | if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, | |
451 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { | |
452 | VERIFY0(zap_update(mos, vd->vdev_top_zap, | |
453 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
454 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
455 | } | |
456 | ||
457 | mutex_exit(&vd->vdev_rebuild_lock); | |
458 | } | |
459 | ||
460 | /* | |
461 | * The zio_done_func_t callback for each rebuild I/O issued. It's responsible | |
462 | * for updating the rebuild stats and limiting the number of in flight I/Os. | |
463 | */ | |
464 | static void | |
465 | vdev_rebuild_cb(zio_t *zio) | |
466 | { | |
467 | vdev_rebuild_t *vr = zio->io_private; | |
468 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
469 | vdev_t *vd = vr->vr_top_vdev; | |
470 | ||
b2255edc | 471 | mutex_enter(&vr->vr_io_lock); |
9a49d3f3 BB |
472 | if (zio->io_error == ENXIO && !vdev_writeable(vd)) { |
473 | /* | |
474 | * The I/O failed because the top-level vdev was unavailable. | |
475 | * Attempt to roll back to the last completed offset, in order | |
476 | * resume from the correct location if the pool is resumed. | |
477 | * (This works because spa_sync waits on spa_txg_zio before | |
478 | * it runs sync tasks.) | |
479 | */ | |
480 | uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; | |
481 | *off = MIN(*off, zio->io_offset); | |
482 | } else if (zio->io_error) { | |
483 | vrp->vrp_errors++; | |
484 | } | |
485 | ||
486 | abd_free(zio->io_abd); | |
487 | ||
b2255edc BB |
488 | ASSERT3U(vr->vr_bytes_inflight, >, 0); |
489 | vr->vr_bytes_inflight -= zio->io_size; | |
490 | cv_broadcast(&vr->vr_io_cv); | |
491 | mutex_exit(&vr->vr_io_lock); | |
9a49d3f3 BB |
492 | |
493 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
494 | } | |
495 | ||
496 | /* | |
b2255edc BB |
497 | * Initialize a block pointer that can be used to read the given segment |
498 | * for sequential rebuild. | |
9a49d3f3 BB |
499 | */ |
500 | static void | |
b2255edc BB |
501 | vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, |
502 | uint64_t asize) | |
9a49d3f3 | 503 | { |
b2255edc BB |
504 | ASSERT(vd->vdev_ops == &vdev_draid_ops || |
505 | vd->vdev_ops == &vdev_mirror_ops || | |
9a49d3f3 BB |
506 | vd->vdev_ops == &vdev_replacing_ops || |
507 | vd->vdev_ops == &vdev_spare_ops); | |
508 | ||
b2255edc BB |
509 | uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? |
510 | vdev_draid_asize_to_psize(vd, asize) : asize; | |
511 | ||
9a49d3f3 BB |
512 | BP_ZERO(bp); |
513 | ||
514 | DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); | |
515 | DVA_SET_OFFSET(&bp->blk_dva[0], start); | |
516 | DVA_SET_GANG(&bp->blk_dva[0], 0); | |
517 | DVA_SET_ASIZE(&bp->blk_dva[0], asize); | |
518 | ||
519 | BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); | |
520 | BP_SET_LSIZE(bp, psize); | |
521 | BP_SET_PSIZE(bp, psize); | |
522 | BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); | |
523 | BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); | |
524 | BP_SET_TYPE(bp, DMU_OT_NONE); | |
525 | BP_SET_LEVEL(bp, 0); | |
526 | BP_SET_DEDUP(bp, 0); | |
527 | BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); | |
9a49d3f3 BB |
528 | } |
529 | ||
530 | /* | |
531 | * Issues a rebuild I/O and takes care of rate limiting the number of queued | |
532 | * rebuild I/Os. The provided start and size must be properly aligned for the | |
533 | * top-level vdev type being rebuilt. | |
534 | */ | |
535 | static int | |
536 | vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) | |
537 | { | |
538 | uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; | |
539 | vdev_t *vd = vr->vr_top_vdev; | |
540 | spa_t *spa = vd->vdev_spa; | |
b2255edc | 541 | blkptr_t blk; |
9a49d3f3 BB |
542 | |
543 | ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); | |
544 | ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); | |
545 | ||
546 | vr->vr_pass_bytes_scanned += size; | |
547 | vr->vr_rebuild_phys.vrp_bytes_scanned += size; | |
548 | ||
b2255edc BB |
549 | /* |
550 | * Rebuild the data in this range by constructing a special block | |
551 | * pointer. It has no relation to any existing blocks in the pool. | |
552 | * However, by disabling checksum verification and issuing a scrub IO | |
553 | * we can reconstruct and repair any children with missing data. | |
554 | */ | |
555 | vdev_rebuild_blkptr_init(&blk, vd, start, size); | |
556 | uint64_t psize = BP_GET_PSIZE(&blk); | |
557 | ||
558 | if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) | |
559 | return (0); | |
560 | ||
561 | mutex_enter(&vr->vr_io_lock); | |
9a49d3f3 BB |
562 | |
563 | /* Limit in flight rebuild I/Os */ | |
b2255edc BB |
564 | while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max) |
565 | cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); | |
9a49d3f3 | 566 | |
b2255edc BB |
567 | vr->vr_bytes_inflight += psize; |
568 | mutex_exit(&vr->vr_io_lock); | |
9a49d3f3 BB |
569 | |
570 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
571 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
572 | uint64_t txg = dmu_tx_get_txg(tx); | |
573 | ||
574 | spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); | |
575 | mutex_enter(&vd->vdev_rebuild_lock); | |
576 | ||
577 | /* This is the first I/O for this txg. */ | |
578 | if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { | |
579 | vr->vr_scan_offset[txg & TXG_MASK] = start; | |
580 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
581 | vdev_rebuild_update_sync, | |
38080324 | 582 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
583 | } |
584 | ||
585 | /* When exiting write out our progress. */ | |
586 | if (vdev_rebuild_should_stop(vd)) { | |
b2255edc BB |
587 | mutex_enter(&vr->vr_io_lock); |
588 | vr->vr_bytes_inflight -= psize; | |
589 | mutex_exit(&vr->vr_io_lock); | |
9a49d3f3 BB |
590 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); |
591 | mutex_exit(&vd->vdev_rebuild_lock); | |
592 | dmu_tx_commit(tx); | |
593 | return (SET_ERROR(EINTR)); | |
594 | } | |
595 | mutex_exit(&vd->vdev_rebuild_lock); | |
b2255edc | 596 | dmu_tx_commit(tx); |
9a49d3f3 BB |
597 | |
598 | vr->vr_scan_offset[txg & TXG_MASK] = start + size; | |
b2255edc BB |
599 | vr->vr_pass_bytes_issued += size; |
600 | vr->vr_rebuild_phys.vrp_bytes_issued += size; | |
9a49d3f3 | 601 | |
b2255edc BB |
602 | zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk, |
603 | abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, | |
604 | ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | | |
605 | ZIO_FLAG_RESILVER, NULL)); | |
9a49d3f3 BB |
606 | |
607 | return (0); | |
608 | } | |
609 | ||
9a49d3f3 BB |
610 | /* |
611 | * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. | |
612 | */ | |
613 | static int | |
614 | vdev_rebuild_ranges(vdev_rebuild_t *vr) | |
615 | { | |
616 | vdev_t *vd = vr->vr_top_vdev; | |
617 | zfs_btree_t *t = &vr->vr_scan_tree->rt_root; | |
618 | zfs_btree_index_t idx; | |
619 | int error; | |
620 | ||
621 | for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; | |
622 | rs = zfs_btree_next(t, &idx, &idx)) { | |
623 | uint64_t start = rs_get_start(rs, vr->vr_scan_tree); | |
624 | uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; | |
625 | ||
626 | /* | |
627 | * zfs_scan_suspend_progress can be set to disable rebuild | |
628 | * progress for testing. See comment in dsl_scan_sync(). | |
629 | */ | |
630 | while (zfs_scan_suspend_progress && | |
631 | !vdev_rebuild_should_stop(vd)) { | |
632 | delay(hz); | |
633 | } | |
634 | ||
635 | while (size > 0) { | |
636 | uint64_t chunk_size; | |
637 | ||
b2255edc BB |
638 | /* |
639 | * Split range into legally-sized logical chunks | |
640 | * given the constraints of the top-level vdev | |
641 | * being rebuilt (dRAID or mirror). | |
642 | */ | |
643 | ASSERT3P(vd->vdev_ops, !=, NULL); | |
644 | chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd, | |
645 | start, size, zfs_rebuild_max_segment); | |
9a49d3f3 BB |
646 | |
647 | error = vdev_rebuild_range(vr, start, chunk_size); | |
648 | if (error != 0) | |
649 | return (error); | |
650 | ||
651 | size -= chunk_size; | |
652 | start += chunk_size; | |
653 | } | |
654 | } | |
655 | ||
656 | return (0); | |
657 | } | |
658 | ||
659 | /* | |
660 | * Calculates the estimated capacity which remains to be scanned. Since | |
661 | * we traverse the pool in metaslab order only allocated capacity beyond | |
662 | * the vrp_last_offset need be considered. All lower offsets must have | |
663 | * already been rebuilt and are thus already included in vrp_bytes_scanned. | |
664 | */ | |
665 | static void | |
666 | vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) | |
667 | { | |
668 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
669 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
670 | uint64_t bytes_est = vrp->vrp_bytes_scanned; | |
671 | ||
672 | if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) | |
673 | return; | |
674 | ||
675 | for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { | |
676 | metaslab_t *msp = vd->vdev_ms[i]; | |
677 | ||
678 | mutex_enter(&msp->ms_lock); | |
679 | bytes_est += metaslab_allocated_space(msp); | |
680 | mutex_exit(&msp->ms_lock); | |
681 | } | |
682 | ||
683 | vrp->vrp_bytes_est = bytes_est; | |
684 | } | |
685 | ||
686 | /* | |
687 | * Load from disk the top-level vdev's rebuild information. | |
688 | */ | |
689 | int | |
690 | vdev_rebuild_load(vdev_t *vd) | |
691 | { | |
692 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
693 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
694 | spa_t *spa = vd->vdev_spa; | |
695 | int err = 0; | |
696 | ||
697 | mutex_enter(&vd->vdev_rebuild_lock); | |
698 | vd->vdev_rebuilding = B_FALSE; | |
699 | ||
700 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { | |
701 | bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); | |
702 | mutex_exit(&vd->vdev_rebuild_lock); | |
703 | return (SET_ERROR(ENOTSUP)); | |
704 | } | |
705 | ||
706 | ASSERT(vd->vdev_top == vd); | |
707 | ||
708 | err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, | |
709 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
710 | REBUILD_PHYS_ENTRIES, vrp); | |
711 | ||
712 | /* | |
713 | * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should | |
714 | * not prevent a pool from being imported. Clear the rebuild | |
715 | * status allowing a new resilver/rebuild to be started. | |
716 | */ | |
717 | if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { | |
718 | bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); | |
719 | } else if (err) { | |
720 | mutex_exit(&vd->vdev_rebuild_lock); | |
721 | return (err); | |
722 | } | |
723 | ||
724 | vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; | |
725 | vr->vr_top_vdev = vd; | |
726 | ||
727 | mutex_exit(&vd->vdev_rebuild_lock); | |
728 | ||
729 | return (0); | |
730 | } | |
731 | ||
732 | /* | |
733 | * Each scan thread is responsible for rebuilding a top-level vdev. The | |
734 | * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. | |
735 | */ | |
736 | static void | |
737 | vdev_rebuild_thread(void *arg) | |
738 | { | |
739 | vdev_t *vd = arg; | |
740 | spa_t *spa = vd->vdev_spa; | |
741 | int error = 0; | |
742 | ||
743 | /* | |
744 | * If there's a scrub in process request that it be stopped. This | |
745 | * is not required for a correct rebuild, but we do want rebuilds to | |
746 | * emulate the resilver behavior as much as possible. | |
747 | */ | |
748 | dsl_pool_t *dsl = spa_get_dsl(spa); | |
749 | if (dsl_scan_scrubbing(dsl)) | |
750 | dsl_scan_cancel(dsl); | |
751 | ||
752 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
753 | mutex_enter(&vd->vdev_rebuild_lock); | |
754 | ||
755 | ASSERT3P(vd->vdev_top, ==, vd); | |
756 | ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); | |
757 | ASSERT(vd->vdev_rebuilding); | |
758 | ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); | |
759 | ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); | |
760 | ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); | |
761 | ||
762 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
763 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
764 | vr->vr_top_vdev = vd; | |
765 | vr->vr_scan_msp = NULL; | |
766 | vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); | |
b2255edc BB |
767 | mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); |
768 | cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); | |
769 | ||
9a49d3f3 BB |
770 | vr->vr_pass_start_time = gethrtime(); |
771 | vr->vr_pass_bytes_scanned = 0; | |
772 | vr->vr_pass_bytes_issued = 0; | |
773 | ||
b2255edc BB |
774 | vr->vr_bytes_inflight_max = MAX(1ULL << 20, |
775 | zfs_rebuild_vdev_limit * vd->vdev_children); | |
776 | ||
9a49d3f3 BB |
777 | uint64_t update_est_time = gethrtime(); |
778 | vdev_rebuild_update_bytes_est(vd, 0); | |
779 | ||
780 | clear_rebuild_bytes(vr->vr_top_vdev); | |
781 | ||
782 | mutex_exit(&vd->vdev_rebuild_lock); | |
783 | ||
784 | /* | |
785 | * Systematically walk the metaslabs and issue rebuild I/Os for | |
786 | * all ranges in the allocated space map. | |
787 | */ | |
788 | for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { | |
789 | metaslab_t *msp = vd->vdev_ms[i]; | |
790 | vr->vr_scan_msp = msp; | |
791 | ||
792 | /* | |
793 | * Removal of vdevs from the vdev tree may eliminate the need | |
794 | * for the rebuild, in which case it should be canceled. The | |
795 | * vdev_rebuild_cancel_wanted flag is set until the sync task | |
796 | * completes. This may be after the rebuild thread exits. | |
797 | */ | |
798 | if (vdev_rebuild_should_cancel(vd)) { | |
799 | vd->vdev_rebuild_cancel_wanted = B_TRUE; | |
800 | error = EINTR; | |
801 | break; | |
802 | } | |
803 | ||
804 | ASSERT0(range_tree_space(vr->vr_scan_tree)); | |
805 | ||
b2255edc | 806 | /* Disable any new allocations to this metaslab */ |
9a49d3f3 BB |
807 | metaslab_disable(msp); |
808 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
9a49d3f3 BB |
809 | |
810 | mutex_enter(&msp->ms_sync_lock); | |
811 | mutex_enter(&msp->ms_lock); | |
812 | ||
b2255edc BB |
813 | /* |
814 | * If there are outstanding allocations wait for them to be | |
815 | * synced. This is needed to ensure all allocated ranges are | |
816 | * on disk and therefore will be rebuilt. | |
817 | */ | |
818 | for (int j = 0; j < TXG_SIZE; j++) { | |
819 | if (range_tree_space(msp->ms_allocating[j])) { | |
820 | mutex_exit(&msp->ms_lock); | |
821 | mutex_exit(&msp->ms_sync_lock); | |
822 | txg_wait_synced(dsl, 0); | |
823 | mutex_enter(&msp->ms_sync_lock); | |
824 | mutex_enter(&msp->ms_lock); | |
825 | break; | |
826 | } | |
827 | } | |
828 | ||
9a49d3f3 BB |
829 | /* |
830 | * When a metaslab has been allocated from read its allocated | |
b2255edc | 831 | * ranges from the space map object into the vr_scan_tree. |
9a49d3f3 BB |
832 | * Then add inflight / unflushed ranges and remove inflight / |
833 | * unflushed frees. This is the minimum range to be rebuilt. | |
834 | */ | |
835 | if (msp->ms_sm != NULL) { | |
836 | VERIFY0(space_map_load(msp->ms_sm, | |
837 | vr->vr_scan_tree, SM_ALLOC)); | |
838 | ||
839 | for (int i = 0; i < TXG_SIZE; i++) { | |
840 | ASSERT0(range_tree_space( | |
841 | msp->ms_allocating[i])); | |
842 | } | |
843 | ||
844 | range_tree_walk(msp->ms_unflushed_allocs, | |
845 | range_tree_add, vr->vr_scan_tree); | |
846 | range_tree_walk(msp->ms_unflushed_frees, | |
847 | range_tree_remove, vr->vr_scan_tree); | |
848 | ||
849 | /* | |
850 | * Remove ranges which have already been rebuilt based | |
851 | * on the last offset. This can happen when restarting | |
852 | * a scan after exporting and re-importing the pool. | |
853 | */ | |
854 | range_tree_clear(vr->vr_scan_tree, 0, | |
855 | vrp->vrp_last_offset); | |
856 | } | |
857 | ||
858 | mutex_exit(&msp->ms_lock); | |
859 | mutex_exit(&msp->ms_sync_lock); | |
860 | ||
861 | /* | |
862 | * To provide an accurate estimate re-calculate the estimated | |
863 | * size every 5 minutes to account for recent allocations and | |
b2255edc | 864 | * frees made to space maps which have not yet been rebuilt. |
9a49d3f3 BB |
865 | */ |
866 | if (gethrtime() > update_est_time + SEC2NSEC(300)) { | |
867 | update_est_time = gethrtime(); | |
868 | vdev_rebuild_update_bytes_est(vd, i); | |
869 | } | |
870 | ||
871 | /* | |
872 | * Walk the allocated space map and issue the rebuild I/O. | |
873 | */ | |
874 | error = vdev_rebuild_ranges(vr); | |
875 | range_tree_vacate(vr->vr_scan_tree, NULL, NULL); | |
876 | ||
877 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
878 | metaslab_enable(msp, B_FALSE, B_FALSE); | |
879 | ||
880 | if (error != 0) | |
881 | break; | |
882 | } | |
883 | ||
884 | range_tree_destroy(vr->vr_scan_tree); | |
885 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
886 | ||
887 | /* Wait for any remaining rebuild I/O to complete */ | |
b2255edc BB |
888 | mutex_enter(&vr->vr_io_lock); |
889 | while (vr->vr_bytes_inflight > 0) | |
890 | cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); | |
9a49d3f3 | 891 | |
b2255edc BB |
892 | mutex_exit(&vr->vr_io_lock); |
893 | ||
894 | mutex_destroy(&vr->vr_io_lock); | |
895 | cv_destroy(&vr->vr_io_cv); | |
9a49d3f3 BB |
896 | |
897 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
898 | ||
899 | dsl_pool_t *dp = spa_get_dsl(spa); | |
900 | dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); | |
901 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
902 | ||
903 | mutex_enter(&vd->vdev_rebuild_lock); | |
904 | if (error == 0) { | |
905 | /* | |
906 | * After a successful rebuild clear the DTLs of all ranges | |
907 | * which were missing when the rebuild was started. These | |
908 | * ranges must have been rebuilt as a consequence of rebuilding | |
909 | * all allocated space. Note that unlike a scrub or resilver | |
910 | * the rebuild operation will reconstruct data only referenced | |
911 | * by a pool checkpoint. See the dsl_scan_done() comments. | |
912 | */ | |
913 | dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, | |
38080324 | 914 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
915 | } else if (vd->vdev_rebuild_cancel_wanted) { |
916 | /* | |
917 | * The rebuild operation was canceled. This will occur when | |
918 | * a device participating in the rebuild is detached. | |
919 | */ | |
920 | dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, | |
38080324 | 921 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
922 | } else if (vd->vdev_rebuild_reset_wanted) { |
923 | /* | |
924 | * Reset the running rebuild without canceling and restarting | |
925 | * it. This will occur when a new device is attached and must | |
926 | * participate in the rebuild. | |
927 | */ | |
928 | dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, | |
38080324 | 929 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
930 | } else { |
931 | /* | |
932 | * The rebuild operation should be suspended. This may occur | |
933 | * when detaching a child vdev or when exporting the pool. The | |
934 | * rebuild is left in the active state so it will be resumed. | |
935 | */ | |
936 | ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); | |
937 | vd->vdev_rebuilding = B_FALSE; | |
938 | } | |
939 | ||
940 | dmu_tx_commit(tx); | |
941 | ||
942 | vd->vdev_rebuild_thread = NULL; | |
943 | mutex_exit(&vd->vdev_rebuild_lock); | |
944 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
945 | ||
946 | cv_broadcast(&vd->vdev_rebuild_cv); | |
22dcf891 MM |
947 | |
948 | thread_exit(); | |
9a49d3f3 BB |
949 | } |
950 | ||
951 | /* | |
952 | * Returns B_TRUE if any top-level vdev are rebuilding. | |
953 | */ | |
954 | boolean_t | |
955 | vdev_rebuild_active(vdev_t *vd) | |
956 | { | |
957 | spa_t *spa = vd->vdev_spa; | |
958 | boolean_t ret = B_FALSE; | |
959 | ||
960 | if (vd == spa->spa_root_vdev) { | |
961 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
962 | ret = vdev_rebuild_active(vd->vdev_child[i]); | |
963 | if (ret) | |
964 | return (ret); | |
965 | } | |
966 | } else if (vd->vdev_top_zap != 0) { | |
967 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
968 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
969 | ||
970 | mutex_enter(&vd->vdev_rebuild_lock); | |
971 | ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); | |
972 | mutex_exit(&vd->vdev_rebuild_lock); | |
973 | } | |
974 | ||
975 | return (ret); | |
976 | } | |
977 | ||
978 | /* | |
979 | * Start a rebuild operation. The rebuild may be restarted when the | |
980 | * top-level vdev is currently actively rebuilding. | |
981 | */ | |
982 | void | |
983 | vdev_rebuild(vdev_t *vd) | |
984 | { | |
985 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
986 | vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; | |
987 | ||
988 | ASSERT(vd->vdev_top == vd); | |
989 | ASSERT(vdev_is_concrete(vd)); | |
990 | ASSERT(!vd->vdev_removing); | |
991 | ASSERT(spa_feature_is_enabled(vd->vdev_spa, | |
992 | SPA_FEATURE_DEVICE_REBUILD)); | |
993 | ||
994 | mutex_enter(&vd->vdev_rebuild_lock); | |
995 | if (vd->vdev_rebuilding) { | |
996 | ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); | |
997 | ||
998 | /* | |
999 | * Signal a running rebuild operation that it should restart | |
1000 | * from the beginning because a new device was attached. The | |
1001 | * vdev_rebuild_reset_wanted flag is set until the sync task | |
1002 | * completes. This may be after the rebuild thread exits. | |
1003 | */ | |
1004 | if (!vd->vdev_rebuild_reset_wanted) | |
1005 | vd->vdev_rebuild_reset_wanted = B_TRUE; | |
1006 | } else { | |
1007 | vdev_rebuild_initiate(vd); | |
1008 | } | |
1009 | mutex_exit(&vd->vdev_rebuild_lock); | |
1010 | } | |
1011 | ||
1012 | static void | |
1013 | vdev_rebuild_restart_impl(vdev_t *vd) | |
1014 | { | |
1015 | spa_t *spa = vd->vdev_spa; | |
1016 | ||
1017 | if (vd == spa->spa_root_vdev) { | |
1018 | for (uint64_t i = 0; i < vd->vdev_children; i++) | |
1019 | vdev_rebuild_restart_impl(vd->vdev_child[i]); | |
1020 | ||
1021 | } else if (vd->vdev_top_zap != 0) { | |
1022 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
1023 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
1024 | ||
1025 | mutex_enter(&vd->vdev_rebuild_lock); | |
1026 | if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && | |
1027 | vdev_writeable(vd) && !vd->vdev_rebuilding) { | |
1028 | ASSERT(spa_feature_is_active(spa, | |
1029 | SPA_FEATURE_DEVICE_REBUILD)); | |
1030 | vd->vdev_rebuilding = B_TRUE; | |
1031 | vd->vdev_rebuild_thread = thread_create(NULL, 0, | |
1032 | vdev_rebuild_thread, vd, 0, &p0, TS_RUN, | |
1033 | maxclsyspri); | |
1034 | } | |
1035 | mutex_exit(&vd->vdev_rebuild_lock); | |
1036 | } | |
1037 | } | |
1038 | ||
1039 | /* | |
1040 | * Conditionally restart all of the vdev_rebuild_thread's for a pool. The | |
1041 | * feature flag must be active and the rebuild in the active state. This | |
1042 | * cannot be used to start a new rebuild. | |
1043 | */ | |
1044 | void | |
1045 | vdev_rebuild_restart(spa_t *spa) | |
1046 | { | |
1047 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1048 | ||
1049 | vdev_rebuild_restart_impl(spa->spa_root_vdev); | |
1050 | } | |
1051 | ||
1052 | /* | |
1053 | * Stop and wait for all of the vdev_rebuild_thread's associated with the | |
1054 | * vdev tree provide to be terminated (canceled or stopped). | |
1055 | */ | |
1056 | void | |
1057 | vdev_rebuild_stop_wait(vdev_t *vd) | |
1058 | { | |
1059 | spa_t *spa = vd->vdev_spa; | |
1060 | ||
1061 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1062 | ||
1063 | if (vd == spa->spa_root_vdev) { | |
1064 | for (uint64_t i = 0; i < vd->vdev_children; i++) | |
1065 | vdev_rebuild_stop_wait(vd->vdev_child[i]); | |
1066 | ||
1067 | } else if (vd->vdev_top_zap != 0) { | |
1068 | ASSERT(vd == vd->vdev_top); | |
1069 | ||
1070 | mutex_enter(&vd->vdev_rebuild_lock); | |
1071 | if (vd->vdev_rebuild_thread != NULL) { | |
1072 | vd->vdev_rebuild_exit_wanted = B_TRUE; | |
1073 | while (vd->vdev_rebuilding) { | |
1074 | cv_wait(&vd->vdev_rebuild_cv, | |
1075 | &vd->vdev_rebuild_lock); | |
1076 | } | |
1077 | vd->vdev_rebuild_exit_wanted = B_FALSE; | |
1078 | } | |
1079 | mutex_exit(&vd->vdev_rebuild_lock); | |
1080 | } | |
1081 | } | |
1082 | ||
1083 | /* | |
1084 | * Stop all rebuild operations but leave them in the active state so they | |
1085 | * will be resumed when importing the pool. | |
1086 | */ | |
1087 | void | |
1088 | vdev_rebuild_stop_all(spa_t *spa) | |
1089 | { | |
1090 | vdev_rebuild_stop_wait(spa->spa_root_vdev); | |
1091 | } | |
1092 | ||
1093 | /* | |
1094 | * Rebuild statistics reported per top-level vdev. | |
1095 | */ | |
1096 | int | |
1097 | vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) | |
1098 | { | |
1099 | spa_t *spa = tvd->vdev_spa; | |
1100 | ||
1101 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) | |
1102 | return (SET_ERROR(ENOTSUP)); | |
1103 | ||
1104 | if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) | |
1105 | return (SET_ERROR(EINVAL)); | |
1106 | ||
1107 | int error = zap_contains(spa_meta_objset(spa), | |
1108 | tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); | |
1109 | ||
1110 | if (error == ENOENT) { | |
1111 | bzero(vrs, sizeof (vdev_rebuild_stat_t)); | |
1112 | vrs->vrs_state = VDEV_REBUILD_NONE; | |
1113 | error = 0; | |
1114 | } else if (error == 0) { | |
1115 | vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; | |
1116 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
1117 | ||
1118 | mutex_enter(&tvd->vdev_rebuild_lock); | |
1119 | vrs->vrs_state = vrp->vrp_rebuild_state; | |
1120 | vrs->vrs_start_time = vrp->vrp_start_time; | |
1121 | vrs->vrs_end_time = vrp->vrp_end_time; | |
1122 | vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; | |
1123 | vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; | |
1124 | vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; | |
1125 | vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; | |
1126 | vrs->vrs_bytes_est = vrp->vrp_bytes_est; | |
1127 | vrs->vrs_errors = vrp->vrp_errors; | |
1128 | vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - | |
1129 | vr->vr_pass_start_time); | |
1130 | vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; | |
1131 | vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; | |
1132 | mutex_exit(&tvd->vdev_rebuild_lock); | |
1133 | } | |
1134 | ||
1135 | return (error); | |
1136 | } | |
1137 | ||
1138 | /* BEGIN CSTYLED */ | |
1139 | ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, | |
b2255edc BB |
1140 | "Max segment size in bytes of rebuild reads"); |
1141 | ||
1142 | ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW, | |
1143 | "Max bytes in flight per leaf vdev for sequential resilvers"); | |
1144 | ||
1145 | ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, | |
1146 | "Automatically scrub after sequential resilver completes"); | |
9a49d3f3 | 1147 | /* END CSTYLED */ |