]>
Commit | Line | Data |
---|---|---|
9a49d3f3 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * | |
23 | * Copyright (c) 2018, Intel Corporation. | |
24 | * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. | |
25 | */ | |
26 | ||
27 | #include <sys/vdev_impl.h> | |
b2255edc | 28 | #include <sys/vdev_draid.h> |
9a49d3f3 BB |
29 | #include <sys/dsl_scan.h> |
30 | #include <sys/spa_impl.h> | |
31 | #include <sys/metaslab_impl.h> | |
32 | #include <sys/vdev_rebuild.h> | |
33 | #include <sys/zio.h> | |
34 | #include <sys/dmu_tx.h> | |
35 | #include <sys/arc.h> | |
36 | #include <sys/zap.h> | |
37 | ||
38 | /* | |
39 | * This file contains the sequential reconstruction implementation for | |
40 | * resilvering. This form of resilvering is internally referred to as device | |
41 | * rebuild to avoid conflating it with the traditional healing reconstruction | |
42 | * performed by the dsl scan code. | |
43 | * | |
44 | * When replacing a device, or scrubbing the pool, ZFS has historically used | |
45 | * a process called resilvering which is a form of healing reconstruction. | |
46 | * This approach has the advantage that as blocks are read from disk their | |
47 | * checksums can be immediately verified and the data repaired. Unfortunately, | |
48 | * it also results in a random IO pattern to the disk even when extra care | |
49 | * is taken to sequentialize the IO as much as possible. This substantially | |
50 | * increases the time required to resilver the pool and restore redundancy. | |
51 | * | |
52 | * For mirrored devices it's possible to implement an alternate sequential | |
53 | * reconstruction strategy when resilvering. Sequential reconstruction | |
54 | * behaves like a traditional RAID rebuild and reconstructs a device in LBA | |
55 | * order without verifying the checksum. After this phase completes a second | |
56 | * scrub phase is started to verify all of the checksums. This two phase | |
57 | * process will take longer than the healing reconstruction described above. | |
58 | * However, it has that advantage that after the reconstruction first phase | |
59 | * completes redundancy has been restored. At this point the pool can incur | |
60 | * another device failure without risking data loss. | |
61 | * | |
62 | * There are a few noteworthy limitations and other advantages of resilvering | |
63 | * using sequential reconstruction vs healing reconstruction. | |
64 | * | |
65 | * Limitations: | |
66 | * | |
b2255edc BB |
67 | * - Sequential reconstruction is not possible on RAIDZ due to its |
68 | * variable stripe width. Note dRAID uses a fixed stripe width which | |
69 | * avoids this issue, but comes at the expense of some usable capacity. | |
9a49d3f3 | 70 | * |
b2255edc | 71 | * - Block checksums are not verified during sequential reconstruction. |
9a49d3f3 BB |
72 | * Similar to traditional RAID the parity/mirror data is reconstructed |
73 | * but cannot be immediately double checked. For this reason when the | |
b2255edc BB |
74 | * last active resilver completes the pool is automatically scrubbed |
75 | * by default. | |
9a49d3f3 BB |
76 | * |
77 | * - Deferred resilvers using sequential reconstruction are not currently | |
78 | * supported. When adding another vdev to an active top-level resilver | |
79 | * it must be restarted. | |
80 | * | |
81 | * Advantages: | |
82 | * | |
b2255edc | 83 | * - Sequential reconstruction is performed in LBA order which may be faster |
bf169e9f | 84 | * than healing reconstruction particularly when using HDDs (or |
9a49d3f3 BB |
85 | * especially with SMR devices). Only allocated capacity is resilvered. |
86 | * | |
87 | * - Sequential reconstruction is not constrained by ZFS block boundaries. | |
88 | * This allows it to issue larger IOs to disk which span multiple blocks | |
89 | * allowing all of these logical blocks to be repaired with a single IO. | |
90 | * | |
91 | * - Unlike a healing resilver or scrub which are pool wide operations, | |
b2255edc BB |
92 | * sequential reconstruction is handled by the top-level vdevs. This |
93 | * allows for it to be started or canceled on a top-level vdev without | |
94 | * impacting any other top-level vdevs in the pool. | |
9a49d3f3 BB |
95 | * |
96 | * - Data only referenced by a pool checkpoint will be repaired because | |
97 | * that space is reflected in the space maps. This differs for a | |
98 | * healing resilver or scrub which will not repair that data. | |
99 | */ | |
100 | ||
101 | ||
102 | /* | |
b2255edc BB |
103 | * Size of rebuild reads; defaults to 1MiB per data disk and is capped at |
104 | * SPA_MAXBLOCKSIZE. | |
9a49d3f3 | 105 | */ |
18168da7 | 106 | static unsigned long zfs_rebuild_max_segment = 1024 * 1024; |
9a49d3f3 BB |
107 | |
108 | /* | |
b2255edc BB |
109 | * Maximum number of parallelly executed bytes per leaf vdev caused by a |
110 | * sequential resilver. We attempt to strike a balance here between keeping | |
111 | * the vdev queues full of I/Os at all times and not overflowing the queues | |
112 | * to cause long latency, which would cause long txg sync times. | |
113 | * | |
114 | * A large default value can be safely used here because the default target | |
115 | * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep | |
116 | * the queue depth short. | |
117 | * | |
118 | * 32MB was selected as the default value to achieve good performance with | |
119 | * a large 90-drive dRAID HDD configuration (draid2:8d:90c:2s). A sequential | |
120 | * rebuild was unable to saturate all of the drives using smaller values. | |
121 | * With a value of 32MB the sequential resilver write rate was measured at | |
122 | * 800MB/s sustained while rebuilding to a distributed spare. | |
9a49d3f3 | 123 | */ |
18168da7 | 124 | static unsigned long zfs_rebuild_vdev_limit = 32 << 20; |
b2255edc BB |
125 | |
126 | /* | |
127 | * Automatically start a pool scrub when the last active sequential resilver | |
128 | * completes in order to verify the checksums of all blocks which have been | |
129 | * resilvered. This option is enabled by default and is strongly recommended. | |
130 | */ | |
18168da7 | 131 | static int zfs_rebuild_scrub_enabled = 1; |
9a49d3f3 BB |
132 | |
133 | /* | |
134 | * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). | |
135 | */ | |
460748d4 | 136 | static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg); |
9a49d3f3 BB |
137 | |
138 | /* | |
139 | * Clear the per-vdev rebuild bytes value for a vdev tree. | |
140 | */ | |
141 | static void | |
142 | clear_rebuild_bytes(vdev_t *vd) | |
143 | { | |
144 | vdev_stat_t *vs = &vd->vdev_stat; | |
145 | ||
146 | for (uint64_t i = 0; i < vd->vdev_children; i++) | |
147 | clear_rebuild_bytes(vd->vdev_child[i]); | |
148 | ||
149 | mutex_enter(&vd->vdev_stat_lock); | |
150 | vs->vs_rebuild_processed = 0; | |
151 | mutex_exit(&vd->vdev_stat_lock); | |
152 | } | |
153 | ||
154 | /* | |
155 | * Determines whether a vdev_rebuild_thread() should be stopped. | |
156 | */ | |
157 | static boolean_t | |
158 | vdev_rebuild_should_stop(vdev_t *vd) | |
159 | { | |
160 | return (!vdev_writeable(vd) || vd->vdev_removing || | |
161 | vd->vdev_rebuild_exit_wanted || | |
162 | vd->vdev_rebuild_cancel_wanted || | |
163 | vd->vdev_rebuild_reset_wanted); | |
164 | } | |
165 | ||
166 | /* | |
167 | * Determine if the rebuild should be canceled. This may happen when all | |
168 | * vdevs with MISSING DTLs are detached. | |
169 | */ | |
170 | static boolean_t | |
171 | vdev_rebuild_should_cancel(vdev_t *vd) | |
172 | { | |
173 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
174 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
175 | ||
176 | if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) | |
177 | return (B_TRUE); | |
178 | ||
179 | return (B_FALSE); | |
180 | } | |
181 | ||
182 | /* | |
183 | * The sync task for updating the on-disk state of a rebuild. This is | |
184 | * scheduled by vdev_rebuild_range(). | |
185 | */ | |
186 | static void | |
187 | vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) | |
188 | { | |
189 | int vdev_id = (uintptr_t)arg; | |
190 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
191 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
192 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
193 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
194 | uint64_t txg = dmu_tx_get_txg(tx); | |
195 | ||
196 | mutex_enter(&vd->vdev_rebuild_lock); | |
197 | ||
198 | if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { | |
199 | vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; | |
200 | vr->vr_scan_offset[txg & TXG_MASK] = 0; | |
201 | } | |
202 | ||
203 | vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + | |
204 | NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); | |
205 | ||
206 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
207 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
208 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
209 | ||
210 | mutex_exit(&vd->vdev_rebuild_lock); | |
211 | } | |
212 | ||
213 | /* | |
214 | * Initialize the on-disk state for a new rebuild, start the rebuild thread. | |
215 | */ | |
216 | static void | |
217 | vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) | |
218 | { | |
219 | int vdev_id = (uintptr_t)arg; | |
220 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
221 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
222 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
223 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
224 | ||
225 | ASSERT(vd->vdev_rebuilding); | |
226 | ||
227 | spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); | |
228 | ||
229 | mutex_enter(&vd->vdev_rebuild_lock); | |
861166b0 | 230 | memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); |
9a49d3f3 BB |
231 | vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; |
232 | vrp->vrp_min_txg = 0; | |
233 | vrp->vrp_max_txg = dmu_tx_get_txg(tx); | |
234 | vrp->vrp_start_time = gethrestime_sec(); | |
235 | vrp->vrp_scan_time_ms = 0; | |
236 | vr->vr_prev_scan_time_ms = 0; | |
237 | ||
238 | /* | |
239 | * Rebuilds are currently only used when replacing a device, in which | |
240 | * case there must be DTL_MISSING entries. In the future, we could | |
241 | * allow rebuilds to be used in a way similar to a scrub. This would | |
242 | * be useful because it would allow us to rebuild the space used by | |
243 | * pool checkpoints. | |
244 | */ | |
245 | VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); | |
246 | ||
247 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
248 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
249 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
250 | ||
251 | spa_history_log_internal(spa, "rebuild", tx, | |
252 | "vdev_id=%llu vdev_guid=%llu started", | |
253 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
254 | ||
255 | ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); | |
256 | vd->vdev_rebuild_thread = thread_create(NULL, 0, | |
257 | vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
258 | ||
259 | mutex_exit(&vd->vdev_rebuild_lock); | |
260 | } | |
261 | ||
262 | static void | |
263 | vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name) | |
264 | { | |
265 | nvlist_t *aux = fnvlist_alloc(); | |
266 | ||
267 | fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); | |
268 | spa_event_notify(spa, vd, aux, name); | |
269 | nvlist_free(aux); | |
270 | } | |
271 | ||
272 | /* | |
273 | * Called to request that a new rebuild be started. The feature will remain | |
274 | * active for the duration of the rebuild, then revert to the enabled state. | |
275 | */ | |
276 | static void | |
277 | vdev_rebuild_initiate(vdev_t *vd) | |
278 | { | |
279 | spa_t *spa = vd->vdev_spa; | |
280 | ||
281 | ASSERT(vd->vdev_top == vd); | |
282 | ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); | |
283 | ASSERT(!vd->vdev_rebuilding); | |
284 | ||
285 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
286 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
287 | ||
288 | vd->vdev_rebuilding = B_TRUE; | |
289 | ||
290 | dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, | |
38080324 | 291 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
292 | dmu_tx_commit(tx); |
293 | ||
294 | vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); | |
295 | } | |
296 | ||
297 | /* | |
298 | * Update the on-disk state to completed when a rebuild finishes. | |
299 | */ | |
300 | static void | |
301 | vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) | |
302 | { | |
303 | int vdev_id = (uintptr_t)arg; | |
304 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
305 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
306 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
307 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
308 | ||
309 | mutex_enter(&vd->vdev_rebuild_lock); | |
310 | vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; | |
311 | vrp->vrp_end_time = gethrestime_sec(); | |
312 | ||
313 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
314 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
315 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
316 | ||
b2255edc | 317 | vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); |
9a49d3f3 BB |
318 | spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); |
319 | ||
320 | spa_history_log_internal(spa, "rebuild", tx, | |
321 | "vdev_id=%llu vdev_guid=%llu complete", | |
322 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
323 | vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); | |
324 | ||
325 | /* Handles detaching of spares */ | |
326 | spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); | |
327 | vd->vdev_rebuilding = B_FALSE; | |
328 | mutex_exit(&vd->vdev_rebuild_lock); | |
329 | ||
b2255edc BB |
330 | /* |
331 | * While we're in syncing context take the opportunity to | |
332 | * setup the scrub when there are no more active rebuilds. | |
333 | */ | |
600a1dc5 BB |
334 | pool_scan_func_t func = POOL_SCAN_SCRUB; |
335 | if (dsl_scan_setup_check(&func, tx) == 0 && | |
b2255edc | 336 | zfs_rebuild_scrub_enabled) { |
b2255edc BB |
337 | dsl_scan_setup_sync(&func, tx); |
338 | } | |
339 | ||
9a49d3f3 | 340 | cv_broadcast(&vd->vdev_rebuild_cv); |
03e02e5b DB |
341 | |
342 | /* Clear recent error events (i.e. duplicate events tracking) */ | |
343 | zfs_ereport_clear(spa, NULL); | |
9a49d3f3 BB |
344 | } |
345 | ||
346 | /* | |
347 | * Update the on-disk state to canceled when a rebuild finishes. | |
348 | */ | |
349 | static void | |
350 | vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) | |
351 | { | |
352 | int vdev_id = (uintptr_t)arg; | |
353 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
354 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
355 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
356 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
357 | ||
358 | mutex_enter(&vd->vdev_rebuild_lock); | |
359 | vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; | |
360 | vrp->vrp_end_time = gethrestime_sec(); | |
361 | ||
362 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
363 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
364 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
365 | ||
366 | spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); | |
367 | ||
368 | spa_history_log_internal(spa, "rebuild", tx, | |
369 | "vdev_id=%llu vdev_guid=%llu canceled", | |
370 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
371 | vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); | |
372 | ||
373 | vd->vdev_rebuild_cancel_wanted = B_FALSE; | |
374 | vd->vdev_rebuilding = B_FALSE; | |
375 | mutex_exit(&vd->vdev_rebuild_lock); | |
376 | ||
377 | spa_notify_waiters(spa); | |
378 | cv_broadcast(&vd->vdev_rebuild_cv); | |
379 | } | |
380 | ||
381 | /* | |
382 | * Resets the progress of a running rebuild. This will occur when a new | |
383 | * vdev is added to rebuild. | |
384 | */ | |
385 | static void | |
386 | vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) | |
387 | { | |
388 | int vdev_id = (uintptr_t)arg; | |
389 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
390 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
391 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
392 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
393 | ||
394 | mutex_enter(&vd->vdev_rebuild_lock); | |
395 | ||
396 | ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); | |
397 | ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); | |
398 | ||
399 | vrp->vrp_last_offset = 0; | |
400 | vrp->vrp_min_txg = 0; | |
401 | vrp->vrp_max_txg = dmu_tx_get_txg(tx); | |
402 | vrp->vrp_bytes_scanned = 0; | |
403 | vrp->vrp_bytes_issued = 0; | |
404 | vrp->vrp_bytes_rebuilt = 0; | |
405 | vrp->vrp_bytes_est = 0; | |
406 | vrp->vrp_scan_time_ms = 0; | |
407 | vr->vr_prev_scan_time_ms = 0; | |
408 | ||
409 | /* See vdev_rebuild_initiate_sync comment */ | |
410 | VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); | |
411 | ||
412 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
413 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
414 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
415 | ||
416 | spa_history_log_internal(spa, "rebuild", tx, | |
417 | "vdev_id=%llu vdev_guid=%llu reset", | |
418 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
419 | ||
420 | vd->vdev_rebuild_reset_wanted = B_FALSE; | |
421 | ASSERT(vd->vdev_rebuilding); | |
422 | ||
423 | vd->vdev_rebuild_thread = thread_create(NULL, 0, | |
424 | vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
425 | ||
426 | mutex_exit(&vd->vdev_rebuild_lock); | |
427 | } | |
428 | ||
429 | /* | |
430 | * Clear the last rebuild status. | |
431 | */ | |
432 | void | |
433 | vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) | |
434 | { | |
435 | int vdev_id = (uintptr_t)arg; | |
436 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
437 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
438 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
439 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
440 | objset_t *mos = spa_meta_objset(spa); | |
441 | ||
442 | mutex_enter(&vd->vdev_rebuild_lock); | |
443 | ||
444 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || | |
445 | vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { | |
446 | mutex_exit(&vd->vdev_rebuild_lock); | |
447 | return; | |
448 | } | |
449 | ||
450 | clear_rebuild_bytes(vd); | |
861166b0 | 451 | memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); |
9a49d3f3 BB |
452 | |
453 | if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, | |
454 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { | |
455 | VERIFY0(zap_update(mos, vd->vdev_top_zap, | |
456 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
457 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
458 | } | |
459 | ||
460 | mutex_exit(&vd->vdev_rebuild_lock); | |
461 | } | |
462 | ||
463 | /* | |
464 | * The zio_done_func_t callback for each rebuild I/O issued. It's responsible | |
465 | * for updating the rebuild stats and limiting the number of in flight I/Os. | |
466 | */ | |
467 | static void | |
468 | vdev_rebuild_cb(zio_t *zio) | |
469 | { | |
470 | vdev_rebuild_t *vr = zio->io_private; | |
471 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
472 | vdev_t *vd = vr->vr_top_vdev; | |
473 | ||
b2255edc | 474 | mutex_enter(&vr->vr_io_lock); |
9a49d3f3 BB |
475 | if (zio->io_error == ENXIO && !vdev_writeable(vd)) { |
476 | /* | |
477 | * The I/O failed because the top-level vdev was unavailable. | |
478 | * Attempt to roll back to the last completed offset, in order | |
479 | * resume from the correct location if the pool is resumed. | |
480 | * (This works because spa_sync waits on spa_txg_zio before | |
481 | * it runs sync tasks.) | |
482 | */ | |
483 | uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; | |
484 | *off = MIN(*off, zio->io_offset); | |
485 | } else if (zio->io_error) { | |
486 | vrp->vrp_errors++; | |
487 | } | |
488 | ||
489 | abd_free(zio->io_abd); | |
490 | ||
b2255edc BB |
491 | ASSERT3U(vr->vr_bytes_inflight, >, 0); |
492 | vr->vr_bytes_inflight -= zio->io_size; | |
493 | cv_broadcast(&vr->vr_io_cv); | |
494 | mutex_exit(&vr->vr_io_lock); | |
9a49d3f3 BB |
495 | |
496 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
497 | } | |
498 | ||
499 | /* | |
b2255edc BB |
500 | * Initialize a block pointer that can be used to read the given segment |
501 | * for sequential rebuild. | |
9a49d3f3 BB |
502 | */ |
503 | static void | |
b2255edc BB |
504 | vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, |
505 | uint64_t asize) | |
9a49d3f3 | 506 | { |
b2255edc BB |
507 | ASSERT(vd->vdev_ops == &vdev_draid_ops || |
508 | vd->vdev_ops == &vdev_mirror_ops || | |
9a49d3f3 BB |
509 | vd->vdev_ops == &vdev_replacing_ops || |
510 | vd->vdev_ops == &vdev_spare_ops); | |
511 | ||
b2255edc BB |
512 | uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? |
513 | vdev_draid_asize_to_psize(vd, asize) : asize; | |
514 | ||
9a49d3f3 BB |
515 | BP_ZERO(bp); |
516 | ||
517 | DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); | |
518 | DVA_SET_OFFSET(&bp->blk_dva[0], start); | |
519 | DVA_SET_GANG(&bp->blk_dva[0], 0); | |
520 | DVA_SET_ASIZE(&bp->blk_dva[0], asize); | |
521 | ||
522 | BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); | |
523 | BP_SET_LSIZE(bp, psize); | |
524 | BP_SET_PSIZE(bp, psize); | |
525 | BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); | |
526 | BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); | |
527 | BP_SET_TYPE(bp, DMU_OT_NONE); | |
528 | BP_SET_LEVEL(bp, 0); | |
529 | BP_SET_DEDUP(bp, 0); | |
530 | BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); | |
9a49d3f3 BB |
531 | } |
532 | ||
533 | /* | |
534 | * Issues a rebuild I/O and takes care of rate limiting the number of queued | |
535 | * rebuild I/Os. The provided start and size must be properly aligned for the | |
536 | * top-level vdev type being rebuilt. | |
537 | */ | |
538 | static int | |
539 | vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) | |
540 | { | |
541 | uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; | |
542 | vdev_t *vd = vr->vr_top_vdev; | |
543 | spa_t *spa = vd->vdev_spa; | |
b2255edc | 544 | blkptr_t blk; |
9a49d3f3 BB |
545 | |
546 | ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); | |
547 | ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); | |
548 | ||
549 | vr->vr_pass_bytes_scanned += size; | |
550 | vr->vr_rebuild_phys.vrp_bytes_scanned += size; | |
551 | ||
b2255edc BB |
552 | /* |
553 | * Rebuild the data in this range by constructing a special block | |
554 | * pointer. It has no relation to any existing blocks in the pool. | |
555 | * However, by disabling checksum verification and issuing a scrub IO | |
556 | * we can reconstruct and repair any children with missing data. | |
557 | */ | |
558 | vdev_rebuild_blkptr_init(&blk, vd, start, size); | |
559 | uint64_t psize = BP_GET_PSIZE(&blk); | |
560 | ||
561 | if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) | |
562 | return (0); | |
563 | ||
564 | mutex_enter(&vr->vr_io_lock); | |
9a49d3f3 BB |
565 | |
566 | /* Limit in flight rebuild I/Os */ | |
b2255edc BB |
567 | while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max) |
568 | cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); | |
9a49d3f3 | 569 | |
b2255edc BB |
570 | vr->vr_bytes_inflight += psize; |
571 | mutex_exit(&vr->vr_io_lock); | |
9a49d3f3 BB |
572 | |
573 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
574 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
575 | uint64_t txg = dmu_tx_get_txg(tx); | |
576 | ||
577 | spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); | |
578 | mutex_enter(&vd->vdev_rebuild_lock); | |
579 | ||
580 | /* This is the first I/O for this txg. */ | |
581 | if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { | |
582 | vr->vr_scan_offset[txg & TXG_MASK] = start; | |
583 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
584 | vdev_rebuild_update_sync, | |
38080324 | 585 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
586 | } |
587 | ||
588 | /* When exiting write out our progress. */ | |
589 | if (vdev_rebuild_should_stop(vd)) { | |
b2255edc BB |
590 | mutex_enter(&vr->vr_io_lock); |
591 | vr->vr_bytes_inflight -= psize; | |
592 | mutex_exit(&vr->vr_io_lock); | |
9a49d3f3 BB |
593 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); |
594 | mutex_exit(&vd->vdev_rebuild_lock); | |
595 | dmu_tx_commit(tx); | |
596 | return (SET_ERROR(EINTR)); | |
597 | } | |
598 | mutex_exit(&vd->vdev_rebuild_lock); | |
b2255edc | 599 | dmu_tx_commit(tx); |
9a49d3f3 BB |
600 | |
601 | vr->vr_scan_offset[txg & TXG_MASK] = start + size; | |
b2255edc BB |
602 | vr->vr_pass_bytes_issued += size; |
603 | vr->vr_rebuild_phys.vrp_bytes_issued += size; | |
9a49d3f3 | 604 | |
b2255edc BB |
605 | zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk, |
606 | abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, | |
607 | ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | | |
608 | ZIO_FLAG_RESILVER, NULL)); | |
9a49d3f3 BB |
609 | |
610 | return (0); | |
611 | } | |
612 | ||
9a49d3f3 BB |
613 | /* |
614 | * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. | |
615 | */ | |
616 | static int | |
617 | vdev_rebuild_ranges(vdev_rebuild_t *vr) | |
618 | { | |
619 | vdev_t *vd = vr->vr_top_vdev; | |
620 | zfs_btree_t *t = &vr->vr_scan_tree->rt_root; | |
621 | zfs_btree_index_t idx; | |
622 | int error; | |
623 | ||
624 | for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; | |
625 | rs = zfs_btree_next(t, &idx, &idx)) { | |
626 | uint64_t start = rs_get_start(rs, vr->vr_scan_tree); | |
627 | uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; | |
628 | ||
629 | /* | |
630 | * zfs_scan_suspend_progress can be set to disable rebuild | |
631 | * progress for testing. See comment in dsl_scan_sync(). | |
632 | */ | |
633 | while (zfs_scan_suspend_progress && | |
634 | !vdev_rebuild_should_stop(vd)) { | |
635 | delay(hz); | |
636 | } | |
637 | ||
638 | while (size > 0) { | |
639 | uint64_t chunk_size; | |
640 | ||
b2255edc BB |
641 | /* |
642 | * Split range into legally-sized logical chunks | |
643 | * given the constraints of the top-level vdev | |
644 | * being rebuilt (dRAID or mirror). | |
645 | */ | |
646 | ASSERT3P(vd->vdev_ops, !=, NULL); | |
647 | chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd, | |
648 | start, size, zfs_rebuild_max_segment); | |
9a49d3f3 BB |
649 | |
650 | error = vdev_rebuild_range(vr, start, chunk_size); | |
651 | if (error != 0) | |
652 | return (error); | |
653 | ||
654 | size -= chunk_size; | |
655 | start += chunk_size; | |
656 | } | |
657 | } | |
658 | ||
659 | return (0); | |
660 | } | |
661 | ||
662 | /* | |
663 | * Calculates the estimated capacity which remains to be scanned. Since | |
664 | * we traverse the pool in metaslab order only allocated capacity beyond | |
665 | * the vrp_last_offset need be considered. All lower offsets must have | |
666 | * already been rebuilt and are thus already included in vrp_bytes_scanned. | |
667 | */ | |
668 | static void | |
669 | vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) | |
670 | { | |
671 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
672 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
673 | uint64_t bytes_est = vrp->vrp_bytes_scanned; | |
674 | ||
675 | if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) | |
676 | return; | |
677 | ||
678 | for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { | |
679 | metaslab_t *msp = vd->vdev_ms[i]; | |
680 | ||
681 | mutex_enter(&msp->ms_lock); | |
682 | bytes_est += metaslab_allocated_space(msp); | |
683 | mutex_exit(&msp->ms_lock); | |
684 | } | |
685 | ||
686 | vrp->vrp_bytes_est = bytes_est; | |
687 | } | |
688 | ||
689 | /* | |
690 | * Load from disk the top-level vdev's rebuild information. | |
691 | */ | |
692 | int | |
693 | vdev_rebuild_load(vdev_t *vd) | |
694 | { | |
695 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
696 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
697 | spa_t *spa = vd->vdev_spa; | |
698 | int err = 0; | |
699 | ||
700 | mutex_enter(&vd->vdev_rebuild_lock); | |
701 | vd->vdev_rebuilding = B_FALSE; | |
702 | ||
703 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { | |
861166b0 | 704 | memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); |
9a49d3f3 BB |
705 | mutex_exit(&vd->vdev_rebuild_lock); |
706 | return (SET_ERROR(ENOTSUP)); | |
707 | } | |
708 | ||
709 | ASSERT(vd->vdev_top == vd); | |
710 | ||
711 | err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, | |
712 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
713 | REBUILD_PHYS_ENTRIES, vrp); | |
714 | ||
715 | /* | |
716 | * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should | |
717 | * not prevent a pool from being imported. Clear the rebuild | |
718 | * status allowing a new resilver/rebuild to be started. | |
719 | */ | |
720 | if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { | |
861166b0 | 721 | memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); |
9a49d3f3 BB |
722 | } else if (err) { |
723 | mutex_exit(&vd->vdev_rebuild_lock); | |
724 | return (err); | |
725 | } | |
726 | ||
727 | vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; | |
728 | vr->vr_top_vdev = vd; | |
729 | ||
730 | mutex_exit(&vd->vdev_rebuild_lock); | |
731 | ||
732 | return (0); | |
733 | } | |
734 | ||
735 | /* | |
736 | * Each scan thread is responsible for rebuilding a top-level vdev. The | |
737 | * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. | |
738 | */ | |
460748d4 | 739 | static __attribute__((noreturn)) void |
9a49d3f3 BB |
740 | vdev_rebuild_thread(void *arg) |
741 | { | |
742 | vdev_t *vd = arg; | |
743 | spa_t *spa = vd->vdev_spa; | |
744 | int error = 0; | |
745 | ||
746 | /* | |
747 | * If there's a scrub in process request that it be stopped. This | |
748 | * is not required for a correct rebuild, but we do want rebuilds to | |
749 | * emulate the resilver behavior as much as possible. | |
750 | */ | |
751 | dsl_pool_t *dsl = spa_get_dsl(spa); | |
752 | if (dsl_scan_scrubbing(dsl)) | |
753 | dsl_scan_cancel(dsl); | |
754 | ||
755 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
756 | mutex_enter(&vd->vdev_rebuild_lock); | |
757 | ||
758 | ASSERT3P(vd->vdev_top, ==, vd); | |
759 | ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); | |
760 | ASSERT(vd->vdev_rebuilding); | |
761 | ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); | |
762 | ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); | |
763 | ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); | |
764 | ||
765 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
766 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
767 | vr->vr_top_vdev = vd; | |
768 | vr->vr_scan_msp = NULL; | |
769 | vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); | |
b2255edc BB |
770 | mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); |
771 | cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); | |
772 | ||
9a49d3f3 BB |
773 | vr->vr_pass_start_time = gethrtime(); |
774 | vr->vr_pass_bytes_scanned = 0; | |
775 | vr->vr_pass_bytes_issued = 0; | |
776 | ||
b2255edc BB |
777 | vr->vr_bytes_inflight_max = MAX(1ULL << 20, |
778 | zfs_rebuild_vdev_limit * vd->vdev_children); | |
779 | ||
9a49d3f3 BB |
780 | uint64_t update_est_time = gethrtime(); |
781 | vdev_rebuild_update_bytes_est(vd, 0); | |
782 | ||
783 | clear_rebuild_bytes(vr->vr_top_vdev); | |
784 | ||
785 | mutex_exit(&vd->vdev_rebuild_lock); | |
786 | ||
787 | /* | |
788 | * Systematically walk the metaslabs and issue rebuild I/Os for | |
789 | * all ranges in the allocated space map. | |
790 | */ | |
791 | for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { | |
792 | metaslab_t *msp = vd->vdev_ms[i]; | |
793 | vr->vr_scan_msp = msp; | |
794 | ||
795 | /* | |
796 | * Removal of vdevs from the vdev tree may eliminate the need | |
797 | * for the rebuild, in which case it should be canceled. The | |
798 | * vdev_rebuild_cancel_wanted flag is set until the sync task | |
799 | * completes. This may be after the rebuild thread exits. | |
800 | */ | |
801 | if (vdev_rebuild_should_cancel(vd)) { | |
802 | vd->vdev_rebuild_cancel_wanted = B_TRUE; | |
803 | error = EINTR; | |
804 | break; | |
805 | } | |
806 | ||
807 | ASSERT0(range_tree_space(vr->vr_scan_tree)); | |
808 | ||
b2255edc | 809 | /* Disable any new allocations to this metaslab */ |
9a49d3f3 | 810 | spa_config_exit(spa, SCL_CONFIG, FTAG); |
8e43fa12 | 811 | metaslab_disable(msp); |
9a49d3f3 BB |
812 | |
813 | mutex_enter(&msp->ms_sync_lock); | |
814 | mutex_enter(&msp->ms_lock); | |
815 | ||
b2255edc BB |
816 | /* |
817 | * If there are outstanding allocations wait for them to be | |
818 | * synced. This is needed to ensure all allocated ranges are | |
819 | * on disk and therefore will be rebuilt. | |
820 | */ | |
821 | for (int j = 0; j < TXG_SIZE; j++) { | |
822 | if (range_tree_space(msp->ms_allocating[j])) { | |
823 | mutex_exit(&msp->ms_lock); | |
824 | mutex_exit(&msp->ms_sync_lock); | |
825 | txg_wait_synced(dsl, 0); | |
826 | mutex_enter(&msp->ms_sync_lock); | |
827 | mutex_enter(&msp->ms_lock); | |
828 | break; | |
829 | } | |
830 | } | |
831 | ||
9a49d3f3 BB |
832 | /* |
833 | * When a metaslab has been allocated from read its allocated | |
b2255edc | 834 | * ranges from the space map object into the vr_scan_tree. |
9a49d3f3 BB |
835 | * Then add inflight / unflushed ranges and remove inflight / |
836 | * unflushed frees. This is the minimum range to be rebuilt. | |
837 | */ | |
838 | if (msp->ms_sm != NULL) { | |
839 | VERIFY0(space_map_load(msp->ms_sm, | |
840 | vr->vr_scan_tree, SM_ALLOC)); | |
841 | ||
842 | for (int i = 0; i < TXG_SIZE; i++) { | |
843 | ASSERT0(range_tree_space( | |
844 | msp->ms_allocating[i])); | |
845 | } | |
846 | ||
847 | range_tree_walk(msp->ms_unflushed_allocs, | |
848 | range_tree_add, vr->vr_scan_tree); | |
849 | range_tree_walk(msp->ms_unflushed_frees, | |
850 | range_tree_remove, vr->vr_scan_tree); | |
851 | ||
852 | /* | |
853 | * Remove ranges which have already been rebuilt based | |
854 | * on the last offset. This can happen when restarting | |
855 | * a scan after exporting and re-importing the pool. | |
856 | */ | |
857 | range_tree_clear(vr->vr_scan_tree, 0, | |
858 | vrp->vrp_last_offset); | |
859 | } | |
860 | ||
861 | mutex_exit(&msp->ms_lock); | |
862 | mutex_exit(&msp->ms_sync_lock); | |
863 | ||
864 | /* | |
865 | * To provide an accurate estimate re-calculate the estimated | |
866 | * size every 5 minutes to account for recent allocations and | |
b2255edc | 867 | * frees made to space maps which have not yet been rebuilt. |
9a49d3f3 BB |
868 | */ |
869 | if (gethrtime() > update_est_time + SEC2NSEC(300)) { | |
870 | update_est_time = gethrtime(); | |
871 | vdev_rebuild_update_bytes_est(vd, i); | |
872 | } | |
873 | ||
874 | /* | |
875 | * Walk the allocated space map and issue the rebuild I/O. | |
876 | */ | |
877 | error = vdev_rebuild_ranges(vr); | |
878 | range_tree_vacate(vr->vr_scan_tree, NULL, NULL); | |
879 | ||
880 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
881 | metaslab_enable(msp, B_FALSE, B_FALSE); | |
882 | ||
883 | if (error != 0) | |
884 | break; | |
885 | } | |
886 | ||
887 | range_tree_destroy(vr->vr_scan_tree); | |
888 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
889 | ||
890 | /* Wait for any remaining rebuild I/O to complete */ | |
b2255edc BB |
891 | mutex_enter(&vr->vr_io_lock); |
892 | while (vr->vr_bytes_inflight > 0) | |
893 | cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); | |
9a49d3f3 | 894 | |
b2255edc BB |
895 | mutex_exit(&vr->vr_io_lock); |
896 | ||
897 | mutex_destroy(&vr->vr_io_lock); | |
898 | cv_destroy(&vr->vr_io_cv); | |
9a49d3f3 BB |
899 | |
900 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
901 | ||
902 | dsl_pool_t *dp = spa_get_dsl(spa); | |
903 | dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); | |
904 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
905 | ||
906 | mutex_enter(&vd->vdev_rebuild_lock); | |
907 | if (error == 0) { | |
908 | /* | |
909 | * After a successful rebuild clear the DTLs of all ranges | |
910 | * which were missing when the rebuild was started. These | |
911 | * ranges must have been rebuilt as a consequence of rebuilding | |
912 | * all allocated space. Note that unlike a scrub or resilver | |
913 | * the rebuild operation will reconstruct data only referenced | |
914 | * by a pool checkpoint. See the dsl_scan_done() comments. | |
915 | */ | |
916 | dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, | |
38080324 | 917 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
918 | } else if (vd->vdev_rebuild_cancel_wanted) { |
919 | /* | |
920 | * The rebuild operation was canceled. This will occur when | |
921 | * a device participating in the rebuild is detached. | |
922 | */ | |
923 | dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, | |
38080324 | 924 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
925 | } else if (vd->vdev_rebuild_reset_wanted) { |
926 | /* | |
927 | * Reset the running rebuild without canceling and restarting | |
928 | * it. This will occur when a new device is attached and must | |
929 | * participate in the rebuild. | |
930 | */ | |
931 | dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, | |
38080324 | 932 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
933 | } else { |
934 | /* | |
935 | * The rebuild operation should be suspended. This may occur | |
936 | * when detaching a child vdev or when exporting the pool. The | |
937 | * rebuild is left in the active state so it will be resumed. | |
938 | */ | |
939 | ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); | |
940 | vd->vdev_rebuilding = B_FALSE; | |
941 | } | |
942 | ||
943 | dmu_tx_commit(tx); | |
944 | ||
945 | vd->vdev_rebuild_thread = NULL; | |
946 | mutex_exit(&vd->vdev_rebuild_lock); | |
947 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
948 | ||
949 | cv_broadcast(&vd->vdev_rebuild_cv); | |
22dcf891 MM |
950 | |
951 | thread_exit(); | |
9a49d3f3 BB |
952 | } |
953 | ||
954 | /* | |
955 | * Returns B_TRUE if any top-level vdev are rebuilding. | |
956 | */ | |
957 | boolean_t | |
958 | vdev_rebuild_active(vdev_t *vd) | |
959 | { | |
960 | spa_t *spa = vd->vdev_spa; | |
961 | boolean_t ret = B_FALSE; | |
962 | ||
963 | if (vd == spa->spa_root_vdev) { | |
964 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
965 | ret = vdev_rebuild_active(vd->vdev_child[i]); | |
966 | if (ret) | |
967 | return (ret); | |
968 | } | |
969 | } else if (vd->vdev_top_zap != 0) { | |
970 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
971 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
972 | ||
973 | mutex_enter(&vd->vdev_rebuild_lock); | |
974 | ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); | |
975 | mutex_exit(&vd->vdev_rebuild_lock); | |
976 | } | |
977 | ||
978 | return (ret); | |
979 | } | |
980 | ||
981 | /* | |
982 | * Start a rebuild operation. The rebuild may be restarted when the | |
983 | * top-level vdev is currently actively rebuilding. | |
984 | */ | |
985 | void | |
986 | vdev_rebuild(vdev_t *vd) | |
987 | { | |
988 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
989 | vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; | |
990 | ||
991 | ASSERT(vd->vdev_top == vd); | |
992 | ASSERT(vdev_is_concrete(vd)); | |
993 | ASSERT(!vd->vdev_removing); | |
994 | ASSERT(spa_feature_is_enabled(vd->vdev_spa, | |
995 | SPA_FEATURE_DEVICE_REBUILD)); | |
996 | ||
997 | mutex_enter(&vd->vdev_rebuild_lock); | |
998 | if (vd->vdev_rebuilding) { | |
999 | ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); | |
1000 | ||
1001 | /* | |
1002 | * Signal a running rebuild operation that it should restart | |
1003 | * from the beginning because a new device was attached. The | |
1004 | * vdev_rebuild_reset_wanted flag is set until the sync task | |
1005 | * completes. This may be after the rebuild thread exits. | |
1006 | */ | |
1007 | if (!vd->vdev_rebuild_reset_wanted) | |
1008 | vd->vdev_rebuild_reset_wanted = B_TRUE; | |
1009 | } else { | |
1010 | vdev_rebuild_initiate(vd); | |
1011 | } | |
1012 | mutex_exit(&vd->vdev_rebuild_lock); | |
1013 | } | |
1014 | ||
1015 | static void | |
1016 | vdev_rebuild_restart_impl(vdev_t *vd) | |
1017 | { | |
1018 | spa_t *spa = vd->vdev_spa; | |
1019 | ||
1020 | if (vd == spa->spa_root_vdev) { | |
1021 | for (uint64_t i = 0; i < vd->vdev_children; i++) | |
1022 | vdev_rebuild_restart_impl(vd->vdev_child[i]); | |
1023 | ||
1024 | } else if (vd->vdev_top_zap != 0) { | |
1025 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
1026 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
1027 | ||
1028 | mutex_enter(&vd->vdev_rebuild_lock); | |
1029 | if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && | |
1030 | vdev_writeable(vd) && !vd->vdev_rebuilding) { | |
1031 | ASSERT(spa_feature_is_active(spa, | |
1032 | SPA_FEATURE_DEVICE_REBUILD)); | |
1033 | vd->vdev_rebuilding = B_TRUE; | |
1034 | vd->vdev_rebuild_thread = thread_create(NULL, 0, | |
1035 | vdev_rebuild_thread, vd, 0, &p0, TS_RUN, | |
1036 | maxclsyspri); | |
1037 | } | |
1038 | mutex_exit(&vd->vdev_rebuild_lock); | |
1039 | } | |
1040 | } | |
1041 | ||
1042 | /* | |
1043 | * Conditionally restart all of the vdev_rebuild_thread's for a pool. The | |
1044 | * feature flag must be active and the rebuild in the active state. This | |
1045 | * cannot be used to start a new rebuild. | |
1046 | */ | |
1047 | void | |
1048 | vdev_rebuild_restart(spa_t *spa) | |
1049 | { | |
1050 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1051 | ||
1052 | vdev_rebuild_restart_impl(spa->spa_root_vdev); | |
1053 | } | |
1054 | ||
1055 | /* | |
1056 | * Stop and wait for all of the vdev_rebuild_thread's associated with the | |
1057 | * vdev tree provide to be terminated (canceled or stopped). | |
1058 | */ | |
1059 | void | |
1060 | vdev_rebuild_stop_wait(vdev_t *vd) | |
1061 | { | |
1062 | spa_t *spa = vd->vdev_spa; | |
1063 | ||
1064 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1065 | ||
1066 | if (vd == spa->spa_root_vdev) { | |
1067 | for (uint64_t i = 0; i < vd->vdev_children; i++) | |
1068 | vdev_rebuild_stop_wait(vd->vdev_child[i]); | |
1069 | ||
1070 | } else if (vd->vdev_top_zap != 0) { | |
1071 | ASSERT(vd == vd->vdev_top); | |
1072 | ||
1073 | mutex_enter(&vd->vdev_rebuild_lock); | |
1074 | if (vd->vdev_rebuild_thread != NULL) { | |
1075 | vd->vdev_rebuild_exit_wanted = B_TRUE; | |
1076 | while (vd->vdev_rebuilding) { | |
1077 | cv_wait(&vd->vdev_rebuild_cv, | |
1078 | &vd->vdev_rebuild_lock); | |
1079 | } | |
1080 | vd->vdev_rebuild_exit_wanted = B_FALSE; | |
1081 | } | |
1082 | mutex_exit(&vd->vdev_rebuild_lock); | |
1083 | } | |
1084 | } | |
1085 | ||
1086 | /* | |
1087 | * Stop all rebuild operations but leave them in the active state so they | |
1088 | * will be resumed when importing the pool. | |
1089 | */ | |
1090 | void | |
1091 | vdev_rebuild_stop_all(spa_t *spa) | |
1092 | { | |
1093 | vdev_rebuild_stop_wait(spa->spa_root_vdev); | |
1094 | } | |
1095 | ||
1096 | /* | |
1097 | * Rebuild statistics reported per top-level vdev. | |
1098 | */ | |
1099 | int | |
1100 | vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) | |
1101 | { | |
1102 | spa_t *spa = tvd->vdev_spa; | |
1103 | ||
1104 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) | |
1105 | return (SET_ERROR(ENOTSUP)); | |
1106 | ||
1107 | if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) | |
1108 | return (SET_ERROR(EINVAL)); | |
1109 | ||
1110 | int error = zap_contains(spa_meta_objset(spa), | |
1111 | tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); | |
1112 | ||
1113 | if (error == ENOENT) { | |
861166b0 | 1114 | memset(vrs, 0, sizeof (vdev_rebuild_stat_t)); |
9a49d3f3 BB |
1115 | vrs->vrs_state = VDEV_REBUILD_NONE; |
1116 | error = 0; | |
1117 | } else if (error == 0) { | |
1118 | vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; | |
1119 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
1120 | ||
1121 | mutex_enter(&tvd->vdev_rebuild_lock); | |
1122 | vrs->vrs_state = vrp->vrp_rebuild_state; | |
1123 | vrs->vrs_start_time = vrp->vrp_start_time; | |
1124 | vrs->vrs_end_time = vrp->vrp_end_time; | |
1125 | vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; | |
1126 | vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; | |
1127 | vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; | |
1128 | vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; | |
1129 | vrs->vrs_bytes_est = vrp->vrp_bytes_est; | |
1130 | vrs->vrs_errors = vrp->vrp_errors; | |
1131 | vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - | |
1132 | vr->vr_pass_start_time); | |
1133 | vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; | |
1134 | vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; | |
1135 | mutex_exit(&tvd->vdev_rebuild_lock); | |
1136 | } | |
1137 | ||
1138 | return (error); | |
1139 | } | |
1140 | ||
9a49d3f3 | 1141 | ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, |
b2255edc BB |
1142 | "Max segment size in bytes of rebuild reads"); |
1143 | ||
1144 | ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, ULONG, ZMOD_RW, | |
1145 | "Max bytes in flight per leaf vdev for sequential resilvers"); | |
1146 | ||
1147 | ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, | |
1148 | "Automatically scrub after sequential resilver completes"); |