]>
Commit | Line | Data |
---|---|---|
9a49d3f3 BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
9a49d3f3 BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * | |
23 | * Copyright (c) 2018, Intel Corporation. | |
24 | * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. | |
2be0a124 | 25 | * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. |
9a49d3f3 BB |
26 | */ |
27 | ||
28 | #include <sys/vdev_impl.h> | |
b2255edc | 29 | #include <sys/vdev_draid.h> |
9a49d3f3 BB |
30 | #include <sys/dsl_scan.h> |
31 | #include <sys/spa_impl.h> | |
32 | #include <sys/metaslab_impl.h> | |
33 | #include <sys/vdev_rebuild.h> | |
34 | #include <sys/zio.h> | |
35 | #include <sys/dmu_tx.h> | |
36 | #include <sys/arc.h> | |
973934b9 | 37 | #include <sys/arc_impl.h> |
9a49d3f3 BB |
38 | #include <sys/zap.h> |
39 | ||
40 | /* | |
41 | * This file contains the sequential reconstruction implementation for | |
42 | * resilvering. This form of resilvering is internally referred to as device | |
43 | * rebuild to avoid conflating it with the traditional healing reconstruction | |
44 | * performed by the dsl scan code. | |
45 | * | |
46 | * When replacing a device, or scrubbing the pool, ZFS has historically used | |
47 | * a process called resilvering which is a form of healing reconstruction. | |
48 | * This approach has the advantage that as blocks are read from disk their | |
49 | * checksums can be immediately verified and the data repaired. Unfortunately, | |
50 | * it also results in a random IO pattern to the disk even when extra care | |
51 | * is taken to sequentialize the IO as much as possible. This substantially | |
52 | * increases the time required to resilver the pool and restore redundancy. | |
53 | * | |
54 | * For mirrored devices it's possible to implement an alternate sequential | |
55 | * reconstruction strategy when resilvering. Sequential reconstruction | |
56 | * behaves like a traditional RAID rebuild and reconstructs a device in LBA | |
57 | * order without verifying the checksum. After this phase completes a second | |
58 | * scrub phase is started to verify all of the checksums. This two phase | |
59 | * process will take longer than the healing reconstruction described above. | |
60 | * However, it has that advantage that after the reconstruction first phase | |
61 | * completes redundancy has been restored. At this point the pool can incur | |
62 | * another device failure without risking data loss. | |
63 | * | |
64 | * There are a few noteworthy limitations and other advantages of resilvering | |
65 | * using sequential reconstruction vs healing reconstruction. | |
66 | * | |
67 | * Limitations: | |
68 | * | |
b2255edc BB |
69 | * - Sequential reconstruction is not possible on RAIDZ due to its |
70 | * variable stripe width. Note dRAID uses a fixed stripe width which | |
71 | * avoids this issue, but comes at the expense of some usable capacity. | |
9a49d3f3 | 72 | * |
b2255edc | 73 | * - Block checksums are not verified during sequential reconstruction. |
9a49d3f3 BB |
74 | * Similar to traditional RAID the parity/mirror data is reconstructed |
75 | * but cannot be immediately double checked. For this reason when the | |
b2255edc BB |
76 | * last active resilver completes the pool is automatically scrubbed |
77 | * by default. | |
9a49d3f3 BB |
78 | * |
79 | * - Deferred resilvers using sequential reconstruction are not currently | |
80 | * supported. When adding another vdev to an active top-level resilver | |
81 | * it must be restarted. | |
82 | * | |
83 | * Advantages: | |
84 | * | |
b2255edc | 85 | * - Sequential reconstruction is performed in LBA order which may be faster |
bf169e9f | 86 | * than healing reconstruction particularly when using HDDs (or |
9a49d3f3 BB |
87 | * especially with SMR devices). Only allocated capacity is resilvered. |
88 | * | |
89 | * - Sequential reconstruction is not constrained by ZFS block boundaries. | |
90 | * This allows it to issue larger IOs to disk which span multiple blocks | |
91 | * allowing all of these logical blocks to be repaired with a single IO. | |
92 | * | |
93 | * - Unlike a healing resilver or scrub which are pool wide operations, | |
b2255edc BB |
94 | * sequential reconstruction is handled by the top-level vdevs. This |
95 | * allows for it to be started or canceled on a top-level vdev without | |
96 | * impacting any other top-level vdevs in the pool. | |
9a49d3f3 BB |
97 | * |
98 | * - Data only referenced by a pool checkpoint will be repaired because | |
99 | * that space is reflected in the space maps. This differs for a | |
100 | * healing resilver or scrub which will not repair that data. | |
101 | */ | |
102 | ||
103 | ||
104 | /* | |
b2255edc BB |
105 | * Size of rebuild reads; defaults to 1MiB per data disk and is capped at |
106 | * SPA_MAXBLOCKSIZE. | |
9a49d3f3 | 107 | */ |
ab8d9c17 | 108 | static uint64_t zfs_rebuild_max_segment = 1024 * 1024; |
9a49d3f3 BB |
109 | |
110 | /* | |
b2255edc BB |
111 | * Maximum number of parallelly executed bytes per leaf vdev caused by a |
112 | * sequential resilver. We attempt to strike a balance here between keeping | |
113 | * the vdev queues full of I/Os at all times and not overflowing the queues | |
114 | * to cause long latency, which would cause long txg sync times. | |
115 | * | |
116 | * A large default value can be safely used here because the default target | |
117 | * segment size is also large (zfs_rebuild_max_segment=1M). This helps keep | |
118 | * the queue depth short. | |
119 | * | |
973934b9 BB |
120 | * 64MB was observed to deliver the best performance and set as the default. |
121 | * Testing was performed with a 106-drive dRAID HDD pool (draid2:11d:106c) | |
122 | * and a rebuild rate of 1.2GB/s was measured to the distribute spare. | |
123 | * Smaller values were unable to fully saturate the available pool I/O. | |
9a49d3f3 | 124 | */ |
973934b9 | 125 | static uint64_t zfs_rebuild_vdev_limit = 64 << 20; |
b2255edc BB |
126 | |
127 | /* | |
128 | * Automatically start a pool scrub when the last active sequential resilver | |
129 | * completes in order to verify the checksums of all blocks which have been | |
130 | * resilvered. This option is enabled by default and is strongly recommended. | |
131 | */ | |
18168da7 | 132 | static int zfs_rebuild_scrub_enabled = 1; |
9a49d3f3 BB |
133 | |
134 | /* | |
135 | * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). | |
136 | */ | |
460748d4 | 137 | static __attribute__((noreturn)) void vdev_rebuild_thread(void *arg); |
2be0a124 | 138 | static void vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx); |
9a49d3f3 BB |
139 | |
140 | /* | |
141 | * Clear the per-vdev rebuild bytes value for a vdev tree. | |
142 | */ | |
143 | static void | |
144 | clear_rebuild_bytes(vdev_t *vd) | |
145 | { | |
146 | vdev_stat_t *vs = &vd->vdev_stat; | |
147 | ||
148 | for (uint64_t i = 0; i < vd->vdev_children; i++) | |
149 | clear_rebuild_bytes(vd->vdev_child[i]); | |
150 | ||
151 | mutex_enter(&vd->vdev_stat_lock); | |
152 | vs->vs_rebuild_processed = 0; | |
153 | mutex_exit(&vd->vdev_stat_lock); | |
154 | } | |
155 | ||
156 | /* | |
157 | * Determines whether a vdev_rebuild_thread() should be stopped. | |
158 | */ | |
159 | static boolean_t | |
160 | vdev_rebuild_should_stop(vdev_t *vd) | |
161 | { | |
162 | return (!vdev_writeable(vd) || vd->vdev_removing || | |
163 | vd->vdev_rebuild_exit_wanted || | |
164 | vd->vdev_rebuild_cancel_wanted || | |
165 | vd->vdev_rebuild_reset_wanted); | |
166 | } | |
167 | ||
168 | /* | |
169 | * Determine if the rebuild should be canceled. This may happen when all | |
170 | * vdevs with MISSING DTLs are detached. | |
171 | */ | |
172 | static boolean_t | |
173 | vdev_rebuild_should_cancel(vdev_t *vd) | |
174 | { | |
175 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
176 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
177 | ||
178 | if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) | |
179 | return (B_TRUE); | |
180 | ||
181 | return (B_FALSE); | |
182 | } | |
183 | ||
184 | /* | |
185 | * The sync task for updating the on-disk state of a rebuild. This is | |
186 | * scheduled by vdev_rebuild_range(). | |
187 | */ | |
188 | static void | |
189 | vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) | |
190 | { | |
191 | int vdev_id = (uintptr_t)arg; | |
192 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
193 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
194 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
195 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
196 | uint64_t txg = dmu_tx_get_txg(tx); | |
197 | ||
198 | mutex_enter(&vd->vdev_rebuild_lock); | |
199 | ||
200 | if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { | |
201 | vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; | |
202 | vr->vr_scan_offset[txg & TXG_MASK] = 0; | |
203 | } | |
204 | ||
205 | vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + | |
206 | NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); | |
207 | ||
208 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
209 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
210 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
211 | ||
212 | mutex_exit(&vd->vdev_rebuild_lock); | |
213 | } | |
214 | ||
215 | /* | |
216 | * Initialize the on-disk state for a new rebuild, start the rebuild thread. | |
217 | */ | |
218 | static void | |
219 | vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) | |
220 | { | |
221 | int vdev_id = (uintptr_t)arg; | |
222 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
223 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
224 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
225 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
226 | ||
227 | ASSERT(vd->vdev_rebuilding); | |
228 | ||
229 | spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); | |
230 | ||
231 | mutex_enter(&vd->vdev_rebuild_lock); | |
861166b0 | 232 | memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); |
9a49d3f3 BB |
233 | vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; |
234 | vrp->vrp_min_txg = 0; | |
235 | vrp->vrp_max_txg = dmu_tx_get_txg(tx); | |
236 | vrp->vrp_start_time = gethrestime_sec(); | |
237 | vrp->vrp_scan_time_ms = 0; | |
238 | vr->vr_prev_scan_time_ms = 0; | |
239 | ||
240 | /* | |
241 | * Rebuilds are currently only used when replacing a device, in which | |
242 | * case there must be DTL_MISSING entries. In the future, we could | |
243 | * allow rebuilds to be used in a way similar to a scrub. This would | |
244 | * be useful because it would allow us to rebuild the space used by | |
245 | * pool checkpoints. | |
246 | */ | |
247 | VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); | |
248 | ||
249 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
250 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
251 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
252 | ||
253 | spa_history_log_internal(spa, "rebuild", tx, | |
254 | "vdev_id=%llu vdev_guid=%llu started", | |
255 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
256 | ||
257 | ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); | |
258 | vd->vdev_rebuild_thread = thread_create(NULL, 0, | |
259 | vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
260 | ||
261 | mutex_exit(&vd->vdev_rebuild_lock); | |
262 | } | |
263 | ||
264 | static void | |
a926aab9 | 265 | vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, const char *name) |
9a49d3f3 BB |
266 | { |
267 | nvlist_t *aux = fnvlist_alloc(); | |
268 | ||
269 | fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); | |
270 | spa_event_notify(spa, vd, aux, name); | |
271 | nvlist_free(aux); | |
272 | } | |
273 | ||
274 | /* | |
275 | * Called to request that a new rebuild be started. The feature will remain | |
276 | * active for the duration of the rebuild, then revert to the enabled state. | |
277 | */ | |
278 | static void | |
279 | vdev_rebuild_initiate(vdev_t *vd) | |
280 | { | |
281 | spa_t *spa = vd->vdev_spa; | |
282 | ||
283 | ASSERT(vd->vdev_top == vd); | |
284 | ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); | |
285 | ASSERT(!vd->vdev_rebuilding); | |
286 | ||
287 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
288 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
289 | ||
290 | vd->vdev_rebuilding = B_TRUE; | |
291 | ||
292 | dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, | |
38080324 | 293 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
294 | dmu_tx_commit(tx); |
295 | ||
296 | vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); | |
297 | } | |
298 | ||
299 | /* | |
300 | * Update the on-disk state to completed when a rebuild finishes. | |
301 | */ | |
302 | static void | |
303 | vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) | |
304 | { | |
305 | int vdev_id = (uintptr_t)arg; | |
306 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
307 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
308 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
309 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
310 | ||
311 | mutex_enter(&vd->vdev_rebuild_lock); | |
2be0a124 SW |
312 | |
313 | /* | |
314 | * Handle a second device failure if it occurs after all rebuild I/O | |
315 | * has completed but before this sync task has been executed. | |
316 | */ | |
317 | if (vd->vdev_rebuild_reset_wanted) { | |
318 | mutex_exit(&vd->vdev_rebuild_lock); | |
319 | vdev_rebuild_reset_sync(arg, tx); | |
320 | return; | |
321 | } | |
322 | ||
9a49d3f3 BB |
323 | vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; |
324 | vrp->vrp_end_time = gethrestime_sec(); | |
325 | ||
326 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
327 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
328 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
329 | ||
b2255edc | 330 | vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); |
9a49d3f3 BB |
331 | spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); |
332 | ||
333 | spa_history_log_internal(spa, "rebuild", tx, | |
334 | "vdev_id=%llu vdev_guid=%llu complete", | |
335 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
336 | vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); | |
337 | ||
338 | /* Handles detaching of spares */ | |
339 | spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); | |
340 | vd->vdev_rebuilding = B_FALSE; | |
341 | mutex_exit(&vd->vdev_rebuild_lock); | |
342 | ||
b2255edc BB |
343 | /* |
344 | * While we're in syncing context take the opportunity to | |
345 | * setup the scrub when there are no more active rebuilds. | |
346 | */ | |
600a1dc5 BB |
347 | pool_scan_func_t func = POOL_SCAN_SCRUB; |
348 | if (dsl_scan_setup_check(&func, tx) == 0 && | |
b2255edc | 349 | zfs_rebuild_scrub_enabled) { |
b2255edc BB |
350 | dsl_scan_setup_sync(&func, tx); |
351 | } | |
352 | ||
9a49d3f3 | 353 | cv_broadcast(&vd->vdev_rebuild_cv); |
03e02e5b DB |
354 | |
355 | /* Clear recent error events (i.e. duplicate events tracking) */ | |
356 | zfs_ereport_clear(spa, NULL); | |
9a49d3f3 BB |
357 | } |
358 | ||
359 | /* | |
360 | * Update the on-disk state to canceled when a rebuild finishes. | |
361 | */ | |
362 | static void | |
363 | vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) | |
364 | { | |
365 | int vdev_id = (uintptr_t)arg; | |
366 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
367 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
368 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
369 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
370 | ||
371 | mutex_enter(&vd->vdev_rebuild_lock); | |
372 | vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; | |
373 | vrp->vrp_end_time = gethrestime_sec(); | |
374 | ||
375 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
376 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
377 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
378 | ||
379 | spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); | |
380 | ||
381 | spa_history_log_internal(spa, "rebuild", tx, | |
382 | "vdev_id=%llu vdev_guid=%llu canceled", | |
383 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
384 | vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); | |
385 | ||
386 | vd->vdev_rebuild_cancel_wanted = B_FALSE; | |
387 | vd->vdev_rebuilding = B_FALSE; | |
388 | mutex_exit(&vd->vdev_rebuild_lock); | |
389 | ||
390 | spa_notify_waiters(spa); | |
391 | cv_broadcast(&vd->vdev_rebuild_cv); | |
392 | } | |
393 | ||
394 | /* | |
395 | * Resets the progress of a running rebuild. This will occur when a new | |
396 | * vdev is added to rebuild. | |
397 | */ | |
398 | static void | |
399 | vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) | |
400 | { | |
401 | int vdev_id = (uintptr_t)arg; | |
402 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
403 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
404 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
405 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
406 | ||
407 | mutex_enter(&vd->vdev_rebuild_lock); | |
408 | ||
409 | ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); | |
410 | ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); | |
411 | ||
412 | vrp->vrp_last_offset = 0; | |
413 | vrp->vrp_min_txg = 0; | |
414 | vrp->vrp_max_txg = dmu_tx_get_txg(tx); | |
415 | vrp->vrp_bytes_scanned = 0; | |
416 | vrp->vrp_bytes_issued = 0; | |
417 | vrp->vrp_bytes_rebuilt = 0; | |
418 | vrp->vrp_bytes_est = 0; | |
419 | vrp->vrp_scan_time_ms = 0; | |
420 | vr->vr_prev_scan_time_ms = 0; | |
421 | ||
422 | /* See vdev_rebuild_initiate_sync comment */ | |
423 | VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); | |
424 | ||
425 | VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, | |
426 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
427 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
428 | ||
429 | spa_history_log_internal(spa, "rebuild", tx, | |
430 | "vdev_id=%llu vdev_guid=%llu reset", | |
431 | (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); | |
432 | ||
433 | vd->vdev_rebuild_reset_wanted = B_FALSE; | |
434 | ASSERT(vd->vdev_rebuilding); | |
435 | ||
436 | vd->vdev_rebuild_thread = thread_create(NULL, 0, | |
437 | vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); | |
438 | ||
439 | mutex_exit(&vd->vdev_rebuild_lock); | |
440 | } | |
441 | ||
442 | /* | |
443 | * Clear the last rebuild status. | |
444 | */ | |
445 | void | |
446 | vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) | |
447 | { | |
448 | int vdev_id = (uintptr_t)arg; | |
449 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
450 | vdev_t *vd = vdev_lookup_top(spa, vdev_id); | |
451 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
452 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
453 | objset_t *mos = spa_meta_objset(spa); | |
454 | ||
455 | mutex_enter(&vd->vdev_rebuild_lock); | |
456 | ||
457 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || | |
458 | vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { | |
459 | mutex_exit(&vd->vdev_rebuild_lock); | |
460 | return; | |
461 | } | |
462 | ||
463 | clear_rebuild_bytes(vd); | |
861166b0 | 464 | memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); |
9a49d3f3 BB |
465 | |
466 | if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, | |
467 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { | |
468 | VERIFY0(zap_update(mos, vd->vdev_top_zap, | |
469 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
470 | REBUILD_PHYS_ENTRIES, vrp, tx)); | |
471 | } | |
472 | ||
473 | mutex_exit(&vd->vdev_rebuild_lock); | |
474 | } | |
475 | ||
476 | /* | |
477 | * The zio_done_func_t callback for each rebuild I/O issued. It's responsible | |
478 | * for updating the rebuild stats and limiting the number of in flight I/Os. | |
479 | */ | |
480 | static void | |
481 | vdev_rebuild_cb(zio_t *zio) | |
482 | { | |
483 | vdev_rebuild_t *vr = zio->io_private; | |
484 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
485 | vdev_t *vd = vr->vr_top_vdev; | |
486 | ||
b2255edc | 487 | mutex_enter(&vr->vr_io_lock); |
9a49d3f3 BB |
488 | if (zio->io_error == ENXIO && !vdev_writeable(vd)) { |
489 | /* | |
490 | * The I/O failed because the top-level vdev was unavailable. | |
491 | * Attempt to roll back to the last completed offset, in order | |
492 | * resume from the correct location if the pool is resumed. | |
493 | * (This works because spa_sync waits on spa_txg_zio before | |
494 | * it runs sync tasks.) | |
495 | */ | |
496 | uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; | |
497 | *off = MIN(*off, zio->io_offset); | |
498 | } else if (zio->io_error) { | |
499 | vrp->vrp_errors++; | |
500 | } | |
501 | ||
502 | abd_free(zio->io_abd); | |
503 | ||
b2255edc BB |
504 | ASSERT3U(vr->vr_bytes_inflight, >, 0); |
505 | vr->vr_bytes_inflight -= zio->io_size; | |
506 | cv_broadcast(&vr->vr_io_cv); | |
507 | mutex_exit(&vr->vr_io_lock); | |
9a49d3f3 BB |
508 | |
509 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); | |
510 | } | |
511 | ||
512 | /* | |
b2255edc BB |
513 | * Initialize a block pointer that can be used to read the given segment |
514 | * for sequential rebuild. | |
9a49d3f3 BB |
515 | */ |
516 | static void | |
b2255edc BB |
517 | vdev_rebuild_blkptr_init(blkptr_t *bp, vdev_t *vd, uint64_t start, |
518 | uint64_t asize) | |
9a49d3f3 | 519 | { |
b2255edc BB |
520 | ASSERT(vd->vdev_ops == &vdev_draid_ops || |
521 | vd->vdev_ops == &vdev_mirror_ops || | |
9a49d3f3 BB |
522 | vd->vdev_ops == &vdev_replacing_ops || |
523 | vd->vdev_ops == &vdev_spare_ops); | |
524 | ||
b2255edc BB |
525 | uint64_t psize = vd->vdev_ops == &vdev_draid_ops ? |
526 | vdev_draid_asize_to_psize(vd, asize) : asize; | |
527 | ||
9a49d3f3 BB |
528 | BP_ZERO(bp); |
529 | ||
530 | DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); | |
531 | DVA_SET_OFFSET(&bp->blk_dva[0], start); | |
532 | DVA_SET_GANG(&bp->blk_dva[0], 0); | |
533 | DVA_SET_ASIZE(&bp->blk_dva[0], asize); | |
534 | ||
535 | BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); | |
536 | BP_SET_LSIZE(bp, psize); | |
537 | BP_SET_PSIZE(bp, psize); | |
538 | BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); | |
539 | BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); | |
540 | BP_SET_TYPE(bp, DMU_OT_NONE); | |
541 | BP_SET_LEVEL(bp, 0); | |
542 | BP_SET_DEDUP(bp, 0); | |
543 | BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); | |
9a49d3f3 BB |
544 | } |
545 | ||
546 | /* | |
547 | * Issues a rebuild I/O and takes care of rate limiting the number of queued | |
548 | * rebuild I/Os. The provided start and size must be properly aligned for the | |
549 | * top-level vdev type being rebuilt. | |
550 | */ | |
551 | static int | |
552 | vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) | |
553 | { | |
554 | uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; | |
555 | vdev_t *vd = vr->vr_top_vdev; | |
556 | spa_t *spa = vd->vdev_spa; | |
b2255edc | 557 | blkptr_t blk; |
9a49d3f3 BB |
558 | |
559 | ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); | |
560 | ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); | |
561 | ||
562 | vr->vr_pass_bytes_scanned += size; | |
563 | vr->vr_rebuild_phys.vrp_bytes_scanned += size; | |
564 | ||
b2255edc BB |
565 | /* |
566 | * Rebuild the data in this range by constructing a special block | |
567 | * pointer. It has no relation to any existing blocks in the pool. | |
568 | * However, by disabling checksum verification and issuing a scrub IO | |
569 | * we can reconstruct and repair any children with missing data. | |
570 | */ | |
571 | vdev_rebuild_blkptr_init(&blk, vd, start, size); | |
572 | uint64_t psize = BP_GET_PSIZE(&blk); | |
573 | ||
fa7b2390 AM |
574 | if (!vdev_dtl_need_resilver(vd, &blk.blk_dva[0], psize, TXG_UNKNOWN)) { |
575 | vr->vr_pass_bytes_skipped += size; | |
b2255edc | 576 | return (0); |
fa7b2390 | 577 | } |
b2255edc BB |
578 | |
579 | mutex_enter(&vr->vr_io_lock); | |
9a49d3f3 BB |
580 | |
581 | /* Limit in flight rebuild I/Os */ | |
b2255edc BB |
582 | while (vr->vr_bytes_inflight >= vr->vr_bytes_inflight_max) |
583 | cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); | |
9a49d3f3 | 584 | |
b2255edc BB |
585 | vr->vr_bytes_inflight += psize; |
586 | mutex_exit(&vr->vr_io_lock); | |
9a49d3f3 BB |
587 | |
588 | dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
589 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
590 | uint64_t txg = dmu_tx_get_txg(tx); | |
591 | ||
592 | spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); | |
593 | mutex_enter(&vd->vdev_rebuild_lock); | |
594 | ||
595 | /* This is the first I/O for this txg. */ | |
596 | if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { | |
597 | vr->vr_scan_offset[txg & TXG_MASK] = start; | |
598 | dsl_sync_task_nowait(spa_get_dsl(spa), | |
599 | vdev_rebuild_update_sync, | |
38080324 | 600 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
601 | } |
602 | ||
603 | /* When exiting write out our progress. */ | |
604 | if (vdev_rebuild_should_stop(vd)) { | |
b2255edc BB |
605 | mutex_enter(&vr->vr_io_lock); |
606 | vr->vr_bytes_inflight -= psize; | |
607 | mutex_exit(&vr->vr_io_lock); | |
9a49d3f3 BB |
608 | spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); |
609 | mutex_exit(&vd->vdev_rebuild_lock); | |
610 | dmu_tx_commit(tx); | |
611 | return (SET_ERROR(EINTR)); | |
612 | } | |
613 | mutex_exit(&vd->vdev_rebuild_lock); | |
b2255edc | 614 | dmu_tx_commit(tx); |
9a49d3f3 BB |
615 | |
616 | vr->vr_scan_offset[txg & TXG_MASK] = start + size; | |
b2255edc BB |
617 | vr->vr_pass_bytes_issued += size; |
618 | vr->vr_rebuild_phys.vrp_bytes_issued += size; | |
9a49d3f3 | 619 | |
b2255edc BB |
620 | zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, &blk, |
621 | abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, | |
622 | ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | | |
623 | ZIO_FLAG_RESILVER, NULL)); | |
9a49d3f3 BB |
624 | |
625 | return (0); | |
626 | } | |
627 | ||
9a49d3f3 BB |
628 | /* |
629 | * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. | |
630 | */ | |
631 | static int | |
632 | vdev_rebuild_ranges(vdev_rebuild_t *vr) | |
633 | { | |
634 | vdev_t *vd = vr->vr_top_vdev; | |
635 | zfs_btree_t *t = &vr->vr_scan_tree->rt_root; | |
636 | zfs_btree_index_t idx; | |
637 | int error; | |
638 | ||
639 | for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; | |
640 | rs = zfs_btree_next(t, &idx, &idx)) { | |
641 | uint64_t start = rs_get_start(rs, vr->vr_scan_tree); | |
642 | uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; | |
643 | ||
644 | /* | |
645 | * zfs_scan_suspend_progress can be set to disable rebuild | |
646 | * progress for testing. See comment in dsl_scan_sync(). | |
647 | */ | |
648 | while (zfs_scan_suspend_progress && | |
649 | !vdev_rebuild_should_stop(vd)) { | |
650 | delay(hz); | |
651 | } | |
652 | ||
653 | while (size > 0) { | |
654 | uint64_t chunk_size; | |
655 | ||
b2255edc BB |
656 | /* |
657 | * Split range into legally-sized logical chunks | |
658 | * given the constraints of the top-level vdev | |
659 | * being rebuilt (dRAID or mirror). | |
660 | */ | |
661 | ASSERT3P(vd->vdev_ops, !=, NULL); | |
662 | chunk_size = vd->vdev_ops->vdev_op_rebuild_asize(vd, | |
663 | start, size, zfs_rebuild_max_segment); | |
9a49d3f3 BB |
664 | |
665 | error = vdev_rebuild_range(vr, start, chunk_size); | |
666 | if (error != 0) | |
667 | return (error); | |
668 | ||
669 | size -= chunk_size; | |
670 | start += chunk_size; | |
671 | } | |
672 | } | |
673 | ||
674 | return (0); | |
675 | } | |
676 | ||
677 | /* | |
678 | * Calculates the estimated capacity which remains to be scanned. Since | |
679 | * we traverse the pool in metaslab order only allocated capacity beyond | |
680 | * the vrp_last_offset need be considered. All lower offsets must have | |
681 | * already been rebuilt and are thus already included in vrp_bytes_scanned. | |
682 | */ | |
683 | static void | |
684 | vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) | |
685 | { | |
686 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
687 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
688 | uint64_t bytes_est = vrp->vrp_bytes_scanned; | |
689 | ||
690 | if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) | |
691 | return; | |
692 | ||
693 | for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { | |
694 | metaslab_t *msp = vd->vdev_ms[i]; | |
695 | ||
696 | mutex_enter(&msp->ms_lock); | |
697 | bytes_est += metaslab_allocated_space(msp); | |
698 | mutex_exit(&msp->ms_lock); | |
699 | } | |
700 | ||
701 | vrp->vrp_bytes_est = bytes_est; | |
702 | } | |
703 | ||
704 | /* | |
705 | * Load from disk the top-level vdev's rebuild information. | |
706 | */ | |
707 | int | |
708 | vdev_rebuild_load(vdev_t *vd) | |
709 | { | |
710 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
711 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
712 | spa_t *spa = vd->vdev_spa; | |
713 | int err = 0; | |
714 | ||
715 | mutex_enter(&vd->vdev_rebuild_lock); | |
716 | vd->vdev_rebuilding = B_FALSE; | |
717 | ||
718 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { | |
861166b0 | 719 | memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); |
9a49d3f3 BB |
720 | mutex_exit(&vd->vdev_rebuild_lock); |
721 | return (SET_ERROR(ENOTSUP)); | |
722 | } | |
723 | ||
724 | ASSERT(vd->vdev_top == vd); | |
725 | ||
726 | err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, | |
727 | VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), | |
728 | REBUILD_PHYS_ENTRIES, vrp); | |
729 | ||
730 | /* | |
731 | * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should | |
732 | * not prevent a pool from being imported. Clear the rebuild | |
733 | * status allowing a new resilver/rebuild to be started. | |
734 | */ | |
735 | if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { | |
861166b0 | 736 | memset(vrp, 0, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); |
9a49d3f3 BB |
737 | } else if (err) { |
738 | mutex_exit(&vd->vdev_rebuild_lock); | |
739 | return (err); | |
740 | } | |
741 | ||
742 | vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; | |
743 | vr->vr_top_vdev = vd; | |
744 | ||
745 | mutex_exit(&vd->vdev_rebuild_lock); | |
746 | ||
747 | return (0); | |
748 | } | |
749 | ||
750 | /* | |
751 | * Each scan thread is responsible for rebuilding a top-level vdev. The | |
752 | * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. | |
753 | */ | |
460748d4 | 754 | static __attribute__((noreturn)) void |
9a49d3f3 BB |
755 | vdev_rebuild_thread(void *arg) |
756 | { | |
757 | vdev_t *vd = arg; | |
758 | spa_t *spa = vd->vdev_spa; | |
973934b9 | 759 | vdev_t *rvd = spa->spa_root_vdev; |
9a49d3f3 BB |
760 | int error = 0; |
761 | ||
762 | /* | |
763 | * If there's a scrub in process request that it be stopped. This | |
764 | * is not required for a correct rebuild, but we do want rebuilds to | |
765 | * emulate the resilver behavior as much as possible. | |
766 | */ | |
767 | dsl_pool_t *dsl = spa_get_dsl(spa); | |
768 | if (dsl_scan_scrubbing(dsl)) | |
769 | dsl_scan_cancel(dsl); | |
770 | ||
771 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
772 | mutex_enter(&vd->vdev_rebuild_lock); | |
773 | ||
774 | ASSERT3P(vd->vdev_top, ==, vd); | |
775 | ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); | |
776 | ASSERT(vd->vdev_rebuilding); | |
777 | ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); | |
778 | ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); | |
9a49d3f3 BB |
779 | |
780 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
781 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
782 | vr->vr_top_vdev = vd; | |
783 | vr->vr_scan_msp = NULL; | |
784 | vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); | |
b2255edc BB |
785 | mutex_init(&vr->vr_io_lock, NULL, MUTEX_DEFAULT, NULL); |
786 | cv_init(&vr->vr_io_cv, NULL, CV_DEFAULT, NULL); | |
787 | ||
9a49d3f3 BB |
788 | vr->vr_pass_start_time = gethrtime(); |
789 | vr->vr_pass_bytes_scanned = 0; | |
790 | vr->vr_pass_bytes_issued = 0; | |
fa7b2390 | 791 | vr->vr_pass_bytes_skipped = 0; |
9a49d3f3 BB |
792 | |
793 | uint64_t update_est_time = gethrtime(); | |
794 | vdev_rebuild_update_bytes_est(vd, 0); | |
795 | ||
796 | clear_rebuild_bytes(vr->vr_top_vdev); | |
797 | ||
798 | mutex_exit(&vd->vdev_rebuild_lock); | |
799 | ||
800 | /* | |
801 | * Systematically walk the metaslabs and issue rebuild I/Os for | |
802 | * all ranges in the allocated space map. | |
803 | */ | |
804 | for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { | |
805 | metaslab_t *msp = vd->vdev_ms[i]; | |
806 | vr->vr_scan_msp = msp; | |
807 | ||
973934b9 BB |
808 | /* |
809 | * Calculate the max number of in-flight bytes for top-level | |
f9c39dc8 | 810 | * vdev scanning operations (minimum 1MB, maximum 1/2 of |
973934b9 BB |
811 | * arc_c_max shared by all top-level vdevs). Limits for the |
812 | * issuing phase are done per top-level vdev and are handled | |
813 | * separately. | |
814 | */ | |
f9c39dc8 | 815 | uint64_t limit = (arc_c_max / 2) / MAX(rvd->vdev_children, 1); |
973934b9 BB |
816 | vr->vr_bytes_inflight_max = MIN(limit, MAX(1ULL << 20, |
817 | zfs_rebuild_vdev_limit * vd->vdev_children)); | |
818 | ||
9a49d3f3 BB |
819 | /* |
820 | * Removal of vdevs from the vdev tree may eliminate the need | |
821 | * for the rebuild, in which case it should be canceled. The | |
822 | * vdev_rebuild_cancel_wanted flag is set until the sync task | |
823 | * completes. This may be after the rebuild thread exits. | |
824 | */ | |
825 | if (vdev_rebuild_should_cancel(vd)) { | |
826 | vd->vdev_rebuild_cancel_wanted = B_TRUE; | |
827 | error = EINTR; | |
828 | break; | |
829 | } | |
830 | ||
831 | ASSERT0(range_tree_space(vr->vr_scan_tree)); | |
832 | ||
b2255edc | 833 | /* Disable any new allocations to this metaslab */ |
9a49d3f3 | 834 | spa_config_exit(spa, SCL_CONFIG, FTAG); |
8e43fa12 | 835 | metaslab_disable(msp); |
9a49d3f3 BB |
836 | |
837 | mutex_enter(&msp->ms_sync_lock); | |
838 | mutex_enter(&msp->ms_lock); | |
839 | ||
b2255edc BB |
840 | /* |
841 | * If there are outstanding allocations wait for them to be | |
842 | * synced. This is needed to ensure all allocated ranges are | |
843 | * on disk and therefore will be rebuilt. | |
844 | */ | |
845 | for (int j = 0; j < TXG_SIZE; j++) { | |
846 | if (range_tree_space(msp->ms_allocating[j])) { | |
847 | mutex_exit(&msp->ms_lock); | |
848 | mutex_exit(&msp->ms_sync_lock); | |
849 | txg_wait_synced(dsl, 0); | |
850 | mutex_enter(&msp->ms_sync_lock); | |
851 | mutex_enter(&msp->ms_lock); | |
852 | break; | |
853 | } | |
854 | } | |
855 | ||
9a49d3f3 BB |
856 | /* |
857 | * When a metaslab has been allocated from read its allocated | |
b2255edc | 858 | * ranges from the space map object into the vr_scan_tree. |
9a49d3f3 BB |
859 | * Then add inflight / unflushed ranges and remove inflight / |
860 | * unflushed frees. This is the minimum range to be rebuilt. | |
861 | */ | |
862 | if (msp->ms_sm != NULL) { | |
863 | VERIFY0(space_map_load(msp->ms_sm, | |
864 | vr->vr_scan_tree, SM_ALLOC)); | |
865 | ||
866 | for (int i = 0; i < TXG_SIZE; i++) { | |
867 | ASSERT0(range_tree_space( | |
868 | msp->ms_allocating[i])); | |
869 | } | |
870 | ||
871 | range_tree_walk(msp->ms_unflushed_allocs, | |
872 | range_tree_add, vr->vr_scan_tree); | |
873 | range_tree_walk(msp->ms_unflushed_frees, | |
874 | range_tree_remove, vr->vr_scan_tree); | |
875 | ||
876 | /* | |
877 | * Remove ranges which have already been rebuilt based | |
878 | * on the last offset. This can happen when restarting | |
879 | * a scan after exporting and re-importing the pool. | |
880 | */ | |
881 | range_tree_clear(vr->vr_scan_tree, 0, | |
882 | vrp->vrp_last_offset); | |
883 | } | |
884 | ||
885 | mutex_exit(&msp->ms_lock); | |
886 | mutex_exit(&msp->ms_sync_lock); | |
887 | ||
888 | /* | |
889 | * To provide an accurate estimate re-calculate the estimated | |
890 | * size every 5 minutes to account for recent allocations and | |
b2255edc | 891 | * frees made to space maps which have not yet been rebuilt. |
9a49d3f3 BB |
892 | */ |
893 | if (gethrtime() > update_est_time + SEC2NSEC(300)) { | |
894 | update_est_time = gethrtime(); | |
895 | vdev_rebuild_update_bytes_est(vd, i); | |
896 | } | |
897 | ||
898 | /* | |
899 | * Walk the allocated space map and issue the rebuild I/O. | |
900 | */ | |
901 | error = vdev_rebuild_ranges(vr); | |
902 | range_tree_vacate(vr->vr_scan_tree, NULL, NULL); | |
903 | ||
904 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
905 | metaslab_enable(msp, B_FALSE, B_FALSE); | |
906 | ||
907 | if (error != 0) | |
908 | break; | |
909 | } | |
910 | ||
911 | range_tree_destroy(vr->vr_scan_tree); | |
912 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
913 | ||
914 | /* Wait for any remaining rebuild I/O to complete */ | |
b2255edc BB |
915 | mutex_enter(&vr->vr_io_lock); |
916 | while (vr->vr_bytes_inflight > 0) | |
917 | cv_wait(&vr->vr_io_cv, &vr->vr_io_lock); | |
9a49d3f3 | 918 | |
b2255edc BB |
919 | mutex_exit(&vr->vr_io_lock); |
920 | ||
921 | mutex_destroy(&vr->vr_io_lock); | |
922 | cv_destroy(&vr->vr_io_cv); | |
9a49d3f3 BB |
923 | |
924 | spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); | |
925 | ||
926 | dsl_pool_t *dp = spa_get_dsl(spa); | |
927 | dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); | |
928 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
929 | ||
930 | mutex_enter(&vd->vdev_rebuild_lock); | |
931 | if (error == 0) { | |
932 | /* | |
933 | * After a successful rebuild clear the DTLs of all ranges | |
934 | * which were missing when the rebuild was started. These | |
935 | * ranges must have been rebuilt as a consequence of rebuilding | |
936 | * all allocated space. Note that unlike a scrub or resilver | |
937 | * the rebuild operation will reconstruct data only referenced | |
938 | * by a pool checkpoint. See the dsl_scan_done() comments. | |
939 | */ | |
940 | dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, | |
38080324 | 941 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
942 | } else if (vd->vdev_rebuild_cancel_wanted) { |
943 | /* | |
944 | * The rebuild operation was canceled. This will occur when | |
945 | * a device participating in the rebuild is detached. | |
946 | */ | |
947 | dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, | |
38080324 | 948 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
949 | } else if (vd->vdev_rebuild_reset_wanted) { |
950 | /* | |
951 | * Reset the running rebuild without canceling and restarting | |
952 | * it. This will occur when a new device is attached and must | |
953 | * participate in the rebuild. | |
954 | */ | |
955 | dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, | |
38080324 | 956 | (void *)(uintptr_t)vd->vdev_id, tx); |
9a49d3f3 BB |
957 | } else { |
958 | /* | |
959 | * The rebuild operation should be suspended. This may occur | |
960 | * when detaching a child vdev or when exporting the pool. The | |
961 | * rebuild is left in the active state so it will be resumed. | |
962 | */ | |
963 | ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); | |
964 | vd->vdev_rebuilding = B_FALSE; | |
965 | } | |
966 | ||
967 | dmu_tx_commit(tx); | |
968 | ||
969 | vd->vdev_rebuild_thread = NULL; | |
970 | mutex_exit(&vd->vdev_rebuild_lock); | |
971 | spa_config_exit(spa, SCL_CONFIG, FTAG); | |
972 | ||
973 | cv_broadcast(&vd->vdev_rebuild_cv); | |
22dcf891 MM |
974 | |
975 | thread_exit(); | |
9a49d3f3 BB |
976 | } |
977 | ||
978 | /* | |
979 | * Returns B_TRUE if any top-level vdev are rebuilding. | |
980 | */ | |
981 | boolean_t | |
982 | vdev_rebuild_active(vdev_t *vd) | |
983 | { | |
984 | spa_t *spa = vd->vdev_spa; | |
985 | boolean_t ret = B_FALSE; | |
986 | ||
987 | if (vd == spa->spa_root_vdev) { | |
988 | for (uint64_t i = 0; i < vd->vdev_children; i++) { | |
989 | ret = vdev_rebuild_active(vd->vdev_child[i]); | |
990 | if (ret) | |
991 | return (ret); | |
992 | } | |
993 | } else if (vd->vdev_top_zap != 0) { | |
994 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
995 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
996 | ||
997 | mutex_enter(&vd->vdev_rebuild_lock); | |
998 | ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); | |
999 | mutex_exit(&vd->vdev_rebuild_lock); | |
1000 | } | |
1001 | ||
1002 | return (ret); | |
1003 | } | |
1004 | ||
1005 | /* | |
1006 | * Start a rebuild operation. The rebuild may be restarted when the | |
1007 | * top-level vdev is currently actively rebuilding. | |
1008 | */ | |
1009 | void | |
1010 | vdev_rebuild(vdev_t *vd) | |
1011 | { | |
1012 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
1013 | vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; | |
1014 | ||
1015 | ASSERT(vd->vdev_top == vd); | |
1016 | ASSERT(vdev_is_concrete(vd)); | |
1017 | ASSERT(!vd->vdev_removing); | |
1018 | ASSERT(spa_feature_is_enabled(vd->vdev_spa, | |
1019 | SPA_FEATURE_DEVICE_REBUILD)); | |
1020 | ||
1021 | mutex_enter(&vd->vdev_rebuild_lock); | |
1022 | if (vd->vdev_rebuilding) { | |
1023 | ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); | |
1024 | ||
1025 | /* | |
1026 | * Signal a running rebuild operation that it should restart | |
1027 | * from the beginning because a new device was attached. The | |
1028 | * vdev_rebuild_reset_wanted flag is set until the sync task | |
1029 | * completes. This may be after the rebuild thread exits. | |
1030 | */ | |
1031 | if (!vd->vdev_rebuild_reset_wanted) | |
1032 | vd->vdev_rebuild_reset_wanted = B_TRUE; | |
1033 | } else { | |
1034 | vdev_rebuild_initiate(vd); | |
1035 | } | |
1036 | mutex_exit(&vd->vdev_rebuild_lock); | |
1037 | } | |
1038 | ||
1039 | static void | |
1040 | vdev_rebuild_restart_impl(vdev_t *vd) | |
1041 | { | |
1042 | spa_t *spa = vd->vdev_spa; | |
1043 | ||
1044 | if (vd == spa->spa_root_vdev) { | |
1045 | for (uint64_t i = 0; i < vd->vdev_children; i++) | |
1046 | vdev_rebuild_restart_impl(vd->vdev_child[i]); | |
1047 | ||
1048 | } else if (vd->vdev_top_zap != 0) { | |
1049 | vdev_rebuild_t *vr = &vd->vdev_rebuild_config; | |
1050 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
1051 | ||
1052 | mutex_enter(&vd->vdev_rebuild_lock); | |
1053 | if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && | |
1054 | vdev_writeable(vd) && !vd->vdev_rebuilding) { | |
1055 | ASSERT(spa_feature_is_active(spa, | |
1056 | SPA_FEATURE_DEVICE_REBUILD)); | |
1057 | vd->vdev_rebuilding = B_TRUE; | |
1058 | vd->vdev_rebuild_thread = thread_create(NULL, 0, | |
1059 | vdev_rebuild_thread, vd, 0, &p0, TS_RUN, | |
1060 | maxclsyspri); | |
1061 | } | |
1062 | mutex_exit(&vd->vdev_rebuild_lock); | |
1063 | } | |
1064 | } | |
1065 | ||
1066 | /* | |
1067 | * Conditionally restart all of the vdev_rebuild_thread's for a pool. The | |
1068 | * feature flag must be active and the rebuild in the active state. This | |
1069 | * cannot be used to start a new rebuild. | |
1070 | */ | |
1071 | void | |
1072 | vdev_rebuild_restart(spa_t *spa) | |
1073 | { | |
1074 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1075 | ||
1076 | vdev_rebuild_restart_impl(spa->spa_root_vdev); | |
1077 | } | |
1078 | ||
1079 | /* | |
1080 | * Stop and wait for all of the vdev_rebuild_thread's associated with the | |
1081 | * vdev tree provide to be terminated (canceled or stopped). | |
1082 | */ | |
1083 | void | |
1084 | vdev_rebuild_stop_wait(vdev_t *vd) | |
1085 | { | |
1086 | spa_t *spa = vd->vdev_spa; | |
1087 | ||
1088 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1089 | ||
1090 | if (vd == spa->spa_root_vdev) { | |
1091 | for (uint64_t i = 0; i < vd->vdev_children; i++) | |
1092 | vdev_rebuild_stop_wait(vd->vdev_child[i]); | |
1093 | ||
1094 | } else if (vd->vdev_top_zap != 0) { | |
1095 | ASSERT(vd == vd->vdev_top); | |
1096 | ||
1097 | mutex_enter(&vd->vdev_rebuild_lock); | |
1098 | if (vd->vdev_rebuild_thread != NULL) { | |
1099 | vd->vdev_rebuild_exit_wanted = B_TRUE; | |
1100 | while (vd->vdev_rebuilding) { | |
1101 | cv_wait(&vd->vdev_rebuild_cv, | |
1102 | &vd->vdev_rebuild_lock); | |
1103 | } | |
1104 | vd->vdev_rebuild_exit_wanted = B_FALSE; | |
1105 | } | |
1106 | mutex_exit(&vd->vdev_rebuild_lock); | |
1107 | } | |
1108 | } | |
1109 | ||
1110 | /* | |
1111 | * Stop all rebuild operations but leave them in the active state so they | |
1112 | * will be resumed when importing the pool. | |
1113 | */ | |
1114 | void | |
1115 | vdev_rebuild_stop_all(spa_t *spa) | |
1116 | { | |
1117 | vdev_rebuild_stop_wait(spa->spa_root_vdev); | |
1118 | } | |
1119 | ||
1120 | /* | |
1121 | * Rebuild statistics reported per top-level vdev. | |
1122 | */ | |
1123 | int | |
1124 | vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) | |
1125 | { | |
1126 | spa_t *spa = tvd->vdev_spa; | |
1127 | ||
1128 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) | |
1129 | return (SET_ERROR(ENOTSUP)); | |
1130 | ||
1131 | if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) | |
1132 | return (SET_ERROR(EINVAL)); | |
1133 | ||
1134 | int error = zap_contains(spa_meta_objset(spa), | |
1135 | tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); | |
1136 | ||
1137 | if (error == ENOENT) { | |
861166b0 | 1138 | memset(vrs, 0, sizeof (vdev_rebuild_stat_t)); |
9a49d3f3 BB |
1139 | vrs->vrs_state = VDEV_REBUILD_NONE; |
1140 | error = 0; | |
1141 | } else if (error == 0) { | |
1142 | vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; | |
1143 | vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; | |
1144 | ||
1145 | mutex_enter(&tvd->vdev_rebuild_lock); | |
1146 | vrs->vrs_state = vrp->vrp_rebuild_state; | |
1147 | vrs->vrs_start_time = vrp->vrp_start_time; | |
1148 | vrs->vrs_end_time = vrp->vrp_end_time; | |
1149 | vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; | |
1150 | vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; | |
1151 | vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; | |
1152 | vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; | |
1153 | vrs->vrs_bytes_est = vrp->vrp_bytes_est; | |
1154 | vrs->vrs_errors = vrp->vrp_errors; | |
1155 | vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - | |
1156 | vr->vr_pass_start_time); | |
1157 | vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; | |
1158 | vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; | |
fa7b2390 | 1159 | vrs->vrs_pass_bytes_skipped = vr->vr_pass_bytes_skipped; |
9a49d3f3 BB |
1160 | mutex_exit(&tvd->vdev_rebuild_lock); |
1161 | } | |
1162 | ||
1163 | return (error); | |
1164 | } | |
1165 | ||
ab8d9c17 | 1166 | ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, U64, ZMOD_RW, |
b2255edc BB |
1167 | "Max segment size in bytes of rebuild reads"); |
1168 | ||
ab8d9c17 | 1169 | ZFS_MODULE_PARAM(zfs, zfs_, rebuild_vdev_limit, U64, ZMOD_RW, |
b2255edc BB |
1170 | "Max bytes in flight per leaf vdev for sequential resilvers"); |
1171 | ||
1172 | ZFS_MODULE_PARAM(zfs, zfs_, rebuild_scrub_enabled, INT, ZMOD_RW, | |
1173 | "Automatically scrub after sequential resilver completes"); |