]>
Commit | Line | Data |
---|---|---|
a1d477c2 MA |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. | |
24 | * Copyright (c) 2011, 2017 by Delphix. All rights reserved. | |
25 | */ | |
26 | ||
27 | #include <sys/zfs_context.h> | |
28 | #include <sys/spa_impl.h> | |
29 | #include <sys/dmu.h> | |
30 | #include <sys/dmu_tx.h> | |
31 | #include <sys/zap.h> | |
32 | #include <sys/vdev_impl.h> | |
33 | #include <sys/metaslab.h> | |
34 | #include <sys/metaslab_impl.h> | |
35 | #include <sys/uberblock_impl.h> | |
36 | #include <sys/txg.h> | |
37 | #include <sys/avl.h> | |
38 | #include <sys/bpobj.h> | |
39 | #include <sys/dsl_pool.h> | |
40 | #include <sys/dsl_synctask.h> | |
41 | #include <sys/dsl_dir.h> | |
42 | #include <sys/arc.h> | |
43 | #include <sys/zfeature.h> | |
44 | #include <sys/vdev_indirect_births.h> | |
45 | #include <sys/vdev_indirect_mapping.h> | |
46 | #include <sys/abd.h> | |
47 | #include <sys/trace_vdev.h> | |
48 | ||
49 | /* | |
50 | * This file contains the necessary logic to remove vdevs from a | |
51 | * storage pool. Currently, the only devices that can be removed | |
52 | * are log, cache, and spare devices; and top level vdevs from a pool | |
53 | * w/o raidz or mirrors. (Note that members of a mirror can be removed | |
54 | * by the detach operation.) | |
55 | * | |
56 | * Log vdevs are removed by evacuating them and then turning the vdev | |
57 | * into a hole vdev while holding spa config locks. | |
58 | * | |
59 | * Top level vdevs are removed and converted into an indirect vdev via | |
60 | * a multi-step process: | |
61 | * | |
62 | * - Disable allocations from this device (spa_vdev_remove_top). | |
63 | * | |
64 | * - From a new thread (spa_vdev_remove_thread), copy data from | |
65 | * the removing vdev to a different vdev. The copy happens in open | |
66 | * context (spa_vdev_copy_impl) and issues a sync task | |
67 | * (vdev_mapping_sync) so the sync thread can update the partial | |
68 | * indirect mappings in core and on disk. | |
69 | * | |
70 | * - If a free happens during a removal, it is freed from the | |
71 | * removing vdev, and if it has already been copied, from the new | |
72 | * location as well (free_from_removing_vdev). | |
73 | * | |
74 | * - After the removal is completed, the copy thread converts the vdev | |
75 | * into an indirect vdev (vdev_remove_complete) before instructing | |
76 | * the sync thread to destroy the space maps and finish the removal | |
77 | * (spa_finish_removal). | |
78 | */ | |
79 | ||
80 | typedef struct vdev_copy_arg { | |
81 | metaslab_t *vca_msp; | |
82 | uint64_t vca_outstanding_bytes; | |
83 | kcondvar_t vca_cv; | |
84 | kmutex_t vca_lock; | |
85 | } vdev_copy_arg_t; | |
86 | ||
87 | typedef struct vdev_copy_seg_arg { | |
88 | vdev_copy_arg_t *vcsa_copy_arg; | |
89 | uint64_t vcsa_txg; | |
90 | dva_t *vcsa_dest_dva; | |
91 | blkptr_t *vcsa_dest_bp; | |
92 | } vdev_copy_seg_arg_t; | |
93 | ||
94 | /* | |
95 | * The maximum amount of allowed data we're allowed to copy from a device | |
96 | * at a time when removing it. | |
97 | */ | |
98 | int zfs_remove_max_copy_bytes = 8 * 1024 * 1024; | |
99 | ||
100 | /* | |
101 | * The largest contiguous segment that we will attempt to allocate when | |
102 | * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If | |
103 | * there is a performance problem with attempting to allocate large blocks, | |
104 | * consider decreasing this. | |
105 | */ | |
106 | int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; | |
107 | ||
108 | #define VDEV_REMOVAL_ZAP_OBJS "lzap" | |
109 | ||
110 | static void spa_vdev_remove_thread(void *arg); | |
111 | ||
112 | static void | |
113 | spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) | |
114 | { | |
115 | VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, | |
116 | DMU_POOL_DIRECTORY_OBJECT, | |
117 | DMU_POOL_REMOVING, sizeof (uint64_t), | |
118 | sizeof (spa->spa_removing_phys) / sizeof (uint64_t), | |
119 | &spa->spa_removing_phys, tx)); | |
120 | } | |
121 | ||
122 | static nvlist_t * | |
123 | spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) | |
124 | { | |
125 | for (int i = 0; i < count; i++) { | |
126 | uint64_t guid = | |
127 | fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); | |
128 | ||
129 | if (guid == target_guid) | |
130 | return (nvpp[i]); | |
131 | } | |
132 | ||
133 | return (NULL); | |
134 | } | |
135 | ||
136 | static void | |
137 | spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, | |
138 | nvlist_t *dev_to_remove) | |
139 | { | |
140 | nvlist_t **newdev = NULL; | |
141 | ||
142 | if (count > 1) | |
143 | newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); | |
144 | ||
145 | for (int i = 0, j = 0; i < count; i++) { | |
146 | if (dev[i] == dev_to_remove) | |
147 | continue; | |
148 | VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); | |
149 | } | |
150 | ||
151 | VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); | |
152 | VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); | |
153 | ||
154 | for (int i = 0; i < count - 1; i++) | |
155 | nvlist_free(newdev[i]); | |
156 | ||
157 | if (count > 1) | |
158 | kmem_free(newdev, (count - 1) * sizeof (void *)); | |
159 | } | |
160 | ||
161 | static spa_vdev_removal_t * | |
162 | spa_vdev_removal_create(vdev_t *vd) | |
163 | { | |
164 | spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); | |
165 | mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); | |
166 | cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); | |
167 | svr->svr_allocd_segs = range_tree_create(NULL, NULL); | |
168 | svr->svr_vdev = vd; | |
169 | ||
170 | for (int i = 0; i < TXG_SIZE; i++) { | |
171 | svr->svr_frees[i] = range_tree_create(NULL, NULL); | |
172 | list_create(&svr->svr_new_segments[i], | |
173 | sizeof (vdev_indirect_mapping_entry_t), | |
174 | offsetof(vdev_indirect_mapping_entry_t, vime_node)); | |
175 | } | |
176 | ||
177 | return (svr); | |
178 | } | |
179 | ||
180 | void | |
181 | spa_vdev_removal_destroy(spa_vdev_removal_t *svr) | |
182 | { | |
183 | for (int i = 0; i < TXG_SIZE; i++) { | |
184 | ASSERT0(svr->svr_bytes_done[i]); | |
185 | ASSERT0(svr->svr_max_offset_to_sync[i]); | |
186 | range_tree_destroy(svr->svr_frees[i]); | |
187 | list_destroy(&svr->svr_new_segments[i]); | |
188 | } | |
189 | ||
190 | range_tree_destroy(svr->svr_allocd_segs); | |
191 | mutex_destroy(&svr->svr_lock); | |
192 | cv_destroy(&svr->svr_cv); | |
193 | kmem_free(svr, sizeof (*svr)); | |
194 | } | |
195 | ||
196 | /* | |
197 | * This is called as a synctask in the txg in which we will mark this vdev | |
198 | * as removing (in the config stored in the MOS). | |
199 | * | |
200 | * It begins the evacuation of a toplevel vdev by: | |
201 | * - initializing the spa_removing_phys which tracks this removal | |
202 | * - computing the amount of space to remove for accounting purposes | |
203 | * - dirtying all dbufs in the spa_config_object | |
204 | * - creating the spa_vdev_removal | |
205 | * - starting the spa_vdev_remove_thread | |
206 | */ | |
207 | static void | |
208 | vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) | |
209 | { | |
210 | vdev_t *vd = arg; | |
211 | vdev_indirect_config_t *vic = &vd->vdev_indirect_config; | |
212 | spa_t *spa = vd->vdev_spa; | |
213 | objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; | |
214 | spa_vdev_removal_t *svr = NULL; | |
215 | ASSERTV(uint64_t txg = dmu_tx_get_txg(tx)); | |
216 | ||
217 | ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); | |
218 | svr = spa_vdev_removal_create(vd); | |
219 | ||
220 | ASSERT(vd->vdev_removing); | |
221 | ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); | |
222 | ||
223 | spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); | |
224 | if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { | |
225 | /* | |
226 | * By activating the OBSOLETE_COUNTS feature, we prevent | |
227 | * the pool from being downgraded and ensure that the | |
228 | * refcounts are precise. | |
229 | */ | |
230 | spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); | |
231 | uint64_t one = 1; | |
232 | VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, | |
233 | VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, | |
234 | &one, tx)); | |
235 | ASSERT3U(vdev_obsolete_counts_are_precise(vd), !=, 0); | |
236 | } | |
237 | ||
238 | vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); | |
239 | vd->vdev_indirect_mapping = | |
240 | vdev_indirect_mapping_open(mos, vic->vic_mapping_object); | |
241 | vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); | |
242 | vd->vdev_indirect_births = | |
243 | vdev_indirect_births_open(mos, vic->vic_births_object); | |
244 | spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; | |
245 | spa->spa_removing_phys.sr_start_time = gethrestime_sec(); | |
246 | spa->spa_removing_phys.sr_end_time = 0; | |
247 | spa->spa_removing_phys.sr_state = DSS_SCANNING; | |
248 | spa->spa_removing_phys.sr_to_copy = 0; | |
249 | spa->spa_removing_phys.sr_copied = 0; | |
250 | ||
251 | /* | |
252 | * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because | |
253 | * there may be space in the defer tree, which is free, but still | |
254 | * counted in vs_alloc. | |
255 | */ | |
256 | for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { | |
257 | metaslab_t *ms = vd->vdev_ms[i]; | |
258 | if (ms->ms_sm == NULL) | |
259 | continue; | |
260 | ||
261 | /* | |
262 | * Sync tasks happen before metaslab_sync(), therefore | |
263 | * smp_alloc and sm_alloc must be the same. | |
264 | */ | |
265 | ASSERT3U(space_map_allocated(ms->ms_sm), ==, | |
266 | ms->ms_sm->sm_phys->smp_alloc); | |
267 | ||
268 | spa->spa_removing_phys.sr_to_copy += | |
269 | space_map_allocated(ms->ms_sm); | |
270 | ||
271 | /* | |
272 | * Space which we are freeing this txg does not need to | |
273 | * be copied. | |
274 | */ | |
275 | spa->spa_removing_phys.sr_to_copy -= | |
276 | range_tree_space(ms->ms_freeingtree); | |
277 | ||
278 | ASSERT0(range_tree_space(ms->ms_freedtree)); | |
279 | for (int t = 0; t < TXG_SIZE; t++) | |
280 | ASSERT0(range_tree_space(ms->ms_alloctree[t])); | |
281 | } | |
282 | ||
283 | /* | |
284 | * Sync tasks are called before metaslab_sync(), so there should | |
285 | * be no already-synced metaslabs in the TXG_CLEAN list. | |
286 | */ | |
287 | ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); | |
288 | ||
289 | spa_sync_removing_state(spa, tx); | |
290 | ||
291 | /* | |
292 | * All blocks that we need to read the most recent mapping must be | |
293 | * stored on concrete vdevs. Therefore, we must dirty anything that | |
294 | * is read before spa_remove_init(). Specifically, the | |
295 | * spa_config_object. (Note that although we already modified the | |
296 | * spa_config_object in spa_sync_removing_state, that may not have | |
297 | * modified all blocks of the object.) | |
298 | */ | |
299 | dmu_object_info_t doi; | |
300 | VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); | |
301 | for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { | |
302 | dmu_buf_t *dbuf; | |
303 | VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, | |
304 | offset, FTAG, &dbuf, 0)); | |
305 | dmu_buf_will_dirty(dbuf, tx); | |
306 | offset += dbuf->db_size; | |
307 | dmu_buf_rele(dbuf, FTAG); | |
308 | } | |
309 | ||
310 | /* | |
311 | * Now that we've allocated the im_object, dirty the vdev to ensure | |
312 | * that the object gets written to the config on disk. | |
313 | */ | |
314 | vdev_config_dirty(vd); | |
315 | ||
316 | zfs_dbgmsg("starting removal thread for vdev %llu (%p) in txg %llu " | |
317 | "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), | |
318 | vic->vic_mapping_object); | |
319 | ||
320 | spa_history_log_internal(spa, "vdev remove started", tx, | |
321 | "%s vdev %llu %s", spa_name(spa), vd->vdev_id, | |
322 | (vd->vdev_path != NULL) ? vd->vdev_path : "-"); | |
323 | /* | |
324 | * Setting spa_vdev_removal causes subsequent frees to call | |
325 | * free_from_removing_vdev(). Note that we don't need any locking | |
326 | * because we are the sync thread, and metaslab_free_impl() is only | |
327 | * called from syncing context (potentially from a zio taskq thread, | |
328 | * but in any case only when there are outstanding free i/os, which | |
329 | * there are not). | |
330 | */ | |
331 | ASSERT3P(spa->spa_vdev_removal, ==, NULL); | |
332 | spa->spa_vdev_removal = svr; | |
333 | svr->svr_thread = thread_create(NULL, 0, | |
334 | spa_vdev_remove_thread, vd, 0, &p0, TS_RUN, minclsyspri); | |
335 | } | |
336 | ||
337 | /* | |
338 | * When we are opening a pool, we must read the mapping for each | |
339 | * indirect vdev in order from most recently removed to least | |
340 | * recently removed. We do this because the blocks for the mapping | |
341 | * of older indirect vdevs may be stored on more recently removed vdevs. | |
342 | * In order to read each indirect mapping object, we must have | |
343 | * initialized all more recently removed vdevs. | |
344 | */ | |
345 | int | |
346 | spa_remove_init(spa_t *spa) | |
347 | { | |
348 | int error; | |
349 | ||
350 | error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, | |
351 | DMU_POOL_DIRECTORY_OBJECT, | |
352 | DMU_POOL_REMOVING, sizeof (uint64_t), | |
353 | sizeof (spa->spa_removing_phys) / sizeof (uint64_t), | |
354 | &spa->spa_removing_phys); | |
355 | ||
356 | if (error == ENOENT) { | |
357 | spa->spa_removing_phys.sr_state = DSS_NONE; | |
358 | spa->spa_removing_phys.sr_removing_vdev = -1; | |
359 | spa->spa_removing_phys.sr_prev_indirect_vdev = -1; | |
360 | return (0); | |
361 | } else if (error != 0) { | |
362 | return (error); | |
363 | } | |
364 | ||
365 | if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { | |
366 | /* | |
367 | * We are currently removing a vdev. Create and | |
368 | * initialize a spa_vdev_removal_t from the bonus | |
369 | * buffer of the removing vdevs vdev_im_object, and | |
370 | * initialize its partial mapping. | |
371 | */ | |
372 | spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); | |
373 | vdev_t *vd = vdev_lookup_top(spa, | |
374 | spa->spa_removing_phys.sr_removing_vdev); | |
375 | spa_config_exit(spa, SCL_STATE, FTAG); | |
376 | ||
377 | if (vd == NULL) | |
378 | return (EINVAL); | |
379 | ||
380 | vdev_indirect_config_t *vic = &vd->vdev_indirect_config; | |
381 | ||
382 | ASSERT(vdev_is_concrete(vd)); | |
383 | spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); | |
384 | ASSERT(svr->svr_vdev->vdev_removing); | |
385 | ||
386 | vd->vdev_indirect_mapping = vdev_indirect_mapping_open( | |
387 | spa->spa_meta_objset, vic->vic_mapping_object); | |
388 | vd->vdev_indirect_births = vdev_indirect_births_open( | |
389 | spa->spa_meta_objset, vic->vic_births_object); | |
390 | ||
391 | spa->spa_vdev_removal = svr; | |
392 | } | |
393 | ||
394 | spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); | |
395 | uint64_t indirect_vdev_id = | |
396 | spa->spa_removing_phys.sr_prev_indirect_vdev; | |
397 | while (indirect_vdev_id != UINT64_MAX) { | |
398 | vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); | |
399 | vdev_indirect_config_t *vic = &vd->vdev_indirect_config; | |
400 | ||
401 | ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); | |
402 | vd->vdev_indirect_mapping = vdev_indirect_mapping_open( | |
403 | spa->spa_meta_objset, vic->vic_mapping_object); | |
404 | vd->vdev_indirect_births = vdev_indirect_births_open( | |
405 | spa->spa_meta_objset, vic->vic_births_object); | |
406 | ||
407 | indirect_vdev_id = vic->vic_prev_indirect_vdev; | |
408 | } | |
409 | spa_config_exit(spa, SCL_STATE, FTAG); | |
410 | ||
411 | /* | |
412 | * Now that we've loaded all the indirect mappings, we can allow | |
413 | * reads from other blocks (e.g. via predictive prefetch). | |
414 | */ | |
415 | spa->spa_indirect_vdevs_loaded = B_TRUE; | |
416 | return (0); | |
417 | } | |
418 | ||
419 | void | |
420 | spa_restart_removal(spa_t *spa) | |
421 | { | |
422 | spa_vdev_removal_t *svr = spa->spa_vdev_removal; | |
423 | ||
424 | if (svr == NULL) | |
425 | return; | |
426 | ||
427 | /* | |
428 | * In general when this function is called there is no | |
429 | * removal thread running. The only scenario where this | |
430 | * is not true is during spa_import() where this function | |
431 | * is called twice [once from spa_import_impl() and | |
432 | * spa_async_resume()]. Thus, in the scenario where we | |
433 | * import a pool that has an ongoing removal we don't | |
434 | * want to spawn a second thread. | |
435 | */ | |
436 | if (svr->svr_thread != NULL) | |
437 | return; | |
438 | ||
439 | if (!spa_writeable(spa)) | |
440 | return; | |
441 | ||
442 | vdev_t *vd = svr->svr_vdev; | |
443 | vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; | |
444 | ||
445 | ASSERT3P(vd, !=, NULL); | |
446 | ASSERT(vd->vdev_removing); | |
447 | ||
448 | zfs_dbgmsg("restarting removal of %llu at count=%llu", | |
449 | vd->vdev_id, vdev_indirect_mapping_num_entries(vim)); | |
450 | svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, vd, | |
451 | 0, &p0, TS_RUN, minclsyspri); | |
452 | } | |
453 | ||
454 | /* | |
455 | * Process freeing from a device which is in the middle of being removed. | |
456 | * We must handle this carefully so that we attempt to copy freed data, | |
457 | * and we correctly free already-copied data. | |
458 | */ | |
459 | void | |
460 | free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size, | |
461 | uint64_t txg) | |
462 | { | |
463 | spa_t *spa = vd->vdev_spa; | |
464 | spa_vdev_removal_t *svr = spa->spa_vdev_removal; | |
465 | vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; | |
466 | uint64_t max_offset_yet = 0; | |
467 | ||
468 | ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); | |
469 | ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, | |
470 | vdev_indirect_mapping_object(vim)); | |
471 | ASSERT3P(vd, ==, svr->svr_vdev); | |
472 | ASSERT3U(spa_syncing_txg(spa), ==, txg); | |
473 | ||
474 | mutex_enter(&svr->svr_lock); | |
475 | ||
476 | /* | |
477 | * Remove the segment from the removing vdev's spacemap. This | |
478 | * ensures that we will not attempt to copy this space (if the | |
479 | * removal thread has not yet visited it), and also ensures | |
480 | * that we know what is actually allocated on the new vdevs | |
481 | * (needed if we cancel the removal). | |
482 | * | |
483 | * Note: we must do the metaslab_free_concrete() with the svr_lock | |
484 | * held, so that the remove_thread can not load this metaslab and then | |
485 | * visit this offset between the time that we metaslab_free_concrete() | |
486 | * and when we check to see if it has been visited. | |
487 | */ | |
488 | metaslab_free_concrete(vd, offset, size, txg); | |
489 | ||
490 | uint64_t synced_size = 0; | |
491 | uint64_t synced_offset = 0; | |
492 | uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); | |
493 | if (offset < max_offset_synced) { | |
494 | /* | |
495 | * The mapping for this offset is already on disk. | |
496 | * Free from the new location. | |
497 | * | |
498 | * Note that we use svr_max_synced_offset because it is | |
499 | * updated atomically with respect to the in-core mapping. | |
500 | * By contrast, vim_max_offset is not. | |
501 | * | |
502 | * This block may be split between a synced entry and an | |
503 | * in-flight or unvisited entry. Only process the synced | |
504 | * portion of it here. | |
505 | */ | |
506 | synced_size = MIN(size, max_offset_synced - offset); | |
507 | synced_offset = offset; | |
508 | ||
509 | ASSERT3U(max_offset_yet, <=, max_offset_synced); | |
510 | max_offset_yet = max_offset_synced; | |
511 | ||
512 | DTRACE_PROBE3(remove__free__synced, | |
513 | spa_t *, spa, | |
514 | uint64_t, offset, | |
515 | uint64_t, synced_size); | |
516 | ||
517 | size -= synced_size; | |
518 | offset += synced_size; | |
519 | } | |
520 | ||
521 | /* | |
522 | * Look at all in-flight txgs starting from the currently syncing one | |
523 | * and see if a section of this free is being copied. By starting from | |
524 | * this txg and iterating forward, we might find that this region | |
525 | * was copied in two different txgs and handle it appropriately. | |
526 | */ | |
527 | for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { | |
528 | int txgoff = (txg + i) & TXG_MASK; | |
529 | if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { | |
530 | /* | |
531 | * The mapping for this offset is in flight, and | |
532 | * will be synced in txg+i. | |
533 | */ | |
534 | uint64_t inflight_size = MIN(size, | |
535 | svr->svr_max_offset_to_sync[txgoff] - offset); | |
536 | ||
537 | DTRACE_PROBE4(remove__free__inflight, | |
538 | spa_t *, spa, | |
539 | uint64_t, offset, | |
540 | uint64_t, inflight_size, | |
541 | uint64_t, txg + i); | |
542 | ||
543 | /* | |
544 | * We copy data in order of increasing offset. | |
545 | * Therefore the max_offset_to_sync[] must increase | |
546 | * (or be zero, indicating that nothing is being | |
547 | * copied in that txg). | |
548 | */ | |
549 | if (svr->svr_max_offset_to_sync[txgoff] != 0) { | |
550 | ASSERT3U(svr->svr_max_offset_to_sync[txgoff], | |
551 | >=, max_offset_yet); | |
552 | max_offset_yet = | |
553 | svr->svr_max_offset_to_sync[txgoff]; | |
554 | } | |
555 | ||
556 | /* | |
557 | * We've already committed to copying this segment: | |
558 | * we have allocated space elsewhere in the pool for | |
559 | * it and have an IO outstanding to copy the data. We | |
560 | * cannot free the space before the copy has | |
561 | * completed, or else the copy IO might overwrite any | |
562 | * new data. To free that space, we record the | |
563 | * segment in the appropriate svr_frees tree and free | |
564 | * the mapped space later, in the txg where we have | |
565 | * completed the copy and synced the mapping (see | |
566 | * vdev_mapping_sync). | |
567 | */ | |
568 | range_tree_add(svr->svr_frees[txgoff], | |
569 | offset, inflight_size); | |
570 | size -= inflight_size; | |
571 | offset += inflight_size; | |
572 | ||
573 | /* | |
574 | * This space is already accounted for as being | |
575 | * done, because it is being copied in txg+i. | |
576 | * However, if i!=0, then it is being copied in | |
577 | * a future txg. If we crash after this txg | |
578 | * syncs but before txg+i syncs, then the space | |
579 | * will be free. Therefore we must account | |
580 | * for the space being done in *this* txg | |
581 | * (when it is freed) rather than the future txg | |
582 | * (when it will be copied). | |
583 | */ | |
584 | ASSERT3U(svr->svr_bytes_done[txgoff], >=, | |
585 | inflight_size); | |
586 | svr->svr_bytes_done[txgoff] -= inflight_size; | |
587 | svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; | |
588 | } | |
589 | } | |
590 | ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); | |
591 | ||
592 | if (size > 0) { | |
593 | /* | |
594 | * The copy thread has not yet visited this offset. Ensure | |
595 | * that it doesn't. | |
596 | */ | |
597 | ||
598 | DTRACE_PROBE3(remove__free__unvisited, | |
599 | spa_t *, spa, | |
600 | uint64_t, offset, | |
601 | uint64_t, size); | |
602 | ||
603 | if (svr->svr_allocd_segs != NULL) | |
604 | range_tree_clear(svr->svr_allocd_segs, offset, size); | |
605 | ||
606 | /* | |
607 | * Since we now do not need to copy this data, for | |
608 | * accounting purposes we have done our job and can count | |
609 | * it as completed. | |
610 | */ | |
611 | svr->svr_bytes_done[txg & TXG_MASK] += size; | |
612 | } | |
613 | mutex_exit(&svr->svr_lock); | |
614 | ||
615 | /* | |
616 | * Now that we have dropped svr_lock, process the synced portion | |
617 | * of this free. | |
618 | */ | |
619 | if (synced_size > 0) { | |
620 | vdev_indirect_mark_obsolete(vd, synced_offset, synced_size, | |
621 | txg); | |
622 | /* | |
623 | * Note: this can only be called from syncing context, | |
624 | * and the vdev_indirect_mapping is only changed from the | |
625 | * sync thread, so we don't need svr_lock while doing | |
626 | * metaslab_free_impl_cb. | |
627 | */ | |
628 | vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, | |
629 | metaslab_free_impl_cb, &txg); | |
630 | } | |
631 | } | |
632 | ||
633 | /* | |
634 | * Stop an active removal and update the spa_removing phys. | |
635 | */ | |
636 | static void | |
637 | spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) | |
638 | { | |
639 | spa_vdev_removal_t *svr = spa->spa_vdev_removal; | |
640 | ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); | |
641 | ||
642 | /* Ensure the removal thread has completed before we free the svr. */ | |
643 | spa_vdev_remove_suspend(spa); | |
644 | ||
645 | ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); | |
646 | ||
647 | if (state == DSS_FINISHED) { | |
648 | spa_removing_phys_t *srp = &spa->spa_removing_phys; | |
649 | vdev_t *vd = svr->svr_vdev; | |
650 | vdev_indirect_config_t *vic = &vd->vdev_indirect_config; | |
651 | ||
652 | if (srp->sr_prev_indirect_vdev != UINT64_MAX) { | |
653 | vdev_t *pvd; | |
654 | pvd = vdev_lookup_top(spa, | |
655 | srp->sr_prev_indirect_vdev); | |
656 | ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); | |
657 | } | |
658 | ||
659 | vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; | |
660 | srp->sr_prev_indirect_vdev = vd->vdev_id; | |
661 | } | |
662 | spa->spa_removing_phys.sr_state = state; | |
663 | spa->spa_removing_phys.sr_end_time = gethrestime_sec(); | |
664 | ||
665 | spa->spa_vdev_removal = NULL; | |
666 | spa_vdev_removal_destroy(svr); | |
667 | ||
668 | spa_sync_removing_state(spa, tx); | |
669 | ||
670 | vdev_config_dirty(spa->spa_root_vdev); | |
671 | } | |
672 | ||
673 | static void | |
674 | free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) | |
675 | { | |
676 | vdev_t *vd = arg; | |
677 | vdev_indirect_mark_obsolete(vd, offset, size, | |
678 | vd->vdev_spa->spa_syncing_txg); | |
679 | vdev_indirect_ops.vdev_op_remap(vd, offset, size, | |
680 | metaslab_free_impl_cb, &vd->vdev_spa->spa_syncing_txg); | |
681 | } | |
682 | ||
683 | /* | |
684 | * On behalf of the removal thread, syncs an incremental bit more of | |
685 | * the indirect mapping to disk and updates the in-memory mapping. | |
686 | * Called as a sync task in every txg that the removal thread makes progress. | |
687 | */ | |
688 | static void | |
689 | vdev_mapping_sync(void *arg, dmu_tx_t *tx) | |
690 | { | |
691 | spa_vdev_removal_t *svr = arg; | |
692 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
693 | vdev_t *vd = svr->svr_vdev; | |
694 | ASSERTV(vdev_indirect_config_t *vic = &vd->vdev_indirect_config); | |
695 | uint64_t txg = dmu_tx_get_txg(tx); | |
696 | vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; | |
697 | ||
698 | ASSERT(vic->vic_mapping_object != 0); | |
699 | ASSERT3U(txg, ==, spa_syncing_txg(spa)); | |
700 | ||
701 | vdev_indirect_mapping_add_entries(vim, | |
702 | &svr->svr_new_segments[txg & TXG_MASK], tx); | |
703 | vdev_indirect_births_add_entry(vd->vdev_indirect_births, | |
704 | vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); | |
705 | ||
706 | /* | |
707 | * Free the copied data for anything that was freed while the | |
708 | * mapping entries were in flight. | |
709 | */ | |
710 | mutex_enter(&svr->svr_lock); | |
711 | range_tree_vacate(svr->svr_frees[txg & TXG_MASK], | |
712 | free_mapped_segment_cb, vd); | |
713 | ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, | |
714 | vdev_indirect_mapping_max_offset(vim)); | |
715 | svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; | |
716 | mutex_exit(&svr->svr_lock); | |
717 | ||
718 | spa_sync_removing_state(spa, tx); | |
719 | } | |
720 | ||
721 | static void | |
722 | spa_vdev_copy_segment_write_done(zio_t *zio) | |
723 | { | |
724 | vdev_copy_seg_arg_t *vcsa = zio->io_private; | |
725 | vdev_copy_arg_t *vca = vcsa->vcsa_copy_arg; | |
726 | spa_config_exit(zio->io_spa, SCL_STATE, FTAG); | |
727 | abd_free(zio->io_abd); | |
728 | ||
729 | mutex_enter(&vca->vca_lock); | |
730 | vca->vca_outstanding_bytes -= zio->io_size; | |
731 | cv_signal(&vca->vca_cv); | |
732 | mutex_exit(&vca->vca_lock); | |
733 | ||
734 | ASSERT0(zio->io_error); | |
735 | kmem_free(vcsa->vcsa_dest_bp, sizeof (blkptr_t)); | |
736 | kmem_free(vcsa, sizeof (vdev_copy_seg_arg_t)); | |
737 | } | |
738 | ||
739 | static void | |
740 | spa_vdev_copy_segment_read_done(zio_t *zio) | |
741 | { | |
742 | vdev_copy_seg_arg_t *vcsa = zio->io_private; | |
743 | dva_t *dest_dva = vcsa->vcsa_dest_dva; | |
744 | uint64_t txg = vcsa->vcsa_txg; | |
745 | spa_t *spa = zio->io_spa; | |
746 | ASSERTV(vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(dest_dva))); | |
747 | blkptr_t *bp = NULL; | |
748 | dva_t *dva = NULL; | |
749 | uint64_t size = zio->io_size; | |
750 | ||
751 | ASSERT3P(dest_vd, !=, NULL); | |
752 | ASSERT0(zio->io_error); | |
753 | ||
754 | vcsa->vcsa_dest_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); | |
755 | bp = vcsa->vcsa_dest_bp; | |
756 | dva = bp->blk_dva; | |
757 | ||
758 | BP_ZERO(bp); | |
759 | ||
760 | /* initialize with dest_dva */ | |
761 | bcopy(dest_dva, dva, sizeof (dva_t)); | |
762 | BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); | |
763 | ||
764 | BP_SET_LSIZE(bp, size); | |
765 | BP_SET_PSIZE(bp, size); | |
766 | BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); | |
767 | BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); | |
768 | BP_SET_TYPE(bp, DMU_OT_NONE); | |
769 | BP_SET_LEVEL(bp, 0); | |
770 | BP_SET_DEDUP(bp, 0); | |
771 | BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); | |
772 | ||
773 | zio_nowait(zio_rewrite(spa->spa_txg_zio[txg & TXG_MASK], spa, | |
774 | txg, bp, zio->io_abd, size, | |
775 | spa_vdev_copy_segment_write_done, vcsa, | |
776 | ZIO_PRIORITY_REMOVAL, 0, NULL)); | |
777 | } | |
778 | ||
779 | static int | |
780 | spa_vdev_copy_segment(vdev_t *vd, uint64_t start, uint64_t size, uint64_t txg, | |
781 | vdev_copy_arg_t *vca, zio_alloc_list_t *zal) | |
782 | { | |
783 | metaslab_group_t *mg = vd->vdev_mg; | |
784 | spa_t *spa = vd->vdev_spa; | |
785 | spa_vdev_removal_t *svr = spa->spa_vdev_removal; | |
786 | vdev_indirect_mapping_entry_t *entry; | |
787 | vdev_copy_seg_arg_t *private; | |
788 | dva_t dst = {{ 0 }}; | |
789 | blkptr_t blk, *bp = &blk; | |
790 | dva_t *dva = bp->blk_dva; | |
791 | ||
792 | ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); | |
793 | ||
794 | int error = metaslab_alloc_dva(spa, mg->mg_class, size, | |
795 | &dst, 0, NULL, txg, 0, zal); | |
796 | if (error != 0) | |
797 | return (error); | |
798 | ||
799 | /* | |
800 | * We can't have any padding of the allocated size, otherwise we will | |
801 | * misunderstand what's allocated, and the size of the mapping. | |
802 | * The caller ensures this will be true by passing in a size that is | |
803 | * aligned to the worst (highest) ashift in the pool. | |
804 | */ | |
805 | ASSERT3U(DVA_GET_ASIZE(&dst), ==, size); | |
806 | ||
807 | mutex_enter(&vca->vca_lock); | |
808 | vca->vca_outstanding_bytes += size; | |
809 | mutex_exit(&vca->vca_lock); | |
810 | ||
811 | entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); | |
812 | DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); | |
813 | entry->vime_mapping.vimep_dst = dst; | |
814 | ||
815 | private = kmem_alloc(sizeof (vdev_copy_seg_arg_t), KM_SLEEP); | |
816 | private->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; | |
817 | private->vcsa_txg = txg; | |
818 | private->vcsa_copy_arg = vca; | |
819 | ||
820 | /* | |
821 | * This lock is eventually released by the donefunc for the | |
822 | * zio_write_phys that finishes copying the data. | |
823 | */ | |
824 | spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); | |
825 | ||
826 | /* | |
827 | * Do logical I/O, letting the redundancy vdevs (like mirror) | |
828 | * handle their own I/O instead of duplicating that code here. | |
829 | */ | |
830 | BP_ZERO(bp); | |
831 | ||
832 | DVA_SET_VDEV(&dva[0], vd->vdev_id); | |
833 | DVA_SET_OFFSET(&dva[0], start); | |
834 | DVA_SET_GANG(&dva[0], 0); | |
835 | DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, size)); | |
836 | ||
837 | BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); | |
838 | ||
839 | BP_SET_LSIZE(bp, size); | |
840 | BP_SET_PSIZE(bp, size); | |
841 | BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); | |
842 | BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); | |
843 | BP_SET_TYPE(bp, DMU_OT_NONE); | |
844 | BP_SET_LEVEL(bp, 0); | |
845 | BP_SET_DEDUP(bp, 0); | |
846 | BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); | |
847 | ||
848 | zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, | |
849 | bp, abd_alloc_for_io(size, B_FALSE), size, | |
850 | spa_vdev_copy_segment_read_done, private, | |
851 | ZIO_PRIORITY_REMOVAL, 0, NULL)); | |
852 | ||
853 | list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); | |
854 | ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); | |
855 | vdev_dirty(vd, 0, NULL, txg); | |
856 | ||
857 | return (0); | |
858 | } | |
859 | ||
860 | /* | |
861 | * Complete the removal of a toplevel vdev. This is called as a | |
862 | * synctask in the same txg that we will sync out the new config (to the | |
863 | * MOS object) which indicates that this vdev is indirect. | |
864 | */ | |
865 | static void | |
866 | vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) | |
867 | { | |
868 | spa_vdev_removal_t *svr = arg; | |
869 | vdev_t *vd = svr->svr_vdev; | |
870 | spa_t *spa = vd->vdev_spa; | |
871 | ||
872 | ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); | |
873 | ||
874 | for (int i = 0; i < TXG_SIZE; i++) { | |
875 | ASSERT0(svr->svr_bytes_done[i]); | |
876 | } | |
877 | ||
878 | ASSERT3U(spa->spa_removing_phys.sr_copied, ==, | |
879 | spa->spa_removing_phys.sr_to_copy); | |
880 | ||
881 | vdev_destroy_spacemaps(vd, tx); | |
882 | ||
883 | /* destroy leaf zaps, if any */ | |
884 | ASSERT3P(svr->svr_zaplist, !=, NULL); | |
885 | for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); | |
886 | pair != NULL; | |
887 | pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { | |
888 | vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); | |
889 | } | |
890 | fnvlist_free(svr->svr_zaplist); | |
891 | ||
892 | spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); | |
893 | /* vd->vdev_path is not available here */ | |
894 | spa_history_log_internal(spa, "vdev remove completed", tx, | |
895 | "%s vdev %llu", spa_name(spa), vd->vdev_id); | |
896 | } | |
897 | ||
898 | static void | |
899 | vdev_indirect_state_transfer(vdev_t *ivd, vdev_t *vd) | |
900 | { | |
901 | ivd->vdev_indirect_config = vd->vdev_indirect_config; | |
902 | ||
903 | ASSERT3P(ivd->vdev_indirect_mapping, ==, NULL); | |
904 | ASSERT(vd->vdev_indirect_mapping != NULL); | |
905 | ivd->vdev_indirect_mapping = vd->vdev_indirect_mapping; | |
906 | vd->vdev_indirect_mapping = NULL; | |
907 | ||
908 | ASSERT3P(ivd->vdev_indirect_births, ==, NULL); | |
909 | ASSERT(vd->vdev_indirect_births != NULL); | |
910 | ivd->vdev_indirect_births = vd->vdev_indirect_births; | |
911 | vd->vdev_indirect_births = NULL; | |
912 | ||
913 | ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); | |
914 | ASSERT0(range_tree_space(ivd->vdev_obsolete_segments)); | |
915 | ||
916 | if (vd->vdev_obsolete_sm != NULL) { | |
917 | ASSERT3U(ivd->vdev_asize, ==, vd->vdev_asize); | |
918 | ||
919 | /* | |
920 | * We cannot use space_map_{open,close} because we hold all | |
921 | * the config locks as writer. | |
922 | */ | |
923 | ASSERT3P(ivd->vdev_obsolete_sm, ==, NULL); | |
924 | ivd->vdev_obsolete_sm = vd->vdev_obsolete_sm; | |
925 | vd->vdev_obsolete_sm = NULL; | |
926 | } | |
927 | } | |
928 | ||
929 | static void | |
930 | vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) | |
931 | { | |
932 | ASSERT3P(zlist, !=, NULL); | |
933 | ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); | |
934 | ||
935 | if (vd->vdev_leaf_zap != 0) { | |
936 | char zkey[32]; | |
937 | (void) snprintf(zkey, sizeof (zkey), "%s-%llu", | |
938 | VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap); | |
939 | fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); | |
940 | } | |
941 | ||
942 | for (uint64_t id = 0; id < vd->vdev_children; id++) { | |
943 | vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); | |
944 | } | |
945 | } | |
946 | ||
947 | static void | |
948 | vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) | |
949 | { | |
950 | vdev_t *ivd; | |
951 | dmu_tx_t *tx; | |
952 | spa_t *spa = vd->vdev_spa; | |
953 | spa_vdev_removal_t *svr = spa->spa_vdev_removal; | |
954 | ||
955 | /* | |
956 | * First, build a list of leaf zaps to be destroyed. | |
957 | * This is passed to the sync context thread, | |
958 | * which does the actual unlinking. | |
959 | */ | |
960 | svr->svr_zaplist = fnvlist_alloc(); | |
961 | vdev_remove_enlist_zaps(vd, svr->svr_zaplist); | |
962 | ||
963 | ivd = vdev_add_parent(vd, &vdev_indirect_ops); | |
964 | ||
965 | vd->vdev_leaf_zap = 0; | |
966 | ||
967 | vdev_remove_child(ivd, vd); | |
968 | vdev_compact_children(ivd); | |
969 | ||
970 | vdev_indirect_state_transfer(ivd, vd); | |
971 | ||
972 | svr->svr_vdev = ivd; | |
973 | ||
974 | ASSERT(!ivd->vdev_removing); | |
975 | ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); | |
976 | ||
977 | tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); | |
978 | dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, | |
979 | 0, ZFS_SPACE_CHECK_NONE, tx); | |
980 | dmu_tx_commit(tx); | |
981 | ||
982 | /* | |
983 | * Indicate that this thread has exited. | |
984 | * After this, we can not use svr. | |
985 | */ | |
986 | mutex_enter(&svr->svr_lock); | |
987 | svr->svr_thread = NULL; | |
988 | cv_broadcast(&svr->svr_cv); | |
989 | mutex_exit(&svr->svr_lock); | |
990 | } | |
991 | ||
992 | /* | |
993 | * Complete the removal of a toplevel vdev. This is called in open | |
994 | * context by the removal thread after we have copied all vdev's data. | |
995 | */ | |
996 | static void | |
997 | vdev_remove_complete(vdev_t *vd) | |
998 | { | |
999 | spa_t *spa = vd->vdev_spa; | |
1000 | uint64_t txg; | |
1001 | ||
1002 | /* | |
1003 | * Wait for any deferred frees to be synced before we call | |
1004 | * vdev_metaslab_fini() | |
1005 | */ | |
1006 | txg_wait_synced(spa->spa_dsl_pool, 0); | |
1007 | ||
1008 | txg = spa_vdev_enter(spa); | |
1009 | zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", | |
1010 | vd->vdev_id, txg); | |
1011 | ||
1012 | /* | |
1013 | * Discard allocation state. | |
1014 | */ | |
1015 | if (vd->vdev_mg != NULL) { | |
1016 | vdev_metaslab_fini(vd); | |
1017 | metaslab_group_destroy(vd->vdev_mg); | |
1018 | vd->vdev_mg = NULL; | |
1019 | } | |
1020 | ASSERT0(vd->vdev_stat.vs_space); | |
1021 | ASSERT0(vd->vdev_stat.vs_dspace); | |
1022 | ||
1023 | vdev_remove_replace_with_indirect(vd, txg); | |
1024 | ||
1025 | /* | |
1026 | * We now release the locks, allowing spa_sync to run and finish the | |
1027 | * removal via vdev_remove_complete_sync in syncing context. | |
1028 | */ | |
1029 | (void) spa_vdev_exit(spa, NULL, txg, 0); | |
1030 | ||
1031 | /* | |
1032 | * Top ZAP should have been transferred to the indirect vdev in | |
1033 | * vdev_remove_replace_with_indirect. | |
1034 | */ | |
1035 | ASSERT0(vd->vdev_top_zap); | |
1036 | ||
1037 | /* | |
1038 | * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. | |
1039 | */ | |
1040 | ASSERT0(vd->vdev_leaf_zap); | |
1041 | ||
1042 | txg = spa_vdev_enter(spa); | |
1043 | (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); | |
1044 | /* | |
1045 | * Request to update the config and the config cachefile. | |
1046 | */ | |
1047 | vdev_config_dirty(spa->spa_root_vdev); | |
1048 | (void) spa_vdev_exit(spa, vd, txg, 0); | |
1049 | } | |
1050 | ||
1051 | /* | |
1052 | * Evacuates a segment of size at most max_alloc from the vdev | |
1053 | * via repeated calls to spa_vdev_copy_segment. If an allocation | |
1054 | * fails, the pool is probably too fragmented to handle such a | |
1055 | * large size, so decrease max_alloc so that the caller will not try | |
1056 | * this size again this txg. | |
1057 | */ | |
1058 | static void | |
1059 | spa_vdev_copy_impl(spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, | |
1060 | uint64_t *max_alloc, dmu_tx_t *tx) | |
1061 | { | |
1062 | uint64_t txg = dmu_tx_get_txg(tx); | |
1063 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
1064 | ||
1065 | mutex_enter(&svr->svr_lock); | |
1066 | ||
1067 | range_seg_t *rs = avl_first(&svr->svr_allocd_segs->rt_root); | |
1068 | if (rs == NULL) { | |
1069 | mutex_exit(&svr->svr_lock); | |
1070 | return; | |
1071 | } | |
1072 | uint64_t offset = rs->rs_start; | |
1073 | uint64_t length = MIN(rs->rs_end - rs->rs_start, *max_alloc); | |
1074 | ||
1075 | range_tree_remove(svr->svr_allocd_segs, offset, length); | |
1076 | ||
1077 | if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { | |
1078 | dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, | |
1079 | svr, 0, ZFS_SPACE_CHECK_NONE, tx); | |
1080 | } | |
1081 | ||
1082 | svr->svr_max_offset_to_sync[txg & TXG_MASK] = offset + length; | |
1083 | ||
1084 | /* | |
1085 | * Note: this is the amount of *allocated* space | |
1086 | * that we are taking care of each txg. | |
1087 | */ | |
1088 | svr->svr_bytes_done[txg & TXG_MASK] += length; | |
1089 | ||
1090 | mutex_exit(&svr->svr_lock); | |
1091 | ||
1092 | zio_alloc_list_t zal; | |
1093 | metaslab_trace_init(&zal); | |
1094 | uint64_t thismax = *max_alloc; | |
1095 | while (length > 0) { | |
1096 | uint64_t mylen = MIN(length, thismax); | |
1097 | ||
1098 | int error = spa_vdev_copy_segment(svr->svr_vdev, | |
1099 | offset, mylen, txg, vca, &zal); | |
1100 | ||
1101 | if (error == ENOSPC) { | |
1102 | /* | |
1103 | * Cut our segment in half, and don't try this | |
1104 | * segment size again this txg. Note that the | |
1105 | * allocation size must be aligned to the highest | |
1106 | * ashift in the pool, so that the allocation will | |
1107 | * not be padded out to a multiple of the ashift, | |
1108 | * which could cause us to think that this mapping | |
1109 | * is larger than we intended. | |
1110 | */ | |
1111 | ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); | |
1112 | ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); | |
1113 | thismax = P2ROUNDUP(mylen / 2, | |
1114 | 1 << spa->spa_max_ashift); | |
1115 | ASSERT3U(thismax, <, mylen); | |
1116 | /* | |
1117 | * The minimum-size allocation can not fail. | |
1118 | */ | |
1119 | ASSERT3U(mylen, >, 1 << spa->spa_max_ashift); | |
1120 | *max_alloc = mylen - (1 << spa->spa_max_ashift); | |
1121 | } else { | |
1122 | ASSERT0(error); | |
1123 | length -= mylen; | |
1124 | offset += mylen; | |
1125 | ||
1126 | /* | |
1127 | * We've performed an allocation, so reset the | |
1128 | * alloc trace list. | |
1129 | */ | |
1130 | metaslab_trace_fini(&zal); | |
1131 | metaslab_trace_init(&zal); | |
1132 | } | |
1133 | } | |
1134 | metaslab_trace_fini(&zal); | |
1135 | } | |
1136 | ||
1137 | /* | |
1138 | * The removal thread operates in open context. It iterates over all | |
1139 | * allocated space in the vdev, by loading each metaslab's spacemap. | |
1140 | * For each contiguous segment of allocated space (capping the segment | |
1141 | * size at SPA_MAXBLOCKSIZE), we: | |
1142 | * - Allocate space for it on another vdev. | |
1143 | * - Create a new mapping from the old location to the new location | |
1144 | * (as a record in svr_new_segments). | |
1145 | * - Initiate a physical read zio to get the data off the removing disk. | |
1146 | * - In the read zio's done callback, initiate a physical write zio to | |
1147 | * write it to the new vdev. | |
1148 | * Note that all of this will take effect when a particular TXG syncs. | |
1149 | * The sync thread ensures that all the phys reads and writes for the syncing | |
1150 | * TXG have completed (see spa_txg_zio) and writes the new mappings to disk | |
1151 | * (see vdev_mapping_sync()). | |
1152 | */ | |
1153 | static void | |
1154 | spa_vdev_remove_thread(void *arg) | |
1155 | { | |
1156 | vdev_t *vd = arg; | |
1157 | spa_t *spa = vd->vdev_spa; | |
1158 | spa_vdev_removal_t *svr = spa->spa_vdev_removal; | |
1159 | vdev_copy_arg_t vca; | |
1160 | uint64_t max_alloc = zfs_remove_max_segment; | |
1161 | uint64_t last_txg = 0; | |
1162 | vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; | |
1163 | uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); | |
1164 | ||
1165 | ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); | |
1166 | ASSERT(vdev_is_concrete(vd)); | |
1167 | ASSERT(vd->vdev_removing); | |
1168 | ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); | |
1169 | ASSERT3P(svr->svr_vdev, ==, vd); | |
1170 | ASSERT(vim != NULL); | |
1171 | ||
1172 | mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); | |
1173 | cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); | |
1174 | vca.vca_outstanding_bytes = 0; | |
1175 | ||
1176 | mutex_enter(&svr->svr_lock); | |
1177 | ||
1178 | /* | |
1179 | * Start from vim_max_offset so we pick up where we left off | |
1180 | * if we are restarting the removal after opening the pool. | |
1181 | */ | |
1182 | uint64_t msi; | |
1183 | for (msi = start_offset >> vd->vdev_ms_shift; | |
1184 | msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { | |
1185 | metaslab_t *msp = vd->vdev_ms[msi]; | |
1186 | ASSERT3U(msi, <=, vd->vdev_ms_count); | |
1187 | ||
1188 | ASSERT0(range_tree_space(svr->svr_allocd_segs)); | |
1189 | ||
1190 | mutex_enter(&msp->ms_sync_lock); | |
1191 | mutex_enter(&msp->ms_lock); | |
1192 | ||
1193 | /* | |
1194 | * Assert nothing in flight -- ms_*tree is empty. | |
1195 | */ | |
1196 | for (int i = 0; i < TXG_SIZE; i++) { | |
1197 | ASSERT0(range_tree_space(msp->ms_alloctree[i])); | |
1198 | } | |
1199 | ||
1200 | /* | |
1201 | * If the metaslab has ever been allocated from (ms_sm!=NULL), | |
1202 | * read the allocated segments from the space map object | |
1203 | * into svr_allocd_segs. Since we do this while holding | |
1204 | * svr_lock and ms_sync_lock, concurrent frees (which | |
1205 | * would have modified the space map) will wait for us | |
1206 | * to finish loading the spacemap, and then take the | |
1207 | * appropriate action (see free_from_removing_vdev()). | |
1208 | */ | |
1209 | if (msp->ms_sm != NULL) { | |
1210 | space_map_t *sm = NULL; | |
1211 | ||
1212 | /* | |
1213 | * We have to open a new space map here, because | |
1214 | * ms_sm's sm_length and sm_alloc may not reflect | |
1215 | * what's in the object contents, if we are in between | |
1216 | * metaslab_sync() and metaslab_sync_done(). | |
1217 | */ | |
1218 | VERIFY0(space_map_open(&sm, | |
1219 | spa->spa_dsl_pool->dp_meta_objset, | |
1220 | msp->ms_sm->sm_object, msp->ms_sm->sm_start, | |
1221 | msp->ms_sm->sm_size, msp->ms_sm->sm_shift)); | |
1222 | space_map_update(sm); | |
1223 | VERIFY0(space_map_load(sm, svr->svr_allocd_segs, | |
1224 | SM_ALLOC)); | |
1225 | space_map_close(sm); | |
1226 | ||
1227 | range_tree_walk(msp->ms_freeingtree, | |
1228 | range_tree_remove, svr->svr_allocd_segs); | |
1229 | ||
1230 | /* | |
1231 | * When we are resuming from a paused removal (i.e. | |
1232 | * when importing a pool with a removal in progress), | |
1233 | * discard any state that we have already processed. | |
1234 | */ | |
1235 | range_tree_clear(svr->svr_allocd_segs, 0, start_offset); | |
1236 | } | |
1237 | mutex_exit(&msp->ms_lock); | |
1238 | mutex_exit(&msp->ms_sync_lock); | |
1239 | ||
1240 | vca.vca_msp = msp; | |
1241 | zfs_dbgmsg("copying %llu segments for metaslab %llu", | |
1242 | avl_numnodes(&svr->svr_allocd_segs->rt_root), | |
1243 | msp->ms_id); | |
1244 | ||
1245 | while (!svr->svr_thread_exit && | |
1246 | range_tree_space(svr->svr_allocd_segs) != 0) { | |
1247 | ||
1248 | mutex_exit(&svr->svr_lock); | |
1249 | ||
1250 | mutex_enter(&vca.vca_lock); | |
1251 | while (vca.vca_outstanding_bytes > | |
1252 | zfs_remove_max_copy_bytes) { | |
1253 | cv_wait(&vca.vca_cv, &vca.vca_lock); | |
1254 | } | |
1255 | mutex_exit(&vca.vca_lock); | |
1256 | ||
1257 | dmu_tx_t *tx = | |
1258 | dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); | |
1259 | dmu_tx_hold_space(tx, SPA_MAXBLOCKSIZE); | |
1260 | VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); | |
1261 | uint64_t txg = dmu_tx_get_txg(tx); | |
1262 | ||
1263 | if (txg != last_txg) | |
1264 | max_alloc = zfs_remove_max_segment; | |
1265 | last_txg = txg; | |
1266 | ||
1267 | spa_vdev_copy_impl(svr, &vca, &max_alloc, tx); | |
1268 | ||
1269 | dmu_tx_commit(tx); | |
1270 | mutex_enter(&svr->svr_lock); | |
1271 | } | |
1272 | } | |
1273 | ||
1274 | mutex_exit(&svr->svr_lock); | |
1275 | /* | |
1276 | * Wait for all copies to finish before cleaning up the vca. | |
1277 | */ | |
1278 | txg_wait_synced(spa->spa_dsl_pool, 0); | |
1279 | ASSERT0(vca.vca_outstanding_bytes); | |
1280 | ||
1281 | mutex_destroy(&vca.vca_lock); | |
1282 | cv_destroy(&vca.vca_cv); | |
1283 | ||
1284 | if (svr->svr_thread_exit) { | |
1285 | mutex_enter(&svr->svr_lock); | |
1286 | range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); | |
1287 | svr->svr_thread = NULL; | |
1288 | cv_broadcast(&svr->svr_cv); | |
1289 | mutex_exit(&svr->svr_lock); | |
1290 | } else { | |
1291 | ASSERT0(range_tree_space(svr->svr_allocd_segs)); | |
1292 | vdev_remove_complete(vd); | |
1293 | } | |
1294 | } | |
1295 | ||
1296 | void | |
1297 | spa_vdev_remove_suspend(spa_t *spa) | |
1298 | { | |
1299 | spa_vdev_removal_t *svr = spa->spa_vdev_removal; | |
1300 | ||
1301 | if (svr == NULL) | |
1302 | return; | |
1303 | ||
1304 | mutex_enter(&svr->svr_lock); | |
1305 | svr->svr_thread_exit = B_TRUE; | |
1306 | while (svr->svr_thread != NULL) | |
1307 | cv_wait(&svr->svr_cv, &svr->svr_lock); | |
1308 | svr->svr_thread_exit = B_FALSE; | |
1309 | mutex_exit(&svr->svr_lock); | |
1310 | } | |
1311 | ||
1312 | /* ARGSUSED */ | |
1313 | static int | |
1314 | spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) | |
1315 | { | |
1316 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
1317 | ||
1318 | if (spa->spa_vdev_removal == NULL) | |
1319 | return (ENOTACTIVE); | |
1320 | return (0); | |
1321 | } | |
1322 | ||
1323 | /* | |
1324 | * Cancel a removal by freeing all entries from the partial mapping | |
1325 | * and marking the vdev as no longer being removing. | |
1326 | */ | |
1327 | /* ARGSUSED */ | |
1328 | static void | |
1329 | spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) | |
1330 | { | |
1331 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; | |
1332 | spa_vdev_removal_t *svr = spa->spa_vdev_removal; | |
1333 | vdev_t *vd = svr->svr_vdev; | |
1334 | vdev_indirect_config_t *vic = &vd->vdev_indirect_config; | |
1335 | vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; | |
1336 | objset_t *mos = spa->spa_meta_objset; | |
1337 | ||
1338 | ASSERT3P(svr->svr_thread, ==, NULL); | |
1339 | ||
1340 | spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); | |
1341 | if (vdev_obsolete_counts_are_precise(vd)) { | |
1342 | spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); | |
1343 | VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, | |
1344 | VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); | |
1345 | } | |
1346 | ||
1347 | if (vdev_obsolete_sm_object(vd) != 0) { | |
1348 | ASSERT(vd->vdev_obsolete_sm != NULL); | |
1349 | ASSERT3U(vdev_obsolete_sm_object(vd), ==, | |
1350 | space_map_object(vd->vdev_obsolete_sm)); | |
1351 | ||
1352 | space_map_free(vd->vdev_obsolete_sm, tx); | |
1353 | VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, | |
1354 | VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); | |
1355 | space_map_close(vd->vdev_obsolete_sm); | |
1356 | vd->vdev_obsolete_sm = NULL; | |
1357 | spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); | |
1358 | } | |
1359 | for (int i = 0; i < TXG_SIZE; i++) { | |
1360 | ASSERT(list_is_empty(&svr->svr_new_segments[i])); | |
1361 | ASSERT3U(svr->svr_max_offset_to_sync[i], <=, | |
1362 | vdev_indirect_mapping_max_offset(vim)); | |
1363 | } | |
1364 | ||
1365 | for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { | |
1366 | metaslab_t *msp = vd->vdev_ms[msi]; | |
1367 | ||
1368 | if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) | |
1369 | break; | |
1370 | ||
1371 | ASSERT0(range_tree_space(svr->svr_allocd_segs)); | |
1372 | ||
1373 | mutex_enter(&msp->ms_lock); | |
1374 | ||
1375 | /* | |
1376 | * Assert nothing in flight -- ms_*tree is empty. | |
1377 | */ | |
1378 | for (int i = 0; i < TXG_SIZE; i++) | |
1379 | ASSERT0(range_tree_space(msp->ms_alloctree[i])); | |
1380 | for (int i = 0; i < TXG_DEFER_SIZE; i++) | |
1381 | ASSERT0(range_tree_space(msp->ms_defertree[i])); | |
1382 | ASSERT0(range_tree_space(msp->ms_freedtree)); | |
1383 | ||
1384 | if (msp->ms_sm != NULL) { | |
1385 | /* | |
1386 | * Assert that the in-core spacemap has the same | |
1387 | * length as the on-disk one, so we can use the | |
1388 | * existing in-core spacemap to load it from disk. | |
1389 | */ | |
1390 | ASSERT3U(msp->ms_sm->sm_alloc, ==, | |
1391 | msp->ms_sm->sm_phys->smp_alloc); | |
1392 | ASSERT3U(msp->ms_sm->sm_length, ==, | |
1393 | msp->ms_sm->sm_phys->smp_objsize); | |
1394 | ||
1395 | mutex_enter(&svr->svr_lock); | |
1396 | VERIFY0(space_map_load(msp->ms_sm, | |
1397 | svr->svr_allocd_segs, SM_ALLOC)); | |
1398 | range_tree_walk(msp->ms_freeingtree, | |
1399 | range_tree_remove, svr->svr_allocd_segs); | |
1400 | ||
1401 | /* | |
1402 | * Clear everything past what has been synced, | |
1403 | * because we have not allocated mappings for it yet. | |
1404 | */ | |
1405 | uint64_t syncd = vdev_indirect_mapping_max_offset(vim); | |
1406 | range_tree_clear(svr->svr_allocd_segs, syncd, | |
1407 | msp->ms_sm->sm_start + msp->ms_sm->sm_size - syncd); | |
1408 | ||
1409 | mutex_exit(&svr->svr_lock); | |
1410 | } | |
1411 | mutex_exit(&msp->ms_lock); | |
1412 | ||
1413 | mutex_enter(&svr->svr_lock); | |
1414 | range_tree_vacate(svr->svr_allocd_segs, | |
1415 | free_mapped_segment_cb, vd); | |
1416 | mutex_exit(&svr->svr_lock); | |
1417 | } | |
1418 | ||
1419 | /* | |
1420 | * Note: this must happen after we invoke free_mapped_segment_cb, | |
1421 | * because it adds to the obsolete_segments. | |
1422 | */ | |
1423 | range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); | |
1424 | ||
1425 | ASSERT3U(vic->vic_mapping_object, ==, | |
1426 | vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); | |
1427 | vdev_indirect_mapping_close(vd->vdev_indirect_mapping); | |
1428 | vd->vdev_indirect_mapping = NULL; | |
1429 | vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); | |
1430 | vic->vic_mapping_object = 0; | |
1431 | ||
1432 | ASSERT3U(vic->vic_births_object, ==, | |
1433 | vdev_indirect_births_object(vd->vdev_indirect_births)); | |
1434 | vdev_indirect_births_close(vd->vdev_indirect_births); | |
1435 | vd->vdev_indirect_births = NULL; | |
1436 | vdev_indirect_births_free(mos, vic->vic_births_object, tx); | |
1437 | vic->vic_births_object = 0; | |
1438 | ||
1439 | /* | |
1440 | * We may have processed some frees from the removing vdev in this | |
1441 | * txg, thus increasing svr_bytes_done; discard that here to | |
1442 | * satisfy the assertions in spa_vdev_removal_destroy(). | |
1443 | * Note that future txg's can not have any bytes_done, because | |
1444 | * future TXG's are only modified from open context, and we have | |
1445 | * already shut down the copying thread. | |
1446 | */ | |
1447 | svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; | |
1448 | spa_finish_removal(spa, DSS_CANCELED, tx); | |
1449 | ||
1450 | vd->vdev_removing = B_FALSE; | |
1451 | vdev_config_dirty(vd); | |
1452 | ||
1453 | zfs_dbgmsg("canceled device removal for vdev %llu in %llu", | |
1454 | vd->vdev_id, dmu_tx_get_txg(tx)); | |
1455 | spa_history_log_internal(spa, "vdev remove canceled", tx, | |
1456 | "%s vdev %llu %s", spa_name(spa), | |
1457 | vd->vdev_id, (vd->vdev_path != NULL) ? vd->vdev_path : "-"); | |
1458 | } | |
1459 | ||
1460 | int | |
1461 | spa_vdev_remove_cancel(spa_t *spa) | |
1462 | { | |
1463 | spa_vdev_remove_suspend(spa); | |
1464 | ||
1465 | if (spa->spa_vdev_removal == NULL) | |
1466 | return (ENOTACTIVE); | |
1467 | ||
1468 | uint64_t vdid = spa->spa_vdev_removal->svr_vdev->vdev_id; | |
1469 | ||
1470 | int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, | |
1471 | spa_vdev_remove_cancel_sync, NULL, 0, ZFS_SPACE_CHECK_NONE); | |
1472 | ||
1473 | if (error == 0) { | |
1474 | spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); | |
1475 | vdev_t *vd = vdev_lookup_top(spa, vdid); | |
1476 | metaslab_group_activate(vd->vdev_mg); | |
1477 | spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); | |
1478 | } | |
1479 | ||
1480 | return (error); | |
1481 | } | |
1482 | ||
1483 | /* | |
1484 | * Called every sync pass of every txg if there's a svr. | |
1485 | */ | |
1486 | void | |
1487 | svr_sync(spa_t *spa, dmu_tx_t *tx) | |
1488 | { | |
1489 | spa_vdev_removal_t *svr = spa->spa_vdev_removal; | |
1490 | int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; | |
1491 | ||
1492 | /* | |
1493 | * This check is necessary so that we do not dirty the | |
1494 | * DIRECTORY_OBJECT via spa_sync_removing_state() when there | |
1495 | * is nothing to do. Dirtying it every time would prevent us | |
1496 | * from syncing-to-convergence. | |
1497 | */ | |
1498 | if (svr->svr_bytes_done[txgoff] == 0) | |
1499 | return; | |
1500 | ||
1501 | /* | |
1502 | * Update progress accounting. | |
1503 | */ | |
1504 | spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; | |
1505 | svr->svr_bytes_done[txgoff] = 0; | |
1506 | ||
1507 | spa_sync_removing_state(spa, tx); | |
1508 | } | |
1509 | ||
1510 | static void | |
1511 | vdev_remove_make_hole_and_free(vdev_t *vd) | |
1512 | { | |
1513 | uint64_t id = vd->vdev_id; | |
1514 | spa_t *spa = vd->vdev_spa; | |
1515 | vdev_t *rvd = spa->spa_root_vdev; | |
1516 | boolean_t last_vdev = (id == (rvd->vdev_children - 1)); | |
1517 | ||
1518 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1519 | ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); | |
1520 | ||
1521 | vdev_free(vd); | |
1522 | ||
1523 | if (last_vdev) { | |
1524 | vdev_compact_children(rvd); | |
1525 | } else { | |
1526 | vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); | |
1527 | vdev_add_child(rvd, vd); | |
1528 | } | |
1529 | vdev_config_dirty(rvd); | |
1530 | ||
1531 | /* | |
1532 | * Reassess the health of our root vdev. | |
1533 | */ | |
1534 | vdev_reopen(rvd); | |
1535 | } | |
1536 | ||
1537 | /* | |
1538 | * Remove a log device. The config lock is held for the specified TXG. | |
1539 | */ | |
1540 | static int | |
1541 | spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) | |
1542 | { | |
1543 | metaslab_group_t *mg = vd->vdev_mg; | |
1544 | spa_t *spa = vd->vdev_spa; | |
1545 | int error = 0; | |
1546 | ||
1547 | ASSERT(vd->vdev_islog); | |
1548 | ASSERT(vd == vd->vdev_top); | |
1549 | ||
1550 | /* | |
1551 | * Stop allocating from this vdev. | |
1552 | */ | |
1553 | metaslab_group_passivate(mg); | |
1554 | ||
1555 | /* | |
1556 | * Wait for the youngest allocations and frees to sync, | |
1557 | * and then wait for the deferral of those frees to finish. | |
1558 | */ | |
1559 | spa_vdev_config_exit(spa, NULL, | |
1560 | *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); | |
1561 | ||
1562 | /* | |
1563 | * Evacuate the device. We don't hold the config lock as writer | |
1564 | * since we need to do I/O but we do keep the | |
1565 | * spa_namespace_lock held. Once this completes the device | |
1566 | * should no longer have any blocks allocated on it. | |
1567 | */ | |
1568 | if (vd->vdev_islog) { | |
1569 | if (vd->vdev_stat.vs_alloc != 0) | |
1570 | error = spa_reset_logs(spa); | |
1571 | } | |
1572 | ||
1573 | *txg = spa_vdev_config_enter(spa); | |
1574 | ||
1575 | if (error != 0) { | |
1576 | metaslab_group_activate(mg); | |
1577 | return (error); | |
1578 | } | |
1579 | ASSERT0(vd->vdev_stat.vs_alloc); | |
1580 | ||
1581 | /* | |
1582 | * The evacuation succeeded. Remove any remaining MOS metadata | |
1583 | * associated with this vdev, and wait for these changes to sync. | |
1584 | */ | |
1585 | vd->vdev_removing = B_TRUE; | |
1586 | ||
1587 | vdev_dirty_leaves(vd, VDD_DTL, *txg); | |
1588 | vdev_config_dirty(vd); | |
1589 | ||
1590 | spa_history_log_internal(spa, "vdev remove", NULL, | |
1591 | "%s vdev %llu (log) %s", spa_name(spa), vd->vdev_id, | |
1592 | (vd->vdev_path != NULL) ? vd->vdev_path : "-"); | |
1593 | ||
1594 | spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); | |
1595 | ||
1596 | *txg = spa_vdev_config_enter(spa); | |
1597 | ||
1598 | sysevent_t *ev = spa_event_create(spa, vd, NULL, | |
1599 | ESC_ZFS_VDEV_REMOVE_DEV); | |
1600 | ASSERT(MUTEX_HELD(&spa_namespace_lock)); | |
1601 | ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); | |
1602 | ||
1603 | /* The top ZAP should have been destroyed by vdev_remove_empty. */ | |
1604 | ASSERT0(vd->vdev_top_zap); | |
1605 | /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ | |
1606 | ASSERT0(vd->vdev_leaf_zap); | |
1607 | ||
1608 | (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); | |
1609 | ||
1610 | if (list_link_active(&vd->vdev_state_dirty_node)) | |
1611 | vdev_state_clean(vd); | |
1612 | if (list_link_active(&vd->vdev_config_dirty_node)) | |
1613 | vdev_config_clean(vd); | |
1614 | ||
1615 | /* | |
1616 | * Clean up the vdev namespace. | |
1617 | */ | |
1618 | vdev_remove_make_hole_and_free(vd); | |
1619 | ||
1620 | if (ev != NULL) | |
1621 | spa_event_post(ev); | |
1622 | ||
1623 | return (0); | |
1624 | } | |
1625 | ||
1626 | static int | |
1627 | spa_vdev_remove_top_check(vdev_t *vd) | |
1628 | { | |
1629 | spa_t *spa = vd->vdev_spa; | |
1630 | ||
1631 | if (vd != vd->vdev_top) | |
1632 | return (SET_ERROR(ENOTSUP)); | |
1633 | ||
1634 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) | |
1635 | return (SET_ERROR(ENOTSUP)); | |
1636 | ||
1637 | /* | |
1638 | * There has to be enough free space to remove the | |
1639 | * device and leave double the "slop" space (i.e. we | |
1640 | * must leave at least 3% of the pool free, in addition to | |
1641 | * the normal slop space). | |
1642 | */ | |
1643 | if (dsl_dir_space_available(spa->spa_dsl_pool->dp_root_dir, | |
1644 | NULL, 0, B_TRUE) < | |
1645 | vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { | |
1646 | return (SET_ERROR(ENOSPC)); | |
1647 | } | |
1648 | ||
1649 | /* | |
1650 | * There can not be a removal in progress. | |
1651 | */ | |
1652 | if (spa->spa_removing_phys.sr_state == DSS_SCANNING) | |
1653 | return (SET_ERROR(EBUSY)); | |
1654 | ||
1655 | /* | |
1656 | * The device must have all its data. | |
1657 | */ | |
1658 | if (!vdev_dtl_empty(vd, DTL_MISSING) || | |
1659 | !vdev_dtl_empty(vd, DTL_OUTAGE)) | |
1660 | return (SET_ERROR(EBUSY)); | |
1661 | ||
1662 | /* | |
1663 | * The device must be healthy. | |
1664 | */ | |
1665 | if (!vdev_readable(vd)) | |
1666 | return (SET_ERROR(EIO)); | |
1667 | ||
1668 | /* | |
1669 | * All vdevs in normal class must have the same ashift. | |
1670 | */ | |
1671 | if (spa->spa_max_ashift != spa->spa_min_ashift) { | |
1672 | return (SET_ERROR(EINVAL)); | |
1673 | } | |
1674 | ||
1675 | /* | |
1676 | * All vdevs in normal class must have the same ashift | |
1677 | * and not be raidz. | |
1678 | */ | |
1679 | vdev_t *rvd = spa->spa_root_vdev; | |
1680 | int num_indirect = 0; | |
1681 | for (uint64_t id = 0; id < rvd->vdev_children; id++) { | |
1682 | vdev_t *cvd = rvd->vdev_child[id]; | |
1683 | if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) | |
1684 | ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); | |
1685 | if (cvd->vdev_ops == &vdev_indirect_ops) | |
1686 | num_indirect++; | |
1687 | if (!vdev_is_concrete(cvd)) | |
1688 | continue; | |
1689 | if (cvd->vdev_ops == &vdev_raidz_ops) | |
1690 | return (SET_ERROR(EINVAL)); | |
1691 | /* | |
1692 | * Need the mirror to be mirror of leaf vdevs only | |
1693 | */ | |
1694 | if (cvd->vdev_ops == &vdev_mirror_ops) { | |
1695 | for (uint64_t cid = 0; | |
1696 | cid < cvd->vdev_children; cid++) { | |
1697 | if (!cvd->vdev_child[cid]->vdev_ops-> | |
1698 | vdev_op_leaf) | |
1699 | return (SET_ERROR(EINVAL)); | |
1700 | } | |
1701 | } | |
1702 | } | |
1703 | ||
1704 | return (0); | |
1705 | } | |
1706 | ||
1707 | /* | |
1708 | * Initiate removal of a top-level vdev, reducing the total space in the pool. | |
1709 | * The config lock is held for the specified TXG. Once initiated, | |
1710 | * evacuation of all allocated space (copying it to other vdevs) happens | |
1711 | * in the background (see spa_vdev_remove_thread()), and can be canceled | |
1712 | * (see spa_vdev_remove_cancel()). If successful, the vdev will | |
1713 | * be transformed to an indirect vdev (see spa_vdev_remove_complete()). | |
1714 | */ | |
1715 | static int | |
1716 | spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) | |
1717 | { | |
1718 | spa_t *spa = vd->vdev_spa; | |
1719 | int error; | |
1720 | ||
1721 | /* | |
1722 | * Check for errors up-front, so that we don't waste time | |
1723 | * passivating the metaslab group and clearing the ZIL if there | |
1724 | * are errors. | |
1725 | */ | |
1726 | error = spa_vdev_remove_top_check(vd); | |
1727 | if (error != 0) | |
1728 | return (error); | |
1729 | ||
1730 | /* | |
1731 | * Stop allocating from this vdev. Note that we must check | |
1732 | * that this is not the only device in the pool before | |
1733 | * passivating, otherwise we will not be able to make | |
1734 | * progress because we can't allocate from any vdevs. | |
1735 | * The above check for sufficient free space serves this | |
1736 | * purpose. | |
1737 | */ | |
1738 | metaslab_group_t *mg = vd->vdev_mg; | |
1739 | metaslab_group_passivate(mg); | |
1740 | ||
1741 | /* | |
1742 | * Wait for the youngest allocations and frees to sync, | |
1743 | * and then wait for the deferral of those frees to finish. | |
1744 | */ | |
1745 | spa_vdev_config_exit(spa, NULL, | |
1746 | *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); | |
1747 | ||
1748 | /* | |
1749 | * We must ensure that no "stubby" log blocks are allocated | |
1750 | * on the device to be removed. These blocks could be | |
1751 | * written at any time, including while we are in the middle | |
1752 | * of copying them. | |
1753 | */ | |
1754 | error = spa_reset_logs(spa); | |
1755 | ||
1756 | *txg = spa_vdev_config_enter(spa); | |
1757 | ||
1758 | /* | |
1759 | * Things might have changed while the config lock was dropped | |
1760 | * (e.g. space usage). Check for errors again. | |
1761 | */ | |
1762 | if (error == 0) | |
1763 | error = spa_vdev_remove_top_check(vd); | |
1764 | ||
1765 | if (error != 0) { | |
1766 | metaslab_group_activate(mg); | |
1767 | return (error); | |
1768 | } | |
1769 | ||
1770 | vd->vdev_removing = B_TRUE; | |
1771 | ||
1772 | vdev_dirty_leaves(vd, VDD_DTL, *txg); | |
1773 | vdev_config_dirty(vd); | |
1774 | dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); | |
1775 | dsl_sync_task_nowait(spa->spa_dsl_pool, | |
1776 | vdev_remove_initiate_sync, | |
1777 | vd, 0, ZFS_SPACE_CHECK_NONE, tx); | |
1778 | dmu_tx_commit(tx); | |
1779 | ||
1780 | return (0); | |
1781 | } | |
1782 | ||
1783 | /* | |
1784 | * Remove a device from the pool. | |
1785 | * | |
1786 | * Removing a device from the vdev namespace requires several steps | |
1787 | * and can take a significant amount of time. As a result we use | |
1788 | * the spa_vdev_config_[enter/exit] functions which allow us to | |
1789 | * grab and release the spa_config_lock while still holding the namespace | |
1790 | * lock. During each step the configuration is synced out. | |
1791 | */ | |
1792 | int | |
1793 | spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) | |
1794 | { | |
1795 | vdev_t *vd; | |
1796 | nvlist_t **spares, **l2cache, *nv; | |
1797 | uint64_t txg = 0; | |
1798 | uint_t nspares, nl2cache; | |
1799 | int error = 0; | |
1800 | boolean_t locked = MUTEX_HELD(&spa_namespace_lock); | |
1801 | sysevent_t *ev = NULL; | |
1802 | ||
1803 | ASSERT(spa_writeable(spa)); | |
1804 | ||
1805 | if (!locked) | |
1806 | txg = spa_vdev_enter(spa); | |
1807 | ||
1808 | vd = spa_lookup_by_guid(spa, guid, B_FALSE); | |
1809 | ||
1810 | if (spa->spa_spares.sav_vdevs != NULL && | |
1811 | nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, | |
1812 | ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && | |
1813 | (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { | |
1814 | /* | |
1815 | * Only remove the hot spare if it's not currently in use | |
1816 | * in this pool. | |
1817 | */ | |
1818 | if (vd == NULL || unspare) { | |
1819 | if (vd == NULL) | |
1820 | vd = spa_lookup_by_guid(spa, guid, B_TRUE); | |
1821 | ev = spa_event_create(spa, vd, NULL, | |
1822 | ESC_ZFS_VDEV_REMOVE_AUX); | |
1823 | ||
1824 | char *nvstr = fnvlist_lookup_string(nv, | |
1825 | ZPOOL_CONFIG_PATH); | |
1826 | spa_history_log_internal(spa, "vdev remove", NULL, | |
1827 | "%s vdev (%s) %s", spa_name(spa), | |
1828 | VDEV_TYPE_SPARE, nvstr); | |
1829 | spa_vdev_remove_aux(spa->spa_spares.sav_config, | |
1830 | ZPOOL_CONFIG_SPARES, spares, nspares, nv); | |
1831 | spa_load_spares(spa); | |
1832 | spa->spa_spares.sav_sync = B_TRUE; | |
1833 | } else { | |
1834 | error = SET_ERROR(EBUSY); | |
1835 | } | |
1836 | } else if (spa->spa_l2cache.sav_vdevs != NULL && | |
1837 | nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, | |
1838 | ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && | |
1839 | (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { | |
1840 | char *nvstr = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); | |
1841 | spa_history_log_internal(spa, "vdev remove", NULL, | |
1842 | "%s vdev (%s) %s", spa_name(spa), VDEV_TYPE_L2CACHE, nvstr); | |
1843 | /* | |
1844 | * Cache devices can always be removed. | |
1845 | */ | |
1846 | vd = spa_lookup_by_guid(spa, guid, B_TRUE); | |
1847 | ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); | |
1848 | spa_vdev_remove_aux(spa->spa_l2cache.sav_config, | |
1849 | ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); | |
1850 | spa_load_l2cache(spa); | |
1851 | spa->spa_l2cache.sav_sync = B_TRUE; | |
1852 | } else if (vd != NULL && vd->vdev_islog) { | |
1853 | ASSERT(!locked); | |
1854 | error = spa_vdev_remove_log(vd, &txg); | |
1855 | } else if (vd != NULL) { | |
1856 | ASSERT(!locked); | |
1857 | error = spa_vdev_remove_top(vd, &txg); | |
1858 | } else { | |
1859 | /* | |
1860 | * There is no vdev of any kind with the specified guid. | |
1861 | */ | |
1862 | error = SET_ERROR(ENOENT); | |
1863 | } | |
1864 | ||
1865 | if (!locked) | |
1866 | error = spa_vdev_exit(spa, NULL, txg, error); | |
1867 | ||
1868 | if (ev != NULL) | |
1869 | spa_event_post(ev); | |
1870 | ||
1871 | return (error); | |
1872 | } | |
1873 | ||
1874 | int | |
1875 | spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) | |
1876 | { | |
1877 | prs->prs_state = spa->spa_removing_phys.sr_state; | |
1878 | ||
1879 | if (prs->prs_state == DSS_NONE) | |
1880 | return (SET_ERROR(ENOENT)); | |
1881 | ||
1882 | prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; | |
1883 | prs->prs_start_time = spa->spa_removing_phys.sr_start_time; | |
1884 | prs->prs_end_time = spa->spa_removing_phys.sr_end_time; | |
1885 | prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; | |
1886 | prs->prs_copied = spa->spa_removing_phys.sr_copied; | |
1887 | ||
1888 | if (spa->spa_vdev_removal != NULL) { | |
1889 | for (int i = 0; i < TXG_SIZE; i++) { | |
1890 | prs->prs_copied += | |
1891 | spa->spa_vdev_removal->svr_bytes_done[i]; | |
1892 | } | |
1893 | } | |
1894 | ||
1895 | prs->prs_mapping_memory = 0; | |
1896 | uint64_t indirect_vdev_id = | |
1897 | spa->spa_removing_phys.sr_prev_indirect_vdev; | |
1898 | while (indirect_vdev_id != -1) { | |
1899 | vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; | |
1900 | vdev_indirect_config_t *vic = &vd->vdev_indirect_config; | |
1901 | vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; | |
1902 | ||
1903 | ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); | |
1904 | prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); | |
1905 | indirect_vdev_id = vic->vic_prev_indirect_vdev; | |
1906 | } | |
1907 | ||
1908 | return (0); | |
1909 | } | |
1910 | ||
1911 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
1912 | module_param(zfs_remove_max_segment, int, 0644); | |
1913 | MODULE_PARM_DESC(zfs_remove_max_segment, | |
1914 | "Largest contiguous segment to allocate when removing device"); | |
1915 | ||
1916 | EXPORT_SYMBOL(free_from_removing_vdev); | |
1917 | EXPORT_SYMBOL(spa_removal_get_stats); | |
1918 | EXPORT_SYMBOL(spa_remove_init); | |
1919 | EXPORT_SYMBOL(spa_restart_removal); | |
1920 | EXPORT_SYMBOL(spa_vdev_removal_destroy); | |
1921 | EXPORT_SYMBOL(spa_vdev_remove); | |
1922 | EXPORT_SYMBOL(spa_vdev_remove_cancel); | |
1923 | EXPORT_SYMBOL(spa_vdev_remove_suspend); | |
1924 | EXPORT_SYMBOL(svr_sync); | |
1925 | #endif |