]>
Commit | Line | Data |
---|---|---|
d2734cce SD |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
d2734cce SD |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
23 | * Copyright (c) 2017 by Delphix. All rights reserved. | |
24 | */ | |
25 | ||
26 | /* | |
27 | * Storage Pool Checkpoint | |
28 | * | |
29 | * A storage pool checkpoint can be thought of as a pool-wide snapshot or | |
30 | * a stable version of extreme rewind that guarantees no blocks from the | |
31 | * checkpointed state will have been overwritten. It remembers the entire | |
32 | * state of the storage pool (e.g. snapshots, dataset names, etc..) from the | |
33 | * point that it was taken and the user can rewind back to that point even if | |
34 | * they applied destructive operations on their datasets or even enabled new | |
35 | * zpool on-disk features. If a pool has a checkpoint that is no longer | |
36 | * needed, the user can discard it. | |
37 | * | |
38 | * == On disk data structures used == | |
39 | * | |
40 | * - The pool has a new feature flag and a new entry in the MOS. The feature | |
41 | * flag is set to active when we create the checkpoint and remains active | |
42 | * until the checkpoint is fully discarded. The entry in the MOS config | |
43 | * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that | |
44 | * references the state of the pool when we take the checkpoint. The entry | |
45 | * remains populated until we start discarding the checkpoint or we rewind | |
46 | * back to it. | |
47 | * | |
48 | * - Each vdev contains a vdev-wide space map while the pool has a checkpoint, | |
49 | * which persists until the checkpoint is fully discarded. The space map | |
50 | * contains entries that have been freed in the current state of the pool | |
51 | * but we want to keep around in case we decide to rewind to the checkpoint. | |
52 | * [see vdev_checkpoint_sm] | |
53 | * | |
54 | * - Each metaslab's ms_sm space map behaves the same as without the | |
55 | * checkpoint, with the only exception being the scenario when we free | |
56 | * blocks that belong to the checkpoint. In this case, these blocks remain | |
57 | * ALLOCATED in the metaslab's space map and they are added as FREE in the | |
58 | * vdev's checkpoint space map. | |
59 | * | |
60 | * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that | |
61 | * the uberblock was checkpointed. For normal uberblocks this field is 0. | |
62 | * | |
63 | * == Overview of operations == | |
64 | * | |
65 | * - To create a checkpoint, we first wait for the current TXG to be synced, | |
66 | * so we can use the most recently synced uberblock (spa_ubsync) as the | |
67 | * checkpointed uberblock. Then we use an early synctask to place that | |
68 | * uberblock in MOS config, increment the feature flag for the checkpoint | |
69 | * (marking it active), and setting spa_checkpoint_txg (see its use below) | |
70 | * to the TXG of the checkpointed uberblock. We use an early synctask for | |
71 | * the aforementioned operations to ensure that no blocks were dirtied | |
72 | * between the current TXG and the TXG of the checkpointed uberblock | |
73 | * (e.g the previous txg). | |
74 | * | |
75 | * - When a checkpoint exists, we need to ensure that the blocks that | |
76 | * belong to the checkpoint are freed but never reused. This means that | |
77 | * these blocks should never end up in the ms_allocatable or the ms_freeing | |
78 | * trees of a metaslab. Therefore, whenever there is a checkpoint the new | |
79 | * ms_checkpointing tree is used in addition to the aforementioned ones. | |
80 | * | |
81 | * Whenever a block is freed and we find out that it is referenced by the | |
82 | * checkpoint (we find out by comparing its birth to spa_checkpoint_txg), | |
83 | * we place it in the ms_checkpointing tree instead of the ms_freeingtree. | |
84 | * This way, we divide the blocks that are being freed into checkpointed | |
85 | * and not-checkpointed blocks. | |
86 | * | |
87 | * In order to persist these frees, we write the extents from the | |
88 | * ms_freeingtree to the ms_sm as usual, and the extents from the | |
89 | * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these | |
90 | * checkpointed extents will remain allocated in the metaslab's ms_sm space | |
91 | * map, and therefore won't be reused [see metaslab_sync()]. In addition, | |
92 | * when we discard the checkpoint, we can find the entries that have | |
93 | * actually been freed in vdev_checkpoint_sm. | |
94 | * [see spa_checkpoint_discard_thread_sync()] | |
95 | * | |
96 | * - To discard the checkpoint we use an early synctask to delete the | |
97 | * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0, | |
98 | * and wakeup the discarding zthr thread (an open-context async thread). | |
99 | * We use an early synctask to ensure that the operation happens before any | |
100 | * new data end up in the checkpoint's data structures. | |
101 | * | |
102 | * Once the synctask is done and the discarding zthr is awake, we discard | |
103 | * the checkpointed data over multiple TXGs by having the zthr prefetching | |
104 | * entries from vdev_checkpoint_sm and then starting a synctask that places | |
e1cfd73f | 105 | * them as free blocks into their respective ms_allocatable and ms_sm |
d2734cce SD |
106 | * structures. |
107 | * [see spa_checkpoint_discard_thread()] | |
108 | * | |
109 | * When there are no entries left in the vdev_checkpoint_sm of all | |
110 | * top-level vdevs, a final synctask runs that decrements the feature flag. | |
111 | * | |
112 | * - To rewind to the checkpoint, we first use the current uberblock and | |
113 | * open the MOS so we can access the checkpointed uberblock from the MOS | |
114 | * config. After we retrieve the checkpointed uberblock, we use it as the | |
115 | * current uberblock for the pool by writing it to disk with an updated | |
116 | * TXG, opening its version of the MOS, and moving on as usual from there. | |
117 | * [see spa_ld_checkpoint_rewind()] | |
118 | * | |
119 | * An important note on rewinding to the checkpoint has to do with how we | |
120 | * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL | |
121 | * blocks that have not been claimed by the time we took the checkpoint | |
122 | * as they should no longer be valid. | |
123 | * [see comment in zil_claim()] | |
124 | * | |
125 | * == Miscellaneous information == | |
126 | * | |
127 | * - In the hypothetical event that we take a checkpoint, remove a vdev, | |
128 | * and attempt to rewind, the rewind would fail as the checkpointed | |
129 | * uberblock would reference data in the removed device. For this reason | |
130 | * and others of similar nature, we disallow the following operations that | |
131 | * can change the config: | |
132 | * vdev removal and attach/detach, mirror splitting, and pool reguid. | |
133 | * | |
134 | * - As most of the checkpoint logic is implemented in the SPA and doesn't | |
135 | * distinguish datasets when it comes to space accounting, having a | |
136 | * checkpoint can potentially break the boundaries set by dataset | |
137 | * reservations. | |
138 | */ | |
139 | ||
140 | #include <sys/dmu_tx.h> | |
141 | #include <sys/dsl_dir.h> | |
142 | #include <sys/dsl_synctask.h> | |
143 | #include <sys/metaslab_impl.h> | |
144 | #include <sys/spa.h> | |
145 | #include <sys/spa_impl.h> | |
146 | #include <sys/spa_checkpoint.h> | |
147 | #include <sys/vdev_impl.h> | |
148 | #include <sys/zap.h> | |
149 | #include <sys/zfeature.h> | |
150 | ||
151 | /* | |
152 | * The following parameter limits the amount of memory to be used for the | |
153 | * prefetching of the checkpoint space map done on each vdev while | |
154 | * discarding the checkpoint. | |
155 | * | |
156 | * The reason it exists is because top-level vdevs with long checkpoint | |
157 | * space maps can potentially take up a lot of memory depending on the | |
158 | * amount of checkpointed data that has been freed within them while | |
159 | * the pool had a checkpoint. | |
160 | */ | |
ab8d9c17 | 161 | static uint64_t zfs_spa_discard_memory_limit = 16 * 1024 * 1024; |
d2734cce SD |
162 | |
163 | int | |
164 | spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) | |
165 | { | |
166 | if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) | |
167 | return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); | |
168 | ||
861166b0 | 169 | memset(pcs, 0, sizeof (pool_checkpoint_stat_t)); |
d2734cce SD |
170 | |
171 | int error = zap_contains(spa_meta_objset(spa), | |
172 | DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT); | |
173 | ASSERT(error == 0 || error == ENOENT); | |
174 | ||
175 | if (error == ENOENT) | |
176 | pcs->pcs_state = CS_CHECKPOINT_DISCARDING; | |
177 | else | |
178 | pcs->pcs_state = CS_CHECKPOINT_EXISTS; | |
179 | ||
180 | pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace; | |
181 | pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp; | |
182 | ||
183 | return (0); | |
184 | } | |
185 | ||
186 | static void | |
187 | spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx) | |
188 | { | |
189 | spa_t *spa = arg; | |
190 | ||
191 | spa->spa_checkpoint_info.sci_timestamp = 0; | |
192 | ||
193 | spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); | |
e60e158e | 194 | spa_notify_waiters(spa); |
d2734cce SD |
195 | |
196 | spa_history_log_internal(spa, "spa discard checkpoint", tx, | |
197 | "finished discarding checkpointed state from the pool"); | |
198 | } | |
199 | ||
200 | typedef struct spa_checkpoint_discard_sync_callback_arg { | |
201 | vdev_t *sdc_vd; | |
202 | uint64_t sdc_txg; | |
203 | uint64_t sdc_entry_limit; | |
204 | } spa_checkpoint_discard_sync_callback_arg_t; | |
205 | ||
206 | static int | |
4d044c4c | 207 | spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) |
d2734cce SD |
208 | { |
209 | spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; | |
210 | vdev_t *vd = sdc->sdc_vd; | |
4d044c4c SD |
211 | metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; |
212 | uint64_t end = sme->sme_offset + sme->sme_run; | |
d2734cce SD |
213 | |
214 | if (sdc->sdc_entry_limit == 0) | |
28caa74b | 215 | return (SET_ERROR(EINTR)); |
d2734cce SD |
216 | |
217 | /* | |
218 | * Since the space map is not condensed, we know that | |
219 | * none of its entries is crossing the boundaries of | |
220 | * its respective metaslab. | |
221 | * | |
222 | * That said, there is no fundamental requirement that | |
223 | * the checkpoint's space map entries should not cross | |
224 | * metaslab boundaries. So if needed we could add code | |
225 | * that handles metaslab-crossing segments in the future. | |
226 | */ | |
4d044c4c SD |
227 | VERIFY3U(sme->sme_type, ==, SM_FREE); |
228 | VERIFY3U(sme->sme_offset, >=, ms->ms_start); | |
d2734cce SD |
229 | VERIFY3U(end, <=, ms->ms_start + ms->ms_size); |
230 | ||
231 | /* | |
232 | * At this point we should not be processing any | |
233 | * other frees concurrently, so the lock is technically | |
234 | * unnecessary. We use the lock anyway though to | |
235 | * potentially save ourselves from future headaches. | |
236 | */ | |
237 | mutex_enter(&ms->ms_lock); | |
238 | if (range_tree_is_empty(ms->ms_freeing)) | |
239 | vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); | |
4d044c4c | 240 | range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); |
d2734cce SD |
241 | mutex_exit(&ms->ms_lock); |
242 | ||
4d044c4c SD |
243 | ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, |
244 | sme->sme_run); | |
245 | ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); | |
d2734cce | 246 | |
4d044c4c SD |
247 | vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; |
248 | vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; | |
d2734cce SD |
249 | sdc->sdc_entry_limit--; |
250 | ||
251 | return (0); | |
252 | } | |
253 | ||
254 | #ifdef ZFS_DEBUG | |
255 | static void | |
256 | spa_checkpoint_accounting_verify(spa_t *spa) | |
257 | { | |
258 | vdev_t *rvd = spa->spa_root_vdev; | |
259 | uint64_t ckpoint_sm_space_sum = 0; | |
260 | uint64_t vs_ckpoint_space_sum = 0; | |
261 | ||
262 | for (uint64_t c = 0; c < rvd->vdev_children; c++) { | |
263 | vdev_t *vd = rvd->vdev_child[c]; | |
264 | ||
265 | if (vd->vdev_checkpoint_sm != NULL) { | |
266 | ckpoint_sm_space_sum += | |
425d3237 | 267 | -space_map_allocated(vd->vdev_checkpoint_sm); |
d2734cce SD |
268 | vs_ckpoint_space_sum += |
269 | vd->vdev_stat.vs_checkpoint_space; | |
270 | ASSERT3U(ckpoint_sm_space_sum, ==, | |
271 | vs_ckpoint_space_sum); | |
272 | } else { | |
273 | ASSERT0(vd->vdev_stat.vs_checkpoint_space); | |
274 | } | |
275 | } | |
276 | ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum); | |
277 | } | |
278 | #endif | |
279 | ||
280 | static void | |
281 | spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) | |
282 | { | |
283 | vdev_t *vd = arg; | |
284 | int error; | |
285 | ||
286 | /* | |
287 | * The space map callback is applied only to non-debug entries. | |
288 | * Because the number of debug entries is less or equal to the | |
289 | * number of non-debug entries, we want to ensure that we only | |
290 | * read what we prefetched from open-context. | |
291 | * | |
292 | * Thus, we set the maximum entries that the space map callback | |
293 | * will be applied to be half the entries that could fit in the | |
294 | * imposed memory limit. | |
4d044c4c SD |
295 | * |
296 | * Note that since this is a conservative estimate we also | |
297 | * assume the worst case scenario in our computation where each | |
298 | * entry is two-word. | |
d2734cce SD |
299 | */ |
300 | uint64_t max_entry_limit = | |
4d044c4c | 301 | (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; |
d2734cce SD |
302 | |
303 | /* | |
304 | * Iterate from the end of the space map towards the beginning, | |
305 | * placing its entries on ms_freeing and removing them from the | |
306 | * space map. The iteration stops if one of the following | |
307 | * conditions is true: | |
308 | * | |
309 | * 1] We reached the beginning of the space map. At this point | |
310 | * the space map should be completely empty and | |
311 | * space_map_incremental_destroy should have returned 0. | |
312 | * The next step would be to free and close the space map | |
313 | * and remove its entry from its vdev's top zap. This allows | |
314 | * spa_checkpoint_discard_thread() to move on to the next vdev. | |
315 | * | |
316 | * 2] We reached the memory limit (amount of memory used to hold | |
317 | * space map entries in memory) and space_map_incremental_destroy | |
318 | * returned EINTR. This means that there are entries remaining | |
319 | * in the space map that will be cleared in a future invocation | |
320 | * of this function by spa_checkpoint_discard_thread(). | |
321 | */ | |
322 | spa_checkpoint_discard_sync_callback_arg_t sdc; | |
323 | sdc.sdc_vd = vd; | |
324 | sdc.sdc_txg = tx->tx_txg; | |
4d044c4c | 325 | sdc.sdc_entry_limit = max_entry_limit; |
d2734cce | 326 | |
4d044c4c SD |
327 | uint64_t words_before = |
328 | space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); | |
d2734cce SD |
329 | |
330 | error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, | |
331 | spa_checkpoint_discard_sync_callback, &sdc, tx); | |
332 | ||
4d044c4c | 333 | uint64_t words_after = |
d2734cce SD |
334 | space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); |
335 | ||
336 | #ifdef ZFS_DEBUG | |
337 | spa_checkpoint_accounting_verify(vd->vdev_spa); | |
338 | #endif | |
339 | ||
8e739b2c | 340 | zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %lld, " |
4d044c4c | 341 | "deleted %llu words - %llu words are left", |
8e739b2c RE |
342 | (u_longlong_t)tx->tx_txg, (longlong_t)vd->vdev_id, |
343 | (u_longlong_t)(words_before - words_after), | |
344 | (u_longlong_t)words_after); | |
d2734cce SD |
345 | |
346 | if (error != EINTR) { | |
347 | if (error != 0) { | |
8e739b2c | 348 | zfs_panic_recover("zfs: error %lld was returned " |
d2734cce | 349 | "while incrementally destroying the checkpoint " |
f272960d | 350 | "space map of vdev %llu\n", |
8e739b2c | 351 | (longlong_t)error, vd->vdev_id); |
d2734cce | 352 | } |
4d044c4c | 353 | ASSERT0(words_after); |
425d3237 | 354 | ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm)); |
4d044c4c | 355 | ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); |
d2734cce SD |
356 | |
357 | space_map_free(vd->vdev_checkpoint_sm, tx); | |
358 | space_map_close(vd->vdev_checkpoint_sm); | |
359 | vd->vdev_checkpoint_sm = NULL; | |
360 | ||
4d044c4c | 361 | VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), |
d2734cce SD |
362 | vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); |
363 | } | |
364 | } | |
365 | ||
366 | static boolean_t | |
367 | spa_checkpoint_discard_is_done(spa_t *spa) | |
368 | { | |
369 | vdev_t *rvd = spa->spa_root_vdev; | |
370 | ||
371 | ASSERT(!spa_has_checkpoint(spa)); | |
372 | ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)); | |
373 | ||
374 | for (uint64_t c = 0; c < rvd->vdev_children; c++) { | |
375 | if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL) | |
376 | return (B_FALSE); | |
377 | ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space); | |
378 | } | |
379 | ||
380 | return (B_TRUE); | |
381 | } | |
382 | ||
d2734cce SD |
383 | boolean_t |
384 | spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr) | |
385 | { | |
14e4e3cb | 386 | (void) zthr; |
d2734cce SD |
387 | spa_t *spa = arg; |
388 | ||
389 | if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) | |
390 | return (B_FALSE); | |
391 | ||
392 | if (spa_has_checkpoint(spa)) | |
393 | return (B_FALSE); | |
394 | ||
395 | return (B_TRUE); | |
396 | } | |
397 | ||
61c3391a | 398 | void |
d2734cce SD |
399 | spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) |
400 | { | |
401 | spa_t *spa = arg; | |
402 | vdev_t *rvd = spa->spa_root_vdev; | |
403 | ||
404 | for (uint64_t c = 0; c < rvd->vdev_children; c++) { | |
405 | vdev_t *vd = rvd->vdev_child[c]; | |
406 | ||
407 | while (vd->vdev_checkpoint_sm != NULL) { | |
408 | space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm; | |
409 | int numbufs; | |
410 | dmu_buf_t **dbp; | |
411 | ||
412 | if (zthr_iscancelled(zthr)) | |
61c3391a | 413 | return; |
d2734cce SD |
414 | |
415 | ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); | |
416 | ||
417 | uint64_t size = MIN(space_map_length(checkpoint_sm), | |
418 | zfs_spa_discard_memory_limit); | |
419 | uint64_t offset = | |
420 | space_map_length(checkpoint_sm) - size; | |
421 | ||
422 | /* | |
423 | * Ensure that the part of the space map that will | |
424 | * be destroyed by the synctask, is prefetched in | |
425 | * memory before the synctask runs. | |
426 | */ | |
427 | int error = dmu_buf_hold_array_by_bonus( | |
428 | checkpoint_sm->sm_dbuf, offset, size, | |
429 | B_TRUE, FTAG, &numbufs, &dbp); | |
430 | if (error != 0) { | |
431 | zfs_panic_recover("zfs: error %d was returned " | |
432 | "while prefetching checkpoint space map " | |
433 | "entries of vdev %llu\n", | |
434 | error, vd->vdev_id); | |
435 | } | |
436 | ||
437 | VERIFY0(dsl_sync_task(spa->spa_name, NULL, | |
438 | spa_checkpoint_discard_thread_sync, vd, | |
439 | 0, ZFS_SPACE_CHECK_NONE)); | |
440 | ||
441 | dmu_buf_rele_array(dbp, numbufs, FTAG); | |
442 | } | |
443 | } | |
444 | ||
445 | VERIFY(spa_checkpoint_discard_is_done(spa)); | |
446 | VERIFY0(spa->spa_checkpoint_info.sci_dspace); | |
447 | VERIFY0(dsl_sync_task(spa->spa_name, NULL, | |
448 | spa_checkpoint_discard_complete_sync, spa, | |
449 | 0, ZFS_SPACE_CHECK_NONE)); | |
d2734cce SD |
450 | } |
451 | ||
452 | ||
d2734cce SD |
453 | static int |
454 | spa_checkpoint_check(void *arg, dmu_tx_t *tx) | |
455 | { | |
14e4e3cb | 456 | (void) arg; |
d2734cce SD |
457 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; |
458 | ||
459 | if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT)) | |
460 | return (SET_ERROR(ENOTSUP)); | |
461 | ||
462 | if (!spa_top_vdevs_spacemap_addressable(spa)) | |
463 | return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG)); | |
464 | ||
c40a1124 | 465 | if (spa->spa_removing_phys.sr_state == DSS_SCANNING) |
d2734cce SD |
466 | return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); |
467 | ||
5caeef02 DB |
468 | if (spa->spa_raidz_expand != NULL) |
469 | return (SET_ERROR(ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS)); | |
470 | ||
d2734cce SD |
471 | if (spa->spa_checkpoint_txg != 0) |
472 | return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); | |
473 | ||
474 | if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) | |
475 | return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); | |
476 | ||
477 | return (0); | |
478 | } | |
479 | ||
d2734cce SD |
480 | static void |
481 | spa_checkpoint_sync(void *arg, dmu_tx_t *tx) | |
482 | { | |
14e4e3cb | 483 | (void) arg; |
d2734cce SD |
484 | dsl_pool_t *dp = dmu_tx_pool(tx); |
485 | spa_t *spa = dp->dp_spa; | |
486 | uberblock_t checkpoint = spa->spa_ubsync; | |
487 | ||
488 | /* | |
489 | * At this point, there should not be a checkpoint in the MOS. | |
490 | */ | |
491 | ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, | |
492 | DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT); | |
493 | ||
494 | ASSERT0(spa->spa_checkpoint_info.sci_timestamp); | |
495 | ASSERT0(spa->spa_checkpoint_info.sci_dspace); | |
496 | ||
497 | /* | |
498 | * Since the checkpointed uberblock is the one that just got synced | |
499 | * (we use spa_ubsync), its txg must be equal to the txg number of | |
500 | * the txg we are syncing, minus 1. | |
501 | */ | |
502 | ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1); | |
503 | ||
504 | /* | |
505 | * Once the checkpoint is in place, we need to ensure that none of | |
506 | * its blocks will be marked for reuse after it has been freed. | |
507 | * When there is a checkpoint and a block is freed, we compare its | |
508 | * birth txg to the txg of the checkpointed uberblock to see if the | |
509 | * block is part of the checkpoint or not. Therefore, we have to set | |
510 | * spa_checkpoint_txg before any frees happen in this txg (which is | |
511 | * why this is done as an early_synctask as explained in the comment | |
512 | * in spa_checkpoint()). | |
513 | */ | |
514 | spa->spa_checkpoint_txg = checkpoint.ub_txg; | |
515 | spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; | |
516 | ||
517 | checkpoint.ub_checkpoint_txg = checkpoint.ub_txg; | |
518 | VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, | |
519 | DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, | |
520 | sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), | |
521 | &checkpoint, tx)); | |
522 | ||
523 | /* | |
524 | * Increment the feature refcount and thus activate the feature. | |
525 | * Note that the feature will be deactivated when we've | |
526 | * completely discarded all checkpointed state (both vdev | |
527 | * space maps and uberblock). | |
528 | */ | |
529 | spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); | |
530 | ||
531 | spa_history_log_internal(spa, "spa checkpoint", tx, | |
74756182 | 532 | "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg); |
d2734cce SD |
533 | } |
534 | ||
535 | /* | |
536 | * Create a checkpoint for the pool. | |
537 | */ | |
538 | int | |
539 | spa_checkpoint(const char *pool) | |
540 | { | |
541 | int error; | |
542 | spa_t *spa; | |
543 | ||
544 | error = spa_open(pool, &spa, FTAG); | |
545 | if (error != 0) | |
546 | return (error); | |
547 | ||
548 | mutex_enter(&spa->spa_vdev_top_lock); | |
549 | ||
550 | /* | |
551 | * Wait for current syncing txg to finish so the latest synced | |
552 | * uberblock (spa_ubsync) has all the changes that we expect | |
553 | * to see if we were to revert later to the checkpoint. In other | |
554 | * words we want the checkpointed uberblock to include/reference | |
555 | * all the changes that were pending at the time that we issued | |
556 | * the checkpoint command. | |
557 | */ | |
558 | txg_wait_synced(spa_get_dsl(spa), 0); | |
559 | ||
560 | /* | |
561 | * As the checkpointed uberblock references blocks from the previous | |
562 | * txg (spa_ubsync) we want to ensure that are not freeing any of | |
563 | * these blocks in the same txg that the following synctask will | |
564 | * run. Thus, we run it as an early synctask, so the dirty changes | |
565 | * that are synced to disk afterwards during zios and other synctasks | |
566 | * do not reuse checkpointed blocks. | |
567 | */ | |
568 | error = dsl_early_sync_task(pool, spa_checkpoint_check, | |
569 | spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL); | |
570 | ||
571 | mutex_exit(&spa->spa_vdev_top_lock); | |
572 | ||
573 | spa_close(spa, FTAG); | |
574 | return (error); | |
575 | } | |
576 | ||
d2734cce SD |
577 | static int |
578 | spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) | |
579 | { | |
14e4e3cb | 580 | (void) arg; |
d2734cce SD |
581 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; |
582 | ||
583 | if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) | |
584 | return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); | |
585 | ||
586 | if (spa->spa_checkpoint_txg == 0) | |
587 | return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); | |
588 | ||
589 | VERIFY0(zap_contains(spa_meta_objset(spa), | |
590 | DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT)); | |
591 | ||
592 | return (0); | |
593 | } | |
594 | ||
d2734cce SD |
595 | static void |
596 | spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx) | |
597 | { | |
14e4e3cb | 598 | (void) arg; |
d2734cce SD |
599 | spa_t *spa = dmu_tx_pool(tx)->dp_spa; |
600 | ||
601 | VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, | |
602 | DMU_POOL_ZPOOL_CHECKPOINT, tx)); | |
603 | ||
604 | spa->spa_checkpoint_txg = 0; | |
605 | ||
606 | zthr_wakeup(spa->spa_checkpoint_discard_zthr); | |
607 | ||
608 | spa_history_log_internal(spa, "spa discard checkpoint", tx, | |
609 | "started discarding checkpointed state from the pool"); | |
610 | } | |
611 | ||
612 | /* | |
613 | * Discard the checkpoint from a pool. | |
614 | */ | |
615 | int | |
616 | spa_checkpoint_discard(const char *pool) | |
617 | { | |
618 | /* | |
619 | * Similarly to spa_checkpoint(), we want our synctask to run | |
620 | * before any pending dirty data are written to disk so they | |
621 | * won't end up in the checkpoint's data structures (e.g. | |
622 | * ms_checkpointing and vdev_checkpoint_sm) and re-create any | |
623 | * space maps that the discarding open-context thread has | |
624 | * deleted. | |
625 | * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread] | |
626 | */ | |
627 | return (dsl_early_sync_task(pool, spa_checkpoint_discard_check, | |
628 | spa_checkpoint_discard_sync, NULL, 0, | |
629 | ZFS_SPACE_CHECK_DISCARD_CHECKPOINT)); | |
630 | } | |
631 | ||
d2734cce SD |
632 | EXPORT_SYMBOL(spa_checkpoint_get_stats); |
633 | EXPORT_SYMBOL(spa_checkpoint_discard_thread); | |
634 | EXPORT_SYMBOL(spa_checkpoint_discard_thread_check); | |
635 | ||
636 | /* BEGIN CSTYLED */ | |
ab8d9c17 | 637 | ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, U64, ZMOD_RW, |
03fdcb9a MM |
638 | "Limit for memory used in prefetching the checkpoint space map done " |
639 | "on each vdev while discarding the checkpoint"); | |
d2734cce | 640 | /* END CSTYLED */ |