]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/spa_checkpoint.c
Remove bcopy(), bzero(), bcmp()
[mirror_zfs.git] / module / zfs / spa_checkpoint.c
CommitLineData
d2734cce
SD
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22/*
23 * Copyright (c) 2017 by Delphix. All rights reserved.
24 */
25
26/*
27 * Storage Pool Checkpoint
28 *
29 * A storage pool checkpoint can be thought of as a pool-wide snapshot or
30 * a stable version of extreme rewind that guarantees no blocks from the
31 * checkpointed state will have been overwritten. It remembers the entire
32 * state of the storage pool (e.g. snapshots, dataset names, etc..) from the
33 * point that it was taken and the user can rewind back to that point even if
34 * they applied destructive operations on their datasets or even enabled new
35 * zpool on-disk features. If a pool has a checkpoint that is no longer
36 * needed, the user can discard it.
37 *
38 * == On disk data structures used ==
39 *
40 * - The pool has a new feature flag and a new entry in the MOS. The feature
41 * flag is set to active when we create the checkpoint and remains active
42 * until the checkpoint is fully discarded. The entry in the MOS config
43 * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that
44 * references the state of the pool when we take the checkpoint. The entry
45 * remains populated until we start discarding the checkpoint or we rewind
46 * back to it.
47 *
48 * - Each vdev contains a vdev-wide space map while the pool has a checkpoint,
49 * which persists until the checkpoint is fully discarded. The space map
50 * contains entries that have been freed in the current state of the pool
51 * but we want to keep around in case we decide to rewind to the checkpoint.
52 * [see vdev_checkpoint_sm]
53 *
54 * - Each metaslab's ms_sm space map behaves the same as without the
55 * checkpoint, with the only exception being the scenario when we free
56 * blocks that belong to the checkpoint. In this case, these blocks remain
57 * ALLOCATED in the metaslab's space map and they are added as FREE in the
58 * vdev's checkpoint space map.
59 *
60 * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that
61 * the uberblock was checkpointed. For normal uberblocks this field is 0.
62 *
63 * == Overview of operations ==
64 *
65 * - To create a checkpoint, we first wait for the current TXG to be synced,
66 * so we can use the most recently synced uberblock (spa_ubsync) as the
67 * checkpointed uberblock. Then we use an early synctask to place that
68 * uberblock in MOS config, increment the feature flag for the checkpoint
69 * (marking it active), and setting spa_checkpoint_txg (see its use below)
70 * to the TXG of the checkpointed uberblock. We use an early synctask for
71 * the aforementioned operations to ensure that no blocks were dirtied
72 * between the current TXG and the TXG of the checkpointed uberblock
73 * (e.g the previous txg).
74 *
75 * - When a checkpoint exists, we need to ensure that the blocks that
76 * belong to the checkpoint are freed but never reused. This means that
77 * these blocks should never end up in the ms_allocatable or the ms_freeing
78 * trees of a metaslab. Therefore, whenever there is a checkpoint the new
79 * ms_checkpointing tree is used in addition to the aforementioned ones.
80 *
81 * Whenever a block is freed and we find out that it is referenced by the
82 * checkpoint (we find out by comparing its birth to spa_checkpoint_txg),
83 * we place it in the ms_checkpointing tree instead of the ms_freeingtree.
84 * This way, we divide the blocks that are being freed into checkpointed
85 * and not-checkpointed blocks.
86 *
87 * In order to persist these frees, we write the extents from the
88 * ms_freeingtree to the ms_sm as usual, and the extents from the
89 * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these
90 * checkpointed extents will remain allocated in the metaslab's ms_sm space
91 * map, and therefore won't be reused [see metaslab_sync()]. In addition,
92 * when we discard the checkpoint, we can find the entries that have
93 * actually been freed in vdev_checkpoint_sm.
94 * [see spa_checkpoint_discard_thread_sync()]
95 *
96 * - To discard the checkpoint we use an early synctask to delete the
97 * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0,
98 * and wakeup the discarding zthr thread (an open-context async thread).
99 * We use an early synctask to ensure that the operation happens before any
100 * new data end up in the checkpoint's data structures.
101 *
102 * Once the synctask is done and the discarding zthr is awake, we discard
103 * the checkpointed data over multiple TXGs by having the zthr prefetching
104 * entries from vdev_checkpoint_sm and then starting a synctask that places
e1cfd73f 105 * them as free blocks into their respective ms_allocatable and ms_sm
d2734cce
SD
106 * structures.
107 * [see spa_checkpoint_discard_thread()]
108 *
109 * When there are no entries left in the vdev_checkpoint_sm of all
110 * top-level vdevs, a final synctask runs that decrements the feature flag.
111 *
112 * - To rewind to the checkpoint, we first use the current uberblock and
113 * open the MOS so we can access the checkpointed uberblock from the MOS
114 * config. After we retrieve the checkpointed uberblock, we use it as the
115 * current uberblock for the pool by writing it to disk with an updated
116 * TXG, opening its version of the MOS, and moving on as usual from there.
117 * [see spa_ld_checkpoint_rewind()]
118 *
119 * An important note on rewinding to the checkpoint has to do with how we
120 * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL
121 * blocks that have not been claimed by the time we took the checkpoint
122 * as they should no longer be valid.
123 * [see comment in zil_claim()]
124 *
125 * == Miscellaneous information ==
126 *
127 * - In the hypothetical event that we take a checkpoint, remove a vdev,
128 * and attempt to rewind, the rewind would fail as the checkpointed
129 * uberblock would reference data in the removed device. For this reason
130 * and others of similar nature, we disallow the following operations that
131 * can change the config:
132 * vdev removal and attach/detach, mirror splitting, and pool reguid.
133 *
134 * - As most of the checkpoint logic is implemented in the SPA and doesn't
135 * distinguish datasets when it comes to space accounting, having a
136 * checkpoint can potentially break the boundaries set by dataset
137 * reservations.
138 */
139
140#include <sys/dmu_tx.h>
141#include <sys/dsl_dir.h>
142#include <sys/dsl_synctask.h>
143#include <sys/metaslab_impl.h>
144#include <sys/spa.h>
145#include <sys/spa_impl.h>
146#include <sys/spa_checkpoint.h>
147#include <sys/vdev_impl.h>
148#include <sys/zap.h>
149#include <sys/zfeature.h>
150
151/*
152 * The following parameter limits the amount of memory to be used for the
153 * prefetching of the checkpoint space map done on each vdev while
154 * discarding the checkpoint.
155 *
156 * The reason it exists is because top-level vdevs with long checkpoint
157 * space maps can potentially take up a lot of memory depending on the
158 * amount of checkpointed data that has been freed within them while
159 * the pool had a checkpoint.
160 */
18168da7 161static unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024;
d2734cce
SD
162
163int
164spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs)
165{
166 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
167 return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
168
861166b0 169 memset(pcs, 0, sizeof (pool_checkpoint_stat_t));
d2734cce
SD
170
171 int error = zap_contains(spa_meta_objset(spa),
172 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT);
173 ASSERT(error == 0 || error == ENOENT);
174
175 if (error == ENOENT)
176 pcs->pcs_state = CS_CHECKPOINT_DISCARDING;
177 else
178 pcs->pcs_state = CS_CHECKPOINT_EXISTS;
179
180 pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace;
181 pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp;
182
183 return (0);
184}
185
186static void
187spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx)
188{
189 spa_t *spa = arg;
190
191 spa->spa_checkpoint_info.sci_timestamp = 0;
192
193 spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
e60e158e 194 spa_notify_waiters(spa);
d2734cce
SD
195
196 spa_history_log_internal(spa, "spa discard checkpoint", tx,
197 "finished discarding checkpointed state from the pool");
198}
199
200typedef struct spa_checkpoint_discard_sync_callback_arg {
201 vdev_t *sdc_vd;
202 uint64_t sdc_txg;
203 uint64_t sdc_entry_limit;
204} spa_checkpoint_discard_sync_callback_arg_t;
205
206static int
4d044c4c 207spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg)
d2734cce
SD
208{
209 spa_checkpoint_discard_sync_callback_arg_t *sdc = arg;
210 vdev_t *vd = sdc->sdc_vd;
4d044c4c
SD
211 metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift];
212 uint64_t end = sme->sme_offset + sme->sme_run;
d2734cce
SD
213
214 if (sdc->sdc_entry_limit == 0)
28caa74b 215 return (SET_ERROR(EINTR));
d2734cce
SD
216
217 /*
218 * Since the space map is not condensed, we know that
219 * none of its entries is crossing the boundaries of
220 * its respective metaslab.
221 *
222 * That said, there is no fundamental requirement that
223 * the checkpoint's space map entries should not cross
224 * metaslab boundaries. So if needed we could add code
225 * that handles metaslab-crossing segments in the future.
226 */
4d044c4c
SD
227 VERIFY3U(sme->sme_type, ==, SM_FREE);
228 VERIFY3U(sme->sme_offset, >=, ms->ms_start);
d2734cce
SD
229 VERIFY3U(end, <=, ms->ms_start + ms->ms_size);
230
231 /*
232 * At this point we should not be processing any
233 * other frees concurrently, so the lock is technically
234 * unnecessary. We use the lock anyway though to
235 * potentially save ourselves from future headaches.
236 */
237 mutex_enter(&ms->ms_lock);
238 if (range_tree_is_empty(ms->ms_freeing))
239 vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg);
4d044c4c 240 range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run);
d2734cce
SD
241 mutex_exit(&ms->ms_lock);
242
4d044c4c
SD
243 ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=,
244 sme->sme_run);
245 ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run);
d2734cce 246
4d044c4c
SD
247 vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run;
248 vd->vdev_stat.vs_checkpoint_space -= sme->sme_run;
d2734cce
SD
249 sdc->sdc_entry_limit--;
250
251 return (0);
252}
253
254#ifdef ZFS_DEBUG
255static void
256spa_checkpoint_accounting_verify(spa_t *spa)
257{
258 vdev_t *rvd = spa->spa_root_vdev;
259 uint64_t ckpoint_sm_space_sum = 0;
260 uint64_t vs_ckpoint_space_sum = 0;
261
262 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
263 vdev_t *vd = rvd->vdev_child[c];
264
265 if (vd->vdev_checkpoint_sm != NULL) {
266 ckpoint_sm_space_sum +=
425d3237 267 -space_map_allocated(vd->vdev_checkpoint_sm);
d2734cce
SD
268 vs_ckpoint_space_sum +=
269 vd->vdev_stat.vs_checkpoint_space;
270 ASSERT3U(ckpoint_sm_space_sum, ==,
271 vs_ckpoint_space_sum);
272 } else {
273 ASSERT0(vd->vdev_stat.vs_checkpoint_space);
274 }
275 }
276 ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum);
277}
278#endif
279
280static void
281spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx)
282{
283 vdev_t *vd = arg;
284 int error;
285
286 /*
287 * The space map callback is applied only to non-debug entries.
288 * Because the number of debug entries is less or equal to the
289 * number of non-debug entries, we want to ensure that we only
290 * read what we prefetched from open-context.
291 *
292 * Thus, we set the maximum entries that the space map callback
293 * will be applied to be half the entries that could fit in the
294 * imposed memory limit.
4d044c4c
SD
295 *
296 * Note that since this is a conservative estimate we also
297 * assume the worst case scenario in our computation where each
298 * entry is two-word.
d2734cce
SD
299 */
300 uint64_t max_entry_limit =
4d044c4c 301 (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1;
d2734cce
SD
302
303 /*
304 * Iterate from the end of the space map towards the beginning,
305 * placing its entries on ms_freeing and removing them from the
306 * space map. The iteration stops if one of the following
307 * conditions is true:
308 *
309 * 1] We reached the beginning of the space map. At this point
310 * the space map should be completely empty and
311 * space_map_incremental_destroy should have returned 0.
312 * The next step would be to free and close the space map
313 * and remove its entry from its vdev's top zap. This allows
314 * spa_checkpoint_discard_thread() to move on to the next vdev.
315 *
316 * 2] We reached the memory limit (amount of memory used to hold
317 * space map entries in memory) and space_map_incremental_destroy
318 * returned EINTR. This means that there are entries remaining
319 * in the space map that will be cleared in a future invocation
320 * of this function by spa_checkpoint_discard_thread().
321 */
322 spa_checkpoint_discard_sync_callback_arg_t sdc;
323 sdc.sdc_vd = vd;
324 sdc.sdc_txg = tx->tx_txg;
4d044c4c 325 sdc.sdc_entry_limit = max_entry_limit;
d2734cce 326
4d044c4c
SD
327 uint64_t words_before =
328 space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
d2734cce
SD
329
330 error = space_map_incremental_destroy(vd->vdev_checkpoint_sm,
331 spa_checkpoint_discard_sync_callback, &sdc, tx);
332
4d044c4c 333 uint64_t words_after =
d2734cce
SD
334 space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t);
335
336#ifdef ZFS_DEBUG
337 spa_checkpoint_accounting_verify(vd->vdev_spa);
338#endif
339
8e739b2c 340 zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %lld, "
4d044c4c 341 "deleted %llu words - %llu words are left",
8e739b2c
RE
342 (u_longlong_t)tx->tx_txg, (longlong_t)vd->vdev_id,
343 (u_longlong_t)(words_before - words_after),
344 (u_longlong_t)words_after);
d2734cce
SD
345
346 if (error != EINTR) {
347 if (error != 0) {
8e739b2c 348 zfs_panic_recover("zfs: error %lld was returned "
d2734cce 349 "while incrementally destroying the checkpoint "
8e739b2c
RE
350 "space map of vdev %u\n",
351 (longlong_t)error, vd->vdev_id);
d2734cce 352 }
4d044c4c 353 ASSERT0(words_after);
425d3237 354 ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm));
4d044c4c 355 ASSERT0(space_map_length(vd->vdev_checkpoint_sm));
d2734cce
SD
356
357 space_map_free(vd->vdev_checkpoint_sm, tx);
358 space_map_close(vd->vdev_checkpoint_sm);
359 vd->vdev_checkpoint_sm = NULL;
360
4d044c4c 361 VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa),
d2734cce
SD
362 vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx));
363 }
364}
365
366static boolean_t
367spa_checkpoint_discard_is_done(spa_t *spa)
368{
369 vdev_t *rvd = spa->spa_root_vdev;
370
371 ASSERT(!spa_has_checkpoint(spa));
372 ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT));
373
374 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
375 if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL)
376 return (B_FALSE);
377 ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space);
378 }
379
380 return (B_TRUE);
381}
382
d2734cce
SD
383boolean_t
384spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr)
385{
14e4e3cb 386 (void) zthr;
d2734cce
SD
387 spa_t *spa = arg;
388
389 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
390 return (B_FALSE);
391
392 if (spa_has_checkpoint(spa))
393 return (B_FALSE);
394
395 return (B_TRUE);
396}
397
61c3391a 398void
d2734cce
SD
399spa_checkpoint_discard_thread(void *arg, zthr_t *zthr)
400{
401 spa_t *spa = arg;
402 vdev_t *rvd = spa->spa_root_vdev;
403
404 for (uint64_t c = 0; c < rvd->vdev_children; c++) {
405 vdev_t *vd = rvd->vdev_child[c];
406
407 while (vd->vdev_checkpoint_sm != NULL) {
408 space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm;
409 int numbufs;
410 dmu_buf_t **dbp;
411
412 if (zthr_iscancelled(zthr))
61c3391a 413 return;
d2734cce
SD
414
415 ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops);
416
417 uint64_t size = MIN(space_map_length(checkpoint_sm),
418 zfs_spa_discard_memory_limit);
419 uint64_t offset =
420 space_map_length(checkpoint_sm) - size;
421
422 /*
423 * Ensure that the part of the space map that will
424 * be destroyed by the synctask, is prefetched in
425 * memory before the synctask runs.
426 */
427 int error = dmu_buf_hold_array_by_bonus(
428 checkpoint_sm->sm_dbuf, offset, size,
429 B_TRUE, FTAG, &numbufs, &dbp);
430 if (error != 0) {
431 zfs_panic_recover("zfs: error %d was returned "
432 "while prefetching checkpoint space map "
433 "entries of vdev %llu\n",
434 error, vd->vdev_id);
435 }
436
437 VERIFY0(dsl_sync_task(spa->spa_name, NULL,
438 spa_checkpoint_discard_thread_sync, vd,
439 0, ZFS_SPACE_CHECK_NONE));
440
441 dmu_buf_rele_array(dbp, numbufs, FTAG);
442 }
443 }
444
445 VERIFY(spa_checkpoint_discard_is_done(spa));
446 VERIFY0(spa->spa_checkpoint_info.sci_dspace);
447 VERIFY0(dsl_sync_task(spa->spa_name, NULL,
448 spa_checkpoint_discard_complete_sync, spa,
449 0, ZFS_SPACE_CHECK_NONE));
d2734cce
SD
450}
451
452
d2734cce
SD
453static int
454spa_checkpoint_check(void *arg, dmu_tx_t *tx)
455{
14e4e3cb 456 (void) arg;
d2734cce
SD
457 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
458
459 if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT))
460 return (SET_ERROR(ENOTSUP));
461
462 if (!spa_top_vdevs_spacemap_addressable(spa))
463 return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG));
464
c40a1124 465 if (spa->spa_removing_phys.sr_state == DSS_SCANNING)
d2734cce
SD
466 return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS));
467
468 if (spa->spa_checkpoint_txg != 0)
469 return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS));
470
471 if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
472 return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
473
474 return (0);
475}
476
d2734cce
SD
477static void
478spa_checkpoint_sync(void *arg, dmu_tx_t *tx)
479{
14e4e3cb 480 (void) arg;
d2734cce
SD
481 dsl_pool_t *dp = dmu_tx_pool(tx);
482 spa_t *spa = dp->dp_spa;
483 uberblock_t checkpoint = spa->spa_ubsync;
484
485 /*
486 * At this point, there should not be a checkpoint in the MOS.
487 */
488 ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
489 DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT);
490
491 ASSERT0(spa->spa_checkpoint_info.sci_timestamp);
492 ASSERT0(spa->spa_checkpoint_info.sci_dspace);
493
494 /*
495 * Since the checkpointed uberblock is the one that just got synced
496 * (we use spa_ubsync), its txg must be equal to the txg number of
497 * the txg we are syncing, minus 1.
498 */
499 ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1);
500
501 /*
502 * Once the checkpoint is in place, we need to ensure that none of
503 * its blocks will be marked for reuse after it has been freed.
504 * When there is a checkpoint and a block is freed, we compare its
505 * birth txg to the txg of the checkpointed uberblock to see if the
506 * block is part of the checkpoint or not. Therefore, we have to set
507 * spa_checkpoint_txg before any frees happen in this txg (which is
508 * why this is done as an early_synctask as explained in the comment
509 * in spa_checkpoint()).
510 */
511 spa->spa_checkpoint_txg = checkpoint.ub_txg;
512 spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
513
514 checkpoint.ub_checkpoint_txg = checkpoint.ub_txg;
515 VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset,
516 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT,
517 sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t),
518 &checkpoint, tx));
519
520 /*
521 * Increment the feature refcount and thus activate the feature.
522 * Note that the feature will be deactivated when we've
523 * completely discarded all checkpointed state (both vdev
524 * space maps and uberblock).
525 */
526 spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx);
527
528 spa_history_log_internal(spa, "spa checkpoint", tx,
74756182 529 "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg);
d2734cce
SD
530}
531
532/*
533 * Create a checkpoint for the pool.
534 */
535int
536spa_checkpoint(const char *pool)
537{
538 int error;
539 spa_t *spa;
540
541 error = spa_open(pool, &spa, FTAG);
542 if (error != 0)
543 return (error);
544
545 mutex_enter(&spa->spa_vdev_top_lock);
546
547 /*
548 * Wait for current syncing txg to finish so the latest synced
549 * uberblock (spa_ubsync) has all the changes that we expect
550 * to see if we were to revert later to the checkpoint. In other
551 * words we want the checkpointed uberblock to include/reference
552 * all the changes that were pending at the time that we issued
553 * the checkpoint command.
554 */
555 txg_wait_synced(spa_get_dsl(spa), 0);
556
557 /*
558 * As the checkpointed uberblock references blocks from the previous
559 * txg (spa_ubsync) we want to ensure that are not freeing any of
560 * these blocks in the same txg that the following synctask will
561 * run. Thus, we run it as an early synctask, so the dirty changes
562 * that are synced to disk afterwards during zios and other synctasks
563 * do not reuse checkpointed blocks.
564 */
565 error = dsl_early_sync_task(pool, spa_checkpoint_check,
566 spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL);
567
568 mutex_exit(&spa->spa_vdev_top_lock);
569
570 spa_close(spa, FTAG);
571 return (error);
572}
573
d2734cce
SD
574static int
575spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx)
576{
14e4e3cb 577 (void) arg;
d2734cce
SD
578 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
579
580 if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT))
581 return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT));
582
583 if (spa->spa_checkpoint_txg == 0)
584 return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT));
585
586 VERIFY0(zap_contains(spa_meta_objset(spa),
587 DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT));
588
589 return (0);
590}
591
d2734cce
SD
592static void
593spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx)
594{
14e4e3cb 595 (void) arg;
d2734cce
SD
596 spa_t *spa = dmu_tx_pool(tx)->dp_spa;
597
598 VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
599 DMU_POOL_ZPOOL_CHECKPOINT, tx));
600
601 spa->spa_checkpoint_txg = 0;
602
603 zthr_wakeup(spa->spa_checkpoint_discard_zthr);
604
605 spa_history_log_internal(spa, "spa discard checkpoint", tx,
606 "started discarding checkpointed state from the pool");
607}
608
609/*
610 * Discard the checkpoint from a pool.
611 */
612int
613spa_checkpoint_discard(const char *pool)
614{
615 /*
616 * Similarly to spa_checkpoint(), we want our synctask to run
617 * before any pending dirty data are written to disk so they
618 * won't end up in the checkpoint's data structures (e.g.
619 * ms_checkpointing and vdev_checkpoint_sm) and re-create any
620 * space maps that the discarding open-context thread has
621 * deleted.
622 * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread]
623 */
624 return (dsl_early_sync_task(pool, spa_checkpoint_discard_check,
625 spa_checkpoint_discard_sync, NULL, 0,
626 ZFS_SPACE_CHECK_DISCARD_CHECKPOINT));
627}
628
d2734cce
SD
629EXPORT_SYMBOL(spa_checkpoint_get_stats);
630EXPORT_SYMBOL(spa_checkpoint_discard_thread);
631EXPORT_SYMBOL(spa_checkpoint_discard_thread_check);
632
633/* BEGIN CSTYLED */
03fdcb9a
MM
634ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, ULONG, ZMOD_RW,
635 "Limit for memory used in prefetching the checkpoint space map done "
636 "on each vdev while discarding the checkpoint");
d2734cce 637/* END CSTYLED */