]>
Commit | Line | Data |
---|---|---|
9ae529ec CS |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
78e2739d | 23 | * Copyright (c) 2013 by Delphix. All rights reserved. |
9ae529ec CS |
24 | */ |
25 | ||
26 | #include <sys/arc.h> | |
27 | #include <sys/bptree.h> | |
28 | #include <sys/dmu.h> | |
29 | #include <sys/dmu_objset.h> | |
30 | #include <sys/dmu_tx.h> | |
31 | #include <sys/dmu_traverse.h> | |
32 | #include <sys/dsl_dataset.h> | |
33 | #include <sys/dsl_dir.h> | |
34 | #include <sys/dsl_pool.h> | |
35 | #include <sys/dnode.h> | |
36 | #include <sys/refcount.h> | |
37 | #include <sys/spa.h> | |
38 | ||
39 | /* | |
40 | * A bptree is a queue of root block pointers from destroyed datasets. When a | |
41 | * dataset is destroyed its root block pointer is put on the end of the pool's | |
42 | * bptree queue so the dataset's blocks can be freed asynchronously by | |
43 | * dsl_scan_sync. This allows the delete operation to finish without traversing | |
44 | * all the dataset's blocks. | |
45 | * | |
d3cc8b15 | 46 | * Note that while bt_begin and bt_end are only ever incremented in this code, |
9ae529ec CS |
47 | * they are effectively reset to 0 every time the entire bptree is freed because |
48 | * the bptree's object is destroyed and re-created. | |
49 | */ | |
50 | ||
51 | struct bptree_args { | |
52 | bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ | |
53 | boolean_t ba_free; /* true if freeing during traversal */ | |
54 | ||
55 | bptree_itor_t *ba_func; /* function to call for each blockpointer */ | |
56 | void *ba_arg; /* caller supplied argument to ba_func */ | |
57 | dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ | |
58 | } bptree_args_t; | |
59 | ||
60 | uint64_t | |
61 | bptree_alloc(objset_t *os, dmu_tx_t *tx) | |
62 | { | |
63 | uint64_t obj; | |
64 | dmu_buf_t *db; | |
65 | bptree_phys_t *bt; | |
66 | ||
67 | obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, | |
68 | SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, | |
69 | sizeof (bptree_phys_t), tx); | |
70 | ||
71 | /* | |
72 | * Bonus buffer contents are already initialized to 0, but for | |
73 | * readability we make it explicit. | |
74 | */ | |
75 | VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); | |
76 | dmu_buf_will_dirty(db, tx); | |
77 | bt = db->db_data; | |
78 | bt->bt_begin = 0; | |
79 | bt->bt_end = 0; | |
80 | bt->bt_bytes = 0; | |
81 | bt->bt_comp = 0; | |
82 | bt->bt_uncomp = 0; | |
83 | dmu_buf_rele(db, FTAG); | |
84 | ||
85 | return (obj); | |
86 | } | |
87 | ||
88 | int | |
89 | bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) | |
90 | { | |
91 | dmu_buf_t *db; | |
92 | bptree_phys_t *bt; | |
93 | ||
94 | VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); | |
95 | bt = db->db_data; | |
96 | ASSERT3U(bt->bt_begin, ==, bt->bt_end); | |
c99c9001 MS |
97 | ASSERT0(bt->bt_bytes); |
98 | ASSERT0(bt->bt_comp); | |
99 | ASSERT0(bt->bt_uncomp); | |
9ae529ec CS |
100 | dmu_buf_rele(db, FTAG); |
101 | ||
102 | return (dmu_object_free(os, obj, tx)); | |
103 | } | |
104 | ||
105 | void | |
106 | bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, | |
107 | uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) | |
108 | { | |
109 | dmu_buf_t *db; | |
110 | bptree_phys_t *bt; | |
111 | bptree_entry_phys_t bte; | |
112 | ||
113 | /* | |
114 | * bptree objects are in the pool mos, therefore they can only be | |
115 | * modified in syncing context. Furthermore, this is only modified | |
116 | * by the sync thread, so no locking is necessary. | |
117 | */ | |
118 | ASSERT(dmu_tx_is_syncing(tx)); | |
119 | ||
120 | VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); | |
121 | bt = db->db_data; | |
122 | ||
123 | bte.be_birth_txg = birth_txg; | |
124 | bte.be_bp = *bp; | |
125 | bzero(&bte.be_zb, sizeof (bte.be_zb)); | |
126 | dmu_write(os, obj, bt->bt_end * sizeof (bte), sizeof (bte), &bte, tx); | |
127 | ||
128 | dmu_buf_will_dirty(db, tx); | |
129 | bt->bt_end++; | |
130 | bt->bt_bytes += bytes; | |
131 | bt->bt_comp += comp; | |
132 | bt->bt_uncomp += uncomp; | |
133 | dmu_buf_rele(db, FTAG); | |
134 | } | |
135 | ||
136 | /* ARGSUSED */ | |
137 | static int | |
294f6806 | 138 | bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, |
9ae529ec CS |
139 | const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) |
140 | { | |
141 | int err; | |
142 | struct bptree_args *ba = arg; | |
143 | ||
144 | if (bp == NULL) | |
145 | return (0); | |
146 | ||
147 | err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); | |
148 | if (err == 0 && ba->ba_free) { | |
149 | ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); | |
150 | ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); | |
151 | ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); | |
152 | } | |
153 | return (err); | |
154 | } | |
155 | ||
156 | int | |
157 | bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, | |
158 | void *arg, dmu_tx_t *tx) | |
159 | { | |
160 | int err; | |
161 | uint64_t i; | |
162 | dmu_buf_t *db; | |
163 | struct bptree_args ba; | |
164 | ||
165 | ASSERT(!free || dmu_tx_is_syncing(tx)); | |
166 | ||
167 | err = dmu_bonus_hold(os, obj, FTAG, &db); | |
168 | if (err != 0) | |
169 | return (err); | |
170 | ||
171 | if (free) | |
172 | dmu_buf_will_dirty(db, tx); | |
173 | ||
174 | ba.ba_phys = db->db_data; | |
175 | ba.ba_free = free; | |
176 | ba.ba_func = func; | |
177 | ba.ba_arg = arg; | |
178 | ba.ba_tx = tx; | |
179 | ||
180 | err = 0; | |
181 | for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { | |
182 | bptree_entry_phys_t bte; | |
78e2739d | 183 | int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; |
9ae529ec CS |
184 | |
185 | ASSERT(!free || i == ba.ba_phys->bt_begin); | |
186 | ||
187 | err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), | |
188 | &bte, DMU_READ_NO_PREFETCH); | |
189 | if (err != 0) | |
190 | break; | |
191 | ||
78e2739d MA |
192 | if (zfs_recover) |
193 | flags |= TRAVERSE_HARD; | |
9ae529ec | 194 | err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, |
78e2739d | 195 | bte.be_birth_txg, &bte.be_zb, flags, |
9ae529ec CS |
196 | bptree_visit_cb, &ba); |
197 | if (free) { | |
78e2739d | 198 | if (err == ERESTART) { |
9ae529ec CS |
199 | /* save bookmark for future resume */ |
200 | ASSERT3U(bte.be_zb.zb_objset, ==, | |
201 | ZB_DESTROYED_OBJSET); | |
c99c9001 | 202 | ASSERT0(bte.be_zb.zb_level); |
9ae529ec CS |
203 | dmu_write(os, obj, i * sizeof (bte), |
204 | sizeof (bte), &bte, tx); | |
205 | break; | |
9ae529ec | 206 | } |
78e2739d MA |
207 | if (err != 0) { |
208 | /* | |
209 | * We can not properly handle an i/o | |
210 | * error, because the traversal code | |
211 | * does not know how to resume from an | |
212 | * arbitrary bookmark. | |
213 | */ | |
214 | zfs_panic_recover("error %u from " | |
215 | "traverse_dataset_destroyed()", err); | |
216 | } | |
217 | ||
218 | ba.ba_phys->bt_begin++; | |
219 | (void) dmu_free_range(os, obj, | |
220 | i * sizeof (bte), sizeof (bte), tx); | |
9ae529ec CS |
221 | } |
222 | } | |
223 | ||
224 | ASSERT(!free || err != 0 || ba.ba_phys->bt_begin == ba.ba_phys->bt_end); | |
225 | ||
226 | /* if all blocks are free there should be no used space */ | |
227 | if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { | |
c99c9001 MS |
228 | ASSERT0(ba.ba_phys->bt_bytes); |
229 | ASSERT0(ba.ba_phys->bt_comp); | |
230 | ASSERT0(ba.ba_phys->bt_uncomp); | |
9ae529ec CS |
231 | } |
232 | ||
233 | dmu_buf_rele(db, FTAG); | |
234 | ||
235 | return (err); | |
236 | } |