]>
Commit | Line | Data |
---|---|---|
9ae529ec CS |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | /* | |
78e2739d | 23 | * Copyright (c) 2013 by Delphix. All rights reserved. |
9ae529ec CS |
24 | */ |
25 | ||
26 | #include <sys/arc.h> | |
27 | #include <sys/bptree.h> | |
28 | #include <sys/dmu.h> | |
29 | #include <sys/dmu_objset.h> | |
30 | #include <sys/dmu_tx.h> | |
31 | #include <sys/dmu_traverse.h> | |
32 | #include <sys/dsl_dataset.h> | |
33 | #include <sys/dsl_dir.h> | |
34 | #include <sys/dsl_pool.h> | |
35 | #include <sys/dnode.h> | |
36 | #include <sys/refcount.h> | |
37 | #include <sys/spa.h> | |
38 | ||
39 | /* | |
40 | * A bptree is a queue of root block pointers from destroyed datasets. When a | |
41 | * dataset is destroyed its root block pointer is put on the end of the pool's | |
42 | * bptree queue so the dataset's blocks can be freed asynchronously by | |
43 | * dsl_scan_sync. This allows the delete operation to finish without traversing | |
44 | * all the dataset's blocks. | |
45 | * | |
d3cc8b15 | 46 | * Note that while bt_begin and bt_end are only ever incremented in this code, |
9ae529ec CS |
47 | * they are effectively reset to 0 every time the entire bptree is freed because |
48 | * the bptree's object is destroyed and re-created. | |
49 | */ | |
50 | ||
51 | struct bptree_args { | |
52 | bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ | |
53 | boolean_t ba_free; /* true if freeing during traversal */ | |
54 | ||
55 | bptree_itor_t *ba_func; /* function to call for each blockpointer */ | |
56 | void *ba_arg; /* caller supplied argument to ba_func */ | |
57 | dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ | |
58 | } bptree_args_t; | |
59 | ||
60 | uint64_t | |
61 | bptree_alloc(objset_t *os, dmu_tx_t *tx) | |
62 | { | |
63 | uint64_t obj; | |
64 | dmu_buf_t *db; | |
65 | bptree_phys_t *bt; | |
66 | ||
67 | obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, | |
68 | SPA_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, | |
69 | sizeof (bptree_phys_t), tx); | |
70 | ||
71 | /* | |
72 | * Bonus buffer contents are already initialized to 0, but for | |
73 | * readability we make it explicit. | |
74 | */ | |
75 | VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); | |
76 | dmu_buf_will_dirty(db, tx); | |
77 | bt = db->db_data; | |
78 | bt->bt_begin = 0; | |
79 | bt->bt_end = 0; | |
80 | bt->bt_bytes = 0; | |
81 | bt->bt_comp = 0; | |
82 | bt->bt_uncomp = 0; | |
83 | dmu_buf_rele(db, FTAG); | |
84 | ||
85 | return (obj); | |
86 | } | |
87 | ||
88 | int | |
89 | bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) | |
90 | { | |
91 | dmu_buf_t *db; | |
92 | bptree_phys_t *bt; | |
93 | ||
94 | VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); | |
95 | bt = db->db_data; | |
96 | ASSERT3U(bt->bt_begin, ==, bt->bt_end); | |
c99c9001 MS |
97 | ASSERT0(bt->bt_bytes); |
98 | ASSERT0(bt->bt_comp); | |
99 | ASSERT0(bt->bt_uncomp); | |
9ae529ec CS |
100 | dmu_buf_rele(db, FTAG); |
101 | ||
102 | return (dmu_object_free(os, obj, tx)); | |
103 | } | |
104 | ||
fbeddd60 MA |
105 | boolean_t |
106 | bptree_is_empty(objset_t *os, uint64_t obj) | |
107 | { | |
108 | dmu_buf_t *db; | |
109 | bptree_phys_t *bt; | |
110 | boolean_t rv; | |
111 | ||
112 | VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); | |
113 | bt = db->db_data; | |
114 | rv = (bt->bt_begin == bt->bt_end); | |
115 | dmu_buf_rele(db, FTAG); | |
116 | return (rv); | |
117 | } | |
118 | ||
9ae529ec CS |
119 | void |
120 | bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, | |
121 | uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) | |
122 | { | |
123 | dmu_buf_t *db; | |
124 | bptree_phys_t *bt; | |
fbeddd60 | 125 | bptree_entry_phys_t *bte; |
9ae529ec CS |
126 | |
127 | /* | |
128 | * bptree objects are in the pool mos, therefore they can only be | |
129 | * modified in syncing context. Furthermore, this is only modified | |
130 | * by the sync thread, so no locking is necessary. | |
131 | */ | |
132 | ASSERT(dmu_tx_is_syncing(tx)); | |
133 | ||
134 | VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); | |
135 | bt = db->db_data; | |
136 | ||
fbeddd60 MA |
137 | bte = kmem_zalloc(sizeof (*bte), KM_PUSHPAGE); |
138 | bte->be_birth_txg = birth_txg; | |
139 | bte->be_bp = *bp; | |
140 | dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx); | |
141 | kmem_free(bte, sizeof (*bte)); | |
9ae529ec CS |
142 | |
143 | dmu_buf_will_dirty(db, tx); | |
144 | bt->bt_end++; | |
145 | bt->bt_bytes += bytes; | |
146 | bt->bt_comp += comp; | |
147 | bt->bt_uncomp += uncomp; | |
148 | dmu_buf_rele(db, FTAG); | |
149 | } | |
150 | ||
151 | /* ARGSUSED */ | |
152 | static int | |
294f6806 | 153 | bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, |
9ae529ec CS |
154 | const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) |
155 | { | |
156 | int err; | |
157 | struct bptree_args *ba = arg; | |
158 | ||
b0bc7a84 | 159 | if (BP_IS_HOLE(bp)) |
9ae529ec CS |
160 | return (0); |
161 | ||
162 | err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); | |
163 | if (err == 0 && ba->ba_free) { | |
164 | ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); | |
165 | ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); | |
166 | ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); | |
167 | } | |
168 | return (err); | |
169 | } | |
170 | ||
fbeddd60 MA |
171 | /* |
172 | * If "free" is set: | |
173 | * - It is assumed that "func" will be freeing the block pointers. | |
174 | * - If "func" returns nonzero, the bookmark will be remembered and | |
175 | * iteration will be restarted from this point on next invocation. | |
176 | * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), | |
177 | * bptree_iterate will remember the bookmark, continue traversing | |
178 | * any additional entries, and return 0. | |
179 | * | |
180 | * If "free" is not set, traversal will stop and return an error if | |
181 | * an i/o error is encountered. | |
182 | * | |
183 | * In either case, if zfs_free_leak_on_eio is set, i/o errors will be | |
184 | * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to | |
185 | * traverse_dataset_destroyed()). | |
186 | */ | |
9ae529ec CS |
187 | int |
188 | bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, | |
189 | void *arg, dmu_tx_t *tx) | |
190 | { | |
fbeddd60 | 191 | boolean_t ioerr = B_FALSE; |
9ae529ec CS |
192 | int err; |
193 | uint64_t i; | |
194 | dmu_buf_t *db; | |
195 | struct bptree_args ba; | |
196 | ||
197 | ASSERT(!free || dmu_tx_is_syncing(tx)); | |
198 | ||
199 | err = dmu_bonus_hold(os, obj, FTAG, &db); | |
200 | if (err != 0) | |
201 | return (err); | |
202 | ||
203 | if (free) | |
204 | dmu_buf_will_dirty(db, tx); | |
205 | ||
206 | ba.ba_phys = db->db_data; | |
207 | ba.ba_free = free; | |
208 | ba.ba_func = func; | |
209 | ba.ba_arg = arg; | |
210 | ba.ba_tx = tx; | |
211 | ||
212 | err = 0; | |
213 | for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { | |
214 | bptree_entry_phys_t bte; | |
78e2739d | 215 | int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST; |
9ae529ec | 216 | |
9ae529ec CS |
217 | err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), |
218 | &bte, DMU_READ_NO_PREFETCH); | |
219 | if (err != 0) | |
220 | break; | |
221 | ||
fbeddd60 | 222 | if (zfs_free_leak_on_eio) |
78e2739d | 223 | flags |= TRAVERSE_HARD; |
fbeddd60 MA |
224 | zfs_dbgmsg("bptree index %d: traversing from min_txg=%lld " |
225 | "bookmark %lld/%lld/%lld/%lld", | |
226 | i, (longlong_t)bte.be_birth_txg, | |
227 | (longlong_t)bte.be_zb.zb_objset, | |
228 | (longlong_t)bte.be_zb.zb_object, | |
229 | (longlong_t)bte.be_zb.zb_level, | |
230 | (longlong_t)bte.be_zb.zb_blkid); | |
9ae529ec | 231 | err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, |
78e2739d | 232 | bte.be_birth_txg, &bte.be_zb, flags, |
9ae529ec CS |
233 | bptree_visit_cb, &ba); |
234 | if (free) { | |
fbeddd60 MA |
235 | /* |
236 | * The callback has freed the visited block pointers. | |
237 | * Record our traversal progress on disk, either by | |
238 | * updating this record's bookmark, or by logically | |
239 | * removing this record by advancing bt_begin. | |
240 | */ | |
241 | if (err != 0) { | |
9ae529ec CS |
242 | /* save bookmark for future resume */ |
243 | ASSERT3U(bte.be_zb.zb_objset, ==, | |
244 | ZB_DESTROYED_OBJSET); | |
c99c9001 | 245 | ASSERT0(bte.be_zb.zb_level); |
9ae529ec CS |
246 | dmu_write(os, obj, i * sizeof (bte), |
247 | sizeof (bte), &bte, tx); | |
fbeddd60 MA |
248 | if (err == EIO || err == ECKSUM || |
249 | err == ENXIO) { | |
250 | /* | |
251 | * Skip the rest of this tree and | |
252 | * continue on to the next entry. | |
253 | */ | |
254 | err = 0; | |
255 | ioerr = B_TRUE; | |
256 | } else { | |
257 | break; | |
258 | } | |
259 | } else if (ioerr) { | |
78e2739d | 260 | /* |
fbeddd60 MA |
261 | * This entry is finished, but there were |
262 | * i/o errors on previous entries, so we | |
263 | * can't adjust bt_begin. Set this entry's | |
264 | * be_birth_txg such that it will be | |
265 | * treated as a no-op in future traversals. | |
78e2739d | 266 | */ |
fbeddd60 MA |
267 | bte.be_birth_txg = UINT64_MAX; |
268 | dmu_write(os, obj, i * sizeof (bte), | |
269 | sizeof (bte), &bte, tx); | |
78e2739d MA |
270 | } |
271 | ||
fbeddd60 MA |
272 | if (!ioerr) { |
273 | ba.ba_phys->bt_begin++; | |
274 | (void) dmu_free_range(os, obj, | |
275 | i * sizeof (bte), sizeof (bte), tx); | |
276 | } | |
277 | } else if (err != 0) { | |
278 | break; | |
9ae529ec CS |
279 | } |
280 | } | |
281 | ||
fbeddd60 MA |
282 | ASSERT(!free || err != 0 || ioerr || |
283 | ba.ba_phys->bt_begin == ba.ba_phys->bt_end); | |
9ae529ec CS |
284 | |
285 | /* if all blocks are free there should be no used space */ | |
286 | if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { | |
fbeddd60 MA |
287 | if (zfs_free_leak_on_eio) { |
288 | ba.ba_phys->bt_bytes = 0; | |
289 | ba.ba_phys->bt_comp = 0; | |
290 | ba.ba_phys->bt_uncomp = 0; | |
291 | } | |
292 | ||
c99c9001 MS |
293 | ASSERT0(ba.ba_phys->bt_bytes); |
294 | ASSERT0(ba.ba_phys->bt_comp); | |
295 | ASSERT0(ba.ba_phys->bt_uncomp); | |
9ae529ec CS |
296 | } |
297 | ||
298 | dmu_buf_rele(db, FTAG); | |
299 | ||
300 | return (err); | |
301 | } |