]>
Commit | Line | Data |
---|---|---|
428870ff BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
428870ff BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. | |
37f03da8 | 23 | * Copyright (c) 2012, 2019 by Delphix. All rights reserved. |
0c66c32d | 24 | * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. |
428870ff BB |
25 | */ |
26 | ||
428870ff | 27 | #include <sys/dmu.h> |
428870ff BB |
28 | #include <sys/zap.h> |
29 | #include <sys/zfs_context.h> | |
30 | #include <sys/dsl_pool.h> | |
37f03da8 | 31 | #include <sys/dsl_dataset.h> |
428870ff | 32 | |
330d06f9 MA |
33 | /* |
34 | * Deadlist concurrency: | |
35 | * | |
36 | * Deadlists can only be modified from the syncing thread. | |
37 | * | |
38 | * Except for dsl_deadlist_insert(), it can only be modified with the | |
39 | * dp_config_rwlock held with RW_WRITER. | |
40 | * | |
41 | * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can | |
42 | * be called concurrently, from open context, with the dl_config_rwlock held | |
43 | * with RW_READER. | |
44 | * | |
45 | * Therefore, we only need to provide locking between dsl_deadlist_insert() and | |
46 | * the accessors, protecting: | |
47 | * dl_phys->dl_used,comp,uncomp | |
48 | * and protecting the dl_tree from being loaded. | |
49 | * The locking is provided by dl_lock. Note that locking on the bpobj_t | |
50 | * provides its own locking, and dl_oldfmt is immutable. | |
51 | */ | |
52 | ||
37f03da8 SH |
53 | /* |
54 | * Livelist Overview | |
55 | * ================ | |
56 | * | |
57 | * Livelists use the same 'deadlist_t' struct as deadlists and are also used | |
58 | * to track blkptrs over the lifetime of a dataset. Livelists however, belong | |
59 | * to clones and track the blkptrs that are clone-specific (were born after | |
60 | * the clone's creation). The exception is embedded block pointers which are | |
61 | * not included in livelists because they do not need to be freed. | |
62 | * | |
63 | * When it comes time to delete the clone, the livelist provides a quick | |
64 | * reference as to what needs to be freed. For this reason, livelists also track | |
65 | * when clone-specific blkptrs are freed before deletion to prevent double | |
66 | * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the | |
67 | * deletion algorithm iterates backwards over the livelist, matching | |
68 | * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists | |
69 | * are also updated in the case when blkptrs are remapped: the old version | |
70 | * of the blkptr is cancelled out with a FREE and the new version is tracked | |
71 | * with an ALLOC. | |
72 | * | |
73 | * To bound the amount of memory required for deletion, livelists over a | |
74 | * certain size are spread over multiple entries. Entries are grouped by | |
75 | * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will | |
76 | * be in the same entry. This allows us to delete livelists incrementally | |
77 | * over multiple syncs, one entry at a time. | |
78 | * | |
79 | * During the lifetime of the clone, livelists can get extremely large. | |
80 | * Their size is managed by periodic condensing (preemptively cancelling out | |
81 | * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when | |
82 | * the shared space between the clone and its origin is so small that it | |
83 | * doesn't make sense to use livelists anymore. | |
84 | */ | |
85 | ||
86 | /* | |
87 | * The threshold sublist size at which we create a new sub-livelist for the | |
88 | * next txg. However, since blkptrs of the same transaction group must be in | |
89 | * the same sub-list, the actual sublist size may exceed this. When picking the | |
90 | * size we had to balance the fact that larger sublists mean fewer sublists | |
91 | * (decreasing the cost of insertion) against the consideration that sublists | |
92 | * will be loaded into memory and shouldn't take up an inordinate amount of | |
93 | * space. We settled on ~500000 entries, corresponding to roughly 128M. | |
94 | */ | |
ab8d9c17 | 95 | uint64_t zfs_livelist_max_entries = 500000; |
37f03da8 SH |
96 | |
97 | /* | |
98 | * We can approximate how much of a performance gain a livelist will give us | |
99 | * based on the percentage of blocks shared between the clone and its origin. | |
100 | * 0 percent shared means that the clone has completely diverged and that the | |
101 | * old method is maximally effective: every read from the block tree will | |
102 | * result in lots of frees. Livelists give us gains when they track blocks | |
103 | * scattered across the tree, when one read in the old method might only | |
104 | * result in a few frees. Once the clone has been overwritten enough, | |
105 | * writes are no longer sparse and we'll no longer get much of a benefit from | |
106 | * tracking them with a livelist. We chose a lower limit of 75 percent shared | |
107 | * (25 percent overwritten). This means that 1/4 of all block pointers will be | |
108 | * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists | |
109 | * to make deletion 4x faster. Once the amount of shared space drops below this | |
110 | * threshold, the clone will revert to the old deletion method. | |
111 | */ | |
112 | int zfs_livelist_min_percent_shared = 75; | |
113 | ||
428870ff BB |
114 | static int |
115 | dsl_deadlist_compare(const void *arg1, const void *arg2) | |
116 | { | |
325d288c MA |
117 | const dsl_deadlist_entry_t *dle1 = arg1; |
118 | const dsl_deadlist_entry_t *dle2 = arg2; | |
428870ff | 119 | |
ca577779 | 120 | return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); |
428870ff BB |
121 | } |
122 | ||
325d288c MA |
123 | static int |
124 | dsl_deadlist_cache_compare(const void *arg1, const void *arg2) | |
125 | { | |
126 | const dsl_deadlist_cache_entry_t *dlce1 = arg1; | |
127 | const dsl_deadlist_cache_entry_t *dlce2 = arg2; | |
128 | ||
ca577779 | 129 | return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); |
325d288c MA |
130 | } |
131 | ||
428870ff BB |
132 | static void |
133 | dsl_deadlist_load_tree(dsl_deadlist_t *dl) | |
134 | { | |
135 | zap_cursor_t zc; | |
136 | zap_attribute_t za; | |
d87676a9 | 137 | int error; |
428870ff | 138 | |
df7eeccc MA |
139 | ASSERT(MUTEX_HELD(&dl->dl_lock)); |
140 | ||
428870ff | 141 | ASSERT(!dl->dl_oldfmt); |
325d288c MA |
142 | if (dl->dl_havecache) { |
143 | /* | |
144 | * After loading the tree, the caller may modify the tree, | |
145 | * e.g. to add or remove nodes, or to make a node no longer | |
146 | * refer to the empty_bpobj. These changes would make the | |
147 | * dl_cache incorrect. Therefore we discard the cache here, | |
148 | * so that it can't become incorrect. | |
149 | */ | |
150 | dsl_deadlist_cache_entry_t *dlce; | |
151 | void *cookie = NULL; | |
152 | while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) | |
153 | != NULL) { | |
154 | kmem_free(dlce, sizeof (*dlce)); | |
155 | } | |
156 | avl_destroy(&dl->dl_cache); | |
157 | dl->dl_havecache = B_FALSE; | |
158 | } | |
428870ff BB |
159 | if (dl->dl_havetree) |
160 | return; | |
161 | ||
162 | avl_create(&dl->dl_tree, dsl_deadlist_compare, | |
163 | sizeof (dsl_deadlist_entry_t), | |
164 | offsetof(dsl_deadlist_entry_t, dle_node)); | |
165 | for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); | |
d87676a9 | 166 | (error = zap_cursor_retrieve(&zc, &za)) == 0; |
428870ff | 167 | zap_cursor_advance(&zc)) { |
e19572e4 YP |
168 | dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); |
169 | dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); | |
325d288c MA |
170 | |
171 | /* | |
172 | * Prefetch all the bpobj's so that we do that i/o | |
173 | * in parallel. Then open them all in a second pass. | |
174 | */ | |
175 | dle->dle_bpobj.bpo_object = za.za_first_integer; | |
6c94e649 AM |
176 | dmu_prefetch_dnode(dl->dl_os, dle->dle_bpobj.bpo_object, |
177 | ZIO_PRIORITY_SYNC_READ); | |
325d288c | 178 | |
428870ff BB |
179 | avl_add(&dl->dl_tree, dle); |
180 | } | |
d87676a9 | 181 | VERIFY3U(error, ==, ENOENT); |
428870ff | 182 | zap_cursor_fini(&zc); |
325d288c MA |
183 | |
184 | for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree); | |
185 | dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) { | |
186 | VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, | |
187 | dle->dle_bpobj.bpo_object)); | |
188 | } | |
428870ff BB |
189 | dl->dl_havetree = B_TRUE; |
190 | } | |
191 | ||
325d288c MA |
192 | /* |
193 | * Load only the non-empty bpobj's into the dl_cache. The cache is an analog | |
194 | * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It | |
195 | * is used only for gathering space statistics. The dl_cache has two | |
196 | * advantages over the dl_tree: | |
197 | * | |
198 | * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's | |
199 | * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj | |
200 | * many times and to inquire about its (zero) space stats many times. | |
201 | * | |
202 | * 2. The dl_cache uses less memory than the dl_tree. We only need to load | |
203 | * the dl_tree of snapshots when deleting a snapshot, after which we free the | |
204 | * dl_tree with dsl_deadlist_discard_tree | |
205 | */ | |
206 | static void | |
207 | dsl_deadlist_load_cache(dsl_deadlist_t *dl) | |
208 | { | |
209 | zap_cursor_t zc; | |
210 | zap_attribute_t za; | |
d87676a9 | 211 | int error; |
325d288c MA |
212 | |
213 | ASSERT(MUTEX_HELD(&dl->dl_lock)); | |
214 | ||
215 | ASSERT(!dl->dl_oldfmt); | |
216 | if (dl->dl_havecache) | |
217 | return; | |
218 | ||
219 | uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj; | |
220 | ||
221 | avl_create(&dl->dl_cache, dsl_deadlist_cache_compare, | |
222 | sizeof (dsl_deadlist_cache_entry_t), | |
223 | offsetof(dsl_deadlist_cache_entry_t, dlce_node)); | |
224 | for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); | |
d87676a9 | 225 | (error = zap_cursor_retrieve(&zc, &za)) == 0; |
325d288c MA |
226 | zap_cursor_advance(&zc)) { |
227 | if (za.za_first_integer == empty_bpobj) | |
228 | continue; | |
229 | dsl_deadlist_cache_entry_t *dlce = | |
230 | kmem_zalloc(sizeof (*dlce), KM_SLEEP); | |
231 | dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL); | |
232 | ||
233 | /* | |
234 | * Prefetch all the bpobj's so that we do that i/o | |
235 | * in parallel. Then open them all in a second pass. | |
236 | */ | |
237 | dlce->dlce_bpobj = za.za_first_integer; | |
6c94e649 AM |
238 | dmu_prefetch_dnode(dl->dl_os, dlce->dlce_bpobj, |
239 | ZIO_PRIORITY_SYNC_READ); | |
325d288c MA |
240 | avl_add(&dl->dl_cache, dlce); |
241 | } | |
d87676a9 | 242 | VERIFY3U(error, ==, ENOENT); |
325d288c MA |
243 | zap_cursor_fini(&zc); |
244 | ||
245 | for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache); | |
246 | dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) { | |
247 | bpobj_t bpo; | |
248 | VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj)); | |
249 | ||
250 | VERIFY0(bpobj_space(&bpo, | |
251 | &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp)); | |
252 | bpobj_close(&bpo); | |
253 | } | |
254 | dl->dl_havecache = B_TRUE; | |
255 | } | |
256 | ||
257 | /* | |
258 | * Discard the tree to save memory. | |
259 | */ | |
260 | void | |
261 | dsl_deadlist_discard_tree(dsl_deadlist_t *dl) | |
262 | { | |
263 | mutex_enter(&dl->dl_lock); | |
264 | ||
265 | if (!dl->dl_havetree) { | |
266 | mutex_exit(&dl->dl_lock); | |
267 | return; | |
268 | } | |
269 | dsl_deadlist_entry_t *dle; | |
270 | void *cookie = NULL; | |
271 | while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { | |
272 | bpobj_close(&dle->dle_bpobj); | |
273 | kmem_free(dle, sizeof (*dle)); | |
274 | } | |
275 | avl_destroy(&dl->dl_tree); | |
276 | ||
277 | dl->dl_havetree = B_FALSE; | |
278 | mutex_exit(&dl->dl_lock); | |
279 | } | |
280 | ||
37f03da8 SH |
281 | void |
282 | dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) | |
283 | { | |
284 | dsl_deadlist_entry_t *dle; | |
285 | ||
286 | ASSERT(dsl_deadlist_is_open(dl)); | |
287 | ||
288 | mutex_enter(&dl->dl_lock); | |
289 | dsl_deadlist_load_tree(dl); | |
290 | mutex_exit(&dl->dl_lock); | |
291 | for (dle = avl_first(&dl->dl_tree); dle != NULL; | |
292 | dle = AVL_NEXT(&dl->dl_tree, dle)) { | |
293 | if (func(args, dle) != 0) | |
294 | break; | |
295 | } | |
296 | } | |
297 | ||
428870ff BB |
298 | void |
299 | dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) | |
300 | { | |
301 | dmu_object_info_t doi; | |
302 | ||
a1d477c2 MA |
303 | ASSERT(!dsl_deadlist_is_open(dl)); |
304 | ||
428870ff BB |
305 | mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); |
306 | dl->dl_os = os; | |
307 | dl->dl_object = object; | |
30af21b0 | 308 | VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); |
428870ff BB |
309 | dmu_object_info_from_db(dl->dl_dbuf, &doi); |
310 | if (doi.doi_type == DMU_OT_BPOBJ) { | |
311 | dmu_buf_rele(dl->dl_dbuf, dl); | |
312 | dl->dl_dbuf = NULL; | |
313 | dl->dl_oldfmt = B_TRUE; | |
30af21b0 | 314 | VERIFY0(bpobj_open(&dl->dl_bpobj, os, object)); |
428870ff BB |
315 | return; |
316 | } | |
317 | ||
318 | dl->dl_oldfmt = B_FALSE; | |
319 | dl->dl_phys = dl->dl_dbuf->db_data; | |
320 | dl->dl_havetree = B_FALSE; | |
325d288c | 321 | dl->dl_havecache = B_FALSE; |
428870ff BB |
322 | } |
323 | ||
a1d477c2 MA |
324 | boolean_t |
325 | dsl_deadlist_is_open(dsl_deadlist_t *dl) | |
326 | { | |
327 | return (dl->dl_os != NULL); | |
328 | } | |
329 | ||
428870ff BB |
330 | void |
331 | dsl_deadlist_close(dsl_deadlist_t *dl) | |
332 | { | |
a1d477c2 | 333 | ASSERT(dsl_deadlist_is_open(dl)); |
c17486b2 | 334 | mutex_destroy(&dl->dl_lock); |
0c66c32d | 335 | |
428870ff BB |
336 | if (dl->dl_oldfmt) { |
337 | dl->dl_oldfmt = B_FALSE; | |
338 | bpobj_close(&dl->dl_bpobj); | |
a1d477c2 MA |
339 | dl->dl_os = NULL; |
340 | dl->dl_object = 0; | |
428870ff BB |
341 | return; |
342 | } | |
343 | ||
344 | if (dl->dl_havetree) { | |
325d288c MA |
345 | dsl_deadlist_entry_t *dle; |
346 | void *cookie = NULL; | |
428870ff BB |
347 | while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) |
348 | != NULL) { | |
349 | bpobj_close(&dle->dle_bpobj); | |
350 | kmem_free(dle, sizeof (*dle)); | |
351 | } | |
352 | avl_destroy(&dl->dl_tree); | |
353 | } | |
325d288c MA |
354 | if (dl->dl_havecache) { |
355 | dsl_deadlist_cache_entry_t *dlce; | |
356 | void *cookie = NULL; | |
357 | while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) | |
358 | != NULL) { | |
359 | kmem_free(dlce, sizeof (*dlce)); | |
360 | } | |
361 | avl_destroy(&dl->dl_cache); | |
362 | } | |
428870ff | 363 | dmu_buf_rele(dl->dl_dbuf, dl); |
428870ff BB |
364 | dl->dl_dbuf = NULL; |
365 | dl->dl_phys = NULL; | |
a1d477c2 MA |
366 | dl->dl_os = NULL; |
367 | dl->dl_object = 0; | |
428870ff BB |
368 | } |
369 | ||
370 | uint64_t | |
371 | dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) | |
372 | { | |
373 | if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) | |
f1512ee6 | 374 | return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); |
428870ff BB |
375 | return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, |
376 | sizeof (dsl_deadlist_phys_t), tx)); | |
377 | } | |
378 | ||
379 | void | |
380 | dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) | |
381 | { | |
382 | dmu_object_info_t doi; | |
383 | zap_cursor_t zc; | |
384 | zap_attribute_t za; | |
d87676a9 | 385 | int error; |
428870ff | 386 | |
30af21b0 | 387 | VERIFY0(dmu_object_info(os, dlobj, &doi)); |
428870ff BB |
388 | if (doi.doi_type == DMU_OT_BPOBJ) { |
389 | bpobj_free(os, dlobj, tx); | |
390 | return; | |
391 | } | |
392 | ||
393 | for (zap_cursor_init(&zc, os, dlobj); | |
d87676a9 | 394 | (error = zap_cursor_retrieve(&zc, &za)) == 0; |
753c3839 MA |
395 | zap_cursor_advance(&zc)) { |
396 | uint64_t obj = za.za_first_integer; | |
397 | if (obj == dmu_objset_pool(os)->dp_empty_bpobj) | |
398 | bpobj_decr_empty(os, tx); | |
399 | else | |
400 | bpobj_free(os, obj, tx); | |
401 | } | |
d87676a9 | 402 | VERIFY3U(error, ==, ENOENT); |
428870ff | 403 | zap_cursor_fini(&zc); |
30af21b0 | 404 | VERIFY0(dmu_object_free(os, dlobj, tx)); |
428870ff BB |
405 | } |
406 | ||
753c3839 MA |
407 | static void |
408 | dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, | |
37f03da8 | 409 | const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) |
753c3839 | 410 | { |
df7eeccc | 411 | ASSERT(MUTEX_HELD(&dl->dl_lock)); |
753c3839 MA |
412 | if (dle->dle_bpobj.bpo_object == |
413 | dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { | |
f1512ee6 | 414 | uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); |
753c3839 MA |
415 | bpobj_close(&dle->dle_bpobj); |
416 | bpobj_decr_empty(dl->dl_os, tx); | |
30af21b0 PD |
417 | VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); |
418 | VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, | |
753c3839 MA |
419 | dle->dle_mintxg, obj, tx)); |
420 | } | |
37f03da8 | 421 | bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); |
753c3839 MA |
422 | } |
423 | ||
424 | static void | |
425 | dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, | |
426 | uint64_t obj, dmu_tx_t *tx) | |
427 | { | |
df7eeccc | 428 | ASSERT(MUTEX_HELD(&dl->dl_lock)); |
753c3839 MA |
429 | if (dle->dle_bpobj.bpo_object != |
430 | dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { | |
431 | bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); | |
432 | } else { | |
433 | bpobj_close(&dle->dle_bpobj); | |
434 | bpobj_decr_empty(dl->dl_os, tx); | |
30af21b0 PD |
435 | VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); |
436 | VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, | |
753c3839 MA |
437 | dle->dle_mintxg, obj, tx)); |
438 | } | |
439 | } | |
440 | ||
dc5c8006 AM |
441 | /* |
442 | * Prefetch metadata required for dle_enqueue_subobj(). | |
443 | */ | |
444 | static void | |
445 | dle_prefetch_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, | |
446 | uint64_t obj) | |
447 | { | |
448 | if (dle->dle_bpobj.bpo_object != | |
449 | dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) | |
450 | bpobj_prefetch_subobj(&dle->dle_bpobj, obj); | |
451 | } | |
452 | ||
428870ff | 453 | void |
37f03da8 SH |
454 | dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, |
455 | dmu_tx_t *tx) | |
428870ff BB |
456 | { |
457 | dsl_deadlist_entry_t dle_tofind; | |
458 | dsl_deadlist_entry_t *dle; | |
459 | avl_index_t where; | |
460 | ||
461 | if (dl->dl_oldfmt) { | |
37f03da8 | 462 | bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); |
428870ff BB |
463 | return; |
464 | } | |
465 | ||
df7eeccc | 466 | mutex_enter(&dl->dl_lock); |
428870ff BB |
467 | dsl_deadlist_load_tree(dl); |
468 | ||
469 | dmu_buf_will_dirty(dl->dl_dbuf, tx); | |
37f03da8 SH |
470 | |
471 | int sign = bp_freed ? -1 : +1; | |
428870ff | 472 | dl->dl_phys->dl_used += |
37f03da8 SH |
473 | sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); |
474 | dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); | |
475 | dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); | |
428870ff | 476 | |
493fcce9 | 477 | dle_tofind.dle_mintxg = BP_GET_LOGICAL_BIRTH(bp); |
428870ff BB |
478 | dle = avl_find(&dl->dl_tree, &dle_tofind, &where); |
479 | if (dle == NULL) | |
480 | dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); | |
481 | else | |
482 | dle = AVL_PREV(&dl->dl_tree, dle); | |
eba9e745 BB |
483 | |
484 | if (dle == NULL) { | |
485 | zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", | |
493fcce9 | 486 | bp, (longlong_t)BP_GET_LOGICAL_BIRTH(bp)); |
eba9e745 BB |
487 | dle = avl_first(&dl->dl_tree); |
488 | } | |
489 | ||
490 | ASSERT3P(dle, !=, NULL); | |
37f03da8 | 491 | dle_enqueue(dl, dle, bp, bp_freed, tx); |
df7eeccc | 492 | mutex_exit(&dl->dl_lock); |
428870ff BB |
493 | } |
494 | ||
37f03da8 SH |
495 | int |
496 | dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | |
497 | { | |
498 | dsl_deadlist_t *dl = arg; | |
499 | dsl_deadlist_insert(dl, bp, B_FALSE, tx); | |
500 | return (0); | |
501 | } | |
502 | ||
503 | int | |
504 | dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | |
505 | { | |
506 | dsl_deadlist_t *dl = arg; | |
507 | dsl_deadlist_insert(dl, bp, B_TRUE, tx); | |
508 | return (0); | |
509 | } | |
510 | ||
428870ff BB |
511 | /* |
512 | * Insert new key in deadlist, which must be > all current entries. | |
513 | * mintxg is not inclusive. | |
514 | */ | |
515 | void | |
516 | dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) | |
517 | { | |
518 | uint64_t obj; | |
519 | dsl_deadlist_entry_t *dle; | |
520 | ||
521 | if (dl->dl_oldfmt) | |
522 | return; | |
523 | ||
79c76d5b | 524 | dle = kmem_alloc(sizeof (*dle), KM_SLEEP); |
428870ff | 525 | dle->dle_mintxg = mintxg; |
df7eeccc MA |
526 | |
527 | mutex_enter(&dl->dl_lock); | |
528 | dsl_deadlist_load_tree(dl); | |
529 | ||
f1512ee6 | 530 | obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); |
30af21b0 | 531 | VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); |
428870ff BB |
532 | avl_add(&dl->dl_tree, dle); |
533 | ||
30af21b0 | 534 | VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object, |
428870ff | 535 | mintxg, obj, tx)); |
df7eeccc | 536 | mutex_exit(&dl->dl_lock); |
428870ff BB |
537 | } |
538 | ||
539 | /* | |
540 | * Remove this key, merging its entries into the previous key. | |
541 | */ | |
542 | void | |
543 | dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) | |
544 | { | |
545 | dsl_deadlist_entry_t dle_tofind; | |
546 | dsl_deadlist_entry_t *dle, *dle_prev; | |
547 | ||
548 | if (dl->dl_oldfmt) | |
549 | return; | |
df7eeccc | 550 | mutex_enter(&dl->dl_lock); |
428870ff BB |
551 | dsl_deadlist_load_tree(dl); |
552 | ||
553 | dle_tofind.dle_mintxg = mintxg; | |
554 | dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); | |
30af21b0 | 555 | ASSERT3P(dle, !=, NULL); |
428870ff | 556 | dle_prev = AVL_PREV(&dl->dl_tree, dle); |
a6ccb36b | 557 | ASSERT3P(dle_prev, !=, NULL); |
428870ff | 558 | |
753c3839 | 559 | dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); |
428870ff BB |
560 | |
561 | avl_remove(&dl->dl_tree, dle); | |
562 | bpobj_close(&dle->dle_bpobj); | |
563 | kmem_free(dle, sizeof (*dle)); | |
564 | ||
30af21b0 | 565 | VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); |
df7eeccc | 566 | mutex_exit(&dl->dl_lock); |
428870ff BB |
567 | } |
568 | ||
37f03da8 SH |
569 | /* |
570 | * Remove a deadlist entry and all of its contents by removing the entry from | |
571 | * the deadlist's avl tree, freeing the entry's bpobj and adjusting the | |
572 | * deadlist's space accounting accordingly. | |
573 | */ | |
574 | void | |
575 | dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) | |
576 | { | |
577 | uint64_t used, comp, uncomp; | |
578 | dsl_deadlist_entry_t dle_tofind; | |
579 | dsl_deadlist_entry_t *dle; | |
580 | objset_t *os = dl->dl_os; | |
581 | ||
582 | if (dl->dl_oldfmt) | |
583 | return; | |
584 | ||
585 | mutex_enter(&dl->dl_lock); | |
586 | dsl_deadlist_load_tree(dl); | |
587 | ||
588 | dle_tofind.dle_mintxg = mintxg; | |
589 | dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); | |
590 | VERIFY3P(dle, !=, NULL); | |
591 | ||
592 | avl_remove(&dl->dl_tree, dle); | |
593 | VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); | |
594 | VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); | |
325d288c | 595 | dmu_buf_will_dirty(dl->dl_dbuf, tx); |
37f03da8 SH |
596 | dl->dl_phys->dl_used -= used; |
597 | dl->dl_phys->dl_comp -= comp; | |
598 | dl->dl_phys->dl_uncomp -= uncomp; | |
599 | if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { | |
600 | bpobj_decr_empty(os, tx); | |
601 | } else { | |
602 | bpobj_free(os, dle->dle_bpobj.bpo_object, tx); | |
603 | } | |
604 | bpobj_close(&dle->dle_bpobj); | |
605 | kmem_free(dle, sizeof (*dle)); | |
606 | mutex_exit(&dl->dl_lock); | |
607 | } | |
608 | ||
609 | /* | |
610 | * Clear out the contents of a deadlist_entry by freeing its bpobj, | |
611 | * replacing it with an empty bpobj and adjusting the deadlist's | |
612 | * space accounting | |
613 | */ | |
614 | void | |
615 | dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, | |
616 | dmu_tx_t *tx) | |
617 | { | |
618 | uint64_t new_obj, used, comp, uncomp; | |
619 | objset_t *os = dl->dl_os; | |
620 | ||
621 | mutex_enter(&dl->dl_lock); | |
622 | VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); | |
623 | VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); | |
325d288c | 624 | dmu_buf_will_dirty(dl->dl_dbuf, tx); |
37f03da8 SH |
625 | dl->dl_phys->dl_used -= used; |
626 | dl->dl_phys->dl_comp -= comp; | |
627 | dl->dl_phys->dl_uncomp -= uncomp; | |
628 | if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) | |
629 | bpobj_decr_empty(os, tx); | |
630 | else | |
631 | bpobj_free(os, dle->dle_bpobj.bpo_object, tx); | |
632 | bpobj_close(&dle->dle_bpobj); | |
633 | new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); | |
634 | VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); | |
635 | VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, | |
636 | new_obj, tx)); | |
637 | ASSERT(bpobj_is_empty(&dle->dle_bpobj)); | |
638 | mutex_exit(&dl->dl_lock); | |
639 | } | |
640 | ||
641 | /* | |
642 | * Return the first entry in deadlist's avl tree | |
643 | */ | |
644 | dsl_deadlist_entry_t * | |
645 | dsl_deadlist_first(dsl_deadlist_t *dl) | |
646 | { | |
647 | dsl_deadlist_entry_t *dle; | |
648 | ||
649 | mutex_enter(&dl->dl_lock); | |
650 | dsl_deadlist_load_tree(dl); | |
651 | dle = avl_first(&dl->dl_tree); | |
652 | mutex_exit(&dl->dl_lock); | |
653 | ||
654 | return (dle); | |
655 | } | |
656 | ||
657 | /* | |
658 | * Return the last entry in deadlist's avl tree | |
659 | */ | |
660 | dsl_deadlist_entry_t * | |
661 | dsl_deadlist_last(dsl_deadlist_t *dl) | |
662 | { | |
663 | dsl_deadlist_entry_t *dle; | |
664 | ||
665 | mutex_enter(&dl->dl_lock); | |
666 | dsl_deadlist_load_tree(dl); | |
667 | dle = avl_last(&dl->dl_tree); | |
668 | mutex_exit(&dl->dl_lock); | |
669 | ||
670 | return (dle); | |
671 | } | |
672 | ||
428870ff BB |
673 | /* |
674 | * Walk ds's snapshots to regenerate generate ZAP & AVL. | |
675 | */ | |
676 | static void | |
677 | dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, | |
678 | uint64_t mrs_obj, dmu_tx_t *tx) | |
679 | { | |
a1d477c2 | 680 | dsl_deadlist_t dl = { 0 }; |
428870ff BB |
681 | dsl_pool_t *dp = dmu_objset_pool(os); |
682 | ||
683 | dsl_deadlist_open(&dl, os, dlobj); | |
684 | if (dl.dl_oldfmt) { | |
685 | dsl_deadlist_close(&dl); | |
686 | return; | |
687 | } | |
688 | ||
689 | while (mrs_obj != 0) { | |
690 | dsl_dataset_t *ds; | |
30af21b0 | 691 | VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); |
d683ddbb JG |
692 | dsl_deadlist_add_key(&dl, |
693 | dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); | |
694 | mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; | |
428870ff BB |
695 | dsl_dataset_rele(ds, FTAG); |
696 | } | |
697 | dsl_deadlist_close(&dl); | |
698 | } | |
699 | ||
700 | uint64_t | |
701 | dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, | |
702 | uint64_t mrs_obj, dmu_tx_t *tx) | |
703 | { | |
704 | dsl_deadlist_entry_t *dle; | |
705 | uint64_t newobj; | |
706 | ||
707 | newobj = dsl_deadlist_alloc(dl->dl_os, tx); | |
708 | ||
709 | if (dl->dl_oldfmt) { | |
710 | dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); | |
711 | return (newobj); | |
712 | } | |
713 | ||
df7eeccc | 714 | mutex_enter(&dl->dl_lock); |
428870ff BB |
715 | dsl_deadlist_load_tree(dl); |
716 | ||
717 | for (dle = avl_first(&dl->dl_tree); dle; | |
718 | dle = AVL_NEXT(&dl->dl_tree, dle)) { | |
719 | uint64_t obj; | |
720 | ||
721 | if (dle->dle_mintxg >= maxtxg) | |
722 | break; | |
723 | ||
f1512ee6 | 724 | obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); |
30af21b0 | 725 | VERIFY0(zap_add_int_key(dl->dl_os, newobj, |
428870ff BB |
726 | dle->dle_mintxg, obj, tx)); |
727 | } | |
df7eeccc | 728 | mutex_exit(&dl->dl_lock); |
428870ff BB |
729 | return (newobj); |
730 | } | |
731 | ||
732 | void | |
733 | dsl_deadlist_space(dsl_deadlist_t *dl, | |
734 | uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) | |
735 | { | |
a1d477c2 | 736 | ASSERT(dsl_deadlist_is_open(dl)); |
428870ff | 737 | if (dl->dl_oldfmt) { |
30af21b0 | 738 | VERIFY0(bpobj_space(&dl->dl_bpobj, |
428870ff BB |
739 | usedp, compp, uncompp)); |
740 | return; | |
741 | } | |
742 | ||
743 | mutex_enter(&dl->dl_lock); | |
744 | *usedp = dl->dl_phys->dl_used; | |
745 | *compp = dl->dl_phys->dl_comp; | |
746 | *uncompp = dl->dl_phys->dl_uncomp; | |
747 | mutex_exit(&dl->dl_lock); | |
748 | } | |
749 | ||
750 | /* | |
751 | * return space used in the range (mintxg, maxtxg]. | |
752 | * Includes maxtxg, does not include mintxg. | |
753 | * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is | |
30af21b0 | 754 | * UINT64_MAX). |
428870ff BB |
755 | */ |
756 | void | |
757 | dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, | |
758 | uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) | |
759 | { | |
325d288c MA |
760 | dsl_deadlist_cache_entry_t *dlce; |
761 | dsl_deadlist_cache_entry_t dlce_tofind; | |
428870ff BB |
762 | avl_index_t where; |
763 | ||
764 | if (dl->dl_oldfmt) { | |
30af21b0 | 765 | VERIFY0(bpobj_space_range(&dl->dl_bpobj, |
428870ff BB |
766 | mintxg, maxtxg, usedp, compp, uncompp)); |
767 | return; | |
768 | } | |
769 | ||
428870ff BB |
770 | *usedp = *compp = *uncompp = 0; |
771 | ||
330d06f9 | 772 | mutex_enter(&dl->dl_lock); |
325d288c MA |
773 | dsl_deadlist_load_cache(dl); |
774 | dlce_tofind.dlce_mintxg = mintxg; | |
775 | dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where); | |
776 | ||
428870ff | 777 | /* |
325d288c MA |
778 | * If this mintxg doesn't exist, it may be an empty_bpobj which |
779 | * is omitted from the sparse tree. Start at the next non-empty | |
780 | * entry. | |
428870ff | 781 | */ |
325d288c MA |
782 | if (dlce == NULL) |
783 | dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER); | |
784 | ||
785 | for (; dlce && dlce->dlce_mintxg < maxtxg; | |
786 | dlce = AVL_NEXT(&dl->dl_tree, dlce)) { | |
787 | *usedp += dlce->dlce_bytes; | |
788 | *compp += dlce->dlce_comp; | |
789 | *uncompp += dlce->dlce_uncomp; | |
428870ff | 790 | } |
30af21b0 | 791 | |
330d06f9 | 792 | mutex_exit(&dl->dl_lock); |
428870ff BB |
793 | } |
794 | ||
795 | static void | |
796 | dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, | |
797 | dmu_tx_t *tx) | |
798 | { | |
799 | dsl_deadlist_entry_t dle_tofind; | |
800 | dsl_deadlist_entry_t *dle; | |
801 | avl_index_t where; | |
802 | uint64_t used, comp, uncomp; | |
803 | bpobj_t bpo; | |
804 | ||
df7eeccc MA |
805 | ASSERT(MUTEX_HELD(&dl->dl_lock)); |
806 | ||
30af21b0 PD |
807 | VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); |
808 | VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp)); | |
428870ff BB |
809 | bpobj_close(&bpo); |
810 | ||
811 | dsl_deadlist_load_tree(dl); | |
812 | ||
813 | dmu_buf_will_dirty(dl->dl_dbuf, tx); | |
428870ff BB |
814 | dl->dl_phys->dl_used += used; |
815 | dl->dl_phys->dl_comp += comp; | |
816 | dl->dl_phys->dl_uncomp += uncomp; | |
428870ff BB |
817 | |
818 | dle_tofind.dle_mintxg = birth; | |
819 | dle = avl_find(&dl->dl_tree, &dle_tofind, &where); | |
820 | if (dle == NULL) | |
821 | dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); | |
753c3839 | 822 | dle_enqueue_subobj(dl, dle, obj, tx); |
428870ff BB |
823 | } |
824 | ||
dc5c8006 AM |
825 | /* |
826 | * Prefetch metadata required for dsl_deadlist_insert_bpobj(). | |
827 | */ | |
828 | static void | |
829 | dsl_deadlist_prefetch_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth) | |
830 | { | |
831 | dsl_deadlist_entry_t dle_tofind; | |
832 | dsl_deadlist_entry_t *dle; | |
833 | avl_index_t where; | |
834 | ||
835 | ASSERT(MUTEX_HELD(&dl->dl_lock)); | |
836 | ||
837 | dsl_deadlist_load_tree(dl); | |
838 | ||
839 | dle_tofind.dle_mintxg = birth; | |
840 | dle = avl_find(&dl->dl_tree, &dle_tofind, &where); | |
841 | if (dle == NULL) | |
842 | dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); | |
843 | dle_prefetch_subobj(dl, dle, obj); | |
844 | } | |
845 | ||
428870ff | 846 | static int |
37f03da8 SH |
847 | dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, |
848 | dmu_tx_t *tx) | |
428870ff BB |
849 | { |
850 | dsl_deadlist_t *dl = arg; | |
37f03da8 | 851 | dsl_deadlist_insert(dl, bp, bp_freed, tx); |
428870ff BB |
852 | return (0); |
853 | } | |
854 | ||
855 | /* | |
856 | * Merge the deadlist pointed to by 'obj' into dl. obj will be left as | |
857 | * an empty deadlist. | |
858 | */ | |
859 | void | |
860 | dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) | |
861 | { | |
dc5c8006 | 862 | zap_cursor_t zc, pzc; |
3b9309aa | 863 | zap_attribute_t *za, *pza; |
428870ff BB |
864 | dmu_buf_t *bonus; |
865 | dsl_deadlist_phys_t *dlp; | |
866 | dmu_object_info_t doi; | |
dc5c8006 | 867 | int error, perror, i; |
428870ff | 868 | |
30af21b0 | 869 | VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); |
428870ff BB |
870 | if (doi.doi_type == DMU_OT_BPOBJ) { |
871 | bpobj_t bpo; | |
30af21b0 PD |
872 | VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); |
873 | VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx)); | |
428870ff BB |
874 | bpobj_close(&bpo); |
875 | return; | |
876 | } | |
877 | ||
3b9309aa MZ |
878 | za = kmem_alloc(sizeof (*za), KM_SLEEP); |
879 | pza = kmem_alloc(sizeof (*pza), KM_SLEEP); | |
880 | ||
df7eeccc | 881 | mutex_enter(&dl->dl_lock); |
dc5c8006 AM |
882 | /* |
883 | * Prefetch up to 128 deadlists first and then more as we progress. | |
884 | * The limit is a balance between ARC use and diminishing returns. | |
885 | */ | |
886 | for (zap_cursor_init(&pzc, dl->dl_os, obj), i = 0; | |
3b9309aa | 887 | (perror = zap_cursor_retrieve(&pzc, pza)) == 0 && i < 128; |
dc5c8006 | 888 | zap_cursor_advance(&pzc), i++) { |
3b9309aa MZ |
889 | dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, |
890 | zfs_strtonum(pza->za_name, NULL)); | |
dc5c8006 | 891 | } |
428870ff | 892 | for (zap_cursor_init(&zc, dl->dl_os, obj); |
3b9309aa | 893 | (error = zap_cursor_retrieve(&zc, za)) == 0; |
428870ff | 894 | zap_cursor_advance(&zc)) { |
fdba8cbb AM |
895 | dsl_deadlist_insert_bpobj(dl, za->za_first_integer, |
896 | zfs_strtonum(za->za_name, NULL), tx); | |
897 | VERIFY0(zap_remove(dl->dl_os, obj, za->za_name, tx)); | |
dc5c8006 | 898 | if (perror == 0) { |
3b9309aa MZ |
899 | dsl_deadlist_prefetch_bpobj(dl, pza->za_first_integer, |
900 | zfs_strtonum(pza->za_name, NULL)); | |
dc5c8006 | 901 | zap_cursor_advance(&pzc); |
3b9309aa | 902 | perror = zap_cursor_retrieve(&pzc, pza); |
dc5c8006 | 903 | } |
428870ff | 904 | } |
d87676a9 | 905 | VERIFY3U(error, ==, ENOENT); |
428870ff | 906 | zap_cursor_fini(&zc); |
dc5c8006 | 907 | zap_cursor_fini(&pzc); |
428870ff | 908 | |
30af21b0 | 909 | VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); |
428870ff BB |
910 | dlp = bonus->db_data; |
911 | dmu_buf_will_dirty(bonus, tx); | |
861166b0 | 912 | memset(dlp, 0, sizeof (*dlp)); |
428870ff | 913 | dmu_buf_rele(bonus, FTAG); |
df7eeccc | 914 | mutex_exit(&dl->dl_lock); |
3b9309aa MZ |
915 | |
916 | kmem_free(za, sizeof (*za)); | |
917 | kmem_free(pza, sizeof (*pza)); | |
428870ff BB |
918 | } |
919 | ||
920 | /* | |
30af21b0 | 921 | * Remove entries on dl that are born > mintxg, and put them on the bpobj. |
428870ff BB |
922 | */ |
923 | void | |
924 | dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, | |
925 | dmu_tx_t *tx) | |
926 | { | |
927 | dsl_deadlist_entry_t dle_tofind; | |
dc5c8006 | 928 | dsl_deadlist_entry_t *dle, *pdle; |
428870ff | 929 | avl_index_t where; |
dc5c8006 | 930 | int i; |
428870ff BB |
931 | |
932 | ASSERT(!dl->dl_oldfmt); | |
df7eeccc MA |
933 | |
934 | mutex_enter(&dl->dl_lock); | |
428870ff BB |
935 | dmu_buf_will_dirty(dl->dl_dbuf, tx); |
936 | dsl_deadlist_load_tree(dl); | |
937 | ||
938 | dle_tofind.dle_mintxg = mintxg; | |
939 | dle = avl_find(&dl->dl_tree, &dle_tofind, &where); | |
940 | if (dle == NULL) | |
941 | dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); | |
dc5c8006 AM |
942 | /* |
943 | * Prefetch up to 128 deadlists first and then more as we progress. | |
944 | * The limit is a balance between ARC use and diminishing returns. | |
945 | */ | |
b79e7114 | 946 | for (pdle = dle, i = 0; pdle && i < 128; i++) { |
dc5c8006 AM |
947 | bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); |
948 | pdle = AVL_NEXT(&dl->dl_tree, pdle); | |
949 | } | |
428870ff BB |
950 | while (dle) { |
951 | uint64_t used, comp, uncomp; | |
952 | dsl_deadlist_entry_t *dle_next; | |
953 | ||
954 | bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); | |
dc5c8006 AM |
955 | if (pdle) { |
956 | bpobj_prefetch_subobj(bpo, pdle->dle_bpobj.bpo_object); | |
957 | pdle = AVL_NEXT(&dl->dl_tree, pdle); | |
958 | } | |
428870ff | 959 | |
30af21b0 | 960 | VERIFY0(bpobj_space(&dle->dle_bpobj, |
428870ff | 961 | &used, &comp, &uncomp)); |
428870ff BB |
962 | ASSERT3U(dl->dl_phys->dl_used, >=, used); |
963 | ASSERT3U(dl->dl_phys->dl_comp, >=, comp); | |
964 | ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); | |
965 | dl->dl_phys->dl_used -= used; | |
966 | dl->dl_phys->dl_comp -= comp; | |
967 | dl->dl_phys->dl_uncomp -= uncomp; | |
428870ff | 968 | |
30af21b0 | 969 | VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, |
428870ff BB |
970 | dle->dle_mintxg, tx)); |
971 | ||
972 | dle_next = AVL_NEXT(&dl->dl_tree, dle); | |
973 | avl_remove(&dl->dl_tree, dle); | |
974 | bpobj_close(&dle->dle_bpobj); | |
975 | kmem_free(dle, sizeof (*dle)); | |
976 | dle = dle_next; | |
977 | } | |
df7eeccc | 978 | mutex_exit(&dl->dl_lock); |
428870ff | 979 | } |
37f03da8 SH |
980 | |
981 | typedef struct livelist_entry { | |
86b5f4c1 SD |
982 | blkptr_t le_bp; |
983 | uint32_t le_refcnt; | |
37f03da8 SH |
984 | avl_node_t le_node; |
985 | } livelist_entry_t; | |
986 | ||
987 | static int | |
988 | livelist_compare(const void *larg, const void *rarg) | |
989 | { | |
86b5f4c1 SD |
990 | const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp; |
991 | const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp; | |
37f03da8 SH |
992 | |
993 | /* Sort them according to dva[0] */ | |
994 | uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); | |
995 | uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); | |
996 | ||
997 | if (l_dva0_vdev != r_dva0_vdev) | |
ca577779 | 998 | return (TREE_CMP(l_dva0_vdev, r_dva0_vdev)); |
37f03da8 SH |
999 | |
1000 | /* if vdevs are equal, sort by offsets. */ | |
1001 | uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); | |
1002 | uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); | |
ca577779 | 1003 | return (TREE_CMP(l_dva0_offset, r_dva0_offset)); |
37f03da8 SH |
1004 | } |
1005 | ||
1006 | struct livelist_iter_arg { | |
1007 | avl_tree_t *avl; | |
1008 | bplist_t *to_free; | |
1009 | zthr_t *t; | |
1010 | }; | |
1011 | ||
1012 | /* | |
1013 | * Expects an AVL tree which is incrementally filled will FREE blkptrs | |
1014 | * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a | |
1015 | * corresponding FREE are stored in the supplied bplist. | |
86b5f4c1 | 1016 | * |
e78aca3b AM |
1017 | * Note that multiple FREE and ALLOC entries for the same blkptr may be |
1018 | * encountered when dedup or block cloning is involved. For this reason we | |
1019 | * keep a refcount for all the FREE entries of each blkptr and ensure that | |
86b5f4c1 | 1020 | * each of those FREE entries has a corresponding ALLOC preceding it. |
37f03da8 SH |
1021 | */ |
1022 | static int | |
1023 | dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, | |
1024 | dmu_tx_t *tx) | |
1025 | { | |
1026 | struct livelist_iter_arg *lia = arg; | |
1027 | avl_tree_t *avl = lia->avl; | |
1028 | bplist_t *to_free = lia->to_free; | |
1029 | zthr_t *t = lia->t; | |
1030 | ASSERT(tx == NULL); | |
1031 | ||
1032 | if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) | |
1033 | return (SET_ERROR(EINTR)); | |
86b5f4c1 SD |
1034 | |
1035 | livelist_entry_t node; | |
1036 | node.le_bp = *bp; | |
1037 | livelist_entry_t *found = avl_find(avl, &node, NULL); | |
e78aca3b AM |
1038 | if (found) { |
1039 | ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp)); | |
1040 | ASSERT3U(BP_GET_CHECKSUM(bp), ==, | |
1041 | BP_GET_CHECKSUM(&found->le_bp)); | |
493fcce9 | 1042 | ASSERT3U(BP_GET_BIRTH(bp), ==, BP_GET_BIRTH(&found->le_bp)); |
e78aca3b | 1043 | } |
37f03da8 | 1044 | if (bp_freed) { |
86b5f4c1 SD |
1045 | if (found == NULL) { |
1046 | /* first free entry for this blkptr */ | |
1047 | livelist_entry_t *e = | |
1048 | kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP); | |
1049 | e->le_bp = *bp; | |
1050 | e->le_refcnt = 1; | |
1051 | avl_add(avl, e); | |
37f03da8 | 1052 | } else { |
e78aca3b AM |
1053 | /* |
1054 | * Deduped or cloned block free. We could assert D bit | |
1055 | * for dedup, but there is no such one for cloning. | |
1056 | */ | |
86b5f4c1 SD |
1057 | ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); |
1058 | found->le_refcnt++; | |
1059 | } | |
1060 | } else { | |
1061 | if (found == NULL) { | |
1062 | /* block is currently marked as allocated */ | |
37f03da8 | 1063 | bplist_append(to_free, bp); |
86b5f4c1 SD |
1064 | } else { |
1065 | /* alloc matches a free entry */ | |
1066 | ASSERT3U(found->le_refcnt, !=, 0); | |
1067 | found->le_refcnt--; | |
1068 | if (found->le_refcnt == 0) { | |
1069 | /* all tracked free pairs have been matched */ | |
1070 | avl_remove(avl, found); | |
1071 | kmem_free(found, sizeof (livelist_entry_t)); | |
86b5f4c1 | 1072 | } |
37f03da8 SH |
1073 | } |
1074 | } | |
1075 | return (0); | |
1076 | } | |
1077 | ||
1078 | /* | |
1079 | * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs | |
1080 | * which have an ALLOC entry but no matching FREE | |
1081 | */ | |
1082 | int | |
1083 | dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, | |
1084 | uint64_t *size) | |
1085 | { | |
1086 | avl_tree_t avl; | |
1087 | avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), | |
1088 | offsetof(livelist_entry_t, le_node)); | |
1089 | ||
1090 | /* process the sublist */ | |
1091 | struct livelist_iter_arg arg = { | |
1092 | .avl = &avl, | |
1093 | .to_free = to_free, | |
1094 | .t = t | |
1095 | }; | |
1096 | int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); | |
4acc36ed | 1097 | VERIFY(err != 0 || avl_numnodes(&avl) == 0); |
37f03da8 | 1098 | |
4acc36ed SD |
1099 | void *cookie = NULL; |
1100 | livelist_entry_t *le = NULL; | |
1101 | while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) { | |
1102 | kmem_free(le, sizeof (livelist_entry_t)); | |
1103 | } | |
37f03da8 SH |
1104 | avl_destroy(&avl); |
1105 | return (err); | |
1106 | } | |
1107 | ||
ab8d9c17 | 1108 | ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, U64, ZMOD_RW, |
37f03da8 SH |
1109 | "Size to start the next sub-livelist in a livelist"); |
1110 | ||
03fdcb9a | 1111 | ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW, |
37f03da8 | 1112 | "Threshold at which livelist is disabled"); |