]>
Commit | Line | Data |
---|---|---|
428870ff BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
428870ff BB |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. | |
37f03da8 | 23 | * Copyright (c) 2012, 2019 by Delphix. All rights reserved. |
0c66c32d | 24 | * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. |
428870ff BB |
25 | */ |
26 | ||
428870ff | 27 | #include <sys/dmu.h> |
428870ff BB |
28 | #include <sys/zap.h> |
29 | #include <sys/zfs_context.h> | |
30 | #include <sys/dsl_pool.h> | |
37f03da8 | 31 | #include <sys/dsl_dataset.h> |
428870ff | 32 | |
330d06f9 MA |
33 | /* |
34 | * Deadlist concurrency: | |
35 | * | |
36 | * Deadlists can only be modified from the syncing thread. | |
37 | * | |
38 | * Except for dsl_deadlist_insert(), it can only be modified with the | |
39 | * dp_config_rwlock held with RW_WRITER. | |
40 | * | |
41 | * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can | |
42 | * be called concurrently, from open context, with the dl_config_rwlock held | |
43 | * with RW_READER. | |
44 | * | |
45 | * Therefore, we only need to provide locking between dsl_deadlist_insert() and | |
46 | * the accessors, protecting: | |
47 | * dl_phys->dl_used,comp,uncomp | |
48 | * and protecting the dl_tree from being loaded. | |
49 | * The locking is provided by dl_lock. Note that locking on the bpobj_t | |
50 | * provides its own locking, and dl_oldfmt is immutable. | |
51 | */ | |
52 | ||
37f03da8 SH |
53 | /* |
54 | * Livelist Overview | |
55 | * ================ | |
56 | * | |
57 | * Livelists use the same 'deadlist_t' struct as deadlists and are also used | |
58 | * to track blkptrs over the lifetime of a dataset. Livelists however, belong | |
59 | * to clones and track the blkptrs that are clone-specific (were born after | |
60 | * the clone's creation). The exception is embedded block pointers which are | |
61 | * not included in livelists because they do not need to be freed. | |
62 | * | |
63 | * When it comes time to delete the clone, the livelist provides a quick | |
64 | * reference as to what needs to be freed. For this reason, livelists also track | |
65 | * when clone-specific blkptrs are freed before deletion to prevent double | |
66 | * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the | |
67 | * deletion algorithm iterates backwards over the livelist, matching | |
68 | * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists | |
69 | * are also updated in the case when blkptrs are remapped: the old version | |
70 | * of the blkptr is cancelled out with a FREE and the new version is tracked | |
71 | * with an ALLOC. | |
72 | * | |
73 | * To bound the amount of memory required for deletion, livelists over a | |
74 | * certain size are spread over multiple entries. Entries are grouped by | |
75 | * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will | |
76 | * be in the same entry. This allows us to delete livelists incrementally | |
77 | * over multiple syncs, one entry at a time. | |
78 | * | |
79 | * During the lifetime of the clone, livelists can get extremely large. | |
80 | * Their size is managed by periodic condensing (preemptively cancelling out | |
81 | * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when | |
82 | * the shared space between the clone and its origin is so small that it | |
83 | * doesn't make sense to use livelists anymore. | |
84 | */ | |
85 | ||
86 | /* | |
87 | * The threshold sublist size at which we create a new sub-livelist for the | |
88 | * next txg. However, since blkptrs of the same transaction group must be in | |
89 | * the same sub-list, the actual sublist size may exceed this. When picking the | |
90 | * size we had to balance the fact that larger sublists mean fewer sublists | |
91 | * (decreasing the cost of insertion) against the consideration that sublists | |
92 | * will be loaded into memory and shouldn't take up an inordinate amount of | |
93 | * space. We settled on ~500000 entries, corresponding to roughly 128M. | |
94 | */ | |
95 | unsigned long zfs_livelist_max_entries = 500000; | |
96 | ||
97 | /* | |
98 | * We can approximate how much of a performance gain a livelist will give us | |
99 | * based on the percentage of blocks shared between the clone and its origin. | |
100 | * 0 percent shared means that the clone has completely diverged and that the | |
101 | * old method is maximally effective: every read from the block tree will | |
102 | * result in lots of frees. Livelists give us gains when they track blocks | |
103 | * scattered across the tree, when one read in the old method might only | |
104 | * result in a few frees. Once the clone has been overwritten enough, | |
105 | * writes are no longer sparse and we'll no longer get much of a benefit from | |
106 | * tracking them with a livelist. We chose a lower limit of 75 percent shared | |
107 | * (25 percent overwritten). This means that 1/4 of all block pointers will be | |
108 | * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists | |
109 | * to make deletion 4x faster. Once the amount of shared space drops below this | |
110 | * threshold, the clone will revert to the old deletion method. | |
111 | */ | |
112 | int zfs_livelist_min_percent_shared = 75; | |
113 | ||
428870ff BB |
114 | static int |
115 | dsl_deadlist_compare(const void *arg1, const void *arg2) | |
116 | { | |
325d288c MA |
117 | const dsl_deadlist_entry_t *dle1 = arg1; |
118 | const dsl_deadlist_entry_t *dle2 = arg2; | |
428870ff | 119 | |
ca577779 | 120 | return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); |
428870ff BB |
121 | } |
122 | ||
325d288c MA |
123 | static int |
124 | dsl_deadlist_cache_compare(const void *arg1, const void *arg2) | |
125 | { | |
126 | const dsl_deadlist_cache_entry_t *dlce1 = arg1; | |
127 | const dsl_deadlist_cache_entry_t *dlce2 = arg2; | |
128 | ||
ca577779 | 129 | return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); |
325d288c MA |
130 | } |
131 | ||
428870ff BB |
132 | static void |
133 | dsl_deadlist_load_tree(dsl_deadlist_t *dl) | |
134 | { | |
135 | zap_cursor_t zc; | |
136 | zap_attribute_t za; | |
d87676a9 | 137 | int error; |
428870ff | 138 | |
df7eeccc MA |
139 | ASSERT(MUTEX_HELD(&dl->dl_lock)); |
140 | ||
428870ff | 141 | ASSERT(!dl->dl_oldfmt); |
325d288c MA |
142 | if (dl->dl_havecache) { |
143 | /* | |
144 | * After loading the tree, the caller may modify the tree, | |
145 | * e.g. to add or remove nodes, or to make a node no longer | |
146 | * refer to the empty_bpobj. These changes would make the | |
147 | * dl_cache incorrect. Therefore we discard the cache here, | |
148 | * so that it can't become incorrect. | |
149 | */ | |
150 | dsl_deadlist_cache_entry_t *dlce; | |
151 | void *cookie = NULL; | |
152 | while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) | |
153 | != NULL) { | |
154 | kmem_free(dlce, sizeof (*dlce)); | |
155 | } | |
156 | avl_destroy(&dl->dl_cache); | |
157 | dl->dl_havecache = B_FALSE; | |
158 | } | |
428870ff BB |
159 | if (dl->dl_havetree) |
160 | return; | |
161 | ||
162 | avl_create(&dl->dl_tree, dsl_deadlist_compare, | |
163 | sizeof (dsl_deadlist_entry_t), | |
164 | offsetof(dsl_deadlist_entry_t, dle_node)); | |
165 | for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); | |
d87676a9 | 166 | (error = zap_cursor_retrieve(&zc, &za)) == 0; |
428870ff | 167 | zap_cursor_advance(&zc)) { |
e19572e4 YP |
168 | dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); |
169 | dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); | |
325d288c MA |
170 | |
171 | /* | |
172 | * Prefetch all the bpobj's so that we do that i/o | |
173 | * in parallel. Then open them all in a second pass. | |
174 | */ | |
175 | dle->dle_bpobj.bpo_object = za.za_first_integer; | |
176 | dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object, | |
177 | 0, 0, 0, ZIO_PRIORITY_SYNC_READ); | |
178 | ||
428870ff BB |
179 | avl_add(&dl->dl_tree, dle); |
180 | } | |
d87676a9 | 181 | VERIFY3U(error, ==, ENOENT); |
428870ff | 182 | zap_cursor_fini(&zc); |
325d288c MA |
183 | |
184 | for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree); | |
185 | dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) { | |
186 | VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, | |
187 | dle->dle_bpobj.bpo_object)); | |
188 | } | |
428870ff BB |
189 | dl->dl_havetree = B_TRUE; |
190 | } | |
191 | ||
325d288c MA |
192 | /* |
193 | * Load only the non-empty bpobj's into the dl_cache. The cache is an analog | |
194 | * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It | |
195 | * is used only for gathering space statistics. The dl_cache has two | |
196 | * advantages over the dl_tree: | |
197 | * | |
198 | * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's | |
199 | * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj | |
200 | * many times and to inquire about its (zero) space stats many times. | |
201 | * | |
202 | * 2. The dl_cache uses less memory than the dl_tree. We only need to load | |
203 | * the dl_tree of snapshots when deleting a snapshot, after which we free the | |
204 | * dl_tree with dsl_deadlist_discard_tree | |
205 | */ | |
206 | static void | |
207 | dsl_deadlist_load_cache(dsl_deadlist_t *dl) | |
208 | { | |
209 | zap_cursor_t zc; | |
210 | zap_attribute_t za; | |
d87676a9 | 211 | int error; |
325d288c MA |
212 | |
213 | ASSERT(MUTEX_HELD(&dl->dl_lock)); | |
214 | ||
215 | ASSERT(!dl->dl_oldfmt); | |
216 | if (dl->dl_havecache) | |
217 | return; | |
218 | ||
219 | uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj; | |
220 | ||
221 | avl_create(&dl->dl_cache, dsl_deadlist_cache_compare, | |
222 | sizeof (dsl_deadlist_cache_entry_t), | |
223 | offsetof(dsl_deadlist_cache_entry_t, dlce_node)); | |
224 | for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); | |
d87676a9 | 225 | (error = zap_cursor_retrieve(&zc, &za)) == 0; |
325d288c MA |
226 | zap_cursor_advance(&zc)) { |
227 | if (za.za_first_integer == empty_bpobj) | |
228 | continue; | |
229 | dsl_deadlist_cache_entry_t *dlce = | |
230 | kmem_zalloc(sizeof (*dlce), KM_SLEEP); | |
231 | dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL); | |
232 | ||
233 | /* | |
234 | * Prefetch all the bpobj's so that we do that i/o | |
235 | * in parallel. Then open them all in a second pass. | |
236 | */ | |
237 | dlce->dlce_bpobj = za.za_first_integer; | |
238 | dmu_prefetch(dl->dl_os, dlce->dlce_bpobj, | |
239 | 0, 0, 0, ZIO_PRIORITY_SYNC_READ); | |
240 | avl_add(&dl->dl_cache, dlce); | |
241 | } | |
d87676a9 | 242 | VERIFY3U(error, ==, ENOENT); |
325d288c MA |
243 | zap_cursor_fini(&zc); |
244 | ||
245 | for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache); | |
246 | dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) { | |
247 | bpobj_t bpo; | |
248 | VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj)); | |
249 | ||
250 | VERIFY0(bpobj_space(&bpo, | |
251 | &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp)); | |
252 | bpobj_close(&bpo); | |
253 | } | |
254 | dl->dl_havecache = B_TRUE; | |
255 | } | |
256 | ||
257 | /* | |
258 | * Discard the tree to save memory. | |
259 | */ | |
260 | void | |
261 | dsl_deadlist_discard_tree(dsl_deadlist_t *dl) | |
262 | { | |
263 | mutex_enter(&dl->dl_lock); | |
264 | ||
265 | if (!dl->dl_havetree) { | |
266 | mutex_exit(&dl->dl_lock); | |
267 | return; | |
268 | } | |
269 | dsl_deadlist_entry_t *dle; | |
270 | void *cookie = NULL; | |
271 | while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { | |
272 | bpobj_close(&dle->dle_bpobj); | |
273 | kmem_free(dle, sizeof (*dle)); | |
274 | } | |
275 | avl_destroy(&dl->dl_tree); | |
276 | ||
277 | dl->dl_havetree = B_FALSE; | |
278 | mutex_exit(&dl->dl_lock); | |
279 | } | |
280 | ||
37f03da8 SH |
281 | void |
282 | dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) | |
283 | { | |
284 | dsl_deadlist_entry_t *dle; | |
285 | ||
286 | ASSERT(dsl_deadlist_is_open(dl)); | |
287 | ||
288 | mutex_enter(&dl->dl_lock); | |
289 | dsl_deadlist_load_tree(dl); | |
290 | mutex_exit(&dl->dl_lock); | |
291 | for (dle = avl_first(&dl->dl_tree); dle != NULL; | |
292 | dle = AVL_NEXT(&dl->dl_tree, dle)) { | |
293 | if (func(args, dle) != 0) | |
294 | break; | |
295 | } | |
296 | } | |
297 | ||
428870ff BB |
298 | void |
299 | dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) | |
300 | { | |
301 | dmu_object_info_t doi; | |
302 | ||
a1d477c2 MA |
303 | ASSERT(!dsl_deadlist_is_open(dl)); |
304 | ||
428870ff BB |
305 | mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); |
306 | dl->dl_os = os; | |
307 | dl->dl_object = object; | |
30af21b0 | 308 | VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); |
428870ff BB |
309 | dmu_object_info_from_db(dl->dl_dbuf, &doi); |
310 | if (doi.doi_type == DMU_OT_BPOBJ) { | |
311 | dmu_buf_rele(dl->dl_dbuf, dl); | |
312 | dl->dl_dbuf = NULL; | |
313 | dl->dl_oldfmt = B_TRUE; | |
30af21b0 | 314 | VERIFY0(bpobj_open(&dl->dl_bpobj, os, object)); |
428870ff BB |
315 | return; |
316 | } | |
317 | ||
318 | dl->dl_oldfmt = B_FALSE; | |
319 | dl->dl_phys = dl->dl_dbuf->db_data; | |
320 | dl->dl_havetree = B_FALSE; | |
325d288c | 321 | dl->dl_havecache = B_FALSE; |
428870ff BB |
322 | } |
323 | ||
a1d477c2 MA |
324 | boolean_t |
325 | dsl_deadlist_is_open(dsl_deadlist_t *dl) | |
326 | { | |
327 | return (dl->dl_os != NULL); | |
328 | } | |
329 | ||
428870ff BB |
330 | void |
331 | dsl_deadlist_close(dsl_deadlist_t *dl) | |
332 | { | |
a1d477c2 | 333 | ASSERT(dsl_deadlist_is_open(dl)); |
c17486b2 | 334 | mutex_destroy(&dl->dl_lock); |
0c66c32d | 335 | |
428870ff BB |
336 | if (dl->dl_oldfmt) { |
337 | dl->dl_oldfmt = B_FALSE; | |
338 | bpobj_close(&dl->dl_bpobj); | |
a1d477c2 MA |
339 | dl->dl_os = NULL; |
340 | dl->dl_object = 0; | |
428870ff BB |
341 | return; |
342 | } | |
343 | ||
344 | if (dl->dl_havetree) { | |
325d288c MA |
345 | dsl_deadlist_entry_t *dle; |
346 | void *cookie = NULL; | |
428870ff BB |
347 | while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) |
348 | != NULL) { | |
349 | bpobj_close(&dle->dle_bpobj); | |
350 | kmem_free(dle, sizeof (*dle)); | |
351 | } | |
352 | avl_destroy(&dl->dl_tree); | |
353 | } | |
325d288c MA |
354 | if (dl->dl_havecache) { |
355 | dsl_deadlist_cache_entry_t *dlce; | |
356 | void *cookie = NULL; | |
357 | while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) | |
358 | != NULL) { | |
359 | kmem_free(dlce, sizeof (*dlce)); | |
360 | } | |
361 | avl_destroy(&dl->dl_cache); | |
362 | } | |
428870ff | 363 | dmu_buf_rele(dl->dl_dbuf, dl); |
428870ff BB |
364 | dl->dl_dbuf = NULL; |
365 | dl->dl_phys = NULL; | |
a1d477c2 MA |
366 | dl->dl_os = NULL; |
367 | dl->dl_object = 0; | |
428870ff BB |
368 | } |
369 | ||
370 | uint64_t | |
371 | dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) | |
372 | { | |
373 | if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) | |
f1512ee6 | 374 | return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); |
428870ff BB |
375 | return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, |
376 | sizeof (dsl_deadlist_phys_t), tx)); | |
377 | } | |
378 | ||
379 | void | |
380 | dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) | |
381 | { | |
382 | dmu_object_info_t doi; | |
383 | zap_cursor_t zc; | |
384 | zap_attribute_t za; | |
d87676a9 | 385 | int error; |
428870ff | 386 | |
30af21b0 | 387 | VERIFY0(dmu_object_info(os, dlobj, &doi)); |
428870ff BB |
388 | if (doi.doi_type == DMU_OT_BPOBJ) { |
389 | bpobj_free(os, dlobj, tx); | |
390 | return; | |
391 | } | |
392 | ||
393 | for (zap_cursor_init(&zc, os, dlobj); | |
d87676a9 | 394 | (error = zap_cursor_retrieve(&zc, &za)) == 0; |
753c3839 MA |
395 | zap_cursor_advance(&zc)) { |
396 | uint64_t obj = za.za_first_integer; | |
397 | if (obj == dmu_objset_pool(os)->dp_empty_bpobj) | |
398 | bpobj_decr_empty(os, tx); | |
399 | else | |
400 | bpobj_free(os, obj, tx); | |
401 | } | |
d87676a9 | 402 | VERIFY3U(error, ==, ENOENT); |
428870ff | 403 | zap_cursor_fini(&zc); |
30af21b0 | 404 | VERIFY0(dmu_object_free(os, dlobj, tx)); |
428870ff BB |
405 | } |
406 | ||
753c3839 MA |
407 | static void |
408 | dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, | |
37f03da8 | 409 | const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) |
753c3839 | 410 | { |
df7eeccc | 411 | ASSERT(MUTEX_HELD(&dl->dl_lock)); |
753c3839 MA |
412 | if (dle->dle_bpobj.bpo_object == |
413 | dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { | |
f1512ee6 | 414 | uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); |
753c3839 MA |
415 | bpobj_close(&dle->dle_bpobj); |
416 | bpobj_decr_empty(dl->dl_os, tx); | |
30af21b0 PD |
417 | VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); |
418 | VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, | |
753c3839 MA |
419 | dle->dle_mintxg, obj, tx)); |
420 | } | |
37f03da8 | 421 | bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); |
753c3839 MA |
422 | } |
423 | ||
424 | static void | |
425 | dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, | |
426 | uint64_t obj, dmu_tx_t *tx) | |
427 | { | |
df7eeccc | 428 | ASSERT(MUTEX_HELD(&dl->dl_lock)); |
753c3839 MA |
429 | if (dle->dle_bpobj.bpo_object != |
430 | dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { | |
431 | bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); | |
432 | } else { | |
433 | bpobj_close(&dle->dle_bpobj); | |
434 | bpobj_decr_empty(dl->dl_os, tx); | |
30af21b0 PD |
435 | VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); |
436 | VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, | |
753c3839 MA |
437 | dle->dle_mintxg, obj, tx)); |
438 | } | |
439 | } | |
440 | ||
428870ff | 441 | void |
37f03da8 SH |
442 | dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, |
443 | dmu_tx_t *tx) | |
428870ff BB |
444 | { |
445 | dsl_deadlist_entry_t dle_tofind; | |
446 | dsl_deadlist_entry_t *dle; | |
447 | avl_index_t where; | |
448 | ||
449 | if (dl->dl_oldfmt) { | |
37f03da8 | 450 | bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); |
428870ff BB |
451 | return; |
452 | } | |
453 | ||
df7eeccc | 454 | mutex_enter(&dl->dl_lock); |
428870ff BB |
455 | dsl_deadlist_load_tree(dl); |
456 | ||
457 | dmu_buf_will_dirty(dl->dl_dbuf, tx); | |
37f03da8 SH |
458 | |
459 | int sign = bp_freed ? -1 : +1; | |
428870ff | 460 | dl->dl_phys->dl_used += |
37f03da8 SH |
461 | sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); |
462 | dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); | |
463 | dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); | |
428870ff BB |
464 | |
465 | dle_tofind.dle_mintxg = bp->blk_birth; | |
466 | dle = avl_find(&dl->dl_tree, &dle_tofind, &where); | |
467 | if (dle == NULL) | |
468 | dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); | |
469 | else | |
470 | dle = AVL_PREV(&dl->dl_tree, dle); | |
eba9e745 BB |
471 | |
472 | if (dle == NULL) { | |
473 | zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", | |
474 | bp, (longlong_t)bp->blk_birth); | |
475 | dle = avl_first(&dl->dl_tree); | |
476 | } | |
477 | ||
478 | ASSERT3P(dle, !=, NULL); | |
37f03da8 | 479 | dle_enqueue(dl, dle, bp, bp_freed, tx); |
df7eeccc | 480 | mutex_exit(&dl->dl_lock); |
428870ff BB |
481 | } |
482 | ||
37f03da8 SH |
483 | int |
484 | dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | |
485 | { | |
486 | dsl_deadlist_t *dl = arg; | |
487 | dsl_deadlist_insert(dl, bp, B_FALSE, tx); | |
488 | return (0); | |
489 | } | |
490 | ||
491 | int | |
492 | dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) | |
493 | { | |
494 | dsl_deadlist_t *dl = arg; | |
495 | dsl_deadlist_insert(dl, bp, B_TRUE, tx); | |
496 | return (0); | |
497 | } | |
498 | ||
428870ff BB |
499 | /* |
500 | * Insert new key in deadlist, which must be > all current entries. | |
501 | * mintxg is not inclusive. | |
502 | */ | |
503 | void | |
504 | dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) | |
505 | { | |
506 | uint64_t obj; | |
507 | dsl_deadlist_entry_t *dle; | |
508 | ||
509 | if (dl->dl_oldfmt) | |
510 | return; | |
511 | ||
79c76d5b | 512 | dle = kmem_alloc(sizeof (*dle), KM_SLEEP); |
428870ff | 513 | dle->dle_mintxg = mintxg; |
df7eeccc MA |
514 | |
515 | mutex_enter(&dl->dl_lock); | |
516 | dsl_deadlist_load_tree(dl); | |
517 | ||
f1512ee6 | 518 | obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); |
30af21b0 | 519 | VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); |
428870ff BB |
520 | avl_add(&dl->dl_tree, dle); |
521 | ||
30af21b0 | 522 | VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object, |
428870ff | 523 | mintxg, obj, tx)); |
df7eeccc | 524 | mutex_exit(&dl->dl_lock); |
428870ff BB |
525 | } |
526 | ||
527 | /* | |
528 | * Remove this key, merging its entries into the previous key. | |
529 | */ | |
530 | void | |
531 | dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) | |
532 | { | |
533 | dsl_deadlist_entry_t dle_tofind; | |
534 | dsl_deadlist_entry_t *dle, *dle_prev; | |
535 | ||
536 | if (dl->dl_oldfmt) | |
537 | return; | |
df7eeccc | 538 | mutex_enter(&dl->dl_lock); |
428870ff BB |
539 | dsl_deadlist_load_tree(dl); |
540 | ||
541 | dle_tofind.dle_mintxg = mintxg; | |
542 | dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); | |
30af21b0 | 543 | ASSERT3P(dle, !=, NULL); |
428870ff BB |
544 | dle_prev = AVL_PREV(&dl->dl_tree, dle); |
545 | ||
753c3839 | 546 | dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); |
428870ff BB |
547 | |
548 | avl_remove(&dl->dl_tree, dle); | |
549 | bpobj_close(&dle->dle_bpobj); | |
550 | kmem_free(dle, sizeof (*dle)); | |
551 | ||
30af21b0 | 552 | VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); |
df7eeccc | 553 | mutex_exit(&dl->dl_lock); |
428870ff BB |
554 | } |
555 | ||
37f03da8 SH |
556 | /* |
557 | * Remove a deadlist entry and all of its contents by removing the entry from | |
558 | * the deadlist's avl tree, freeing the entry's bpobj and adjusting the | |
559 | * deadlist's space accounting accordingly. | |
560 | */ | |
561 | void | |
562 | dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) | |
563 | { | |
564 | uint64_t used, comp, uncomp; | |
565 | dsl_deadlist_entry_t dle_tofind; | |
566 | dsl_deadlist_entry_t *dle; | |
567 | objset_t *os = dl->dl_os; | |
568 | ||
569 | if (dl->dl_oldfmt) | |
570 | return; | |
571 | ||
572 | mutex_enter(&dl->dl_lock); | |
573 | dsl_deadlist_load_tree(dl); | |
574 | ||
575 | dle_tofind.dle_mintxg = mintxg; | |
576 | dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); | |
577 | VERIFY3P(dle, !=, NULL); | |
578 | ||
579 | avl_remove(&dl->dl_tree, dle); | |
580 | VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); | |
581 | VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); | |
325d288c | 582 | dmu_buf_will_dirty(dl->dl_dbuf, tx); |
37f03da8 SH |
583 | dl->dl_phys->dl_used -= used; |
584 | dl->dl_phys->dl_comp -= comp; | |
585 | dl->dl_phys->dl_uncomp -= uncomp; | |
586 | if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { | |
587 | bpobj_decr_empty(os, tx); | |
588 | } else { | |
589 | bpobj_free(os, dle->dle_bpobj.bpo_object, tx); | |
590 | } | |
591 | bpobj_close(&dle->dle_bpobj); | |
592 | kmem_free(dle, sizeof (*dle)); | |
593 | mutex_exit(&dl->dl_lock); | |
594 | } | |
595 | ||
596 | /* | |
597 | * Clear out the contents of a deadlist_entry by freeing its bpobj, | |
598 | * replacing it with an empty bpobj and adjusting the deadlist's | |
599 | * space accounting | |
600 | */ | |
601 | void | |
602 | dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, | |
603 | dmu_tx_t *tx) | |
604 | { | |
605 | uint64_t new_obj, used, comp, uncomp; | |
606 | objset_t *os = dl->dl_os; | |
607 | ||
608 | mutex_enter(&dl->dl_lock); | |
609 | VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); | |
610 | VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); | |
325d288c | 611 | dmu_buf_will_dirty(dl->dl_dbuf, tx); |
37f03da8 SH |
612 | dl->dl_phys->dl_used -= used; |
613 | dl->dl_phys->dl_comp -= comp; | |
614 | dl->dl_phys->dl_uncomp -= uncomp; | |
615 | if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) | |
616 | bpobj_decr_empty(os, tx); | |
617 | else | |
618 | bpobj_free(os, dle->dle_bpobj.bpo_object, tx); | |
619 | bpobj_close(&dle->dle_bpobj); | |
620 | new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); | |
621 | VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); | |
622 | VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, | |
623 | new_obj, tx)); | |
624 | ASSERT(bpobj_is_empty(&dle->dle_bpobj)); | |
625 | mutex_exit(&dl->dl_lock); | |
626 | } | |
627 | ||
628 | /* | |
629 | * Return the first entry in deadlist's avl tree | |
630 | */ | |
631 | dsl_deadlist_entry_t * | |
632 | dsl_deadlist_first(dsl_deadlist_t *dl) | |
633 | { | |
634 | dsl_deadlist_entry_t *dle; | |
635 | ||
636 | mutex_enter(&dl->dl_lock); | |
637 | dsl_deadlist_load_tree(dl); | |
638 | dle = avl_first(&dl->dl_tree); | |
639 | mutex_exit(&dl->dl_lock); | |
640 | ||
641 | return (dle); | |
642 | } | |
643 | ||
644 | /* | |
645 | * Return the last entry in deadlist's avl tree | |
646 | */ | |
647 | dsl_deadlist_entry_t * | |
648 | dsl_deadlist_last(dsl_deadlist_t *dl) | |
649 | { | |
650 | dsl_deadlist_entry_t *dle; | |
651 | ||
652 | mutex_enter(&dl->dl_lock); | |
653 | dsl_deadlist_load_tree(dl); | |
654 | dle = avl_last(&dl->dl_tree); | |
655 | mutex_exit(&dl->dl_lock); | |
656 | ||
657 | return (dle); | |
658 | } | |
659 | ||
428870ff BB |
660 | /* |
661 | * Walk ds's snapshots to regenerate generate ZAP & AVL. | |
662 | */ | |
663 | static void | |
664 | dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, | |
665 | uint64_t mrs_obj, dmu_tx_t *tx) | |
666 | { | |
a1d477c2 | 667 | dsl_deadlist_t dl = { 0 }; |
428870ff BB |
668 | dsl_pool_t *dp = dmu_objset_pool(os); |
669 | ||
670 | dsl_deadlist_open(&dl, os, dlobj); | |
671 | if (dl.dl_oldfmt) { | |
672 | dsl_deadlist_close(&dl); | |
673 | return; | |
674 | } | |
675 | ||
676 | while (mrs_obj != 0) { | |
677 | dsl_dataset_t *ds; | |
30af21b0 | 678 | VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); |
d683ddbb JG |
679 | dsl_deadlist_add_key(&dl, |
680 | dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); | |
681 | mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; | |
428870ff BB |
682 | dsl_dataset_rele(ds, FTAG); |
683 | } | |
684 | dsl_deadlist_close(&dl); | |
685 | } | |
686 | ||
687 | uint64_t | |
688 | dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, | |
689 | uint64_t mrs_obj, dmu_tx_t *tx) | |
690 | { | |
691 | dsl_deadlist_entry_t *dle; | |
692 | uint64_t newobj; | |
693 | ||
694 | newobj = dsl_deadlist_alloc(dl->dl_os, tx); | |
695 | ||
696 | if (dl->dl_oldfmt) { | |
697 | dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); | |
698 | return (newobj); | |
699 | } | |
700 | ||
df7eeccc | 701 | mutex_enter(&dl->dl_lock); |
428870ff BB |
702 | dsl_deadlist_load_tree(dl); |
703 | ||
704 | for (dle = avl_first(&dl->dl_tree); dle; | |
705 | dle = AVL_NEXT(&dl->dl_tree, dle)) { | |
706 | uint64_t obj; | |
707 | ||
708 | if (dle->dle_mintxg >= maxtxg) | |
709 | break; | |
710 | ||
f1512ee6 | 711 | obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); |
30af21b0 | 712 | VERIFY0(zap_add_int_key(dl->dl_os, newobj, |
428870ff BB |
713 | dle->dle_mintxg, obj, tx)); |
714 | } | |
df7eeccc | 715 | mutex_exit(&dl->dl_lock); |
428870ff BB |
716 | return (newobj); |
717 | } | |
718 | ||
719 | void | |
720 | dsl_deadlist_space(dsl_deadlist_t *dl, | |
721 | uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) | |
722 | { | |
a1d477c2 | 723 | ASSERT(dsl_deadlist_is_open(dl)); |
428870ff | 724 | if (dl->dl_oldfmt) { |
30af21b0 | 725 | VERIFY0(bpobj_space(&dl->dl_bpobj, |
428870ff BB |
726 | usedp, compp, uncompp)); |
727 | return; | |
728 | } | |
729 | ||
730 | mutex_enter(&dl->dl_lock); | |
731 | *usedp = dl->dl_phys->dl_used; | |
732 | *compp = dl->dl_phys->dl_comp; | |
733 | *uncompp = dl->dl_phys->dl_uncomp; | |
734 | mutex_exit(&dl->dl_lock); | |
735 | } | |
736 | ||
737 | /* | |
738 | * return space used in the range (mintxg, maxtxg]. | |
739 | * Includes maxtxg, does not include mintxg. | |
740 | * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is | |
30af21b0 | 741 | * UINT64_MAX). |
428870ff BB |
742 | */ |
743 | void | |
744 | dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, | |
745 | uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) | |
746 | { | |
325d288c MA |
747 | dsl_deadlist_cache_entry_t *dlce; |
748 | dsl_deadlist_cache_entry_t dlce_tofind; | |
428870ff BB |
749 | avl_index_t where; |
750 | ||
751 | if (dl->dl_oldfmt) { | |
30af21b0 | 752 | VERIFY0(bpobj_space_range(&dl->dl_bpobj, |
428870ff BB |
753 | mintxg, maxtxg, usedp, compp, uncompp)); |
754 | return; | |
755 | } | |
756 | ||
428870ff BB |
757 | *usedp = *compp = *uncompp = 0; |
758 | ||
330d06f9 | 759 | mutex_enter(&dl->dl_lock); |
325d288c MA |
760 | dsl_deadlist_load_cache(dl); |
761 | dlce_tofind.dlce_mintxg = mintxg; | |
762 | dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where); | |
763 | ||
428870ff | 764 | /* |
325d288c MA |
765 | * If this mintxg doesn't exist, it may be an empty_bpobj which |
766 | * is omitted from the sparse tree. Start at the next non-empty | |
767 | * entry. | |
428870ff | 768 | */ |
325d288c MA |
769 | if (dlce == NULL) |
770 | dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER); | |
771 | ||
772 | for (; dlce && dlce->dlce_mintxg < maxtxg; | |
773 | dlce = AVL_NEXT(&dl->dl_tree, dlce)) { | |
774 | *usedp += dlce->dlce_bytes; | |
775 | *compp += dlce->dlce_comp; | |
776 | *uncompp += dlce->dlce_uncomp; | |
428870ff | 777 | } |
30af21b0 | 778 | |
330d06f9 | 779 | mutex_exit(&dl->dl_lock); |
428870ff BB |
780 | } |
781 | ||
782 | static void | |
783 | dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, | |
784 | dmu_tx_t *tx) | |
785 | { | |
786 | dsl_deadlist_entry_t dle_tofind; | |
787 | dsl_deadlist_entry_t *dle; | |
788 | avl_index_t where; | |
789 | uint64_t used, comp, uncomp; | |
790 | bpobj_t bpo; | |
791 | ||
df7eeccc MA |
792 | ASSERT(MUTEX_HELD(&dl->dl_lock)); |
793 | ||
30af21b0 PD |
794 | VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); |
795 | VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp)); | |
428870ff BB |
796 | bpobj_close(&bpo); |
797 | ||
798 | dsl_deadlist_load_tree(dl); | |
799 | ||
800 | dmu_buf_will_dirty(dl->dl_dbuf, tx); | |
428870ff BB |
801 | dl->dl_phys->dl_used += used; |
802 | dl->dl_phys->dl_comp += comp; | |
803 | dl->dl_phys->dl_uncomp += uncomp; | |
428870ff BB |
804 | |
805 | dle_tofind.dle_mintxg = birth; | |
806 | dle = avl_find(&dl->dl_tree, &dle_tofind, &where); | |
807 | if (dle == NULL) | |
808 | dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); | |
753c3839 | 809 | dle_enqueue_subobj(dl, dle, obj, tx); |
428870ff BB |
810 | } |
811 | ||
812 | static int | |
37f03da8 SH |
813 | dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, |
814 | dmu_tx_t *tx) | |
428870ff BB |
815 | { |
816 | dsl_deadlist_t *dl = arg; | |
37f03da8 | 817 | dsl_deadlist_insert(dl, bp, bp_freed, tx); |
428870ff BB |
818 | return (0); |
819 | } | |
820 | ||
821 | /* | |
822 | * Merge the deadlist pointed to by 'obj' into dl. obj will be left as | |
823 | * an empty deadlist. | |
824 | */ | |
825 | void | |
826 | dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) | |
827 | { | |
828 | zap_cursor_t zc; | |
829 | zap_attribute_t za; | |
830 | dmu_buf_t *bonus; | |
831 | dsl_deadlist_phys_t *dlp; | |
832 | dmu_object_info_t doi; | |
d87676a9 | 833 | int error; |
428870ff | 834 | |
30af21b0 | 835 | VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); |
428870ff BB |
836 | if (doi.doi_type == DMU_OT_BPOBJ) { |
837 | bpobj_t bpo; | |
30af21b0 PD |
838 | VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); |
839 | VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx)); | |
428870ff BB |
840 | bpobj_close(&bpo); |
841 | return; | |
842 | } | |
843 | ||
df7eeccc | 844 | mutex_enter(&dl->dl_lock); |
428870ff | 845 | for (zap_cursor_init(&zc, dl->dl_os, obj); |
d87676a9 | 846 | (error = zap_cursor_retrieve(&zc, &za)) == 0; |
428870ff | 847 | zap_cursor_advance(&zc)) { |
e19572e4 | 848 | uint64_t mintxg = zfs_strtonum(za.za_name, NULL); |
428870ff | 849 | dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); |
30af21b0 | 850 | VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); |
428870ff | 851 | } |
d87676a9 | 852 | VERIFY3U(error, ==, ENOENT); |
428870ff BB |
853 | zap_cursor_fini(&zc); |
854 | ||
30af21b0 | 855 | VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); |
428870ff BB |
856 | dlp = bonus->db_data; |
857 | dmu_buf_will_dirty(bonus, tx); | |
861166b0 | 858 | memset(dlp, 0, sizeof (*dlp)); |
428870ff | 859 | dmu_buf_rele(bonus, FTAG); |
df7eeccc | 860 | mutex_exit(&dl->dl_lock); |
428870ff BB |
861 | } |
862 | ||
863 | /* | |
30af21b0 | 864 | * Remove entries on dl that are born > mintxg, and put them on the bpobj. |
428870ff BB |
865 | */ |
866 | void | |
867 | dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, | |
868 | dmu_tx_t *tx) | |
869 | { | |
870 | dsl_deadlist_entry_t dle_tofind; | |
871 | dsl_deadlist_entry_t *dle; | |
872 | avl_index_t where; | |
873 | ||
874 | ASSERT(!dl->dl_oldfmt); | |
df7eeccc MA |
875 | |
876 | mutex_enter(&dl->dl_lock); | |
428870ff BB |
877 | dmu_buf_will_dirty(dl->dl_dbuf, tx); |
878 | dsl_deadlist_load_tree(dl); | |
879 | ||
880 | dle_tofind.dle_mintxg = mintxg; | |
881 | dle = avl_find(&dl->dl_tree, &dle_tofind, &where); | |
882 | if (dle == NULL) | |
883 | dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); | |
884 | while (dle) { | |
885 | uint64_t used, comp, uncomp; | |
886 | dsl_deadlist_entry_t *dle_next; | |
887 | ||
888 | bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); | |
889 | ||
30af21b0 | 890 | VERIFY0(bpobj_space(&dle->dle_bpobj, |
428870ff | 891 | &used, &comp, &uncomp)); |
428870ff BB |
892 | ASSERT3U(dl->dl_phys->dl_used, >=, used); |
893 | ASSERT3U(dl->dl_phys->dl_comp, >=, comp); | |
894 | ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); | |
895 | dl->dl_phys->dl_used -= used; | |
896 | dl->dl_phys->dl_comp -= comp; | |
897 | dl->dl_phys->dl_uncomp -= uncomp; | |
428870ff | 898 | |
30af21b0 | 899 | VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, |
428870ff BB |
900 | dle->dle_mintxg, tx)); |
901 | ||
902 | dle_next = AVL_NEXT(&dl->dl_tree, dle); | |
903 | avl_remove(&dl->dl_tree, dle); | |
904 | bpobj_close(&dle->dle_bpobj); | |
905 | kmem_free(dle, sizeof (*dle)); | |
906 | dle = dle_next; | |
907 | } | |
df7eeccc | 908 | mutex_exit(&dl->dl_lock); |
428870ff | 909 | } |
37f03da8 SH |
910 | |
911 | typedef struct livelist_entry { | |
86b5f4c1 SD |
912 | blkptr_t le_bp; |
913 | uint32_t le_refcnt; | |
37f03da8 SH |
914 | avl_node_t le_node; |
915 | } livelist_entry_t; | |
916 | ||
917 | static int | |
918 | livelist_compare(const void *larg, const void *rarg) | |
919 | { | |
86b5f4c1 SD |
920 | const blkptr_t *l = &((livelist_entry_t *)larg)->le_bp; |
921 | const blkptr_t *r = &((livelist_entry_t *)rarg)->le_bp; | |
37f03da8 SH |
922 | |
923 | /* Sort them according to dva[0] */ | |
924 | uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); | |
925 | uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); | |
926 | ||
927 | if (l_dva0_vdev != r_dva0_vdev) | |
ca577779 | 928 | return (TREE_CMP(l_dva0_vdev, r_dva0_vdev)); |
37f03da8 SH |
929 | |
930 | /* if vdevs are equal, sort by offsets. */ | |
931 | uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); | |
932 | uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); | |
933 | if (l_dva0_offset == r_dva0_offset) | |
934 | ASSERT3U(l->blk_birth, ==, r->blk_birth); | |
ca577779 | 935 | return (TREE_CMP(l_dva0_offset, r_dva0_offset)); |
37f03da8 SH |
936 | } |
937 | ||
938 | struct livelist_iter_arg { | |
939 | avl_tree_t *avl; | |
940 | bplist_t *to_free; | |
941 | zthr_t *t; | |
942 | }; | |
943 | ||
944 | /* | |
945 | * Expects an AVL tree which is incrementally filled will FREE blkptrs | |
946 | * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a | |
947 | * corresponding FREE are stored in the supplied bplist. | |
86b5f4c1 SD |
948 | * |
949 | * Note that multiple FREE and ALLOC entries for the same blkptr may | |
950 | * be encountered when dedup is involved. For this reason we keep a | |
951 | * refcount for all the FREE entries of each blkptr and ensure that | |
952 | * each of those FREE entries has a corresponding ALLOC preceding it. | |
37f03da8 SH |
953 | */ |
954 | static int | |
955 | dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, | |
956 | dmu_tx_t *tx) | |
957 | { | |
958 | struct livelist_iter_arg *lia = arg; | |
959 | avl_tree_t *avl = lia->avl; | |
960 | bplist_t *to_free = lia->to_free; | |
961 | zthr_t *t = lia->t; | |
962 | ASSERT(tx == NULL); | |
963 | ||
964 | if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) | |
965 | return (SET_ERROR(EINTR)); | |
86b5f4c1 SD |
966 | |
967 | livelist_entry_t node; | |
968 | node.le_bp = *bp; | |
969 | livelist_entry_t *found = avl_find(avl, &node, NULL); | |
37f03da8 | 970 | if (bp_freed) { |
86b5f4c1 SD |
971 | if (found == NULL) { |
972 | /* first free entry for this blkptr */ | |
973 | livelist_entry_t *e = | |
974 | kmem_alloc(sizeof (livelist_entry_t), KM_SLEEP); | |
975 | e->le_bp = *bp; | |
976 | e->le_refcnt = 1; | |
977 | avl_add(avl, e); | |
37f03da8 | 978 | } else { |
86b5f4c1 SD |
979 | /* dedup block free */ |
980 | ASSERT(BP_GET_DEDUP(bp)); | |
981 | ASSERT3U(BP_GET_CHECKSUM(bp), ==, | |
982 | BP_GET_CHECKSUM(&found->le_bp)); | |
983 | ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); | |
984 | found->le_refcnt++; | |
985 | } | |
986 | } else { | |
987 | if (found == NULL) { | |
988 | /* block is currently marked as allocated */ | |
37f03da8 | 989 | bplist_append(to_free, bp); |
86b5f4c1 SD |
990 | } else { |
991 | /* alloc matches a free entry */ | |
992 | ASSERT3U(found->le_refcnt, !=, 0); | |
993 | found->le_refcnt--; | |
994 | if (found->le_refcnt == 0) { | |
995 | /* all tracked free pairs have been matched */ | |
996 | avl_remove(avl, found); | |
997 | kmem_free(found, sizeof (livelist_entry_t)); | |
998 | } else { | |
999 | /* | |
1000 | * This is definitely a deduped blkptr so | |
1001 | * let's validate it. | |
1002 | */ | |
1003 | ASSERT(BP_GET_DEDUP(bp)); | |
1004 | ASSERT3U(BP_GET_CHECKSUM(bp), ==, | |
1005 | BP_GET_CHECKSUM(&found->le_bp)); | |
1006 | } | |
37f03da8 SH |
1007 | } |
1008 | } | |
1009 | return (0); | |
1010 | } | |
1011 | ||
1012 | /* | |
1013 | * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs | |
1014 | * which have an ALLOC entry but no matching FREE | |
1015 | */ | |
1016 | int | |
1017 | dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, | |
1018 | uint64_t *size) | |
1019 | { | |
1020 | avl_tree_t avl; | |
1021 | avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), | |
1022 | offsetof(livelist_entry_t, le_node)); | |
1023 | ||
1024 | /* process the sublist */ | |
1025 | struct livelist_iter_arg arg = { | |
1026 | .avl = &avl, | |
1027 | .to_free = to_free, | |
1028 | .t = t | |
1029 | }; | |
1030 | int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); | |
4acc36ed | 1031 | VERIFY(err != 0 || avl_numnodes(&avl) == 0); |
37f03da8 | 1032 | |
4acc36ed SD |
1033 | void *cookie = NULL; |
1034 | livelist_entry_t *le = NULL; | |
1035 | while ((le = avl_destroy_nodes(&avl, &cookie)) != NULL) { | |
1036 | kmem_free(le, sizeof (livelist_entry_t)); | |
1037 | } | |
37f03da8 SH |
1038 | avl_destroy(&avl); |
1039 | return (err); | |
1040 | } | |
1041 | ||
03fdcb9a | 1042 | ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, ULONG, ZMOD_RW, |
37f03da8 SH |
1043 | "Size to start the next sub-livelist in a livelist"); |
1044 | ||
03fdcb9a | 1045 | ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW, |
37f03da8 | 1046 | "Threshold at which livelist is disabled"); |