]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/bpobj.c
Simplify spa_sync by breaking it up to smaller functions
[mirror_zfs.git] / module / zfs / bpobj.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright (c) 2011, 2016 by Delphix. All rights reserved.
24 * Copyright (c) 2017 Datto Inc.
25 */
26
27 #include <sys/bpobj.h>
28 #include <sys/zfs_context.h>
29 #include <sys/refcount.h>
30 #include <sys/dsl_pool.h>
31 #include <sys/zfeature.h>
32 #include <sys/zap.h>
33
34 /*
35 * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj).
36 */
37 uint64_t
38 bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx)
39 {
40 spa_t *spa = dmu_objset_spa(os);
41 dsl_pool_t *dp = dmu_objset_pool(os);
42
43 if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
44 if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) {
45 ASSERT0(dp->dp_empty_bpobj);
46 dp->dp_empty_bpobj =
47 bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx);
48 VERIFY(zap_add(os,
49 DMU_POOL_DIRECTORY_OBJECT,
50 DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1,
51 &dp->dp_empty_bpobj, tx) == 0);
52 }
53 spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx);
54 ASSERT(dp->dp_empty_bpobj != 0);
55 return (dp->dp_empty_bpobj);
56 } else {
57 return (bpobj_alloc(os, blocksize, tx));
58 }
59 }
60
61 void
62 bpobj_decr_empty(objset_t *os, dmu_tx_t *tx)
63 {
64 dsl_pool_t *dp = dmu_objset_pool(os);
65
66 spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx);
67 if (!spa_feature_is_active(dmu_objset_spa(os),
68 SPA_FEATURE_EMPTY_BPOBJ)) {
69 VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset,
70 DMU_POOL_DIRECTORY_OBJECT,
71 DMU_POOL_EMPTY_BPOBJ, tx));
72 VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx));
73 dp->dp_empty_bpobj = 0;
74 }
75 }
76
77 uint64_t
78 bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx)
79 {
80 int size;
81
82 if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT)
83 size = BPOBJ_SIZE_V0;
84 else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
85 size = BPOBJ_SIZE_V1;
86 else
87 size = sizeof (bpobj_phys_t);
88
89 return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize,
90 DMU_OT_BPOBJ_HDR, size, tx));
91 }
92
93 void
94 bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx)
95 {
96 int64_t i;
97 bpobj_t bpo;
98 dmu_object_info_t doi;
99 int epb;
100 dmu_buf_t *dbuf = NULL;
101
102 ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj);
103 VERIFY3U(0, ==, bpobj_open(&bpo, os, obj));
104
105 mutex_enter(&bpo.bpo_lock);
106
107 if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0)
108 goto out;
109
110 VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi));
111 epb = doi.doi_data_block_size / sizeof (uint64_t);
112
113 for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
114 uint64_t *objarray;
115 uint64_t offset, blkoff;
116
117 offset = i * sizeof (uint64_t);
118 blkoff = P2PHASE(i, epb);
119
120 if (dbuf == NULL || dbuf->db_offset > offset) {
121 if (dbuf)
122 dmu_buf_rele(dbuf, FTAG);
123 VERIFY3U(0, ==, dmu_buf_hold(os,
124 bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0));
125 }
126
127 ASSERT3U(offset, >=, dbuf->db_offset);
128 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
129
130 objarray = dbuf->db_data;
131 bpobj_free(os, objarray[blkoff], tx);
132 }
133 if (dbuf) {
134 dmu_buf_rele(dbuf, FTAG);
135 dbuf = NULL;
136 }
137 VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx));
138
139 out:
140 mutex_exit(&bpo.bpo_lock);
141 bpobj_close(&bpo);
142
143 VERIFY3U(0, ==, dmu_object_free(os, obj, tx));
144 }
145
146 int
147 bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object)
148 {
149 dmu_object_info_t doi;
150 int err;
151
152 err = dmu_object_info(os, object, &doi);
153 if (err)
154 return (err);
155
156 bzero(bpo, sizeof (*bpo));
157 mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL);
158
159 ASSERT(bpo->bpo_dbuf == NULL);
160 ASSERT(bpo->bpo_phys == NULL);
161 ASSERT(object != 0);
162 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ);
163 ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR);
164
165 err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf);
166 if (err)
167 return (err);
168
169 bpo->bpo_os = os;
170 bpo->bpo_object = object;
171 bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT;
172 bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0);
173 bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1);
174 bpo->bpo_phys = bpo->bpo_dbuf->db_data;
175 return (0);
176 }
177
178 boolean_t
179 bpobj_is_open(const bpobj_t *bpo)
180 {
181 return (bpo->bpo_object != 0);
182 }
183
184 void
185 bpobj_close(bpobj_t *bpo)
186 {
187 /* Lame workaround for closing a bpobj that was never opened. */
188 if (bpo->bpo_object == 0)
189 return;
190
191 dmu_buf_rele(bpo->bpo_dbuf, bpo);
192 if (bpo->bpo_cached_dbuf != NULL)
193 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
194 bpo->bpo_dbuf = NULL;
195 bpo->bpo_phys = NULL;
196 bpo->bpo_cached_dbuf = NULL;
197 bpo->bpo_object = 0;
198
199 mutex_destroy(&bpo->bpo_lock);
200 }
201
202 boolean_t
203 bpobj_is_empty(bpobj_t *bpo)
204 {
205 return (bpo->bpo_phys->bpo_num_blkptrs == 0 &&
206 (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0));
207 }
208
209 static int
210 bpobj_iterate_impl(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx,
211 boolean_t free)
212 {
213 dmu_object_info_t doi;
214 int epb;
215 int64_t i;
216 int err = 0;
217 dmu_buf_t *dbuf = NULL;
218
219 ASSERT(bpobj_is_open(bpo));
220 mutex_enter(&bpo->bpo_lock);
221
222 if (free)
223 dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
224
225 for (i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= 0; i--) {
226 blkptr_t *bparray;
227 blkptr_t *bp;
228 uint64_t offset, blkoff;
229
230 offset = i * sizeof (blkptr_t);
231 blkoff = P2PHASE(i, bpo->bpo_epb);
232
233 if (dbuf == NULL || dbuf->db_offset > offset) {
234 if (dbuf)
235 dmu_buf_rele(dbuf, FTAG);
236 err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, offset,
237 FTAG, &dbuf, 0);
238 if (err)
239 break;
240 }
241
242 ASSERT3U(offset, >=, dbuf->db_offset);
243 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
244
245 bparray = dbuf->db_data;
246 bp = &bparray[blkoff];
247 err = func(arg, bp, tx);
248 if (err)
249 break;
250 if (free) {
251 bpo->bpo_phys->bpo_bytes -=
252 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
253 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
254 if (bpo->bpo_havecomp) {
255 bpo->bpo_phys->bpo_comp -= BP_GET_PSIZE(bp);
256 bpo->bpo_phys->bpo_uncomp -= BP_GET_UCSIZE(bp);
257 }
258 bpo->bpo_phys->bpo_num_blkptrs--;
259 ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0);
260 }
261 }
262 if (dbuf) {
263 dmu_buf_rele(dbuf, FTAG);
264 dbuf = NULL;
265 }
266 if (free) {
267 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os, bpo->bpo_object,
268 (i + 1) * sizeof (blkptr_t), DMU_OBJECT_END, tx));
269 }
270 if (err || !bpo->bpo_havesubobj || bpo->bpo_phys->bpo_subobjs == 0)
271 goto out;
272
273 ASSERT(bpo->bpo_havecomp);
274 err = dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, &doi);
275 if (err) {
276 mutex_exit(&bpo->bpo_lock);
277 return (err);
278 }
279 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
280 epb = doi.doi_data_block_size / sizeof (uint64_t);
281
282 for (i = bpo->bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) {
283 uint64_t *objarray;
284 uint64_t offset, blkoff;
285 bpobj_t sublist;
286 uint64_t used_before, comp_before, uncomp_before;
287 uint64_t used_after, comp_after, uncomp_after;
288
289 offset = i * sizeof (uint64_t);
290 blkoff = P2PHASE(i, epb);
291
292 if (dbuf == NULL || dbuf->db_offset > offset) {
293 if (dbuf)
294 dmu_buf_rele(dbuf, FTAG);
295 err = dmu_buf_hold(bpo->bpo_os,
296 bpo->bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0);
297 if (err)
298 break;
299 }
300
301 ASSERT3U(offset, >=, dbuf->db_offset);
302 ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size);
303
304 objarray = dbuf->db_data;
305 err = bpobj_open(&sublist, bpo->bpo_os, objarray[blkoff]);
306 if (err)
307 break;
308 if (free) {
309 err = bpobj_space(&sublist,
310 &used_before, &comp_before, &uncomp_before);
311 if (err != 0) {
312 bpobj_close(&sublist);
313 break;
314 }
315 }
316 err = bpobj_iterate_impl(&sublist, func, arg, tx, free);
317 if (free) {
318 VERIFY3U(0, ==, bpobj_space(&sublist,
319 &used_after, &comp_after, &uncomp_after));
320 bpo->bpo_phys->bpo_bytes -= used_before - used_after;
321 ASSERT3S(bpo->bpo_phys->bpo_bytes, >=, 0);
322 bpo->bpo_phys->bpo_comp -= comp_before - comp_after;
323 bpo->bpo_phys->bpo_uncomp -=
324 uncomp_before - uncomp_after;
325 }
326
327 bpobj_close(&sublist);
328 if (err)
329 break;
330 if (free) {
331 err = dmu_object_free(bpo->bpo_os,
332 objarray[blkoff], tx);
333 if (err)
334 break;
335 bpo->bpo_phys->bpo_num_subobjs--;
336 ASSERT3S(bpo->bpo_phys->bpo_num_subobjs, >=, 0);
337 }
338 }
339 if (dbuf) {
340 dmu_buf_rele(dbuf, FTAG);
341 dbuf = NULL;
342 }
343 if (free) {
344 VERIFY3U(0, ==, dmu_free_range(bpo->bpo_os,
345 bpo->bpo_phys->bpo_subobjs,
346 (i + 1) * sizeof (uint64_t), DMU_OBJECT_END, tx));
347 }
348
349 out:
350 /* If there are no entries, there should be no bytes. */
351 if (bpobj_is_empty(bpo)) {
352 ASSERT0(bpo->bpo_phys->bpo_bytes);
353 ASSERT0(bpo->bpo_phys->bpo_comp);
354 ASSERT0(bpo->bpo_phys->bpo_uncomp);
355 }
356
357 mutex_exit(&bpo->bpo_lock);
358 return (err);
359 }
360
361 /*
362 * Iterate and remove the entries. If func returns nonzero, iteration
363 * will stop and that entry will not be removed.
364 */
365 int
366 bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
367 {
368 return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE));
369 }
370
371 /*
372 * Iterate the entries. If func returns nonzero, iteration will stop.
373 */
374 int
375 bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx)
376 {
377 return (bpobj_iterate_impl(bpo, func, arg, tx, B_FALSE));
378 }
379
380 /*
381 * Logically add subobj's contents to the parent bpobj.
382 *
383 * In the most general case, this is accomplished in constant time by adding
384 * a reference to subobj. This case is used when enqueuing a large subobj:
385 * +--------------+ +--------------+
386 * | bpobj |----------------------->| subobj list |
387 * +----+----+----+----+----+ +-----+-----+--+--+
388 * | bp | bp | bp | bp | bp | | obj | obj | obj |
389 * +----+----+----+----+----+ +-----+-----+-----+
390 *
391 * +--------------+ +--------------+
392 * | sub-bpobj |----------------------> | subsubobj |
393 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
394 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
395 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
396 *
397 * Result: sub-bpobj added to parent's subobj list.
398 * +--------------+ +--------------+
399 * | bpobj |----------------------->| subobj list |
400 * +----+----+----+----+----+ +-----+-----+--+--+-----+
401 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ |
402 * +----+----+----+----+----+ +-----+-----+-----+--|--+
403 * |
404 * /-----------------------------------------------------/
405 * v
406 * +--------------+ +--------------+
407 * | sub-bpobj |----------------------> | subsubobj |
408 * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+
409 * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj |
410 * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+
411 *
412 *
413 * In a common case, the subobj is small: its bp's and its list of subobj's
414 * are each stored in a single block. In this case we copy the subobj's
415 * contents to the parent:
416 * +--------------+ +--------------+
417 * | bpobj |----------------------->| subobj list |
418 * +----+----+----+----+----+ +-----+-----+--+--+
419 * | bp | bp | bp | bp | bp | | obj | obj | obj |
420 * +----+----+----+----+----+ +-----+-----+-----+
421 * ^ ^
422 * +--------------+ | +--------------+ |
423 * | sub-bpobj |---------^------------> | subsubobj | ^
424 * +----+----+----+ | +-----+-----+--+ |
425 * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/
426 * +----+----+ +-----+-----+
427 *
428 * Result: subobj destroyed, contents copied to parent:
429 * +--------------+ +--------------+
430 * | bpobj |----------------------->| subobj list |
431 * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+
432 * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ |
433 * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+
434 *
435 *
436 * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's
437 * but retain the sub-bpobj:
438 * +--------------+ +--------------+
439 * | bpobj |----------------------->| subobj list |
440 * +----+----+----+----+----+ +-----+-----+--+--+
441 * | bp | bp | bp | bp | bp | | obj | obj | obj |
442 * +----+----+----+----+----+ +-----+-----+-----+
443 * ^
444 * +--------------+ +--------------+ |
445 * | sub-bpobj |----------------------> | subsubobj | ^
446 * +----+----+----+----+---------+----+ +-----+-----+--+ |
447 * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/
448 * +----+----+----+----+---------+----+ +-----+-----+
449 *
450 * Result: sub-sub-bpobjs and subobj added to parent's subobj list.
451 * +--------------+ +--------------+
452 * | bpobj |-------------------->| subobj list |
453 * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+
454 * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* |
455 * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+
456 * |
457 * /--------------------------------------------------------------/
458 * v
459 * +--------------+
460 * | sub-bpobj |
461 * +----+----+----+----+---------+----+
462 * | bp | bp | bp | bp | ... | bp |
463 * +----+----+----+----+---------+----+
464 */
465 void
466 bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx)
467 {
468 bpobj_t subbpo;
469 uint64_t used, comp, uncomp, subsubobjs;
470 boolean_t copy_subsub = B_TRUE;
471 boolean_t copy_bps = B_TRUE;
472
473 ASSERT(bpobj_is_open(bpo));
474 ASSERT(subobj != 0);
475 ASSERT(bpo->bpo_havesubobj);
476 ASSERT(bpo->bpo_havecomp);
477 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
478
479 if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) {
480 bpobj_decr_empty(bpo->bpo_os, tx);
481 return;
482 }
483
484 VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj));
485 VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp));
486
487 if (bpobj_is_empty(&subbpo)) {
488 /* No point in having an empty subobj. */
489 bpobj_close(&subbpo);
490 bpobj_free(bpo->bpo_os, subobj, tx);
491 return;
492 }
493
494 mutex_enter(&bpo->bpo_lock);
495 dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
496
497 dmu_object_info_t doi;
498
499 if (bpo->bpo_phys->bpo_subobjs != 0) {
500 ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
501 &doi));
502 ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ);
503 }
504
505 /*
506 * If subobj has only one block of subobjs, then move subobj's
507 * subobjs to bpo's subobj list directly. This reduces recursion in
508 * bpobj_iterate due to nested subobjs.
509 */
510 subsubobjs = subbpo.bpo_phys->bpo_subobjs;
511 if (subsubobjs != 0) {
512 VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi));
513 if (doi.doi_max_offset > doi.doi_data_block_size) {
514 copy_subsub = B_FALSE;
515 }
516 }
517
518 /*
519 * If, in addition to having only one block of subobj's, subobj has
520 * only one block of bp's, then move subobj's bp's to bpo's bp list
521 * directly. This reduces recursion in bpobj_iterate due to nested
522 * subobjs.
523 */
524 VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi));
525 if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) {
526 copy_bps = B_FALSE;
527 }
528
529 if (copy_subsub && subsubobjs != 0) {
530 dmu_buf_t *subdb;
531 uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs;
532
533 VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs,
534 0, FTAG, &subdb, 0));
535 /*
536 * Make sure that we are not asking dmu_write()
537 * to write more data than we have in our buffer.
538 */
539 VERIFY3U(subdb->db_size, >=,
540 numsubsub * sizeof (subobj));
541 if (bpo->bpo_phys->bpo_subobjs == 0) {
542 bpo->bpo_phys->bpo_subobjs =
543 dmu_object_alloc(bpo->bpo_os,
544 DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
545 DMU_OT_NONE, 0, tx);
546 }
547 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
548 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
549 numsubsub * sizeof (subobj), subdb->db_data, tx);
550 dmu_buf_rele(subdb, FTAG);
551 bpo->bpo_phys->bpo_num_subobjs += numsubsub;
552
553 dmu_buf_will_dirty(subbpo.bpo_dbuf, tx);
554 subbpo.bpo_phys->bpo_subobjs = 0;
555 VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx));
556 }
557
558 if (copy_bps) {
559 dmu_buf_t *bps;
560 uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs;
561
562 ASSERT(copy_subsub);
563 VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj,
564 0, FTAG, &bps, 0));
565
566 /*
567 * Make sure that we are not asking dmu_write()
568 * to write more data than we have in our buffer.
569 */
570 VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t));
571 dmu_write(bpo->bpo_os, bpo->bpo_object,
572 bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t),
573 numbps * sizeof (blkptr_t),
574 bps->db_data, tx);
575 dmu_buf_rele(bps, FTAG);
576 bpo->bpo_phys->bpo_num_blkptrs += numbps;
577
578 bpobj_close(&subbpo);
579 VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx));
580 } else {
581 bpobj_close(&subbpo);
582 if (bpo->bpo_phys->bpo_subobjs == 0) {
583 bpo->bpo_phys->bpo_subobjs =
584 dmu_object_alloc(bpo->bpo_os,
585 DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE,
586 DMU_OT_NONE, 0, tx);
587 }
588
589 dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs,
590 bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj),
591 sizeof (subobj), &subobj, tx);
592 bpo->bpo_phys->bpo_num_subobjs++;
593 }
594
595 bpo->bpo_phys->bpo_bytes += used;
596 bpo->bpo_phys->bpo_comp += comp;
597 bpo->bpo_phys->bpo_uncomp += uncomp;
598 mutex_exit(&bpo->bpo_lock);
599
600 }
601
602 void
603 bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, dmu_tx_t *tx)
604 {
605 blkptr_t stored_bp = *bp;
606 uint64_t offset;
607 int blkoff;
608 blkptr_t *bparray;
609
610 ASSERT(bpobj_is_open(bpo));
611 ASSERT(!BP_IS_HOLE(bp));
612 ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj);
613
614 if (BP_IS_EMBEDDED(bp)) {
615 /*
616 * The bpobj will compress better without the payload.
617 *
618 * Note that we store EMBEDDED bp's because they have an
619 * uncompressed size, which must be accounted for. An
620 * alternative would be to add their size to bpo_uncomp
621 * without storing the bp, but that would create additional
622 * complications: bpo_uncomp would be inconsistent with the
623 * set of BP's stored, and bpobj_iterate() wouldn't visit
624 * all the space accounted for in the bpobj.
625 */
626 bzero(&stored_bp, sizeof (stored_bp));
627 stored_bp.blk_prop = bp->blk_prop;
628 stored_bp.blk_birth = bp->blk_birth;
629 } else if (!BP_GET_DEDUP(bp)) {
630 /* The bpobj will compress better without the checksum */
631 bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum));
632 }
633
634 /* We never need the fill count. */
635 stored_bp.blk_fill = 0;
636
637 mutex_enter(&bpo->bpo_lock);
638
639 offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp);
640 blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb);
641
642 if (bpo->bpo_cached_dbuf == NULL ||
643 offset < bpo->bpo_cached_dbuf->db_offset ||
644 offset >= bpo->bpo_cached_dbuf->db_offset +
645 bpo->bpo_cached_dbuf->db_size) {
646 if (bpo->bpo_cached_dbuf)
647 dmu_buf_rele(bpo->bpo_cached_dbuf, bpo);
648 VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object,
649 offset, bpo, &bpo->bpo_cached_dbuf, 0));
650 }
651
652 dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx);
653 bparray = bpo->bpo_cached_dbuf->db_data;
654 bparray[blkoff] = stored_bp;
655
656 dmu_buf_will_dirty(bpo->bpo_dbuf, tx);
657 bpo->bpo_phys->bpo_num_blkptrs++;
658 bpo->bpo_phys->bpo_bytes +=
659 bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp);
660 if (bpo->bpo_havecomp) {
661 bpo->bpo_phys->bpo_comp += BP_GET_PSIZE(bp);
662 bpo->bpo_phys->bpo_uncomp += BP_GET_UCSIZE(bp);
663 }
664 mutex_exit(&bpo->bpo_lock);
665 }
666
667 struct space_range_arg {
668 spa_t *spa;
669 uint64_t mintxg;
670 uint64_t maxtxg;
671 uint64_t used;
672 uint64_t comp;
673 uint64_t uncomp;
674 };
675
676 /* ARGSUSED */
677 static int
678 space_range_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
679 {
680 struct space_range_arg *sra = arg;
681
682 if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) {
683 if (dsl_pool_sync_context(spa_get_dsl(sra->spa)))
684 sra->used += bp_get_dsize_sync(sra->spa, bp);
685 else
686 sra->used += bp_get_dsize(sra->spa, bp);
687 sra->comp += BP_GET_PSIZE(bp);
688 sra->uncomp += BP_GET_UCSIZE(bp);
689 }
690 return (0);
691 }
692
693 int
694 bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
695 {
696 ASSERT(bpobj_is_open(bpo));
697 mutex_enter(&bpo->bpo_lock);
698
699 *usedp = bpo->bpo_phys->bpo_bytes;
700 if (bpo->bpo_havecomp) {
701 *compp = bpo->bpo_phys->bpo_comp;
702 *uncompp = bpo->bpo_phys->bpo_uncomp;
703 mutex_exit(&bpo->bpo_lock);
704 return (0);
705 } else {
706 mutex_exit(&bpo->bpo_lock);
707 return (bpobj_space_range(bpo, 0, UINT64_MAX,
708 usedp, compp, uncompp));
709 }
710 }
711
712 /*
713 * Return the amount of space in the bpobj which is:
714 * mintxg < blk_birth <= maxtxg
715 */
716 int
717 bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg,
718 uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
719 {
720 struct space_range_arg sra = { 0 };
721 int err;
722
723 ASSERT(bpobj_is_open(bpo));
724
725 /*
726 * As an optimization, if they want the whole txg range, just
727 * get bpo_bytes rather than iterating over the bps.
728 */
729 if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp)
730 return (bpobj_space(bpo, usedp, compp, uncompp));
731
732 sra.spa = dmu_objset_spa(bpo->bpo_os);
733 sra.mintxg = mintxg;
734 sra.maxtxg = maxtxg;
735
736 err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL);
737 *usedp = sra.used;
738 *compp = sra.comp;
739 *uncompp = sra.uncomp;
740 return (err);
741 }