4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26 * Copyright 2014 HybridCluster. All rights reserved.
27 * Copyright 2016 RackTop Systems.
28 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
29 * Copyright (c) 2019, Klara Inc.
30 * Copyright (c) 2019, Allan Jude
34 #include <sys/dmu_impl.h>
35 #include <sys/dmu_tx.h>
37 #include <sys/dnode.h>
38 #include <sys/zfs_context.h>
39 #include <sys/dmu_objset.h>
40 #include <sys/dmu_traverse.h>
41 #include <sys/dsl_dataset.h>
42 #include <sys/dsl_dir.h>
43 #include <sys/dsl_prop.h>
44 #include <sys/dsl_pool.h>
45 #include <sys/dsl_synctask.h>
46 #include <sys/spa_impl.h>
47 #include <sys/zfs_ioctl.h>
49 #include <sys/zio_checksum.h>
50 #include <sys/zfs_znode.h>
51 #include <zfs_fletcher.h>
54 #include <sys/zfs_onexit.h>
55 #include <sys/dmu_send.h>
56 #include <sys/dmu_recv.h>
57 #include <sys/dsl_destroy.h>
58 #include <sys/blkptr.h>
59 #include <sys/dsl_bookmark.h>
60 #include <sys/zfeature.h>
61 #include <sys/bqueue.h>
63 #include <sys/policy.h>
64 #include <sys/objlist.h>
66 #include <sys/zfs_vfsops.h>
69 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
70 static int zfs_send_corrupt_data
= B_FALSE
;
72 * This tunable controls the amount of data (measured in bytes) that will be
73 * prefetched by zfs send. If the main thread is blocking on reads that haven't
74 * completed, this variable might need to be increased. If instead the main
75 * thread is issuing new reads because the prefetches have fallen out of the
76 * cache, this may need to be decreased.
78 static int zfs_send_queue_length
= SPA_MAXBLOCKSIZE
;
80 * This tunable controls the length of the queues that zfs send worker threads
81 * use to communicate. If the send_main_thread is blocking on these queues,
82 * this variable may need to be increased. If there is a significant slowdown
83 * at the start of a send as these threads consume all the available IO
84 * resources, this variable may need to be decreased.
86 static int zfs_send_no_prefetch_queue_length
= 1024 * 1024;
88 * These tunables control the fill fraction of the queues by zfs send. The fill
89 * fraction controls the frequency with which threads have to be cv_signaled.
90 * If a lot of cpu time is being spent on cv_signal, then these should be tuned
91 * down. If the queues empty before the signalled thread can catch up, then
92 * these should be tuned up.
94 static int zfs_send_queue_ff
= 20;
95 static int zfs_send_no_prefetch_queue_ff
= 20;
98 * Use this to override the recordsize calculation for fast zfs send estimates.
100 static int zfs_override_estimate_recordsize
= 0;
102 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
103 static const boolean_t zfs_send_set_freerecords_bit
= B_TRUE
;
105 /* Set this tunable to FALSE is disable sending unmodified spill blocks. */
106 static int zfs_send_unmodified_spill_blocks
= B_TRUE
;
108 static inline boolean_t
109 overflow_multiply(uint64_t a
, uint64_t b
, uint64_t *c
)
111 uint64_t temp
= a
* b
;
112 if (b
!= 0 && temp
/ b
!= a
)
118 struct send_thread_arg
{
120 objset_t
*os
; /* Objset to traverse */
121 uint64_t fromtxg
; /* Traverse from this txg */
122 int flags
; /* flags to pass to traverse_dataset */
125 zbookmark_phys_t resume
;
126 uint64_t *num_blocks_visited
;
129 struct redact_list_thread_arg
{
132 zbookmark_phys_t resume
;
133 redaction_list_t
*rl
;
134 boolean_t mark_redact
;
136 uint64_t *num_blocks_visited
;
139 struct send_merge_thread_arg
{
142 struct redact_list_thread_arg
*from_arg
;
143 struct send_thread_arg
*to_arg
;
144 struct redact_list_thread_arg
*redact_arg
;
150 boolean_t eos_marker
; /* Marks the end of the stream */
152 uint64_t start_blkid
;
155 enum type
{DATA
, HOLE
, OBJECT
, OBJECT_RANGE
, REDACT
,
156 PREVIOUSLY_REDACTED
} type
;
159 dmu_object_type_t obj_type
;
160 uint32_t datablksz
; // logical size
161 uint32_t datasz
; // payload size
167 boolean_t io_outstanding
;
168 boolean_t io_compressed
;
176 * This is a pointer because embedding it in the
177 * struct causes these structures to be massively larger
178 * for all range types; this makes the code much less
194 * The list of data whose inclusion in a send stream can be pending from
195 * one call to backup_cb to another. Multiple calls to dump_free(),
196 * dump_freeobjects(), and dump_redact() can be aggregated into a single
197 * DRR_FREE, DRR_FREEOBJECTS, or DRR_REDACT replay record.
206 typedef struct dmu_send_cookie
{
207 dmu_replay_record_t
*dsc_drr
;
208 dmu_send_outparams_t
*dsc_dso
;
213 uint64_t dsc_fromtxg
;
215 dmu_pendop_t dsc_pending_op
;
216 uint64_t dsc_featureflags
;
217 uint64_t dsc_last_data_object
;
218 uint64_t dsc_last_data_offset
;
219 uint64_t dsc_resume_object
;
220 uint64_t dsc_resume_offset
;
221 boolean_t dsc_sent_begin
;
222 boolean_t dsc_sent_end
;
225 static int do_dump(dmu_send_cookie_t
*dscp
, struct send_range
*range
);
228 range_free(struct send_range
*range
)
230 if (range
->type
== OBJECT
) {
231 size_t size
= sizeof (dnode_phys_t
) *
232 (range
->sru
.object
.dnp
->dn_extra_slots
+ 1);
233 kmem_free(range
->sru
.object
.dnp
, size
);
234 } else if (range
->type
== DATA
) {
235 mutex_enter(&range
->sru
.data
.lock
);
236 while (range
->sru
.data
.io_outstanding
)
237 cv_wait(&range
->sru
.data
.cv
, &range
->sru
.data
.lock
);
238 if (range
->sru
.data
.abd
!= NULL
)
239 abd_free(range
->sru
.data
.abd
);
240 if (range
->sru
.data
.abuf
!= NULL
) {
241 arc_buf_destroy(range
->sru
.data
.abuf
,
242 &range
->sru
.data
.abuf
);
244 mutex_exit(&range
->sru
.data
.lock
);
246 cv_destroy(&range
->sru
.data
.cv
);
247 mutex_destroy(&range
->sru
.data
.lock
);
249 kmem_free(range
, sizeof (*range
));
253 * For all record types except BEGIN, fill in the checksum (overlaid in
254 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
255 * up to the start of the checksum itself.
258 dump_record(dmu_send_cookie_t
*dscp
, void *payload
, int payload_len
)
260 dmu_send_outparams_t
*dso
= dscp
->dsc_dso
;
261 ASSERT3U(offsetof(dmu_replay_record_t
, drr_u
.drr_checksum
.drr_checksum
),
262 ==, sizeof (dmu_replay_record_t
) - sizeof (zio_cksum_t
));
263 (void) fletcher_4_incremental_native(dscp
->dsc_drr
,
264 offsetof(dmu_replay_record_t
, drr_u
.drr_checksum
.drr_checksum
),
266 if (dscp
->dsc_drr
->drr_type
== DRR_BEGIN
) {
267 dscp
->dsc_sent_begin
= B_TRUE
;
269 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dscp
->dsc_drr
->drr_u
.
270 drr_checksum
.drr_checksum
));
271 dscp
->dsc_drr
->drr_u
.drr_checksum
.drr_checksum
= dscp
->dsc_zc
;
273 if (dscp
->dsc_drr
->drr_type
== DRR_END
) {
274 dscp
->dsc_sent_end
= B_TRUE
;
276 (void) fletcher_4_incremental_native(&dscp
->dsc_drr
->
277 drr_u
.drr_checksum
.drr_checksum
,
278 sizeof (zio_cksum_t
), &dscp
->dsc_zc
);
279 *dscp
->dsc_off
+= sizeof (dmu_replay_record_t
);
280 dscp
->dsc_err
= dso
->dso_outfunc(dscp
->dsc_os
, dscp
->dsc_drr
,
281 sizeof (dmu_replay_record_t
), dso
->dso_arg
);
282 if (dscp
->dsc_err
!= 0)
283 return (SET_ERROR(EINTR
));
284 if (payload_len
!= 0) {
285 *dscp
->dsc_off
+= payload_len
;
287 * payload is null when dso_dryrun == B_TRUE (i.e. when we're
288 * doing a send size calculation)
290 if (payload
!= NULL
) {
291 (void) fletcher_4_incremental_native(
292 payload
, payload_len
, &dscp
->dsc_zc
);
296 * The code does not rely on this (len being a multiple of 8).
297 * We keep this assertion because of the corresponding assertion
298 * in receive_read(). Keeping this assertion ensures that we do
299 * not inadvertently break backwards compatibility (causing the
300 * assertion in receive_read() to trigger on old software).
302 * Raw sends cannot be received on old software, and so can
303 * bypass this assertion.
306 ASSERT((payload_len
% 8 == 0) ||
307 (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
));
309 dscp
->dsc_err
= dso
->dso_outfunc(dscp
->dsc_os
, payload
,
310 payload_len
, dso
->dso_arg
);
311 if (dscp
->dsc_err
!= 0)
312 return (SET_ERROR(EINTR
));
318 * Fill in the drr_free struct, or perform aggregation if the previous record is
319 * also a free record, and the two are adjacent.
321 * Note that we send free records even for a full send, because we want to be
322 * able to receive a full send as a clone, which requires a list of all the free
323 * and freeobject records that were generated on the source.
326 dump_free(dmu_send_cookie_t
*dscp
, uint64_t object
, uint64_t offset
,
329 struct drr_free
*drrf
= &(dscp
->dsc_drr
->drr_u
.drr_free
);
332 * When we receive a free record, dbuf_free_range() assumes
333 * that the receiving system doesn't have any dbufs in the range
334 * being freed. This is always true because there is a one-record
335 * constraint: we only send one WRITE record for any given
336 * object,offset. We know that the one-record constraint is
337 * true because we always send data in increasing order by
340 * If the increasing-order constraint ever changes, we should find
341 * another way to assert that the one-record constraint is still
344 ASSERT(object
> dscp
->dsc_last_data_object
||
345 (object
== dscp
->dsc_last_data_object
&&
346 offset
> dscp
->dsc_last_data_offset
));
349 * If there is a pending op, but it's not PENDING_FREE, push it out,
350 * since free block aggregation can only be done for blocks of the
351 * same type (i.e., DRR_FREE records can only be aggregated with
352 * other DRR_FREE records. DRR_FREEOBJECTS records can only be
353 * aggregated with other DRR_FREEOBJECTS records).
355 if (dscp
->dsc_pending_op
!= PENDING_NONE
&&
356 dscp
->dsc_pending_op
!= PENDING_FREE
) {
357 if (dump_record(dscp
, NULL
, 0) != 0)
358 return (SET_ERROR(EINTR
));
359 dscp
->dsc_pending_op
= PENDING_NONE
;
362 if (dscp
->dsc_pending_op
== PENDING_FREE
) {
364 * Check to see whether this free block can be aggregated
367 if (drrf
->drr_object
== object
&& drrf
->drr_offset
+
368 drrf
->drr_length
== offset
) {
369 if (offset
+ length
< offset
|| length
== UINT64_MAX
)
370 drrf
->drr_length
= UINT64_MAX
;
372 drrf
->drr_length
+= length
;
375 /* not a continuation. Push out pending record */
376 if (dump_record(dscp
, NULL
, 0) != 0)
377 return (SET_ERROR(EINTR
));
378 dscp
->dsc_pending_op
= PENDING_NONE
;
381 /* create a FREE record and make it pending */
382 bzero(dscp
->dsc_drr
, sizeof (dmu_replay_record_t
));
383 dscp
->dsc_drr
->drr_type
= DRR_FREE
;
384 drrf
->drr_object
= object
;
385 drrf
->drr_offset
= offset
;
386 if (offset
+ length
< offset
)
387 drrf
->drr_length
= DMU_OBJECT_END
;
389 drrf
->drr_length
= length
;
390 drrf
->drr_toguid
= dscp
->dsc_toguid
;
391 if (length
== DMU_OBJECT_END
) {
392 if (dump_record(dscp
, NULL
, 0) != 0)
393 return (SET_ERROR(EINTR
));
395 dscp
->dsc_pending_op
= PENDING_FREE
;
402 * Fill in the drr_redact struct, or perform aggregation if the previous record
403 * is also a redaction record, and the two are adjacent.
406 dump_redact(dmu_send_cookie_t
*dscp
, uint64_t object
, uint64_t offset
,
409 struct drr_redact
*drrr
= &dscp
->dsc_drr
->drr_u
.drr_redact
;
412 * If there is a pending op, but it's not PENDING_REDACT, push it out,
413 * since free block aggregation can only be done for blocks of the
414 * same type (i.e., DRR_REDACT records can only be aggregated with
415 * other DRR_REDACT records).
417 if (dscp
->dsc_pending_op
!= PENDING_NONE
&&
418 dscp
->dsc_pending_op
!= PENDING_REDACT
) {
419 if (dump_record(dscp
, NULL
, 0) != 0)
420 return (SET_ERROR(EINTR
));
421 dscp
->dsc_pending_op
= PENDING_NONE
;
424 if (dscp
->dsc_pending_op
== PENDING_REDACT
) {
426 * Check to see whether this redacted block can be aggregated
429 if (drrr
->drr_object
== object
&& drrr
->drr_offset
+
430 drrr
->drr_length
== offset
) {
431 drrr
->drr_length
+= length
;
434 /* not a continuation. Push out pending record */
435 if (dump_record(dscp
, NULL
, 0) != 0)
436 return (SET_ERROR(EINTR
));
437 dscp
->dsc_pending_op
= PENDING_NONE
;
440 /* create a REDACT record and make it pending */
441 bzero(dscp
->dsc_drr
, sizeof (dmu_replay_record_t
));
442 dscp
->dsc_drr
->drr_type
= DRR_REDACT
;
443 drrr
->drr_object
= object
;
444 drrr
->drr_offset
= offset
;
445 drrr
->drr_length
= length
;
446 drrr
->drr_toguid
= dscp
->dsc_toguid
;
447 dscp
->dsc_pending_op
= PENDING_REDACT
;
453 dmu_dump_write(dmu_send_cookie_t
*dscp
, dmu_object_type_t type
, uint64_t object
,
454 uint64_t offset
, int lsize
, int psize
, const blkptr_t
*bp
,
455 boolean_t io_compressed
, void *data
)
457 uint64_t payload_size
;
458 boolean_t raw
= (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
);
459 struct drr_write
*drrw
= &(dscp
->dsc_drr
->drr_u
.drr_write
);
462 * We send data in increasing object, offset order.
463 * See comment in dump_free() for details.
465 ASSERT(object
> dscp
->dsc_last_data_object
||
466 (object
== dscp
->dsc_last_data_object
&&
467 offset
> dscp
->dsc_last_data_offset
));
468 dscp
->dsc_last_data_object
= object
;
469 dscp
->dsc_last_data_offset
= offset
+ lsize
- 1;
472 * If there is any kind of pending aggregation (currently either
473 * a grouping of free objects or free blocks), push it out to
474 * the stream, since aggregation can't be done across operations
475 * of different types.
477 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
478 if (dump_record(dscp
, NULL
, 0) != 0)
479 return (SET_ERROR(EINTR
));
480 dscp
->dsc_pending_op
= PENDING_NONE
;
482 /* write a WRITE record */
483 bzero(dscp
->dsc_drr
, sizeof (dmu_replay_record_t
));
484 dscp
->dsc_drr
->drr_type
= DRR_WRITE
;
485 drrw
->drr_object
= object
;
486 drrw
->drr_type
= type
;
487 drrw
->drr_offset
= offset
;
488 drrw
->drr_toguid
= dscp
->dsc_toguid
;
489 drrw
->drr_logical_size
= lsize
;
491 /* only set the compression fields if the buf is compressed or raw */
492 boolean_t compressed
=
493 (bp
!= NULL
? BP_GET_COMPRESS(bp
) != ZIO_COMPRESS_OFF
&&
494 io_compressed
: lsize
!= psize
);
495 if (raw
|| compressed
) {
496 ASSERT(raw
|| dscp
->dsc_featureflags
&
497 DMU_BACKUP_FEATURE_COMPRESSED
);
498 ASSERT(!BP_IS_EMBEDDED(bp
));
499 ASSERT3S(psize
, >, 0);
502 ASSERT(BP_IS_PROTECTED(bp
));
505 * This is a raw protected block so we need to pass
506 * along everything the receiving side will need to
507 * interpret this block, including the byteswap, salt,
510 if (BP_SHOULD_BYTESWAP(bp
))
511 drrw
->drr_flags
|= DRR_RAW_BYTESWAP
;
512 zio_crypt_decode_params_bp(bp
, drrw
->drr_salt
,
514 zio_crypt_decode_mac_bp(bp
, drrw
->drr_mac
);
516 /* this is a compressed block */
517 ASSERT(dscp
->dsc_featureflags
&
518 DMU_BACKUP_FEATURE_COMPRESSED
);
519 ASSERT(!BP_SHOULD_BYTESWAP(bp
));
520 ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp
)));
521 ASSERT3U(BP_GET_COMPRESS(bp
), !=, ZIO_COMPRESS_OFF
);
522 ASSERT3S(lsize
, >=, psize
);
525 /* set fields common to compressed and raw sends */
526 drrw
->drr_compressiontype
= BP_GET_COMPRESS(bp
);
527 drrw
->drr_compressed_size
= psize
;
528 payload_size
= drrw
->drr_compressed_size
;
530 payload_size
= drrw
->drr_logical_size
;
533 if (bp
== NULL
|| BP_IS_EMBEDDED(bp
) || (BP_IS_PROTECTED(bp
) && !raw
)) {
535 * There's no pre-computed checksum for partial-block writes,
536 * embedded BP's, or encrypted BP's that are being sent as
537 * plaintext, so (like fletcher4-checksummed blocks) userland
538 * will have to compute a dedup-capable checksum itself.
540 drrw
->drr_checksumtype
= ZIO_CHECKSUM_OFF
;
542 drrw
->drr_checksumtype
= BP_GET_CHECKSUM(bp
);
543 if (zio_checksum_table
[drrw
->drr_checksumtype
].ci_flags
&
544 ZCHECKSUM_FLAG_DEDUP
)
545 drrw
->drr_flags
|= DRR_CHECKSUM_DEDUP
;
546 DDK_SET_LSIZE(&drrw
->drr_key
, BP_GET_LSIZE(bp
));
547 DDK_SET_PSIZE(&drrw
->drr_key
, BP_GET_PSIZE(bp
));
548 DDK_SET_COMPRESS(&drrw
->drr_key
, BP_GET_COMPRESS(bp
));
549 DDK_SET_CRYPT(&drrw
->drr_key
, BP_IS_PROTECTED(bp
));
550 drrw
->drr_key
.ddk_cksum
= bp
->blk_cksum
;
553 if (dump_record(dscp
, data
, payload_size
) != 0)
554 return (SET_ERROR(EINTR
));
559 dump_write_embedded(dmu_send_cookie_t
*dscp
, uint64_t object
, uint64_t offset
,
560 int blksz
, const blkptr_t
*bp
)
562 char buf
[BPE_PAYLOAD_SIZE
];
563 struct drr_write_embedded
*drrw
=
564 &(dscp
->dsc_drr
->drr_u
.drr_write_embedded
);
566 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
567 if (dump_record(dscp
, NULL
, 0) != 0)
568 return (SET_ERROR(EINTR
));
569 dscp
->dsc_pending_op
= PENDING_NONE
;
572 ASSERT(BP_IS_EMBEDDED(bp
));
574 bzero(dscp
->dsc_drr
, sizeof (dmu_replay_record_t
));
575 dscp
->dsc_drr
->drr_type
= DRR_WRITE_EMBEDDED
;
576 drrw
->drr_object
= object
;
577 drrw
->drr_offset
= offset
;
578 drrw
->drr_length
= blksz
;
579 drrw
->drr_toguid
= dscp
->dsc_toguid
;
580 drrw
->drr_compression
= BP_GET_COMPRESS(bp
);
581 drrw
->drr_etype
= BPE_GET_ETYPE(bp
);
582 drrw
->drr_lsize
= BPE_GET_LSIZE(bp
);
583 drrw
->drr_psize
= BPE_GET_PSIZE(bp
);
585 decode_embedded_bp_compressed(bp
, buf
);
587 if (dump_record(dscp
, buf
, P2ROUNDUP(drrw
->drr_psize
, 8)) != 0)
588 return (SET_ERROR(EINTR
));
593 dump_spill(dmu_send_cookie_t
*dscp
, const blkptr_t
*bp
, uint64_t object
,
596 struct drr_spill
*drrs
= &(dscp
->dsc_drr
->drr_u
.drr_spill
);
597 uint64_t blksz
= BP_GET_LSIZE(bp
);
598 uint64_t payload_size
= blksz
;
600 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
601 if (dump_record(dscp
, NULL
, 0) != 0)
602 return (SET_ERROR(EINTR
));
603 dscp
->dsc_pending_op
= PENDING_NONE
;
606 /* write a SPILL record */
607 bzero(dscp
->dsc_drr
, sizeof (dmu_replay_record_t
));
608 dscp
->dsc_drr
->drr_type
= DRR_SPILL
;
609 drrs
->drr_object
= object
;
610 drrs
->drr_length
= blksz
;
611 drrs
->drr_toguid
= dscp
->dsc_toguid
;
613 /* See comment in dump_dnode() for full details */
614 if (zfs_send_unmodified_spill_blocks
&&
615 (bp
->blk_birth
<= dscp
->dsc_fromtxg
)) {
616 drrs
->drr_flags
|= DRR_SPILL_UNMODIFIED
;
619 /* handle raw send fields */
620 if (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
621 ASSERT(BP_IS_PROTECTED(bp
));
623 if (BP_SHOULD_BYTESWAP(bp
))
624 drrs
->drr_flags
|= DRR_RAW_BYTESWAP
;
625 drrs
->drr_compressiontype
= BP_GET_COMPRESS(bp
);
626 drrs
->drr_compressed_size
= BP_GET_PSIZE(bp
);
627 zio_crypt_decode_params_bp(bp
, drrs
->drr_salt
, drrs
->drr_iv
);
628 zio_crypt_decode_mac_bp(bp
, drrs
->drr_mac
);
629 payload_size
= drrs
->drr_compressed_size
;
632 if (dump_record(dscp
, data
, payload_size
) != 0)
633 return (SET_ERROR(EINTR
));
638 dump_freeobjects(dmu_send_cookie_t
*dscp
, uint64_t firstobj
, uint64_t numobjs
)
640 struct drr_freeobjects
*drrfo
= &(dscp
->dsc_drr
->drr_u
.drr_freeobjects
);
641 uint64_t maxobj
= DNODES_PER_BLOCK
*
642 (DMU_META_DNODE(dscp
->dsc_os
)->dn_maxblkid
+ 1);
645 * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
646 * leading to zfs recv never completing. to avoid this issue, don't
647 * send FREEOBJECTS records for object IDs which cannot exist on the
651 if (maxobj
<= firstobj
)
654 if (maxobj
< firstobj
+ numobjs
)
655 numobjs
= maxobj
- firstobj
;
659 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
660 * push it out, since free block aggregation can only be done for
661 * blocks of the same type (i.e., DRR_FREE records can only be
662 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
663 * can only be aggregated with other DRR_FREEOBJECTS records).
665 if (dscp
->dsc_pending_op
!= PENDING_NONE
&&
666 dscp
->dsc_pending_op
!= PENDING_FREEOBJECTS
) {
667 if (dump_record(dscp
, NULL
, 0) != 0)
668 return (SET_ERROR(EINTR
));
669 dscp
->dsc_pending_op
= PENDING_NONE
;
672 if (dscp
->dsc_pending_op
== PENDING_FREEOBJECTS
) {
674 * See whether this free object array can be aggregated
677 if (drrfo
->drr_firstobj
+ drrfo
->drr_numobjs
== firstobj
) {
678 drrfo
->drr_numobjs
+= numobjs
;
681 /* can't be aggregated. Push out pending record */
682 if (dump_record(dscp
, NULL
, 0) != 0)
683 return (SET_ERROR(EINTR
));
684 dscp
->dsc_pending_op
= PENDING_NONE
;
688 /* write a FREEOBJECTS record */
689 bzero(dscp
->dsc_drr
, sizeof (dmu_replay_record_t
));
690 dscp
->dsc_drr
->drr_type
= DRR_FREEOBJECTS
;
691 drrfo
->drr_firstobj
= firstobj
;
692 drrfo
->drr_numobjs
= numobjs
;
693 drrfo
->drr_toguid
= dscp
->dsc_toguid
;
695 dscp
->dsc_pending_op
= PENDING_FREEOBJECTS
;
701 dump_dnode(dmu_send_cookie_t
*dscp
, const blkptr_t
*bp
, uint64_t object
,
704 struct drr_object
*drro
= &(dscp
->dsc_drr
->drr_u
.drr_object
);
707 if (object
< dscp
->dsc_resume_object
) {
709 * Note: when resuming, we will visit all the dnodes in
710 * the block of dnodes that we are resuming from. In
711 * this case it's unnecessary to send the dnodes prior to
712 * the one we are resuming from. We should be at most one
713 * block's worth of dnodes behind the resume point.
715 ASSERT3U(dscp
->dsc_resume_object
- object
, <,
716 1 << (DNODE_BLOCK_SHIFT
- DNODE_SHIFT
));
720 if (dnp
== NULL
|| dnp
->dn_type
== DMU_OT_NONE
)
721 return (dump_freeobjects(dscp
, object
, 1));
723 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
724 if (dump_record(dscp
, NULL
, 0) != 0)
725 return (SET_ERROR(EINTR
));
726 dscp
->dsc_pending_op
= PENDING_NONE
;
729 /* write an OBJECT record */
730 bzero(dscp
->dsc_drr
, sizeof (dmu_replay_record_t
));
731 dscp
->dsc_drr
->drr_type
= DRR_OBJECT
;
732 drro
->drr_object
= object
;
733 drro
->drr_type
= dnp
->dn_type
;
734 drro
->drr_bonustype
= dnp
->dn_bonustype
;
735 drro
->drr_blksz
= dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
736 drro
->drr_bonuslen
= dnp
->dn_bonuslen
;
737 drro
->drr_dn_slots
= dnp
->dn_extra_slots
+ 1;
738 drro
->drr_checksumtype
= dnp
->dn_checksum
;
739 drro
->drr_compress
= dnp
->dn_compress
;
740 drro
->drr_toguid
= dscp
->dsc_toguid
;
742 if (!(dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
) &&
743 drro
->drr_blksz
> SPA_OLD_MAXBLOCKSIZE
)
744 drro
->drr_blksz
= SPA_OLD_MAXBLOCKSIZE
;
746 bonuslen
= P2ROUNDUP(dnp
->dn_bonuslen
, 8);
748 if ((dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
)) {
749 ASSERT(BP_IS_ENCRYPTED(bp
));
751 if (BP_SHOULD_BYTESWAP(bp
))
752 drro
->drr_flags
|= DRR_RAW_BYTESWAP
;
754 /* needed for reconstructing dnp on recv side */
755 drro
->drr_maxblkid
= dnp
->dn_maxblkid
;
756 drro
->drr_indblkshift
= dnp
->dn_indblkshift
;
757 drro
->drr_nlevels
= dnp
->dn_nlevels
;
758 drro
->drr_nblkptr
= dnp
->dn_nblkptr
;
761 * Since we encrypt the entire bonus area, the (raw) part
762 * beyond the bonuslen is actually nonzero, so we need
766 drro
->drr_raw_bonuslen
= DN_MAX_BONUS_LEN(dnp
);
767 bonuslen
= drro
->drr_raw_bonuslen
;
772 * DRR_OBJECT_SPILL is set for every dnode which references a
773 * spill block. This allows the receiving pool to definitively
774 * determine when a spill block should be kept or freed.
776 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
)
777 drro
->drr_flags
|= DRR_OBJECT_SPILL
;
779 if (dump_record(dscp
, DN_BONUS(dnp
), bonuslen
) != 0)
780 return (SET_ERROR(EINTR
));
782 /* Free anything past the end of the file. */
783 if (dump_free(dscp
, object
, (dnp
->dn_maxblkid
+ 1) *
784 (dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
), DMU_OBJECT_END
) != 0)
785 return (SET_ERROR(EINTR
));
788 * Send DRR_SPILL records for unmodified spill blocks. This is useful
789 * because changing certain attributes of the object (e.g. blocksize)
790 * can cause old versions of ZFS to incorrectly remove a spill block.
791 * Including these records in the stream forces an up to date version
792 * to always be written ensuring they're never lost. Current versions
793 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
794 * ignore these unmodified spill blocks.
796 if (zfs_send_unmodified_spill_blocks
&&
797 (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) &&
798 (DN_SPILL_BLKPTR(dnp
)->blk_birth
<= dscp
->dsc_fromtxg
)) {
799 struct send_range record
;
800 blkptr_t
*bp
= DN_SPILL_BLKPTR(dnp
);
802 bzero(&record
, sizeof (struct send_range
));
804 record
.object
= object
;
805 record
.eos_marker
= B_FALSE
;
806 record
.start_blkid
= DMU_SPILL_BLKID
;
807 record
.end_blkid
= record
.start_blkid
+ 1;
808 record
.sru
.data
.bp
= *bp
;
809 record
.sru
.data
.obj_type
= dnp
->dn_type
;
810 record
.sru
.data
.datablksz
= BP_GET_LSIZE(bp
);
812 if (do_dump(dscp
, &record
) != 0)
813 return (SET_ERROR(EINTR
));
816 if (dscp
->dsc_err
!= 0)
817 return (SET_ERROR(EINTR
));
823 dump_object_range(dmu_send_cookie_t
*dscp
, const blkptr_t
*bp
,
824 uint64_t firstobj
, uint64_t numslots
)
826 struct drr_object_range
*drror
=
827 &(dscp
->dsc_drr
->drr_u
.drr_object_range
);
829 /* we only use this record type for raw sends */
830 ASSERT(BP_IS_PROTECTED(bp
));
831 ASSERT(dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
);
832 ASSERT3U(BP_GET_COMPRESS(bp
), ==, ZIO_COMPRESS_OFF
);
833 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_DNODE
);
834 ASSERT0(BP_GET_LEVEL(bp
));
836 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
837 if (dump_record(dscp
, NULL
, 0) != 0)
838 return (SET_ERROR(EINTR
));
839 dscp
->dsc_pending_op
= PENDING_NONE
;
842 bzero(dscp
->dsc_drr
, sizeof (dmu_replay_record_t
));
843 dscp
->dsc_drr
->drr_type
= DRR_OBJECT_RANGE
;
844 drror
->drr_firstobj
= firstobj
;
845 drror
->drr_numslots
= numslots
;
846 drror
->drr_toguid
= dscp
->dsc_toguid
;
847 if (BP_SHOULD_BYTESWAP(bp
))
848 drror
->drr_flags
|= DRR_RAW_BYTESWAP
;
849 zio_crypt_decode_params_bp(bp
, drror
->drr_salt
, drror
->drr_iv
);
850 zio_crypt_decode_mac_bp(bp
, drror
->drr_mac
);
852 if (dump_record(dscp
, NULL
, 0) != 0)
853 return (SET_ERROR(EINTR
));
858 send_do_embed(const blkptr_t
*bp
, uint64_t featureflags
)
860 if (!BP_IS_EMBEDDED(bp
))
864 * Compression function must be legacy, or explicitly enabled.
866 if ((BP_GET_COMPRESS(bp
) >= ZIO_COMPRESS_LEGACY_FUNCTIONS
&&
867 !(featureflags
& DMU_BACKUP_FEATURE_LZ4
)))
871 * If we have not set the ZSTD feature flag, we can't send ZSTD
872 * compressed embedded blocks, as the receiver may not support them.
874 if ((BP_GET_COMPRESS(bp
) == ZIO_COMPRESS_ZSTD
&&
875 !(featureflags
& DMU_BACKUP_FEATURE_ZSTD
)))
879 * Embed type must be explicitly enabled.
881 switch (BPE_GET_ETYPE(bp
)) {
882 case BP_EMBEDDED_TYPE_DATA
:
883 if (featureflags
& DMU_BACKUP_FEATURE_EMBED_DATA
)
893 * This function actually handles figuring out what kind of record needs to be
894 * dumped, and calling the appropriate helper function. In most cases,
895 * the data has already been read by send_reader_thread().
898 do_dump(dmu_send_cookie_t
*dscp
, struct send_range
*range
)
901 switch (range
->type
) {
903 err
= dump_dnode(dscp
, &range
->sru
.object
.bp
, range
->object
,
904 range
->sru
.object
.dnp
);
907 ASSERT3U(range
->start_blkid
+ 1, ==, range
->end_blkid
);
908 if (!(dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
)) {
911 uint64_t epb
= BP_GET_LSIZE(&range
->sru
.object_range
.bp
) >>
913 uint64_t firstobj
= range
->start_blkid
* epb
;
914 err
= dump_object_range(dscp
, &range
->sru
.object_range
.bp
,
919 struct srr
*srrp
= &range
->sru
.redact
;
920 err
= dump_redact(dscp
, range
->object
, range
->start_blkid
*
921 srrp
->datablksz
, (range
->end_blkid
- range
->start_blkid
) *
926 struct srd
*srdp
= &range
->sru
.data
;
927 blkptr_t
*bp
= &srdp
->bp
;
929 dmu_objset_spa(dscp
->dsc_os
);
931 ASSERT3U(srdp
->datablksz
, ==, BP_GET_LSIZE(bp
));
932 ASSERT3U(range
->start_blkid
+ 1, ==, range
->end_blkid
);
933 if (BP_GET_TYPE(bp
) == DMU_OT_SA
) {
934 arc_flags_t aflags
= ARC_FLAG_WAIT
;
935 enum zio_flag zioflags
= ZIO_FLAG_CANFAIL
;
937 if (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
938 ASSERT(BP_IS_PROTECTED(bp
));
939 zioflags
|= ZIO_FLAG_RAW
;
943 ASSERT3U(range
->start_blkid
, ==, DMU_SPILL_BLKID
);
944 zb
.zb_objset
= dmu_objset_id(dscp
->dsc_os
);
945 zb
.zb_object
= range
->object
;
947 zb
.zb_blkid
= range
->start_blkid
;
949 arc_buf_t
*abuf
= NULL
;
950 if (!dscp
->dsc_dso
->dso_dryrun
&& arc_read(NULL
, spa
,
951 bp
, arc_getbuf_func
, &abuf
, ZIO_PRIORITY_ASYNC_READ
,
952 zioflags
, &aflags
, &zb
) != 0)
953 return (SET_ERROR(EIO
));
955 err
= dump_spill(dscp
, bp
, zb
.zb_object
,
956 (abuf
== NULL
? NULL
: abuf
->b_data
));
958 arc_buf_destroy(abuf
, &abuf
);
961 if (send_do_embed(bp
, dscp
->dsc_featureflags
)) {
962 err
= dump_write_embedded(dscp
, range
->object
,
963 range
->start_blkid
* srdp
->datablksz
,
964 srdp
->datablksz
, bp
);
967 ASSERT(range
->object
> dscp
->dsc_resume_object
||
968 (range
->object
== dscp
->dsc_resume_object
&&
969 range
->start_blkid
* srdp
->datablksz
>=
970 dscp
->dsc_resume_offset
));
971 /* it's a level-0 block of a regular object */
973 mutex_enter(&srdp
->lock
);
974 while (srdp
->io_outstanding
)
975 cv_wait(&srdp
->cv
, &srdp
->lock
);
977 mutex_exit(&srdp
->lock
);
980 if (zfs_send_corrupt_data
&&
981 !dscp
->dsc_dso
->dso_dryrun
) {
983 * Send a block filled with 0x"zfs badd bloc"
985 srdp
->abuf
= arc_alloc_buf(spa
, &srdp
->abuf
,
986 ARC_BUFC_DATA
, srdp
->datablksz
);
988 for (ptr
= srdp
->abuf
->b_data
;
989 (char *)ptr
< (char *)srdp
->abuf
->b_data
+
990 srdp
->datablksz
; ptr
++)
991 *ptr
= 0x2f5baddb10cULL
;
993 return (SET_ERROR(EIO
));
997 ASSERT(dscp
->dsc_dso
->dso_dryrun
||
998 srdp
->abuf
!= NULL
|| srdp
->abd
!= NULL
);
1000 uint64_t offset
= range
->start_blkid
* srdp
->datablksz
;
1003 if (srdp
->abd
!= NULL
) {
1004 data
= abd_to_buf(srdp
->abd
);
1005 ASSERT3P(srdp
->abuf
, ==, NULL
);
1006 } else if (srdp
->abuf
!= NULL
) {
1007 data
= srdp
->abuf
->b_data
;
1011 * If we have large blocks stored on disk but the send flags
1012 * don't allow us to send large blocks, we split the data from
1013 * the arc buf into chunks.
1015 if (srdp
->datablksz
> SPA_OLD_MAXBLOCKSIZE
&&
1016 !(dscp
->dsc_featureflags
&
1017 DMU_BACKUP_FEATURE_LARGE_BLOCKS
)) {
1018 while (srdp
->datablksz
> 0 && err
== 0) {
1019 int n
= MIN(srdp
->datablksz
,
1020 SPA_OLD_MAXBLOCKSIZE
);
1021 err
= dmu_dump_write(dscp
, srdp
->obj_type
,
1022 range
->object
, offset
, n
, n
, NULL
, B_FALSE
,
1026 * When doing dry run, data==NULL is used as a
1028 * dmu_dump_write()->dump_record().
1032 srdp
->datablksz
-= n
;
1035 err
= dmu_dump_write(dscp
, srdp
->obj_type
,
1036 range
->object
, offset
,
1037 srdp
->datablksz
, srdp
->datasz
, bp
,
1038 srdp
->io_compressed
, data
);
1043 struct srh
*srhp
= &range
->sru
.hole
;
1044 if (range
->object
== DMU_META_DNODE_OBJECT
) {
1045 uint32_t span
= srhp
->datablksz
>> DNODE_SHIFT
;
1046 uint64_t first_obj
= range
->start_blkid
* span
;
1047 uint64_t numobj
= range
->end_blkid
* span
- first_obj
;
1048 return (dump_freeobjects(dscp
, first_obj
, numobj
));
1050 uint64_t offset
= 0;
1053 * If this multiply overflows, we don't need to send this block.
1054 * Even if it has a birth time, it can never not be a hole, so
1055 * we don't need to send records for it.
1057 if (!overflow_multiply(range
->start_blkid
, srhp
->datablksz
,
1063 if (!overflow_multiply(range
->end_blkid
, srhp
->datablksz
, &len
))
1066 return (dump_free(dscp
, range
->object
, offset
, len
));
1069 panic("Invalid range type in do_dump: %d", range
->type
);
1074 static struct send_range
*
1075 range_alloc(enum type type
, uint64_t object
, uint64_t start_blkid
,
1076 uint64_t end_blkid
, boolean_t eos
)
1078 struct send_range
*range
= kmem_alloc(sizeof (*range
), KM_SLEEP
);
1080 range
->object
= object
;
1081 range
->start_blkid
= start_blkid
;
1082 range
->end_blkid
= end_blkid
;
1083 range
->eos_marker
= eos
;
1085 range
->sru
.data
.abd
= NULL
;
1086 range
->sru
.data
.abuf
= NULL
;
1087 mutex_init(&range
->sru
.data
.lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1088 cv_init(&range
->sru
.data
.cv
, NULL
, CV_DEFAULT
, NULL
);
1089 range
->sru
.data
.io_outstanding
= 0;
1090 range
->sru
.data
.io_err
= 0;
1091 range
->sru
.data
.io_compressed
= B_FALSE
;
1097 * This is the callback function to traverse_dataset that acts as a worker
1098 * thread for dmu_send_impl.
1101 send_cb(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
1102 const zbookmark_phys_t
*zb
, const struct dnode_phys
*dnp
, void *arg
)
1105 struct send_thread_arg
*sta
= arg
;
1106 struct send_range
*record
;
1108 ASSERT(zb
->zb_object
== DMU_META_DNODE_OBJECT
||
1109 zb
->zb_object
>= sta
->resume
.zb_object
);
1112 * All bps of an encrypted os should have the encryption bit set.
1113 * If this is not true it indicates tampering and we report an error.
1115 if (sta
->os
->os_encrypted
&&
1116 !BP_IS_HOLE(bp
) && !BP_USES_CRYPT(bp
)) {
1117 spa_log_error(spa
, zb
);
1118 zfs_panic_recover("unencrypted block in encrypted "
1119 "object set %llu", dmu_objset_id(sta
->os
));
1120 return (SET_ERROR(EIO
));
1124 return (SET_ERROR(EINTR
));
1125 if (zb
->zb_object
!= DMU_META_DNODE_OBJECT
&&
1126 DMU_OBJECT_IS_SPECIAL(zb
->zb_object
))
1128 atomic_inc_64(sta
->num_blocks_visited
);
1130 if (zb
->zb_level
== ZB_DNODE_LEVEL
) {
1131 if (zb
->zb_object
== DMU_META_DNODE_OBJECT
)
1133 record
= range_alloc(OBJECT
, zb
->zb_object
, 0, 0, B_FALSE
);
1134 record
->sru
.object
.bp
= *bp
;
1135 size_t size
= sizeof (*dnp
) * (dnp
->dn_extra_slots
+ 1);
1136 record
->sru
.object
.dnp
= kmem_alloc(size
, KM_SLEEP
);
1137 bcopy(dnp
, record
->sru
.object
.dnp
, size
);
1138 bqueue_enqueue(&sta
->q
, record
, sizeof (*record
));
1141 if (zb
->zb_level
== 0 && zb
->zb_object
== DMU_META_DNODE_OBJECT
&&
1143 record
= range_alloc(OBJECT_RANGE
, 0, zb
->zb_blkid
,
1144 zb
->zb_blkid
+ 1, B_FALSE
);
1145 record
->sru
.object_range
.bp
= *bp
;
1146 bqueue_enqueue(&sta
->q
, record
, sizeof (*record
));
1149 if (zb
->zb_level
< 0 || (zb
->zb_level
> 0 && !BP_IS_HOLE(bp
)))
1151 if (zb
->zb_object
== DMU_META_DNODE_OBJECT
&& !BP_IS_HOLE(bp
))
1154 uint64_t span
= bp_span_in_blocks(dnp
->dn_indblkshift
, zb
->zb_level
);
1158 * If this multiply overflows, we don't need to send this block.
1159 * Even if it has a birth time, it can never not be a hole, so
1160 * we don't need to send records for it.
1162 if (!overflow_multiply(span
, zb
->zb_blkid
, &start
) || (!(zb
->zb_blkid
==
1163 DMU_SPILL_BLKID
|| DMU_OT_IS_METADATA(dnp
->dn_type
)) &&
1164 span
* zb
->zb_blkid
> dnp
->dn_maxblkid
)) {
1165 ASSERT(BP_IS_HOLE(bp
));
1169 if (zb
->zb_blkid
== DMU_SPILL_BLKID
)
1170 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_SA
);
1172 enum type record_type
= DATA
;
1175 else if (BP_IS_REDACTED(bp
))
1176 record_type
= REDACT
;
1180 record
= range_alloc(record_type
, zb
->zb_object
, start
,
1181 (start
+ span
< start
? 0 : start
+ span
), B_FALSE
);
1183 uint64_t datablksz
= (zb
->zb_blkid
== DMU_SPILL_BLKID
?
1184 BP_GET_LSIZE(bp
) : dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
);
1186 if (BP_IS_HOLE(bp
)) {
1187 record
->sru
.hole
.datablksz
= datablksz
;
1188 } else if (BP_IS_REDACTED(bp
)) {
1189 record
->sru
.redact
.datablksz
= datablksz
;
1191 record
->sru
.data
.datablksz
= datablksz
;
1192 record
->sru
.data
.obj_type
= dnp
->dn_type
;
1193 record
->sru
.data
.bp
= *bp
;
1196 bqueue_enqueue(&sta
->q
, record
, sizeof (*record
));
1200 struct redact_list_cb_arg
{
1201 uint64_t *num_blocks_visited
;
1204 boolean_t mark_redact
;
1208 redact_list_cb(redact_block_phys_t
*rb
, void *arg
)
1210 struct redact_list_cb_arg
*rlcap
= arg
;
1212 atomic_inc_64(rlcap
->num_blocks_visited
);
1216 struct send_range
*data
= range_alloc(REDACT
, rb
->rbp_object
,
1217 rb
->rbp_blkid
, rb
->rbp_blkid
+ redact_block_get_count(rb
), B_FALSE
);
1218 ASSERT3U(data
->end_blkid
, >, rb
->rbp_blkid
);
1219 if (rlcap
->mark_redact
) {
1220 data
->type
= REDACT
;
1221 data
->sru
.redact
.datablksz
= redact_block_get_size(rb
);
1223 data
->type
= PREVIOUSLY_REDACTED
;
1225 bqueue_enqueue(rlcap
->q
, data
, sizeof (*data
));
1231 * This function kicks off the traverse_dataset. It also handles setting the
1232 * error code of the thread in case something goes wrong, and pushes the End of
1233 * Stream record when the traverse_dataset call has finished.
1236 send_traverse_thread(void *arg
)
1238 struct send_thread_arg
*st_arg
= arg
;
1240 struct send_range
*data
;
1241 fstrans_cookie_t cookie
= spl_fstrans_mark();
1243 err
= traverse_dataset_resume(st_arg
->os
->os_dsl_dataset
,
1244 st_arg
->fromtxg
, &st_arg
->resume
,
1245 st_arg
->flags
, send_cb
, st_arg
);
1248 st_arg
->error_code
= err
;
1249 data
= range_alloc(DATA
, 0, 0, 0, B_TRUE
);
1250 bqueue_enqueue_flush(&st_arg
->q
, data
, sizeof (*data
));
1251 spl_fstrans_unmark(cookie
);
1256 * Utility function that causes End of Stream records to compare after of all
1257 * others, so that other threads' comparison logic can stay simple.
1259 static int __attribute__((unused
))
1260 send_range_after(const struct send_range
*from
, const struct send_range
*to
)
1262 if (from
->eos_marker
== B_TRUE
)
1264 if (to
->eos_marker
== B_TRUE
)
1267 uint64_t from_obj
= from
->object
;
1268 uint64_t from_end_obj
= from
->object
+ 1;
1269 uint64_t to_obj
= to
->object
;
1270 uint64_t to_end_obj
= to
->object
+ 1;
1271 if (from_obj
== 0) {
1272 ASSERT(from
->type
== HOLE
|| from
->type
== OBJECT_RANGE
);
1273 from_obj
= from
->start_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1274 from_end_obj
= from
->end_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1277 ASSERT(to
->type
== HOLE
|| to
->type
== OBJECT_RANGE
);
1278 to_obj
= to
->start_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1279 to_end_obj
= to
->end_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1282 if (from_end_obj
<= to_obj
)
1284 if (from_obj
>= to_end_obj
)
1286 int64_t cmp
= TREE_CMP(to
->type
== OBJECT_RANGE
, from
->type
==
1290 cmp
= TREE_CMP(to
->type
== OBJECT
, from
->type
== OBJECT
);
1293 if (from
->end_blkid
<= to
->start_blkid
)
1295 if (from
->start_blkid
>= to
->end_blkid
)
1301 * Pop the new data off the queue, check that the records we receive are in
1302 * the right order, but do not free the old data. This is used so that the
1303 * records can be sent on to the main thread without copying the data.
1305 static struct send_range
*
1306 get_next_range_nofree(bqueue_t
*bq
, struct send_range
*prev
)
1308 struct send_range
*next
= bqueue_dequeue(bq
);
1309 ASSERT3S(send_range_after(prev
, next
), ==, -1);
1314 * Pop the new data off the queue, check that the records we receive are in
1315 * the right order, and free the old data.
1317 static struct send_range
*
1318 get_next_range(bqueue_t
*bq
, struct send_range
*prev
)
1320 struct send_range
*next
= get_next_range_nofree(bq
, prev
);
1326 redact_list_thread(void *arg
)
1328 struct redact_list_thread_arg
*rlt_arg
= arg
;
1329 struct send_range
*record
;
1330 fstrans_cookie_t cookie
= spl_fstrans_mark();
1331 if (rlt_arg
->rl
!= NULL
) {
1332 struct redact_list_cb_arg rlcba
= {0};
1333 rlcba
.cancel
= &rlt_arg
->cancel
;
1334 rlcba
.q
= &rlt_arg
->q
;
1335 rlcba
.num_blocks_visited
= rlt_arg
->num_blocks_visited
;
1336 rlcba
.mark_redact
= rlt_arg
->mark_redact
;
1337 int err
= dsl_redaction_list_traverse(rlt_arg
->rl
,
1338 &rlt_arg
->resume
, redact_list_cb
, &rlcba
);
1340 rlt_arg
->error_code
= err
;
1342 record
= range_alloc(DATA
, 0, 0, 0, B_TRUE
);
1343 bqueue_enqueue_flush(&rlt_arg
->q
, record
, sizeof (*record
));
1344 spl_fstrans_unmark(cookie
);
1350 * Compare the start point of the two provided ranges. End of stream ranges
1351 * compare last, objects compare before any data or hole inside that object and
1352 * multi-object holes that start at the same object.
1355 send_range_start_compare(struct send_range
*r1
, struct send_range
*r2
)
1357 uint64_t r1_objequiv
= r1
->object
;
1358 uint64_t r1_l0equiv
= r1
->start_blkid
;
1359 uint64_t r2_objequiv
= r2
->object
;
1360 uint64_t r2_l0equiv
= r2
->start_blkid
;
1361 int64_t cmp
= TREE_CMP(r1
->eos_marker
, r2
->eos_marker
);
1364 if (r1
->object
== 0) {
1365 r1_objequiv
= r1
->start_blkid
* DNODES_PER_BLOCK
;
1368 if (r2
->object
== 0) {
1369 r2_objequiv
= r2
->start_blkid
* DNODES_PER_BLOCK
;
1373 cmp
= TREE_CMP(r1_objequiv
, r2_objequiv
);
1376 cmp
= TREE_CMP(r2
->type
== OBJECT_RANGE
, r1
->type
== OBJECT_RANGE
);
1379 cmp
= TREE_CMP(r2
->type
== OBJECT
, r1
->type
== OBJECT
);
1383 return (TREE_CMP(r1_l0equiv
, r2_l0equiv
));
1394 * This function returns the next range the send_merge_thread should operate on.
1395 * The inputs are two arrays; the first one stores the range at the front of the
1396 * queues stored in the second one. The ranges are sorted in descending
1397 * priority order; the metadata from earlier ranges overrules metadata from
1398 * later ranges. out_mask is used to return which threads the ranges came from;
1399 * bit i is set if ranges[i] started at the same place as the returned range.
1401 * This code is not hardcoded to compare a specific number of threads; it could
1402 * be used with any number, just by changing the q_idx enum.
1404 * The "next range" is the one with the earliest start; if two starts are equal,
1405 * the highest-priority range is the next to operate on. If a higher-priority
1406 * range starts in the middle of the first range, then the first range will be
1407 * truncated to end where the higher-priority range starts, and we will operate
1408 * on that one next time. In this way, we make sure that each block covered by
1409 * some range gets covered by a returned range, and each block covered is
1410 * returned using the metadata of the highest-priority range it appears in.
1412 * For example, if the three ranges at the front of the queues were [2,4),
1413 * [3,5), and [1,3), then the ranges returned would be [1,2) with the metadata
1414 * from the third range, [2,4) with the metadata from the first range, and then
1415 * [4,5) with the metadata from the second.
1417 static struct send_range
*
1418 find_next_range(struct send_range
**ranges
, bqueue_t
**qs
, uint64_t *out_mask
)
1420 int idx
= 0; // index of the range with the earliest start
1423 for (i
= 1; i
< NUM_THREADS
; i
++) {
1424 if (send_range_start_compare(ranges
[i
], ranges
[idx
]) < 0)
1427 if (ranges
[idx
]->eos_marker
) {
1428 struct send_range
*ret
= range_alloc(DATA
, 0, 0, 0, B_TRUE
);
1433 * Find all the ranges that start at that same point.
1435 for (i
= 0; i
< NUM_THREADS
; i
++) {
1436 if (send_range_start_compare(ranges
[i
], ranges
[idx
]) == 0)
1441 * OBJECT_RANGE records only come from the TO thread, and should always
1442 * be treated as overlapping with nothing and sent on immediately. They
1443 * are only used in raw sends, and are never redacted.
1445 if (ranges
[idx
]->type
== OBJECT_RANGE
) {
1446 ASSERT3U(idx
, ==, TO_IDX
);
1447 ASSERT3U(*out_mask
, ==, 1 << TO_IDX
);
1448 struct send_range
*ret
= ranges
[idx
];
1449 ranges
[idx
] = get_next_range_nofree(qs
[idx
], ranges
[idx
]);
1453 * Find the first start or end point after the start of the first range.
1455 uint64_t first_change
= ranges
[idx
]->end_blkid
;
1456 for (i
= 0; i
< NUM_THREADS
; i
++) {
1457 if (i
== idx
|| ranges
[i
]->eos_marker
||
1458 ranges
[i
]->object
> ranges
[idx
]->object
||
1459 ranges
[i
]->object
== DMU_META_DNODE_OBJECT
)
1461 ASSERT3U(ranges
[i
]->object
, ==, ranges
[idx
]->object
);
1462 if (first_change
> ranges
[i
]->start_blkid
&&
1463 (bmask
& (1 << i
)) == 0)
1464 first_change
= ranges
[i
]->start_blkid
;
1465 else if (first_change
> ranges
[i
]->end_blkid
)
1466 first_change
= ranges
[i
]->end_blkid
;
1469 * Update all ranges to no longer overlap with the range we're
1470 * returning. All such ranges must start at the same place as the range
1471 * being returned, and end at or after first_change. Thus we update
1472 * their start to first_change. If that makes them size 0, then free
1473 * them and pull a new range from that thread.
1475 for (i
= 0; i
< NUM_THREADS
; i
++) {
1476 if (i
== idx
|| (bmask
& (1 << i
)) == 0)
1478 ASSERT3U(first_change
, >, ranges
[i
]->start_blkid
);
1479 ranges
[i
]->start_blkid
= first_change
;
1480 ASSERT3U(ranges
[i
]->start_blkid
, <=, ranges
[i
]->end_blkid
);
1481 if (ranges
[i
]->start_blkid
== ranges
[i
]->end_blkid
)
1482 ranges
[i
] = get_next_range(qs
[i
], ranges
[i
]);
1485 * Short-circuit the simple case; if the range doesn't overlap with
1486 * anything else, or it only overlaps with things that start at the same
1487 * place and are longer, send it on.
1489 if (first_change
== ranges
[idx
]->end_blkid
) {
1490 struct send_range
*ret
= ranges
[idx
];
1491 ranges
[idx
] = get_next_range_nofree(qs
[idx
], ranges
[idx
]);
1496 * Otherwise, return a truncated copy of ranges[idx] and move the start
1497 * of ranges[idx] back to first_change.
1499 struct send_range
*ret
= kmem_alloc(sizeof (*ret
), KM_SLEEP
);
1500 *ret
= *ranges
[idx
];
1501 ret
->end_blkid
= first_change
;
1502 ranges
[idx
]->start_blkid
= first_change
;
1506 #define FROM_AND_REDACT_BITS ((1 << REDACT_IDX) | (1 << FROM_IDX))
1509 * Merge the results from the from thread and the to thread, and then hand the
1510 * records off to send_prefetch_thread to prefetch them. If this is not a
1511 * send from a redaction bookmark, the from thread will push an end of stream
1512 * record and stop, and we'll just send everything that was changed in the
1513 * to_ds since the ancestor's creation txg. If it is, then since
1514 * traverse_dataset has a canonical order, we can compare each change as
1515 * they're pulled off the queues. That will give us a stream that is
1516 * appropriately sorted, and covers all records. In addition, we pull the
1517 * data from the redact_list_thread and use that to determine which blocks
1518 * should be redacted.
1521 send_merge_thread(void *arg
)
1523 struct send_merge_thread_arg
*smt_arg
= arg
;
1524 struct send_range
*front_ranges
[NUM_THREADS
];
1525 bqueue_t
*queues
[NUM_THREADS
];
1527 fstrans_cookie_t cookie
= spl_fstrans_mark();
1529 if (smt_arg
->redact_arg
== NULL
) {
1530 front_ranges
[REDACT_IDX
] =
1531 kmem_zalloc(sizeof (struct send_range
), KM_SLEEP
);
1532 front_ranges
[REDACT_IDX
]->eos_marker
= B_TRUE
;
1533 front_ranges
[REDACT_IDX
]->type
= REDACT
;
1534 queues
[REDACT_IDX
] = NULL
;
1536 front_ranges
[REDACT_IDX
] =
1537 bqueue_dequeue(&smt_arg
->redact_arg
->q
);
1538 queues
[REDACT_IDX
] = &smt_arg
->redact_arg
->q
;
1540 front_ranges
[TO_IDX
] = bqueue_dequeue(&smt_arg
->to_arg
->q
);
1541 queues
[TO_IDX
] = &smt_arg
->to_arg
->q
;
1542 front_ranges
[FROM_IDX
] = bqueue_dequeue(&smt_arg
->from_arg
->q
);
1543 queues
[FROM_IDX
] = &smt_arg
->from_arg
->q
;
1545 struct send_range
*range
;
1546 for (range
= find_next_range(front_ranges
, queues
, &mask
);
1547 !range
->eos_marker
&& err
== 0 && !smt_arg
->cancel
;
1548 range
= find_next_range(front_ranges
, queues
, &mask
)) {
1550 * If the range in question was in both the from redact bookmark
1551 * and the bookmark we're using to redact, then don't send it.
1552 * It's already redacted on the receiving system, so a redaction
1553 * record would be redundant.
1555 if ((mask
& FROM_AND_REDACT_BITS
) == FROM_AND_REDACT_BITS
) {
1556 ASSERT3U(range
->type
, ==, REDACT
);
1560 bqueue_enqueue(&smt_arg
->q
, range
, sizeof (*range
));
1562 if (smt_arg
->to_arg
->error_code
!= 0) {
1563 err
= smt_arg
->to_arg
->error_code
;
1564 } else if (smt_arg
->from_arg
->error_code
!= 0) {
1565 err
= smt_arg
->from_arg
->error_code
;
1566 } else if (smt_arg
->redact_arg
!= NULL
&&
1567 smt_arg
->redact_arg
->error_code
!= 0) {
1568 err
= smt_arg
->redact_arg
->error_code
;
1571 if (smt_arg
->cancel
&& err
== 0)
1572 err
= SET_ERROR(EINTR
);
1573 smt_arg
->error
= err
;
1574 if (smt_arg
->error
!= 0) {
1575 smt_arg
->to_arg
->cancel
= B_TRUE
;
1576 smt_arg
->from_arg
->cancel
= B_TRUE
;
1577 if (smt_arg
->redact_arg
!= NULL
)
1578 smt_arg
->redact_arg
->cancel
= B_TRUE
;
1580 for (int i
= 0; i
< NUM_THREADS
; i
++) {
1581 while (!front_ranges
[i
]->eos_marker
) {
1582 front_ranges
[i
] = get_next_range(queues
[i
],
1585 range_free(front_ranges
[i
]);
1588 range
= kmem_zalloc(sizeof (*range
), KM_SLEEP
);
1589 range
->eos_marker
= B_TRUE
;
1590 bqueue_enqueue_flush(&smt_arg
->q
, range
, 1);
1591 spl_fstrans_unmark(cookie
);
1595 struct send_reader_thread_arg
{
1596 struct send_merge_thread_arg
*smta
;
1599 boolean_t issue_reads
;
1600 uint64_t featureflags
;
1605 dmu_send_read_done(zio_t
*zio
)
1607 struct send_range
*range
= zio
->io_private
;
1609 mutex_enter(&range
->sru
.data
.lock
);
1610 if (zio
->io_error
!= 0) {
1611 abd_free(range
->sru
.data
.abd
);
1612 range
->sru
.data
.abd
= NULL
;
1613 range
->sru
.data
.io_err
= zio
->io_error
;
1616 ASSERT(range
->sru
.data
.io_outstanding
);
1617 range
->sru
.data
.io_outstanding
= B_FALSE
;
1618 cv_broadcast(&range
->sru
.data
.cv
);
1619 mutex_exit(&range
->sru
.data
.lock
);
1623 issue_data_read(struct send_reader_thread_arg
*srta
, struct send_range
*range
)
1625 struct srd
*srdp
= &range
->sru
.data
;
1626 blkptr_t
*bp
= &srdp
->bp
;
1627 objset_t
*os
= srta
->smta
->os
;
1629 ASSERT3U(range
->type
, ==, DATA
);
1630 ASSERT3U(range
->start_blkid
+ 1, ==, range
->end_blkid
);
1632 * If we have large blocks stored on disk but
1633 * the send flags don't allow us to send large
1634 * blocks, we split the data from the arc buf
1637 boolean_t split_large_blocks
=
1638 srdp
->datablksz
> SPA_OLD_MAXBLOCKSIZE
&&
1639 !(srta
->featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
);
1641 * We should only request compressed data from the ARC if all
1642 * the following are true:
1643 * - stream compression was requested
1644 * - we aren't splitting large blocks into smaller chunks
1645 * - the data won't need to be byteswapped before sending
1646 * - this isn't an embedded block
1647 * - this isn't metadata (if receiving on a different endian
1648 * system it can be byteswapped more easily)
1650 boolean_t request_compressed
=
1651 (srta
->featureflags
& DMU_BACKUP_FEATURE_COMPRESSED
) &&
1652 !split_large_blocks
&& !BP_SHOULD_BYTESWAP(bp
) &&
1653 !BP_IS_EMBEDDED(bp
) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp
));
1655 enum zio_flag zioflags
= ZIO_FLAG_CANFAIL
;
1657 if (srta
->featureflags
& DMU_BACKUP_FEATURE_RAW
) {
1658 zioflags
|= ZIO_FLAG_RAW
;
1659 srdp
->io_compressed
= B_TRUE
;
1660 } else if (request_compressed
) {
1661 zioflags
|= ZIO_FLAG_RAW_COMPRESS
;
1662 srdp
->io_compressed
= B_TRUE
;
1665 srdp
->datasz
= (zioflags
& ZIO_FLAG_RAW_COMPRESS
) ?
1666 BP_GET_PSIZE(bp
) : BP_GET_LSIZE(bp
);
1668 if (!srta
->issue_reads
)
1670 if (BP_IS_REDACTED(bp
))
1672 if (send_do_embed(bp
, srta
->featureflags
))
1675 zbookmark_phys_t zb
= {
1676 .zb_objset
= dmu_objset_id(os
),
1677 .zb_object
= range
->object
,
1679 .zb_blkid
= range
->start_blkid
,
1682 arc_flags_t aflags
= ARC_FLAG_CACHED_ONLY
;
1684 int arc_err
= arc_read(NULL
, os
->os_spa
, bp
,
1685 arc_getbuf_func
, &srdp
->abuf
, ZIO_PRIORITY_ASYNC_READ
,
1686 zioflags
, &aflags
, &zb
);
1688 * If the data is not already cached in the ARC, we read directly
1689 * from zio. This avoids the performance overhead of adding a new
1690 * entry to the ARC, and we also avoid polluting the ARC cache with
1691 * data that is not likely to be used in the future.
1694 srdp
->abd
= abd_alloc_linear(srdp
->datasz
, B_FALSE
);
1695 srdp
->io_outstanding
= B_TRUE
;
1696 zio_nowait(zio_read(NULL
, os
->os_spa
, bp
, srdp
->abd
,
1697 srdp
->datasz
, dmu_send_read_done
, range
,
1698 ZIO_PRIORITY_ASYNC_READ
, zioflags
, &zb
));
1703 * Create a new record with the given values.
1706 enqueue_range(struct send_reader_thread_arg
*srta
, bqueue_t
*q
, dnode_t
*dn
,
1707 uint64_t blkid
, uint64_t count
, const blkptr_t
*bp
, uint32_t datablksz
)
1709 enum type range_type
= (bp
== NULL
|| BP_IS_HOLE(bp
) ? HOLE
:
1710 (BP_IS_REDACTED(bp
) ? REDACT
: DATA
));
1712 struct send_range
*range
= range_alloc(range_type
, dn
->dn_object
,
1713 blkid
, blkid
+ count
, B_FALSE
);
1715 if (blkid
== DMU_SPILL_BLKID
)
1716 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_SA
);
1718 switch (range_type
) {
1720 range
->sru
.hole
.datablksz
= datablksz
;
1723 ASSERT3U(count
, ==, 1);
1724 range
->sru
.data
.datablksz
= datablksz
;
1725 range
->sru
.data
.obj_type
= dn
->dn_type
;
1726 range
->sru
.data
.bp
= *bp
;
1727 issue_data_read(srta
, range
);
1730 range
->sru
.redact
.datablksz
= datablksz
;
1735 bqueue_enqueue(q
, range
, datablksz
);
1739 * This thread is responsible for two things: First, it retrieves the correct
1740 * blkptr in the to ds if we need to send the data because of something from
1741 * the from thread. As a result of this, we're the first ones to discover that
1742 * some indirect blocks can be discarded because they're not holes. Second,
1743 * it issues prefetches for the data we need to send.
1746 send_reader_thread(void *arg
)
1748 struct send_reader_thread_arg
*srta
= arg
;
1749 struct send_merge_thread_arg
*smta
= srta
->smta
;
1750 bqueue_t
*inq
= &smta
->q
;
1751 bqueue_t
*outq
= &srta
->q
;
1752 objset_t
*os
= smta
->os
;
1753 fstrans_cookie_t cookie
= spl_fstrans_mark();
1754 struct send_range
*range
= bqueue_dequeue(inq
);
1758 * If the record we're analyzing is from a redaction bookmark from the
1759 * fromds, then we need to know whether or not it exists in the tods so
1760 * we know whether to create records for it or not. If it does, we need
1761 * the datablksz so we can generate an appropriate record for it.
1762 * Finally, if it isn't redacted, we need the blkptr so that we can send
1763 * a WRITE record containing the actual data.
1765 uint64_t last_obj
= UINT64_MAX
;
1766 uint64_t last_obj_exists
= B_TRUE
;
1767 while (!range
->eos_marker
&& !srta
->cancel
&& smta
->error
== 0 &&
1769 switch (range
->type
) {
1771 issue_data_read(srta
, range
);
1772 bqueue_enqueue(outq
, range
, range
->sru
.data
.datablksz
);
1773 range
= get_next_range_nofree(inq
, range
);
1778 case REDACT
: // Redacted blocks must exist
1779 bqueue_enqueue(outq
, range
, sizeof (*range
));
1780 range
= get_next_range_nofree(inq
, range
);
1782 case PREVIOUSLY_REDACTED
: {
1784 * This entry came from the "from bookmark" when
1785 * sending from a bookmark that has a redaction
1786 * list. We need to check if this object/blkid
1787 * exists in the target ("to") dataset, and if
1788 * not then we drop this entry. We also need
1789 * to fill in the block pointer so that we know
1792 * To accomplish the above, we first cache whether or
1793 * not the last object we examined exists. If it
1794 * doesn't, we can drop this record. If it does, we hold
1795 * the dnode and use it to call dbuf_dnode_findbp. We do
1796 * this instead of dbuf_bookmark_findbp because we will
1797 * often operate on large ranges, and holding the dnode
1798 * once is more efficient.
1800 boolean_t object_exists
= B_TRUE
;
1802 * If the data is redacted, we only care if it exists,
1803 * so that we don't send records for objects that have
1807 if (range
->object
== last_obj
&& !last_obj_exists
) {
1809 * If we're still examining the same object as
1810 * previously, and it doesn't exist, we don't
1811 * need to call dbuf_bookmark_findbp.
1813 object_exists
= B_FALSE
;
1815 err
= dnode_hold(os
, range
->object
, FTAG
, &dn
);
1816 if (err
== ENOENT
) {
1817 object_exists
= B_FALSE
;
1820 last_obj
= range
->object
;
1821 last_obj_exists
= object_exists
;
1826 } else if (!object_exists
) {
1828 * The block was modified, but doesn't
1829 * exist in the to dataset; if it was
1830 * deleted in the to dataset, then we'll
1831 * visit the hole bp for it at some point.
1833 range
= get_next_range(inq
, range
);
1837 (dn
->dn_maxblkid
< range
->end_blkid
?
1838 dn
->dn_maxblkid
: range
->end_blkid
);
1840 * The object exists, so we need to try to find the
1841 * blkptr for each block in the range we're processing.
1843 rw_enter(&dn
->dn_struct_rwlock
, RW_READER
);
1844 for (uint64_t blkid
= range
->start_blkid
;
1845 blkid
< file_max
; blkid
++) {
1847 uint32_t datablksz
=
1848 dn
->dn_phys
->dn_datablkszsec
<<
1850 uint64_t offset
= blkid
* datablksz
;
1852 * This call finds the next non-hole block in
1853 * the object. This is to prevent a
1854 * performance problem where we're unredacting
1855 * a large hole. Using dnode_next_offset to
1856 * skip over the large hole avoids iterating
1857 * over every block in it.
1859 err
= dnode_next_offset(dn
, DNODE_FIND_HAVELOCK
,
1862 offset
= UINT64_MAX
;
1864 } else if (err
!= 0) {
1867 if (offset
!= blkid
* datablksz
) {
1869 * if there is a hole from here
1872 offset
= MIN(offset
, file_max
*
1874 uint64_t nblks
= (offset
/ datablksz
) -
1876 enqueue_range(srta
, outq
, dn
, blkid
,
1877 nblks
, NULL
, datablksz
);
1880 if (blkid
>= file_max
)
1882 err
= dbuf_dnode_findbp(dn
, 0, blkid
, &bp
,
1886 ASSERT(!BP_IS_HOLE(&bp
));
1887 enqueue_range(srta
, outq
, dn
, blkid
, 1, &bp
,
1890 rw_exit(&dn
->dn_struct_rwlock
);
1891 dnode_rele(dn
, FTAG
);
1892 range
= get_next_range(inq
, range
);
1896 if (srta
->cancel
|| err
!= 0) {
1897 smta
->cancel
= B_TRUE
;
1899 } else if (smta
->error
!= 0) {
1900 srta
->error
= smta
->error
;
1902 while (!range
->eos_marker
)
1903 range
= get_next_range(inq
, range
);
1905 bqueue_enqueue_flush(outq
, range
, 1);
1906 spl_fstrans_unmark(cookie
);
1910 #define NUM_SNAPS_NOT_REDACTED UINT64_MAX
1912 struct dmu_send_params
{
1914 void *tag
; // Tag that dp was held with, will be used to release dp.
1916 /* To snapshot args */
1918 dsl_dataset_t
*to_ds
;
1919 /* From snapshot args */
1920 zfs_bookmark_phys_t ancestor_zb
;
1921 uint64_t *fromredactsnaps
;
1922 /* NUM_SNAPS_NOT_REDACTED if not sending from redaction bookmark */
1923 uint64_t numfromredactsnaps
;
1927 boolean_t large_block_ok
;
1928 boolean_t compressok
;
1933 uint64_t saved_guid
;
1934 zfs_bookmark_phys_t
*redactbook
;
1935 /* Stream output params */
1936 dmu_send_outparams_t
*dso
;
1938 /* Stream progress params */
1941 char saved_toname
[MAXNAMELEN
];
1945 setup_featureflags(struct dmu_send_params
*dspp
, objset_t
*os
,
1946 uint64_t *featureflags
)
1948 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
1949 dsl_pool_t
*dp
= dspp
->dp
;
1951 if (dmu_objset_type(os
) == DMU_OST_ZFS
) {
1953 if (zfs_get_zplprop(os
, ZFS_PROP_VERSION
, &version
) != 0)
1954 return (SET_ERROR(EINVAL
));
1956 if (version
>= ZPL_VERSION_SA
)
1957 *featureflags
|= DMU_BACKUP_FEATURE_SA_SPILL
;
1961 /* raw sends imply large_block_ok */
1962 if ((dspp
->rawok
|| dspp
->large_block_ok
) &&
1963 dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_BLOCKS
)) {
1964 *featureflags
|= DMU_BACKUP_FEATURE_LARGE_BLOCKS
;
1967 /* encrypted datasets will not have embedded blocks */
1968 if ((dspp
->embedok
|| dspp
->rawok
) && !os
->os_encrypted
&&
1969 spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_EMBEDDED_DATA
)) {
1970 *featureflags
|= DMU_BACKUP_FEATURE_EMBED_DATA
;
1973 /* raw send implies compressok */
1974 if (dspp
->compressok
|| dspp
->rawok
)
1975 *featureflags
|= DMU_BACKUP_FEATURE_COMPRESSED
;
1977 if (dspp
->rawok
&& os
->os_encrypted
)
1978 *featureflags
|= DMU_BACKUP_FEATURE_RAW
;
1980 if ((*featureflags
&
1981 (DMU_BACKUP_FEATURE_EMBED_DATA
| DMU_BACKUP_FEATURE_COMPRESSED
|
1982 DMU_BACKUP_FEATURE_RAW
)) != 0 &&
1983 spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_LZ4_COMPRESS
)) {
1984 *featureflags
|= DMU_BACKUP_FEATURE_LZ4
;
1988 * We specifically do not include DMU_BACKUP_FEATURE_EMBED_DATA here to
1989 * allow sending ZSTD compressed datasets to a receiver that does not
1992 if ((*featureflags
&
1993 (DMU_BACKUP_FEATURE_COMPRESSED
| DMU_BACKUP_FEATURE_RAW
)) != 0 &&
1994 dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_ZSTD_COMPRESS
)) {
1995 *featureflags
|= DMU_BACKUP_FEATURE_ZSTD
;
1998 if (dspp
->resumeobj
!= 0 || dspp
->resumeoff
!= 0) {
1999 *featureflags
|= DMU_BACKUP_FEATURE_RESUMING
;
2002 if (dspp
->redactbook
!= NULL
) {
2003 *featureflags
|= DMU_BACKUP_FEATURE_REDACTED
;
2006 if (dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_DNODE
)) {
2007 *featureflags
|= DMU_BACKUP_FEATURE_LARGE_DNODE
;
2012 static dmu_replay_record_t
*
2013 create_begin_record(struct dmu_send_params
*dspp
, objset_t
*os
,
2014 uint64_t featureflags
)
2016 dmu_replay_record_t
*drr
= kmem_zalloc(sizeof (dmu_replay_record_t
),
2018 drr
->drr_type
= DRR_BEGIN
;
2020 struct drr_begin
*drrb
= &drr
->drr_u
.drr_begin
;
2021 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
2023 drrb
->drr_magic
= DMU_BACKUP_MAGIC
;
2024 drrb
->drr_creation_time
= dsl_dataset_phys(to_ds
)->ds_creation_time
;
2025 drrb
->drr_type
= dmu_objset_type(os
);
2026 drrb
->drr_toguid
= dsl_dataset_phys(to_ds
)->ds_guid
;
2027 drrb
->drr_fromguid
= dspp
->ancestor_zb
.zbm_guid
;
2029 DMU_SET_STREAM_HDRTYPE(drrb
->drr_versioninfo
, DMU_SUBSTREAM
);
2030 DMU_SET_FEATUREFLAGS(drrb
->drr_versioninfo
, featureflags
);
2033 drrb
->drr_flags
|= DRR_FLAG_CLONE
;
2034 if (dsl_dataset_phys(dspp
->to_ds
)->ds_flags
& DS_FLAG_CI_DATASET
)
2035 drrb
->drr_flags
|= DRR_FLAG_CI_DATA
;
2036 if (zfs_send_set_freerecords_bit
)
2037 drrb
->drr_flags
|= DRR_FLAG_FREERECORDS
;
2038 drr
->drr_u
.drr_begin
.drr_flags
|= DRR_FLAG_SPILL_BLOCK
;
2040 if (dspp
->savedok
) {
2041 drrb
->drr_toguid
= dspp
->saved_guid
;
2042 strlcpy(drrb
->drr_toname
, dspp
->saved_toname
,
2043 sizeof (drrb
->drr_toname
));
2045 dsl_dataset_name(to_ds
, drrb
->drr_toname
);
2046 if (!to_ds
->ds_is_snapshot
) {
2047 (void) strlcat(drrb
->drr_toname
, "@--head--",
2048 sizeof (drrb
->drr_toname
));
2055 setup_to_thread(struct send_thread_arg
*to_arg
, objset_t
*to_os
,
2056 dmu_sendstatus_t
*dssp
, uint64_t fromtxg
, boolean_t rawok
)
2058 VERIFY0(bqueue_init(&to_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2059 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2060 offsetof(struct send_range
, ln
)));
2061 to_arg
->error_code
= 0;
2062 to_arg
->cancel
= B_FALSE
;
2064 to_arg
->fromtxg
= fromtxg
;
2065 to_arg
->flags
= TRAVERSE_PRE
| TRAVERSE_PREFETCH_METADATA
;
2067 to_arg
->flags
|= TRAVERSE_NO_DECRYPT
;
2068 if (zfs_send_corrupt_data
)
2069 to_arg
->flags
|= TRAVERSE_HARD
;
2070 to_arg
->num_blocks_visited
= &dssp
->dss_blocks
;
2071 (void) thread_create(NULL
, 0, send_traverse_thread
, to_arg
, 0,
2072 curproc
, TS_RUN
, minclsyspri
);
2076 setup_from_thread(struct redact_list_thread_arg
*from_arg
,
2077 redaction_list_t
*from_rl
, dmu_sendstatus_t
*dssp
)
2079 VERIFY0(bqueue_init(&from_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2080 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2081 offsetof(struct send_range
, ln
)));
2082 from_arg
->error_code
= 0;
2083 from_arg
->cancel
= B_FALSE
;
2084 from_arg
->rl
= from_rl
;
2085 from_arg
->mark_redact
= B_FALSE
;
2086 from_arg
->num_blocks_visited
= &dssp
->dss_blocks
;
2088 * If from_ds is null, send_traverse_thread just returns success and
2089 * enqueues an eos marker.
2091 (void) thread_create(NULL
, 0, redact_list_thread
, from_arg
, 0,
2092 curproc
, TS_RUN
, minclsyspri
);
2096 setup_redact_list_thread(struct redact_list_thread_arg
*rlt_arg
,
2097 struct dmu_send_params
*dspp
, redaction_list_t
*rl
, dmu_sendstatus_t
*dssp
)
2099 if (dspp
->redactbook
== NULL
)
2102 rlt_arg
->cancel
= B_FALSE
;
2103 VERIFY0(bqueue_init(&rlt_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2104 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2105 offsetof(struct send_range
, ln
)));
2106 rlt_arg
->error_code
= 0;
2107 rlt_arg
->mark_redact
= B_TRUE
;
2109 rlt_arg
->num_blocks_visited
= &dssp
->dss_blocks
;
2111 (void) thread_create(NULL
, 0, redact_list_thread
, rlt_arg
, 0,
2112 curproc
, TS_RUN
, minclsyspri
);
2116 setup_merge_thread(struct send_merge_thread_arg
*smt_arg
,
2117 struct dmu_send_params
*dspp
, struct redact_list_thread_arg
*from_arg
,
2118 struct send_thread_arg
*to_arg
, struct redact_list_thread_arg
*rlt_arg
,
2121 VERIFY0(bqueue_init(&smt_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2122 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2123 offsetof(struct send_range
, ln
)));
2124 smt_arg
->cancel
= B_FALSE
;
2126 smt_arg
->from_arg
= from_arg
;
2127 smt_arg
->to_arg
= to_arg
;
2128 if (dspp
->redactbook
!= NULL
)
2129 smt_arg
->redact_arg
= rlt_arg
;
2132 (void) thread_create(NULL
, 0, send_merge_thread
, smt_arg
, 0, curproc
,
2133 TS_RUN
, minclsyspri
);
2137 setup_reader_thread(struct send_reader_thread_arg
*srt_arg
,
2138 struct dmu_send_params
*dspp
, struct send_merge_thread_arg
*smt_arg
,
2139 uint64_t featureflags
)
2141 VERIFY0(bqueue_init(&srt_arg
->q
, zfs_send_queue_ff
,
2142 MAX(zfs_send_queue_length
, 2 * zfs_max_recordsize
),
2143 offsetof(struct send_range
, ln
)));
2144 srt_arg
->smta
= smt_arg
;
2145 srt_arg
->issue_reads
= !dspp
->dso
->dso_dryrun
;
2146 srt_arg
->featureflags
= featureflags
;
2147 (void) thread_create(NULL
, 0, send_reader_thread
, srt_arg
, 0,
2148 curproc
, TS_RUN
, minclsyspri
);
2152 setup_resume_points(struct dmu_send_params
*dspp
,
2153 struct send_thread_arg
*to_arg
, struct redact_list_thread_arg
*from_arg
,
2154 struct redact_list_thread_arg
*rlt_arg
,
2155 struct send_merge_thread_arg
*smt_arg
, boolean_t resuming
, objset_t
*os
,
2156 redaction_list_t
*redact_rl
, nvlist_t
*nvl
)
2159 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
2165 obj
= dspp
->resumeobj
;
2166 dmu_object_info_t to_doi
;
2167 err
= dmu_object_info(os
, obj
, &to_doi
);
2171 blkid
= dspp
->resumeoff
/ to_doi
.doi_data_block_size
;
2174 * If we're resuming a redacted send, we can skip to the appropriate
2175 * point in the redaction bookmark by binary searching through it.
2177 if (redact_rl
!= NULL
) {
2178 SET_BOOKMARK(&rlt_arg
->resume
, to_ds
->ds_object
, obj
, 0, blkid
);
2181 SET_BOOKMARK(&to_arg
->resume
, to_ds
->ds_object
, obj
, 0, blkid
);
2182 if (nvlist_exists(nvl
, BEGINNV_REDACT_FROM_SNAPS
)) {
2183 uint64_t objset
= dspp
->ancestor_zb
.zbm_redaction_obj
;
2185 * Note: If the resume point is in an object whose
2186 * blocksize is different in the from vs to snapshots,
2187 * we will have divided by the "wrong" blocksize.
2188 * However, in this case fromsnap's send_cb() will
2189 * detect that the blocksize has changed and therefore
2190 * ignore this object.
2192 * If we're resuming a send from a redaction bookmark,
2193 * we still cannot accidentally suggest blocks behind
2194 * the to_ds. In addition, we know that any blocks in
2195 * the object in the to_ds will have to be sent, since
2196 * the size changed. Therefore, we can't cause any harm
2199 SET_BOOKMARK(&from_arg
->resume
, objset
, obj
, 0, blkid
);
2202 fnvlist_add_uint64(nvl
, BEGINNV_RESUME_OBJECT
, dspp
->resumeobj
);
2203 fnvlist_add_uint64(nvl
, BEGINNV_RESUME_OFFSET
, dspp
->resumeoff
);
2208 static dmu_sendstatus_t
*
2209 setup_send_progress(struct dmu_send_params
*dspp
)
2211 dmu_sendstatus_t
*dssp
= kmem_zalloc(sizeof (*dssp
), KM_SLEEP
);
2212 dssp
->dss_outfd
= dspp
->outfd
;
2213 dssp
->dss_off
= dspp
->off
;
2214 dssp
->dss_proc
= curproc
;
2215 mutex_enter(&dspp
->to_ds
->ds_sendstream_lock
);
2216 list_insert_head(&dspp
->to_ds
->ds_sendstreams
, dssp
);
2217 mutex_exit(&dspp
->to_ds
->ds_sendstream_lock
);
2222 * Actually do the bulk of the work in a zfs send.
2224 * The idea is that we want to do a send from ancestor_zb to to_ds. We also
2225 * want to not send any data that has been modified by all the datasets in
2226 * redactsnaparr, and store the list of blocks that are redacted in this way in
2227 * a bookmark named redactbook, created on the to_ds. We do this by creating
2228 * several worker threads, whose function is described below.
2230 * There are three cases.
2231 * The first case is a redacted zfs send. In this case there are 5 threads.
2232 * The first thread is the to_ds traversal thread: it calls dataset_traverse on
2233 * the to_ds and finds all the blocks that have changed since ancestor_zb (if
2234 * it's a full send, that's all blocks in the dataset). It then sends those
2235 * blocks on to the send merge thread. The redact list thread takes the data
2236 * from the redaction bookmark and sends those blocks on to the send merge
2237 * thread. The send merge thread takes the data from the to_ds traversal
2238 * thread, and combines it with the redaction records from the redact list
2239 * thread. If a block appears in both the to_ds's data and the redaction data,
2240 * the send merge thread will mark it as redacted and send it on to the prefetch
2241 * thread. Otherwise, the send merge thread will send the block on to the
2242 * prefetch thread unchanged. The prefetch thread will issue prefetch reads for
2243 * any data that isn't redacted, and then send the data on to the main thread.
2244 * The main thread behaves the same as in a normal send case, issuing demand
2245 * reads for data blocks and sending out records over the network
2247 * The graphic below diagrams the flow of data in the case of a redacted zfs
2248 * send. Each box represents a thread, and each line represents the flow of
2251 * Records from the |
2252 * redaction bookmark |
2253 * +--------------------+ | +---------------------------+
2254 * | | v | Send Merge Thread |
2255 * | Redact List Thread +----------> Apply redaction marks to |
2256 * | | | records as specified by |
2257 * +--------------------+ | redaction ranges |
2258 * +----^---------------+------+
2261 * | +------------v--------+
2262 * | | Prefetch Thread |
2263 * +--------------------+ | | Issues prefetch |
2264 * | to_ds Traversal | | | reads of data blocks|
2265 * | Thread (finds +---------------+ +------------+--------+
2266 * | candidate blocks) | Blocks modified | Prefetched data
2267 * +--------------------+ by to_ds since |
2268 * ancestor_zb +------------v----+
2269 * | Main Thread | File Descriptor
2270 * | Sends data over +->(to zfs receive)
2272 * +-----------------+
2274 * The second case is an incremental send from a redaction bookmark. The to_ds
2275 * traversal thread and the main thread behave the same as in the redacted
2276 * send case. The new thread is the from bookmark traversal thread. It
2277 * iterates over the redaction list in the redaction bookmark, and enqueues
2278 * records for each block that was redacted in the original send. The send
2279 * merge thread now has to merge the data from the two threads. For details
2280 * about that process, see the header comment of send_merge_thread(). Any data
2281 * it decides to send on will be prefetched by the prefetch thread. Note that
2282 * you can perform a redacted send from a redaction bookmark; in that case,
2283 * the data flow behaves very similarly to the flow in the redacted send case,
2284 * except with the addition of the bookmark traversal thread iterating over the
2285 * redaction bookmark. The send_merge_thread also has to take on the
2286 * responsibility of merging the redact list thread's records, the bookmark
2287 * traversal thread's records, and the to_ds records.
2289 * +---------------------+
2291 * | Redact List Thread +--------------+
2293 * +---------------------+ |
2294 * Blocks in redaction list | Ranges modified by every secure snap
2295 * of from bookmark | (or EOS if not readcted)
2297 * +---------------------+ | +----v----------------------+
2298 * | bookmark Traversal | v | Send Merge Thread |
2299 * | Thread (finds +---------> Merges bookmark, rlt, and |
2300 * | candidate blocks) | | to_ds send records |
2301 * +---------------------+ +----^---------------+------+
2303 * | +------------v--------+
2304 * | | Prefetch Thread |
2305 * +--------------------+ | | Issues prefetch |
2306 * | to_ds Traversal | | | reads of data blocks|
2307 * | Thread (finds +---------------+ +------------+--------+
2308 * | candidate blocks) | Blocks modified | Prefetched data
2309 * +--------------------+ by to_ds since +------------v----+
2310 * ancestor_zb | Main Thread | File Descriptor
2311 * | Sends data over +->(to zfs receive)
2313 * +-----------------+
2315 * The final case is a simple zfs full or incremental send. The to_ds traversal
2316 * thread behaves the same as always. The redact list thread is never started.
2317 * The send merge thread takes all the blocks that the to_ds traversal thread
2318 * sends it, prefetches the data, and sends the blocks on to the main thread.
2319 * The main thread sends the data over the wire.
2321 * To keep performance acceptable, we want to prefetch the data in the worker
2322 * threads. While the to_ds thread could simply use the TRAVERSE_PREFETCH
2323 * feature built into traverse_dataset, the combining and deletion of records
2324 * due to redaction and sends from redaction bookmarks mean that we could
2325 * issue many unnecessary prefetches. As a result, we only prefetch data
2326 * after we've determined that the record is not going to be redacted. To
2327 * prevent the prefetching from getting too far ahead of the main thread, the
2328 * blocking queues that are used for communication are capped not by the
2329 * number of entries in the queue, but by the sum of the size of the
2330 * prefetches associated with them. The limit on the amount of data that the
2331 * thread can prefetch beyond what the main thread has reached is controlled
2332 * by the global variable zfs_send_queue_length. In addition, to prevent poor
2333 * performance in the beginning of a send, we also limit the distance ahead
2334 * that the traversal threads can be. That distance is controlled by the
2335 * zfs_send_no_prefetch_queue_length tunable.
2337 * Note: Releases dp using the specified tag.
2340 dmu_send_impl(struct dmu_send_params
*dspp
)
2343 dmu_replay_record_t
*drr
;
2344 dmu_sendstatus_t
*dssp
;
2345 dmu_send_cookie_t dsc
= {0};
2347 uint64_t fromtxg
= dspp
->ancestor_zb
.zbm_creation_txg
;
2348 uint64_t featureflags
= 0;
2349 struct redact_list_thread_arg
*from_arg
;
2350 struct send_thread_arg
*to_arg
;
2351 struct redact_list_thread_arg
*rlt_arg
;
2352 struct send_merge_thread_arg
*smt_arg
;
2353 struct send_reader_thread_arg
*srt_arg
;
2354 struct send_range
*range
;
2355 redaction_list_t
*from_rl
= NULL
;
2356 redaction_list_t
*redact_rl
= NULL
;
2357 boolean_t resuming
= (dspp
->resumeobj
!= 0 || dspp
->resumeoff
!= 0);
2358 boolean_t book_resuming
= resuming
;
2360 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
2361 zfs_bookmark_phys_t
*ancestor_zb
= &dspp
->ancestor_zb
;
2362 dsl_pool_t
*dp
= dspp
->dp
;
2363 void *tag
= dspp
->tag
;
2365 err
= dmu_objset_from_ds(to_ds
, &os
);
2367 dsl_pool_rele(dp
, tag
);
2372 * If this is a non-raw send of an encrypted ds, we can ensure that
2373 * the objset_phys_t is authenticated. This is safe because this is
2374 * either a snapshot or we have owned the dataset, ensuring that
2375 * it can't be modified.
2377 if (!dspp
->rawok
&& os
->os_encrypted
&&
2378 arc_is_unauthenticated(os
->os_phys_buf
)) {
2379 zbookmark_phys_t zb
;
2381 SET_BOOKMARK(&zb
, to_ds
->ds_object
, ZB_ROOT_OBJECT
,
2382 ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
2383 err
= arc_untransform(os
->os_phys_buf
, os
->os_spa
,
2386 dsl_pool_rele(dp
, tag
);
2390 ASSERT0(arc_is_unauthenticated(os
->os_phys_buf
));
2393 if ((err
= setup_featureflags(dspp
, os
, &featureflags
)) != 0) {
2394 dsl_pool_rele(dp
, tag
);
2399 * If we're doing a redacted send, hold the bookmark's redaction list.
2401 if (dspp
->redactbook
!= NULL
) {
2402 err
= dsl_redaction_list_hold_obj(dp
,
2403 dspp
->redactbook
->zbm_redaction_obj
, FTAG
,
2406 dsl_pool_rele(dp
, tag
);
2407 return (SET_ERROR(EINVAL
));
2409 dsl_redaction_list_long_hold(dp
, redact_rl
, FTAG
);
2413 * If we're sending from a redaction bookmark, hold the redaction list
2414 * so that we can consider sending the redacted blocks.
2416 if (ancestor_zb
->zbm_redaction_obj
!= 0) {
2417 err
= dsl_redaction_list_hold_obj(dp
,
2418 ancestor_zb
->zbm_redaction_obj
, FTAG
, &from_rl
);
2420 if (redact_rl
!= NULL
) {
2421 dsl_redaction_list_long_rele(redact_rl
, FTAG
);
2422 dsl_redaction_list_rele(redact_rl
, FTAG
);
2424 dsl_pool_rele(dp
, tag
);
2425 return (SET_ERROR(EINVAL
));
2427 dsl_redaction_list_long_hold(dp
, from_rl
, FTAG
);
2430 dsl_dataset_long_hold(to_ds
, FTAG
);
2432 from_arg
= kmem_zalloc(sizeof (*from_arg
), KM_SLEEP
);
2433 to_arg
= kmem_zalloc(sizeof (*to_arg
), KM_SLEEP
);
2434 rlt_arg
= kmem_zalloc(sizeof (*rlt_arg
), KM_SLEEP
);
2435 smt_arg
= kmem_zalloc(sizeof (*smt_arg
), KM_SLEEP
);
2436 srt_arg
= kmem_zalloc(sizeof (*srt_arg
), KM_SLEEP
);
2438 drr
= create_begin_record(dspp
, os
, featureflags
);
2439 dssp
= setup_send_progress(dspp
);
2442 dsc
.dsc_dso
= dspp
->dso
;
2444 dsc
.dsc_off
= dspp
->off
;
2445 dsc
.dsc_toguid
= dsl_dataset_phys(to_ds
)->ds_guid
;
2446 dsc
.dsc_fromtxg
= fromtxg
;
2447 dsc
.dsc_pending_op
= PENDING_NONE
;
2448 dsc
.dsc_featureflags
= featureflags
;
2449 dsc
.dsc_resume_object
= dspp
->resumeobj
;
2450 dsc
.dsc_resume_offset
= dspp
->resumeoff
;
2452 dsl_pool_rele(dp
, tag
);
2454 void *payload
= NULL
;
2455 size_t payload_len
= 0;
2456 nvlist_t
*nvl
= fnvlist_alloc();
2459 * If we're doing a redacted send, we include the snapshots we're
2460 * redacted with respect to so that the target system knows what send
2461 * streams can be correctly received on top of this dataset. If we're
2462 * instead sending a redacted dataset, we include the snapshots that the
2463 * dataset was created with respect to.
2465 if (dspp
->redactbook
!= NULL
) {
2466 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_SNAPS
,
2467 redact_rl
->rl_phys
->rlp_snaps
,
2468 redact_rl
->rl_phys
->rlp_num_snaps
);
2469 } else if (dsl_dataset_feature_is_active(to_ds
,
2470 SPA_FEATURE_REDACTED_DATASETS
)) {
2471 uint64_t *tods_guids
;
2473 VERIFY(dsl_dataset_get_uint64_array_feature(to_ds
,
2474 SPA_FEATURE_REDACTED_DATASETS
, &length
, &tods_guids
));
2475 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_SNAPS
, tods_guids
,
2480 * If we're sending from a redaction bookmark, then we should retrieve
2481 * the guids of that bookmark so we can send them over the wire.
2483 if (from_rl
!= NULL
) {
2484 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_FROM_SNAPS
,
2485 from_rl
->rl_phys
->rlp_snaps
,
2486 from_rl
->rl_phys
->rlp_num_snaps
);
2490 * If the snapshot we're sending from is redacted, include the redaction
2491 * list in the stream.
2493 if (dspp
->numfromredactsnaps
!= NUM_SNAPS_NOT_REDACTED
) {
2494 ASSERT3P(from_rl
, ==, NULL
);
2495 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_FROM_SNAPS
,
2496 dspp
->fromredactsnaps
, (uint_t
)dspp
->numfromredactsnaps
);
2497 if (dspp
->numfromredactsnaps
> 0) {
2498 kmem_free(dspp
->fromredactsnaps
,
2499 dspp
->numfromredactsnaps
* sizeof (uint64_t));
2500 dspp
->fromredactsnaps
= NULL
;
2504 if (resuming
|| book_resuming
) {
2505 err
= setup_resume_points(dspp
, to_arg
, from_arg
,
2506 rlt_arg
, smt_arg
, resuming
, os
, redact_rl
, nvl
);
2511 if (featureflags
& DMU_BACKUP_FEATURE_RAW
) {
2512 uint64_t ivset_guid
= (ancestor_zb
!= NULL
) ?
2513 ancestor_zb
->zbm_ivset_guid
: 0;
2514 nvlist_t
*keynvl
= NULL
;
2515 ASSERT(os
->os_encrypted
);
2517 err
= dsl_crypto_populate_key_nvlist(os
, ivset_guid
,
2524 fnvlist_add_nvlist(nvl
, "crypt_keydata", keynvl
);
2525 fnvlist_free(keynvl
);
2528 if (!nvlist_empty(nvl
)) {
2529 payload
= fnvlist_pack(nvl
, &payload_len
);
2530 drr
->drr_payloadlen
= payload_len
;
2534 err
= dump_record(&dsc
, payload
, payload_len
);
2535 fnvlist_pack_free(payload
, payload_len
);
2541 setup_to_thread(to_arg
, os
, dssp
, fromtxg
, dspp
->rawok
);
2542 setup_from_thread(from_arg
, from_rl
, dssp
);
2543 setup_redact_list_thread(rlt_arg
, dspp
, redact_rl
, dssp
);
2544 setup_merge_thread(smt_arg
, dspp
, from_arg
, to_arg
, rlt_arg
, os
);
2545 setup_reader_thread(srt_arg
, dspp
, smt_arg
, featureflags
);
2547 range
= bqueue_dequeue(&srt_arg
->q
);
2548 while (err
== 0 && !range
->eos_marker
) {
2549 err
= do_dump(&dsc
, range
);
2550 range
= get_next_range(&srt_arg
->q
, range
);
2551 if (issig(JUSTLOOKING
) && issig(FORREAL
))
2552 err
= SET_ERROR(EINTR
);
2556 * If we hit an error or are interrupted, cancel our worker threads and
2557 * clear the queue of any pending records. The threads will pass the
2558 * cancel up the tree of worker threads, and each one will clean up any
2559 * pending records before exiting.
2562 srt_arg
->cancel
= B_TRUE
;
2563 while (!range
->eos_marker
) {
2564 range
= get_next_range(&srt_arg
->q
, range
);
2569 bqueue_destroy(&srt_arg
->q
);
2570 bqueue_destroy(&smt_arg
->q
);
2571 if (dspp
->redactbook
!= NULL
)
2572 bqueue_destroy(&rlt_arg
->q
);
2573 bqueue_destroy(&to_arg
->q
);
2574 bqueue_destroy(&from_arg
->q
);
2576 if (err
== 0 && srt_arg
->error
!= 0)
2577 err
= srt_arg
->error
;
2582 if (dsc
.dsc_pending_op
!= PENDING_NONE
)
2583 if (dump_record(&dsc
, NULL
, 0) != 0)
2584 err
= SET_ERROR(EINTR
);
2587 if (err
== EINTR
&& dsc
.dsc_err
!= 0)
2593 * Send the DRR_END record if this is not a saved stream.
2594 * Otherwise, the omitted DRR_END record will signal to
2595 * the receive side that the stream is incomplete.
2597 if (!dspp
->savedok
) {
2598 bzero(drr
, sizeof (dmu_replay_record_t
));
2599 drr
->drr_type
= DRR_END
;
2600 drr
->drr_u
.drr_end
.drr_checksum
= dsc
.dsc_zc
;
2601 drr
->drr_u
.drr_end
.drr_toguid
= dsc
.dsc_toguid
;
2603 if (dump_record(&dsc
, NULL
, 0) != 0)
2607 mutex_enter(&to_ds
->ds_sendstream_lock
);
2608 list_remove(&to_ds
->ds_sendstreams
, dssp
);
2609 mutex_exit(&to_ds
->ds_sendstream_lock
);
2611 VERIFY(err
!= 0 || (dsc
.dsc_sent_begin
&&
2612 (dsc
.dsc_sent_end
|| dspp
->savedok
)));
2614 kmem_free(drr
, sizeof (dmu_replay_record_t
));
2615 kmem_free(dssp
, sizeof (dmu_sendstatus_t
));
2616 kmem_free(from_arg
, sizeof (*from_arg
));
2617 kmem_free(to_arg
, sizeof (*to_arg
));
2618 kmem_free(rlt_arg
, sizeof (*rlt_arg
));
2619 kmem_free(smt_arg
, sizeof (*smt_arg
));
2620 kmem_free(srt_arg
, sizeof (*srt_arg
));
2622 dsl_dataset_long_rele(to_ds
, FTAG
);
2623 if (from_rl
!= NULL
) {
2624 dsl_redaction_list_long_rele(from_rl
, FTAG
);
2625 dsl_redaction_list_rele(from_rl
, FTAG
);
2627 if (redact_rl
!= NULL
) {
2628 dsl_redaction_list_long_rele(redact_rl
, FTAG
);
2629 dsl_redaction_list_rele(redact_rl
, FTAG
);
2636 dmu_send_obj(const char *pool
, uint64_t tosnap
, uint64_t fromsnap
,
2637 boolean_t embedok
, boolean_t large_block_ok
, boolean_t compressok
,
2638 boolean_t rawok
, boolean_t savedok
, int outfd
, offset_t
*off
,
2639 dmu_send_outparams_t
*dsop
)
2642 dsl_dataset_t
*fromds
;
2643 ds_hold_flags_t dsflags
;
2644 struct dmu_send_params dspp
= {0};
2645 dspp
.embedok
= embedok
;
2646 dspp
.large_block_ok
= large_block_ok
;
2647 dspp
.compressok
= compressok
;
2653 dspp
.savedok
= savedok
;
2655 dsflags
= (rawok
) ? DS_HOLD_FLAG_NONE
: DS_HOLD_FLAG_DECRYPT
;
2656 err
= dsl_pool_hold(pool
, FTAG
, &dspp
.dp
);
2660 err
= dsl_dataset_hold_obj_flags(dspp
.dp
, tosnap
, dsflags
, FTAG
,
2663 dsl_pool_rele(dspp
.dp
, FTAG
);
2667 if (fromsnap
!= 0) {
2668 err
= dsl_dataset_hold_obj_flags(dspp
.dp
, fromsnap
, dsflags
,
2671 dsl_dataset_rele_flags(dspp
.to_ds
, dsflags
, FTAG
);
2672 dsl_pool_rele(dspp
.dp
, FTAG
);
2675 dspp
.ancestor_zb
.zbm_guid
= dsl_dataset_phys(fromds
)->ds_guid
;
2676 dspp
.ancestor_zb
.zbm_creation_txg
=
2677 dsl_dataset_phys(fromds
)->ds_creation_txg
;
2678 dspp
.ancestor_zb
.zbm_creation_time
=
2679 dsl_dataset_phys(fromds
)->ds_creation_time
;
2681 if (dsl_dataset_is_zapified(fromds
)) {
2682 (void) zap_lookup(dspp
.dp
->dp_meta_objset
,
2683 fromds
->ds_object
, DS_FIELD_IVSET_GUID
, 8, 1,
2684 &dspp
.ancestor_zb
.zbm_ivset_guid
);
2687 /* See dmu_send for the reasons behind this. */
2688 uint64_t *fromredact
;
2690 if (!dsl_dataset_get_uint64_array_feature(fromds
,
2691 SPA_FEATURE_REDACTED_DATASETS
,
2692 &dspp
.numfromredactsnaps
,
2694 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2695 } else if (dspp
.numfromredactsnaps
> 0) {
2696 uint64_t size
= dspp
.numfromredactsnaps
*
2698 dspp
.fromredactsnaps
= kmem_zalloc(size
, KM_SLEEP
);
2699 bcopy(fromredact
, dspp
.fromredactsnaps
, size
);
2702 boolean_t is_before
=
2703 dsl_dataset_is_before(dspp
.to_ds
, fromds
, 0);
2704 dspp
.is_clone
= (dspp
.to_ds
->ds_dir
!=
2706 dsl_dataset_rele(fromds
, FTAG
);
2708 dsl_pool_rele(dspp
.dp
, FTAG
);
2709 err
= SET_ERROR(EXDEV
);
2711 err
= dmu_send_impl(&dspp
);
2714 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2715 err
= dmu_send_impl(&dspp
);
2717 dsl_dataset_rele(dspp
.to_ds
, FTAG
);
2722 dmu_send(const char *tosnap
, const char *fromsnap
, boolean_t embedok
,
2723 boolean_t large_block_ok
, boolean_t compressok
, boolean_t rawok
,
2724 boolean_t savedok
, uint64_t resumeobj
, uint64_t resumeoff
,
2725 const char *redactbook
, int outfd
, offset_t
*off
,
2726 dmu_send_outparams_t
*dsop
)
2729 ds_hold_flags_t dsflags
;
2730 boolean_t owned
= B_FALSE
;
2731 dsl_dataset_t
*fromds
= NULL
;
2732 zfs_bookmark_phys_t book
= {0};
2733 struct dmu_send_params dspp
= {0};
2735 dsflags
= (rawok
) ? DS_HOLD_FLAG_NONE
: DS_HOLD_FLAG_DECRYPT
;
2736 dspp
.tosnap
= tosnap
;
2737 dspp
.embedok
= embedok
;
2738 dspp
.large_block_ok
= large_block_ok
;
2739 dspp
.compressok
= compressok
;
2744 dspp
.resumeobj
= resumeobj
;
2745 dspp
.resumeoff
= resumeoff
;
2747 dspp
.savedok
= savedok
;
2749 if (fromsnap
!= NULL
&& strpbrk(fromsnap
, "@#") == NULL
)
2750 return (SET_ERROR(EINVAL
));
2752 err
= dsl_pool_hold(tosnap
, FTAG
, &dspp
.dp
);
2756 if (strchr(tosnap
, '@') == NULL
&& spa_writeable(dspp
.dp
->dp_spa
)) {
2758 * We are sending a filesystem or volume. Ensure
2759 * that it doesn't change by owning the dataset.
2764 * We are looking for the dataset that represents the
2765 * partially received send stream. If this stream was
2766 * received as a new snapshot of an existing dataset,
2767 * this will be saved in a hidden clone named
2768 * "<pool>/<dataset>/%recv". Otherwise, the stream
2769 * will be saved in the live dataset itself. In
2770 * either case we need to use dsl_dataset_own_force()
2771 * because the stream is marked as inconsistent,
2772 * which would normally make it unavailable to be
2775 char *name
= kmem_asprintf("%s/%s", tosnap
,
2777 err
= dsl_dataset_own_force(dspp
.dp
, name
, dsflags
,
2779 if (err
== ENOENT
) {
2780 err
= dsl_dataset_own_force(dspp
.dp
, tosnap
,
2781 dsflags
, FTAG
, &dspp
.to_ds
);
2785 err
= zap_lookup(dspp
.dp
->dp_meta_objset
,
2786 dspp
.to_ds
->ds_object
,
2787 DS_FIELD_RESUME_TOGUID
, 8, 1,
2792 err
= zap_lookup(dspp
.dp
->dp_meta_objset
,
2793 dspp
.to_ds
->ds_object
,
2794 DS_FIELD_RESUME_TONAME
, 1,
2795 sizeof (dspp
.saved_toname
),
2799 dsl_dataset_disown(dspp
.to_ds
, dsflags
, FTAG
);
2803 err
= dsl_dataset_own(dspp
.dp
, tosnap
, dsflags
,
2808 err
= dsl_dataset_hold_flags(dspp
.dp
, tosnap
, dsflags
, FTAG
,
2813 dsl_pool_rele(dspp
.dp
, FTAG
);
2817 if (redactbook
!= NULL
) {
2818 char path
[ZFS_MAX_DATASET_NAME_LEN
];
2819 (void) strlcpy(path
, tosnap
, sizeof (path
));
2820 char *at
= strchr(path
, '@');
2824 (void) snprintf(at
, sizeof (path
) - (at
- path
), "#%s",
2826 err
= dsl_bookmark_lookup(dspp
.dp
, path
,
2828 dspp
.redactbook
= &book
;
2833 dsl_pool_rele(dspp
.dp
, FTAG
);
2835 dsl_dataset_disown(dspp
.to_ds
, dsflags
, FTAG
);
2837 dsl_dataset_rele_flags(dspp
.to_ds
, dsflags
, FTAG
);
2841 if (fromsnap
!= NULL
) {
2842 zfs_bookmark_phys_t
*zb
= &dspp
.ancestor_zb
;
2844 if (strpbrk(tosnap
, "@#") != NULL
)
2845 fsnamelen
= strpbrk(tosnap
, "@#") - tosnap
;
2847 fsnamelen
= strlen(tosnap
);
2850 * If the fromsnap is in a different filesystem, then
2851 * mark the send stream as a clone.
2853 if (strncmp(tosnap
, fromsnap
, fsnamelen
) != 0 ||
2854 (fromsnap
[fsnamelen
] != '@' &&
2855 fromsnap
[fsnamelen
] != '#')) {
2856 dspp
.is_clone
= B_TRUE
;
2859 if (strchr(fromsnap
, '@') != NULL
) {
2860 err
= dsl_dataset_hold(dspp
.dp
, fromsnap
, FTAG
,
2864 ASSERT3P(fromds
, ==, NULL
);
2867 * We need to make a deep copy of the redact
2868 * snapshots of the from snapshot, because the
2869 * array will be freed when we evict from_ds.
2871 uint64_t *fromredact
;
2872 if (!dsl_dataset_get_uint64_array_feature(
2873 fromds
, SPA_FEATURE_REDACTED_DATASETS
,
2874 &dspp
.numfromredactsnaps
,
2876 dspp
.numfromredactsnaps
=
2877 NUM_SNAPS_NOT_REDACTED
;
2878 } else if (dspp
.numfromredactsnaps
> 0) {
2880 dspp
.numfromredactsnaps
*
2882 dspp
.fromredactsnaps
= kmem_zalloc(size
,
2884 bcopy(fromredact
, dspp
.fromredactsnaps
,
2887 if (!dsl_dataset_is_before(dspp
.to_ds
, fromds
,
2889 err
= SET_ERROR(EXDEV
);
2891 zb
->zbm_creation_txg
=
2892 dsl_dataset_phys(fromds
)->
2894 zb
->zbm_creation_time
=
2895 dsl_dataset_phys(fromds
)->
2898 dsl_dataset_phys(fromds
)->ds_guid
;
2899 zb
->zbm_redaction_obj
= 0;
2901 if (dsl_dataset_is_zapified(fromds
)) {
2903 dspp
.dp
->dp_meta_objset
,
2905 DS_FIELD_IVSET_GUID
, 8, 1,
2906 &zb
->zbm_ivset_guid
);
2909 dsl_dataset_rele(fromds
, FTAG
);
2912 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2913 err
= dsl_bookmark_lookup(dspp
.dp
, fromsnap
, dspp
.to_ds
,
2915 if (err
== EXDEV
&& zb
->zbm_redaction_obj
!= 0 &&
2917 dsl_dataset_phys(dspp
.to_ds
)->ds_guid
)
2922 /* dmu_send_impl will call dsl_pool_rele for us. */
2923 err
= dmu_send_impl(&dspp
);
2925 dsl_pool_rele(dspp
.dp
, FTAG
);
2928 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2929 err
= dmu_send_impl(&dspp
);
2932 dsl_dataset_disown(dspp
.to_ds
, dsflags
, FTAG
);
2934 dsl_dataset_rele_flags(dspp
.to_ds
, dsflags
, FTAG
);
2939 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t
*ds
, uint64_t uncompressed
,
2940 uint64_t compressed
, boolean_t stream_compressed
, uint64_t *sizep
)
2945 * Assume that space (both on-disk and in-stream) is dominated by
2946 * data. We will adjust for indirect blocks and the copies property,
2947 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
2950 uint64_t recordsize
;
2951 uint64_t record_count
;
2953 VERIFY0(dmu_objset_from_ds(ds
, &os
));
2955 /* Assume all (uncompressed) blocks are recordsize. */
2956 if (zfs_override_estimate_recordsize
!= 0) {
2957 recordsize
= zfs_override_estimate_recordsize
;
2958 } else if (os
->os_phys
->os_type
== DMU_OST_ZVOL
) {
2959 err
= dsl_prop_get_int_ds(ds
,
2960 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE
), &recordsize
);
2962 err
= dsl_prop_get_int_ds(ds
,
2963 zfs_prop_to_name(ZFS_PROP_RECORDSIZE
), &recordsize
);
2967 record_count
= uncompressed
/ recordsize
;
2970 * If we're estimating a send size for a compressed stream, use the
2971 * compressed data size to estimate the stream size. Otherwise, use the
2972 * uncompressed data size.
2974 size
= stream_compressed
? compressed
: uncompressed
;
2977 * Subtract out approximate space used by indirect blocks.
2978 * Assume most space is used by data blocks (non-indirect, non-dnode).
2979 * Assume no ditto blocks or internal fragmentation.
2981 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
2984 size
-= record_count
* sizeof (blkptr_t
);
2986 /* Add in the space for the record associated with each block. */
2987 size
+= record_count
* sizeof (dmu_replay_record_t
);
2995 dmu_send_estimate_fast(dsl_dataset_t
*origds
, dsl_dataset_t
*fromds
,
2996 zfs_bookmark_phys_t
*frombook
, boolean_t stream_compressed
,
2997 boolean_t saved
, uint64_t *sizep
)
3000 dsl_dataset_t
*ds
= origds
;
3001 uint64_t uncomp
, comp
;
3003 ASSERT(dsl_pool_config_held(origds
->ds_dir
->dd_pool
));
3004 ASSERT(fromds
== NULL
|| frombook
== NULL
);
3007 * If this is a saved send we may actually be sending
3008 * from the %recv clone used for resuming.
3011 objset_t
*mos
= origds
->ds_dir
->dd_pool
->dp_meta_objset
;
3013 char dsname
[ZFS_MAX_DATASET_NAME_LEN
+ 6];
3015 dsl_dataset_name(origds
, dsname
);
3016 (void) strcat(dsname
, "/");
3017 (void) strcat(dsname
, recv_clone_name
);
3019 err
= dsl_dataset_hold(origds
->ds_dir
->dd_pool
,
3021 if (err
!= ENOENT
&& err
!= 0) {
3023 } else if (err
== ENOENT
) {
3027 /* check that this dataset has partially received data */
3028 err
= zap_lookup(mos
, ds
->ds_object
,
3029 DS_FIELD_RESUME_TOGUID
, 8, 1, &guid
);
3031 err
= SET_ERROR(err
== ENOENT
? EINVAL
: err
);
3035 err
= zap_lookup(mos
, ds
->ds_object
,
3036 DS_FIELD_RESUME_TONAME
, 1, sizeof (dsname
), dsname
);
3038 err
= SET_ERROR(err
== ENOENT
? EINVAL
: err
);
3043 /* tosnap must be a snapshot or the target of a saved send */
3044 if (!ds
->ds_is_snapshot
&& ds
== origds
)
3045 return (SET_ERROR(EINVAL
));
3047 if (fromds
!= NULL
) {
3049 if (!fromds
->ds_is_snapshot
) {
3050 err
= SET_ERROR(EINVAL
);
3054 if (!dsl_dataset_is_before(ds
, fromds
, 0)) {
3055 err
= SET_ERROR(EXDEV
);
3059 err
= dsl_dataset_space_written(fromds
, ds
, &used
, &comp
,
3063 } else if (frombook
!= NULL
) {
3065 err
= dsl_dataset_space_written_bookmark(frombook
, ds
, &used
,
3070 uncomp
= dsl_dataset_phys(ds
)->ds_uncompressed_bytes
;
3071 comp
= dsl_dataset_phys(ds
)->ds_compressed_bytes
;
3074 err
= dmu_adjust_send_estimate_for_indirects(ds
, uncomp
, comp
,
3075 stream_compressed
, sizep
);
3077 * Add the size of the BEGIN and END records to the estimate.
3079 *sizep
+= 2 * sizeof (dmu_replay_record_t
);
3083 dsl_dataset_rele(ds
, FTAG
);
3087 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, corrupt_data
, INT
, ZMOD_RW
,
3088 "Allow sending corrupt data");
3090 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, queue_length
, INT
, ZMOD_RW
,
3091 "Maximum send queue length");
3093 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, unmodified_spill_blocks
, INT
, ZMOD_RW
,
3094 "Send unmodified spill blocks");
3096 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, no_prefetch_queue_length
, INT
, ZMOD_RW
,
3097 "Maximum send queue length for non-prefetch queues");
3099 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, queue_ff
, INT
, ZMOD_RW
,
3100 "Send queue fill fraction");
3102 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, no_prefetch_queue_ff
, INT
, ZMOD_RW
,
3103 "Send queue fill fraction for non-prefetch queues");
3105 ZFS_MODULE_PARAM(zfs_send
, zfs_
, override_estimate_recordsize
, INT
, ZMOD_RW
,
3106 "Override block size estimate with fixed size");