4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or https://opensource.org/licenses/CDDL-1.0.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26 * Copyright 2014 HybridCluster. All rights reserved.
27 * Copyright 2016 RackTop Systems.
28 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
29 * Copyright (c) 2019, Klara Inc.
30 * Copyright (c) 2019, Allan Jude
34 #include <sys/dmu_impl.h>
35 #include <sys/dmu_tx.h>
37 #include <sys/dnode.h>
38 #include <sys/zfs_context.h>
39 #include <sys/dmu_objset.h>
40 #include <sys/dmu_traverse.h>
41 #include <sys/dsl_dataset.h>
42 #include <sys/dsl_dir.h>
43 #include <sys/dsl_prop.h>
44 #include <sys/dsl_pool.h>
45 #include <sys/dsl_synctask.h>
46 #include <sys/spa_impl.h>
47 #include <sys/zfs_ioctl.h>
49 #include <sys/zio_checksum.h>
50 #include <sys/zfs_znode.h>
51 #include <zfs_fletcher.h>
54 #include <sys/zfs_onexit.h>
55 #include <sys/dmu_send.h>
56 #include <sys/dmu_recv.h>
57 #include <sys/dsl_destroy.h>
58 #include <sys/blkptr.h>
59 #include <sys/dsl_bookmark.h>
60 #include <sys/zfeature.h>
61 #include <sys/bqueue.h>
63 #include <sys/policy.h>
64 #include <sys/objlist.h>
66 #include <sys/zfs_vfsops.h>
69 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
70 static int zfs_send_corrupt_data
= B_FALSE
;
72 * This tunable controls the amount of data (measured in bytes) that will be
73 * prefetched by zfs send. If the main thread is blocking on reads that haven't
74 * completed, this variable might need to be increased. If instead the main
75 * thread is issuing new reads because the prefetches have fallen out of the
76 * cache, this may need to be decreased.
78 static int zfs_send_queue_length
= SPA_MAXBLOCKSIZE
;
80 * This tunable controls the length of the queues that zfs send worker threads
81 * use to communicate. If the send_main_thread is blocking on these queues,
82 * this variable may need to be increased. If there is a significant slowdown
83 * at the start of a send as these threads consume all the available IO
84 * resources, this variable may need to be decreased.
86 static int zfs_send_no_prefetch_queue_length
= 1024 * 1024;
88 * These tunables control the fill fraction of the queues by zfs send. The fill
89 * fraction controls the frequency with which threads have to be cv_signaled.
90 * If a lot of cpu time is being spent on cv_signal, then these should be tuned
91 * down. If the queues empty before the signalled thread can catch up, then
92 * these should be tuned up.
94 static int zfs_send_queue_ff
= 20;
95 static int zfs_send_no_prefetch_queue_ff
= 20;
98 * Use this to override the recordsize calculation for fast zfs send estimates.
100 static int zfs_override_estimate_recordsize
= 0;
102 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
103 static const boolean_t zfs_send_set_freerecords_bit
= B_TRUE
;
105 /* Set this tunable to FALSE is disable sending unmodified spill blocks. */
106 static int zfs_send_unmodified_spill_blocks
= B_TRUE
;
108 static inline boolean_t
109 overflow_multiply(uint64_t a
, uint64_t b
, uint64_t *c
)
111 uint64_t temp
= a
* b
;
112 if (b
!= 0 && temp
/ b
!= a
)
118 struct send_thread_arg
{
120 objset_t
*os
; /* Objset to traverse */
121 uint64_t fromtxg
; /* Traverse from this txg */
122 int flags
; /* flags to pass to traverse_dataset */
125 zbookmark_phys_t resume
;
126 uint64_t *num_blocks_visited
;
129 struct redact_list_thread_arg
{
132 zbookmark_phys_t resume
;
133 redaction_list_t
*rl
;
134 boolean_t mark_redact
;
136 uint64_t *num_blocks_visited
;
139 struct send_merge_thread_arg
{
142 struct redact_list_thread_arg
*from_arg
;
143 struct send_thread_arg
*to_arg
;
144 struct redact_list_thread_arg
*redact_arg
;
150 boolean_t eos_marker
; /* Marks the end of the stream */
152 uint64_t start_blkid
;
155 enum type
{DATA
, HOLE
, OBJECT
, OBJECT_RANGE
, REDACT
,
156 PREVIOUSLY_REDACTED
} type
;
159 dmu_object_type_t obj_type
;
160 uint32_t datablksz
; // logical size
161 uint32_t datasz
; // payload size
167 boolean_t io_outstanding
;
168 boolean_t io_compressed
;
176 * This is a pointer because embedding it in the
177 * struct causes these structures to be massively larger
178 * for all range types; this makes the code much less
194 * The list of data whose inclusion in a send stream can be pending from
195 * one call to backup_cb to another. Multiple calls to dump_free(),
196 * dump_freeobjects(), and dump_redact() can be aggregated into a single
197 * DRR_FREE, DRR_FREEOBJECTS, or DRR_REDACT replay record.
206 typedef struct dmu_send_cookie
{
207 dmu_replay_record_t
*dsc_drr
;
208 dmu_send_outparams_t
*dsc_dso
;
213 uint64_t dsc_fromtxg
;
215 dmu_pendop_t dsc_pending_op
;
216 uint64_t dsc_featureflags
;
217 uint64_t dsc_last_data_object
;
218 uint64_t dsc_last_data_offset
;
219 uint64_t dsc_resume_object
;
220 uint64_t dsc_resume_offset
;
221 boolean_t dsc_sent_begin
;
222 boolean_t dsc_sent_end
;
225 static int do_dump(dmu_send_cookie_t
*dscp
, struct send_range
*range
);
228 range_free(struct send_range
*range
)
230 if (range
->type
== OBJECT
) {
231 size_t size
= sizeof (dnode_phys_t
) *
232 (range
->sru
.object
.dnp
->dn_extra_slots
+ 1);
233 kmem_free(range
->sru
.object
.dnp
, size
);
234 } else if (range
->type
== DATA
) {
235 mutex_enter(&range
->sru
.data
.lock
);
236 while (range
->sru
.data
.io_outstanding
)
237 cv_wait(&range
->sru
.data
.cv
, &range
->sru
.data
.lock
);
238 if (range
->sru
.data
.abd
!= NULL
)
239 abd_free(range
->sru
.data
.abd
);
240 if (range
->sru
.data
.abuf
!= NULL
) {
241 arc_buf_destroy(range
->sru
.data
.abuf
,
242 &range
->sru
.data
.abuf
);
244 mutex_exit(&range
->sru
.data
.lock
);
246 cv_destroy(&range
->sru
.data
.cv
);
247 mutex_destroy(&range
->sru
.data
.lock
);
249 kmem_free(range
, sizeof (*range
));
253 * For all record types except BEGIN, fill in the checksum (overlaid in
254 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
255 * up to the start of the checksum itself.
258 dump_record(dmu_send_cookie_t
*dscp
, void *payload
, int payload_len
)
260 dmu_send_outparams_t
*dso
= dscp
->dsc_dso
;
261 ASSERT3U(offsetof(dmu_replay_record_t
, drr_u
.drr_checksum
.drr_checksum
),
262 ==, sizeof (dmu_replay_record_t
) - sizeof (zio_cksum_t
));
263 (void) fletcher_4_incremental_native(dscp
->dsc_drr
,
264 offsetof(dmu_replay_record_t
, drr_u
.drr_checksum
.drr_checksum
),
266 if (dscp
->dsc_drr
->drr_type
== DRR_BEGIN
) {
267 dscp
->dsc_sent_begin
= B_TRUE
;
269 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dscp
->dsc_drr
->drr_u
.
270 drr_checksum
.drr_checksum
));
271 dscp
->dsc_drr
->drr_u
.drr_checksum
.drr_checksum
= dscp
->dsc_zc
;
273 if (dscp
->dsc_drr
->drr_type
== DRR_END
) {
274 dscp
->dsc_sent_end
= B_TRUE
;
276 (void) fletcher_4_incremental_native(&dscp
->dsc_drr
->
277 drr_u
.drr_checksum
.drr_checksum
,
278 sizeof (zio_cksum_t
), &dscp
->dsc_zc
);
279 *dscp
->dsc_off
+= sizeof (dmu_replay_record_t
);
280 dscp
->dsc_err
= dso
->dso_outfunc(dscp
->dsc_os
, dscp
->dsc_drr
,
281 sizeof (dmu_replay_record_t
), dso
->dso_arg
);
282 if (dscp
->dsc_err
!= 0)
283 return (SET_ERROR(EINTR
));
284 if (payload_len
!= 0) {
285 *dscp
->dsc_off
+= payload_len
;
287 * payload is null when dso_dryrun == B_TRUE (i.e. when we're
288 * doing a send size calculation)
290 if (payload
!= NULL
) {
291 (void) fletcher_4_incremental_native(
292 payload
, payload_len
, &dscp
->dsc_zc
);
296 * The code does not rely on this (len being a multiple of 8).
297 * We keep this assertion because of the corresponding assertion
298 * in receive_read(). Keeping this assertion ensures that we do
299 * not inadvertently break backwards compatibility (causing the
300 * assertion in receive_read() to trigger on old software).
302 * Raw sends cannot be received on old software, and so can
303 * bypass this assertion.
306 ASSERT((payload_len
% 8 == 0) ||
307 (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
));
309 dscp
->dsc_err
= dso
->dso_outfunc(dscp
->dsc_os
, payload
,
310 payload_len
, dso
->dso_arg
);
311 if (dscp
->dsc_err
!= 0)
312 return (SET_ERROR(EINTR
));
318 * Fill in the drr_free struct, or perform aggregation if the previous record is
319 * also a free record, and the two are adjacent.
321 * Note that we send free records even for a full send, because we want to be
322 * able to receive a full send as a clone, which requires a list of all the free
323 * and freeobject records that were generated on the source.
326 dump_free(dmu_send_cookie_t
*dscp
, uint64_t object
, uint64_t offset
,
329 struct drr_free
*drrf
= &(dscp
->dsc_drr
->drr_u
.drr_free
);
332 * When we receive a free record, dbuf_free_range() assumes
333 * that the receiving system doesn't have any dbufs in the range
334 * being freed. This is always true because there is a one-record
335 * constraint: we only send one WRITE record for any given
336 * object,offset. We know that the one-record constraint is
337 * true because we always send data in increasing order by
340 * If the increasing-order constraint ever changes, we should find
341 * another way to assert that the one-record constraint is still
344 ASSERT(object
> dscp
->dsc_last_data_object
||
345 (object
== dscp
->dsc_last_data_object
&&
346 offset
> dscp
->dsc_last_data_offset
));
349 * If there is a pending op, but it's not PENDING_FREE, push it out,
350 * since free block aggregation can only be done for blocks of the
351 * same type (i.e., DRR_FREE records can only be aggregated with
352 * other DRR_FREE records. DRR_FREEOBJECTS records can only be
353 * aggregated with other DRR_FREEOBJECTS records).
355 if (dscp
->dsc_pending_op
!= PENDING_NONE
&&
356 dscp
->dsc_pending_op
!= PENDING_FREE
) {
357 if (dump_record(dscp
, NULL
, 0) != 0)
358 return (SET_ERROR(EINTR
));
359 dscp
->dsc_pending_op
= PENDING_NONE
;
362 if (dscp
->dsc_pending_op
== PENDING_FREE
) {
364 * Check to see whether this free block can be aggregated
367 if (drrf
->drr_object
== object
&& drrf
->drr_offset
+
368 drrf
->drr_length
== offset
) {
369 if (offset
+ length
< offset
|| length
== UINT64_MAX
)
370 drrf
->drr_length
= UINT64_MAX
;
372 drrf
->drr_length
+= length
;
375 /* not a continuation. Push out pending record */
376 if (dump_record(dscp
, NULL
, 0) != 0)
377 return (SET_ERROR(EINTR
));
378 dscp
->dsc_pending_op
= PENDING_NONE
;
381 /* create a FREE record and make it pending */
382 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
383 dscp
->dsc_drr
->drr_type
= DRR_FREE
;
384 drrf
->drr_object
= object
;
385 drrf
->drr_offset
= offset
;
386 if (offset
+ length
< offset
)
387 drrf
->drr_length
= DMU_OBJECT_END
;
389 drrf
->drr_length
= length
;
390 drrf
->drr_toguid
= dscp
->dsc_toguid
;
391 if (length
== DMU_OBJECT_END
) {
392 if (dump_record(dscp
, NULL
, 0) != 0)
393 return (SET_ERROR(EINTR
));
395 dscp
->dsc_pending_op
= PENDING_FREE
;
402 * Fill in the drr_redact struct, or perform aggregation if the previous record
403 * is also a redaction record, and the two are adjacent.
406 dump_redact(dmu_send_cookie_t
*dscp
, uint64_t object
, uint64_t offset
,
409 struct drr_redact
*drrr
= &dscp
->dsc_drr
->drr_u
.drr_redact
;
412 * If there is a pending op, but it's not PENDING_REDACT, push it out,
413 * since free block aggregation can only be done for blocks of the
414 * same type (i.e., DRR_REDACT records can only be aggregated with
415 * other DRR_REDACT records).
417 if (dscp
->dsc_pending_op
!= PENDING_NONE
&&
418 dscp
->dsc_pending_op
!= PENDING_REDACT
) {
419 if (dump_record(dscp
, NULL
, 0) != 0)
420 return (SET_ERROR(EINTR
));
421 dscp
->dsc_pending_op
= PENDING_NONE
;
424 if (dscp
->dsc_pending_op
== PENDING_REDACT
) {
426 * Check to see whether this redacted block can be aggregated
429 if (drrr
->drr_object
== object
&& drrr
->drr_offset
+
430 drrr
->drr_length
== offset
) {
431 drrr
->drr_length
+= length
;
434 /* not a continuation. Push out pending record */
435 if (dump_record(dscp
, NULL
, 0) != 0)
436 return (SET_ERROR(EINTR
));
437 dscp
->dsc_pending_op
= PENDING_NONE
;
440 /* create a REDACT record and make it pending */
441 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
442 dscp
->dsc_drr
->drr_type
= DRR_REDACT
;
443 drrr
->drr_object
= object
;
444 drrr
->drr_offset
= offset
;
445 drrr
->drr_length
= length
;
446 drrr
->drr_toguid
= dscp
->dsc_toguid
;
447 dscp
->dsc_pending_op
= PENDING_REDACT
;
453 dmu_dump_write(dmu_send_cookie_t
*dscp
, dmu_object_type_t type
, uint64_t object
,
454 uint64_t offset
, int lsize
, int psize
, const blkptr_t
*bp
,
455 boolean_t io_compressed
, void *data
)
457 uint64_t payload_size
;
458 boolean_t raw
= (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
);
459 struct drr_write
*drrw
= &(dscp
->dsc_drr
->drr_u
.drr_write
);
462 * We send data in increasing object, offset order.
463 * See comment in dump_free() for details.
465 ASSERT(object
> dscp
->dsc_last_data_object
||
466 (object
== dscp
->dsc_last_data_object
&&
467 offset
> dscp
->dsc_last_data_offset
));
468 dscp
->dsc_last_data_object
= object
;
469 dscp
->dsc_last_data_offset
= offset
+ lsize
- 1;
472 * If there is any kind of pending aggregation (currently either
473 * a grouping of free objects or free blocks), push it out to
474 * the stream, since aggregation can't be done across operations
475 * of different types.
477 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
478 if (dump_record(dscp
, NULL
, 0) != 0)
479 return (SET_ERROR(EINTR
));
480 dscp
->dsc_pending_op
= PENDING_NONE
;
482 /* write a WRITE record */
483 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
484 dscp
->dsc_drr
->drr_type
= DRR_WRITE
;
485 drrw
->drr_object
= object
;
486 drrw
->drr_type
= type
;
487 drrw
->drr_offset
= offset
;
488 drrw
->drr_toguid
= dscp
->dsc_toguid
;
489 drrw
->drr_logical_size
= lsize
;
491 /* only set the compression fields if the buf is compressed or raw */
492 boolean_t compressed
=
493 (bp
!= NULL
? BP_GET_COMPRESS(bp
) != ZIO_COMPRESS_OFF
&&
494 io_compressed
: lsize
!= psize
);
495 if (raw
|| compressed
) {
497 ASSERT(raw
|| dscp
->dsc_featureflags
&
498 DMU_BACKUP_FEATURE_COMPRESSED
);
499 ASSERT(!BP_IS_EMBEDDED(bp
));
500 ASSERT3S(psize
, >, 0);
503 ASSERT(BP_IS_PROTECTED(bp
));
506 * This is a raw protected block so we need to pass
507 * along everything the receiving side will need to
508 * interpret this block, including the byteswap, salt,
511 if (BP_SHOULD_BYTESWAP(bp
))
512 drrw
->drr_flags
|= DRR_RAW_BYTESWAP
;
513 zio_crypt_decode_params_bp(bp
, drrw
->drr_salt
,
515 zio_crypt_decode_mac_bp(bp
, drrw
->drr_mac
);
517 /* this is a compressed block */
518 ASSERT(dscp
->dsc_featureflags
&
519 DMU_BACKUP_FEATURE_COMPRESSED
);
520 ASSERT(!BP_SHOULD_BYTESWAP(bp
));
521 ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp
)));
522 ASSERT3U(BP_GET_COMPRESS(bp
), !=, ZIO_COMPRESS_OFF
);
523 ASSERT3S(lsize
, >=, psize
);
526 /* set fields common to compressed and raw sends */
527 drrw
->drr_compressiontype
= BP_GET_COMPRESS(bp
);
528 drrw
->drr_compressed_size
= psize
;
529 payload_size
= drrw
->drr_compressed_size
;
531 payload_size
= drrw
->drr_logical_size
;
534 if (bp
== NULL
|| BP_IS_EMBEDDED(bp
) || (BP_IS_PROTECTED(bp
) && !raw
)) {
536 * There's no pre-computed checksum for partial-block writes,
537 * embedded BP's, or encrypted BP's that are being sent as
538 * plaintext, so (like fletcher4-checksummed blocks) userland
539 * will have to compute a dedup-capable checksum itself.
541 drrw
->drr_checksumtype
= ZIO_CHECKSUM_OFF
;
543 drrw
->drr_checksumtype
= BP_GET_CHECKSUM(bp
);
544 if (zio_checksum_table
[drrw
->drr_checksumtype
].ci_flags
&
545 ZCHECKSUM_FLAG_DEDUP
)
546 drrw
->drr_flags
|= DRR_CHECKSUM_DEDUP
;
547 DDK_SET_LSIZE(&drrw
->drr_key
, BP_GET_LSIZE(bp
));
548 DDK_SET_PSIZE(&drrw
->drr_key
, BP_GET_PSIZE(bp
));
549 DDK_SET_COMPRESS(&drrw
->drr_key
, BP_GET_COMPRESS(bp
));
550 DDK_SET_CRYPT(&drrw
->drr_key
, BP_IS_PROTECTED(bp
));
551 drrw
->drr_key
.ddk_cksum
= bp
->blk_cksum
;
554 if (dump_record(dscp
, data
, payload_size
) != 0)
555 return (SET_ERROR(EINTR
));
560 dump_write_embedded(dmu_send_cookie_t
*dscp
, uint64_t object
, uint64_t offset
,
561 int blksz
, const blkptr_t
*bp
)
563 char buf
[BPE_PAYLOAD_SIZE
];
564 struct drr_write_embedded
*drrw
=
565 &(dscp
->dsc_drr
->drr_u
.drr_write_embedded
);
567 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
568 if (dump_record(dscp
, NULL
, 0) != 0)
569 return (SET_ERROR(EINTR
));
570 dscp
->dsc_pending_op
= PENDING_NONE
;
573 ASSERT(BP_IS_EMBEDDED(bp
));
575 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
576 dscp
->dsc_drr
->drr_type
= DRR_WRITE_EMBEDDED
;
577 drrw
->drr_object
= object
;
578 drrw
->drr_offset
= offset
;
579 drrw
->drr_length
= blksz
;
580 drrw
->drr_toguid
= dscp
->dsc_toguid
;
581 drrw
->drr_compression
= BP_GET_COMPRESS(bp
);
582 drrw
->drr_etype
= BPE_GET_ETYPE(bp
);
583 drrw
->drr_lsize
= BPE_GET_LSIZE(bp
);
584 drrw
->drr_psize
= BPE_GET_PSIZE(bp
);
586 decode_embedded_bp_compressed(bp
, buf
);
588 if (dump_record(dscp
, buf
, P2ROUNDUP(drrw
->drr_psize
, 8)) != 0)
589 return (SET_ERROR(EINTR
));
594 dump_spill(dmu_send_cookie_t
*dscp
, const blkptr_t
*bp
, uint64_t object
,
597 struct drr_spill
*drrs
= &(dscp
->dsc_drr
->drr_u
.drr_spill
);
598 uint64_t blksz
= BP_GET_LSIZE(bp
);
599 uint64_t payload_size
= blksz
;
601 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
602 if (dump_record(dscp
, NULL
, 0) != 0)
603 return (SET_ERROR(EINTR
));
604 dscp
->dsc_pending_op
= PENDING_NONE
;
607 /* write a SPILL record */
608 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
609 dscp
->dsc_drr
->drr_type
= DRR_SPILL
;
610 drrs
->drr_object
= object
;
611 drrs
->drr_length
= blksz
;
612 drrs
->drr_toguid
= dscp
->dsc_toguid
;
614 /* See comment in dump_dnode() for full details */
615 if (zfs_send_unmodified_spill_blocks
&&
616 (bp
->blk_birth
<= dscp
->dsc_fromtxg
)) {
617 drrs
->drr_flags
|= DRR_SPILL_UNMODIFIED
;
620 /* handle raw send fields */
621 if (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
622 ASSERT(BP_IS_PROTECTED(bp
));
624 if (BP_SHOULD_BYTESWAP(bp
))
625 drrs
->drr_flags
|= DRR_RAW_BYTESWAP
;
626 drrs
->drr_compressiontype
= BP_GET_COMPRESS(bp
);
627 drrs
->drr_compressed_size
= BP_GET_PSIZE(bp
);
628 zio_crypt_decode_params_bp(bp
, drrs
->drr_salt
, drrs
->drr_iv
);
629 zio_crypt_decode_mac_bp(bp
, drrs
->drr_mac
);
630 payload_size
= drrs
->drr_compressed_size
;
633 if (dump_record(dscp
, data
, payload_size
) != 0)
634 return (SET_ERROR(EINTR
));
639 dump_freeobjects(dmu_send_cookie_t
*dscp
, uint64_t firstobj
, uint64_t numobjs
)
641 struct drr_freeobjects
*drrfo
= &(dscp
->dsc_drr
->drr_u
.drr_freeobjects
);
642 uint64_t maxobj
= DNODES_PER_BLOCK
*
643 (DMU_META_DNODE(dscp
->dsc_os
)->dn_maxblkid
+ 1);
646 * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
647 * leading to zfs recv never completing. to avoid this issue, don't
648 * send FREEOBJECTS records for object IDs which cannot exist on the
652 if (maxobj
<= firstobj
)
655 if (maxobj
< firstobj
+ numobjs
)
656 numobjs
= maxobj
- firstobj
;
660 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
661 * push it out, since free block aggregation can only be done for
662 * blocks of the same type (i.e., DRR_FREE records can only be
663 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
664 * can only be aggregated with other DRR_FREEOBJECTS records).
666 if (dscp
->dsc_pending_op
!= PENDING_NONE
&&
667 dscp
->dsc_pending_op
!= PENDING_FREEOBJECTS
) {
668 if (dump_record(dscp
, NULL
, 0) != 0)
669 return (SET_ERROR(EINTR
));
670 dscp
->dsc_pending_op
= PENDING_NONE
;
673 if (dscp
->dsc_pending_op
== PENDING_FREEOBJECTS
) {
675 * See whether this free object array can be aggregated
678 if (drrfo
->drr_firstobj
+ drrfo
->drr_numobjs
== firstobj
) {
679 drrfo
->drr_numobjs
+= numobjs
;
682 /* can't be aggregated. Push out pending record */
683 if (dump_record(dscp
, NULL
, 0) != 0)
684 return (SET_ERROR(EINTR
));
685 dscp
->dsc_pending_op
= PENDING_NONE
;
689 /* write a FREEOBJECTS record */
690 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
691 dscp
->dsc_drr
->drr_type
= DRR_FREEOBJECTS
;
692 drrfo
->drr_firstobj
= firstobj
;
693 drrfo
->drr_numobjs
= numobjs
;
694 drrfo
->drr_toguid
= dscp
->dsc_toguid
;
696 dscp
->dsc_pending_op
= PENDING_FREEOBJECTS
;
702 dump_dnode(dmu_send_cookie_t
*dscp
, const blkptr_t
*bp
, uint64_t object
,
705 struct drr_object
*drro
= &(dscp
->dsc_drr
->drr_u
.drr_object
);
708 if (object
< dscp
->dsc_resume_object
) {
710 * Note: when resuming, we will visit all the dnodes in
711 * the block of dnodes that we are resuming from. In
712 * this case it's unnecessary to send the dnodes prior to
713 * the one we are resuming from. We should be at most one
714 * block's worth of dnodes behind the resume point.
716 ASSERT3U(dscp
->dsc_resume_object
- object
, <,
717 1 << (DNODE_BLOCK_SHIFT
- DNODE_SHIFT
));
721 if (dnp
== NULL
|| dnp
->dn_type
== DMU_OT_NONE
)
722 return (dump_freeobjects(dscp
, object
, 1));
724 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
725 if (dump_record(dscp
, NULL
, 0) != 0)
726 return (SET_ERROR(EINTR
));
727 dscp
->dsc_pending_op
= PENDING_NONE
;
730 /* write an OBJECT record */
731 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
732 dscp
->dsc_drr
->drr_type
= DRR_OBJECT
;
733 drro
->drr_object
= object
;
734 drro
->drr_type
= dnp
->dn_type
;
735 drro
->drr_bonustype
= dnp
->dn_bonustype
;
736 drro
->drr_blksz
= dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
737 drro
->drr_bonuslen
= dnp
->dn_bonuslen
;
738 drro
->drr_dn_slots
= dnp
->dn_extra_slots
+ 1;
739 drro
->drr_checksumtype
= dnp
->dn_checksum
;
740 drro
->drr_compress
= dnp
->dn_compress
;
741 drro
->drr_toguid
= dscp
->dsc_toguid
;
743 if (!(dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
) &&
744 drro
->drr_blksz
> SPA_OLD_MAXBLOCKSIZE
)
745 drro
->drr_blksz
= SPA_OLD_MAXBLOCKSIZE
;
747 bonuslen
= P2ROUNDUP(dnp
->dn_bonuslen
, 8);
749 if ((dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
)) {
750 ASSERT(BP_IS_ENCRYPTED(bp
));
752 if (BP_SHOULD_BYTESWAP(bp
))
753 drro
->drr_flags
|= DRR_RAW_BYTESWAP
;
755 /* needed for reconstructing dnp on recv side */
756 drro
->drr_maxblkid
= dnp
->dn_maxblkid
;
757 drro
->drr_indblkshift
= dnp
->dn_indblkshift
;
758 drro
->drr_nlevels
= dnp
->dn_nlevels
;
759 drro
->drr_nblkptr
= dnp
->dn_nblkptr
;
762 * Since we encrypt the entire bonus area, the (raw) part
763 * beyond the bonuslen is actually nonzero, so we need
767 if (drro
->drr_bonuslen
> DN_MAX_BONUS_LEN(dnp
))
768 return (SET_ERROR(EINVAL
));
769 drro
->drr_raw_bonuslen
= DN_MAX_BONUS_LEN(dnp
);
770 bonuslen
= drro
->drr_raw_bonuslen
;
775 * DRR_OBJECT_SPILL is set for every dnode which references a
776 * spill block. This allows the receiving pool to definitively
777 * determine when a spill block should be kept or freed.
779 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
)
780 drro
->drr_flags
|= DRR_OBJECT_SPILL
;
782 if (dump_record(dscp
, DN_BONUS(dnp
), bonuslen
) != 0)
783 return (SET_ERROR(EINTR
));
785 /* Free anything past the end of the file. */
786 if (dump_free(dscp
, object
, (dnp
->dn_maxblkid
+ 1) *
787 (dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
), DMU_OBJECT_END
) != 0)
788 return (SET_ERROR(EINTR
));
791 * Send DRR_SPILL records for unmodified spill blocks. This is useful
792 * because changing certain attributes of the object (e.g. blocksize)
793 * can cause old versions of ZFS to incorrectly remove a spill block.
794 * Including these records in the stream forces an up to date version
795 * to always be written ensuring they're never lost. Current versions
796 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
797 * ignore these unmodified spill blocks.
799 if (zfs_send_unmodified_spill_blocks
&&
800 (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) &&
801 (DN_SPILL_BLKPTR(dnp
)->blk_birth
<= dscp
->dsc_fromtxg
)) {
802 struct send_range record
;
803 blkptr_t
*bp
= DN_SPILL_BLKPTR(dnp
);
805 memset(&record
, 0, sizeof (struct send_range
));
807 record
.object
= object
;
808 record
.eos_marker
= B_FALSE
;
809 record
.start_blkid
= DMU_SPILL_BLKID
;
810 record
.end_blkid
= record
.start_blkid
+ 1;
811 record
.sru
.data
.bp
= *bp
;
812 record
.sru
.data
.obj_type
= dnp
->dn_type
;
813 record
.sru
.data
.datablksz
= BP_GET_LSIZE(bp
);
815 if (do_dump(dscp
, &record
) != 0)
816 return (SET_ERROR(EINTR
));
819 if (dscp
->dsc_err
!= 0)
820 return (SET_ERROR(EINTR
));
826 dump_object_range(dmu_send_cookie_t
*dscp
, const blkptr_t
*bp
,
827 uint64_t firstobj
, uint64_t numslots
)
829 struct drr_object_range
*drror
=
830 &(dscp
->dsc_drr
->drr_u
.drr_object_range
);
832 /* we only use this record type for raw sends */
833 ASSERT(BP_IS_PROTECTED(bp
));
834 ASSERT(dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
);
835 ASSERT3U(BP_GET_COMPRESS(bp
), ==, ZIO_COMPRESS_OFF
);
836 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_DNODE
);
837 ASSERT0(BP_GET_LEVEL(bp
));
839 if (dscp
->dsc_pending_op
!= PENDING_NONE
) {
840 if (dump_record(dscp
, NULL
, 0) != 0)
841 return (SET_ERROR(EINTR
));
842 dscp
->dsc_pending_op
= PENDING_NONE
;
845 memset(dscp
->dsc_drr
, 0, sizeof (dmu_replay_record_t
));
846 dscp
->dsc_drr
->drr_type
= DRR_OBJECT_RANGE
;
847 drror
->drr_firstobj
= firstobj
;
848 drror
->drr_numslots
= numslots
;
849 drror
->drr_toguid
= dscp
->dsc_toguid
;
850 if (BP_SHOULD_BYTESWAP(bp
))
851 drror
->drr_flags
|= DRR_RAW_BYTESWAP
;
852 zio_crypt_decode_params_bp(bp
, drror
->drr_salt
, drror
->drr_iv
);
853 zio_crypt_decode_mac_bp(bp
, drror
->drr_mac
);
855 if (dump_record(dscp
, NULL
, 0) != 0)
856 return (SET_ERROR(EINTR
));
861 send_do_embed(const blkptr_t
*bp
, uint64_t featureflags
)
863 if (!BP_IS_EMBEDDED(bp
))
867 * Compression function must be legacy, or explicitly enabled.
869 if ((BP_GET_COMPRESS(bp
) >= ZIO_COMPRESS_LEGACY_FUNCTIONS
&&
870 !(featureflags
& DMU_BACKUP_FEATURE_LZ4
)))
874 * If we have not set the ZSTD feature flag, we can't send ZSTD
875 * compressed embedded blocks, as the receiver may not support them.
877 if ((BP_GET_COMPRESS(bp
) == ZIO_COMPRESS_ZSTD
&&
878 !(featureflags
& DMU_BACKUP_FEATURE_ZSTD
)))
882 * Embed type must be explicitly enabled.
884 switch (BPE_GET_ETYPE(bp
)) {
885 case BP_EMBEDDED_TYPE_DATA
:
886 if (featureflags
& DMU_BACKUP_FEATURE_EMBED_DATA
)
896 * This function actually handles figuring out what kind of record needs to be
897 * dumped, and calling the appropriate helper function. In most cases,
898 * the data has already been read by send_reader_thread().
901 do_dump(dmu_send_cookie_t
*dscp
, struct send_range
*range
)
904 switch (range
->type
) {
906 err
= dump_dnode(dscp
, &range
->sru
.object
.bp
, range
->object
,
907 range
->sru
.object
.dnp
);
910 ASSERT3U(range
->start_blkid
+ 1, ==, range
->end_blkid
);
911 if (!(dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
)) {
914 uint64_t epb
= BP_GET_LSIZE(&range
->sru
.object_range
.bp
) >>
916 uint64_t firstobj
= range
->start_blkid
* epb
;
917 err
= dump_object_range(dscp
, &range
->sru
.object_range
.bp
,
922 struct srr
*srrp
= &range
->sru
.redact
;
923 err
= dump_redact(dscp
, range
->object
, range
->start_blkid
*
924 srrp
->datablksz
, (range
->end_blkid
- range
->start_blkid
) *
929 struct srd
*srdp
= &range
->sru
.data
;
930 blkptr_t
*bp
= &srdp
->bp
;
932 dmu_objset_spa(dscp
->dsc_os
);
934 ASSERT3U(srdp
->datablksz
, ==, BP_GET_LSIZE(bp
));
935 ASSERT3U(range
->start_blkid
+ 1, ==, range
->end_blkid
);
936 if (BP_GET_TYPE(bp
) == DMU_OT_SA
) {
937 arc_flags_t aflags
= ARC_FLAG_WAIT
;
938 enum zio_flag zioflags
= ZIO_FLAG_CANFAIL
;
940 if (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
941 ASSERT(BP_IS_PROTECTED(bp
));
942 zioflags
|= ZIO_FLAG_RAW
;
946 ASSERT3U(range
->start_blkid
, ==, DMU_SPILL_BLKID
);
947 zb
.zb_objset
= dmu_objset_id(dscp
->dsc_os
);
948 zb
.zb_object
= range
->object
;
950 zb
.zb_blkid
= range
->start_blkid
;
952 arc_buf_t
*abuf
= NULL
;
953 if (!dscp
->dsc_dso
->dso_dryrun
&& arc_read(NULL
, spa
,
954 bp
, arc_getbuf_func
, &abuf
, ZIO_PRIORITY_ASYNC_READ
,
955 zioflags
, &aflags
, &zb
) != 0)
956 return (SET_ERROR(EIO
));
958 err
= dump_spill(dscp
, bp
, zb
.zb_object
,
959 (abuf
== NULL
? NULL
: abuf
->b_data
));
961 arc_buf_destroy(abuf
, &abuf
);
964 if (send_do_embed(bp
, dscp
->dsc_featureflags
)) {
965 err
= dump_write_embedded(dscp
, range
->object
,
966 range
->start_blkid
* srdp
->datablksz
,
967 srdp
->datablksz
, bp
);
970 ASSERT(range
->object
> dscp
->dsc_resume_object
||
971 (range
->object
== dscp
->dsc_resume_object
&&
972 range
->start_blkid
* srdp
->datablksz
>=
973 dscp
->dsc_resume_offset
));
974 /* it's a level-0 block of a regular object */
976 mutex_enter(&srdp
->lock
);
977 while (srdp
->io_outstanding
)
978 cv_wait(&srdp
->cv
, &srdp
->lock
);
980 mutex_exit(&srdp
->lock
);
983 if (zfs_send_corrupt_data
&&
984 !dscp
->dsc_dso
->dso_dryrun
) {
986 * Send a block filled with 0x"zfs badd bloc"
988 srdp
->abuf
= arc_alloc_buf(spa
, &srdp
->abuf
,
989 ARC_BUFC_DATA
, srdp
->datablksz
);
991 for (ptr
= srdp
->abuf
->b_data
;
992 (char *)ptr
< (char *)srdp
->abuf
->b_data
+
993 srdp
->datablksz
; ptr
++)
994 *ptr
= 0x2f5baddb10cULL
;
996 return (SET_ERROR(EIO
));
1000 ASSERT(dscp
->dsc_dso
->dso_dryrun
||
1001 srdp
->abuf
!= NULL
|| srdp
->abd
!= NULL
);
1003 uint64_t offset
= range
->start_blkid
* srdp
->datablksz
;
1006 if (srdp
->abd
!= NULL
) {
1007 data
= abd_to_buf(srdp
->abd
);
1008 ASSERT3P(srdp
->abuf
, ==, NULL
);
1009 } else if (srdp
->abuf
!= NULL
) {
1010 data
= srdp
->abuf
->b_data
;
1014 * If we have large blocks stored on disk but the send flags
1015 * don't allow us to send large blocks, we split the data from
1016 * the arc buf into chunks.
1018 if (srdp
->datablksz
> SPA_OLD_MAXBLOCKSIZE
&&
1019 !(dscp
->dsc_featureflags
&
1020 DMU_BACKUP_FEATURE_LARGE_BLOCKS
)) {
1021 if (dscp
->dsc_featureflags
& DMU_BACKUP_FEATURE_RAW
)
1022 return (SET_ERROR(ENOTSUP
));
1024 while (srdp
->datablksz
> 0 && err
== 0) {
1025 int n
= MIN(srdp
->datablksz
,
1026 SPA_OLD_MAXBLOCKSIZE
);
1027 err
= dmu_dump_write(dscp
, srdp
->obj_type
,
1028 range
->object
, offset
, n
, n
, NULL
, B_FALSE
,
1032 * When doing dry run, data==NULL is used as a
1034 * dmu_dump_write()->dump_record().
1038 srdp
->datablksz
-= n
;
1041 err
= dmu_dump_write(dscp
, srdp
->obj_type
,
1042 range
->object
, offset
,
1043 srdp
->datablksz
, srdp
->datasz
, bp
,
1044 srdp
->io_compressed
, data
);
1049 struct srh
*srhp
= &range
->sru
.hole
;
1050 if (range
->object
== DMU_META_DNODE_OBJECT
) {
1051 uint32_t span
= srhp
->datablksz
>> DNODE_SHIFT
;
1052 uint64_t first_obj
= range
->start_blkid
* span
;
1053 uint64_t numobj
= range
->end_blkid
* span
- first_obj
;
1054 return (dump_freeobjects(dscp
, first_obj
, numobj
));
1056 uint64_t offset
= 0;
1059 * If this multiply overflows, we don't need to send this block.
1060 * Even if it has a birth time, it can never not be a hole, so
1061 * we don't need to send records for it.
1063 if (!overflow_multiply(range
->start_blkid
, srhp
->datablksz
,
1069 if (!overflow_multiply(range
->end_blkid
, srhp
->datablksz
, &len
))
1072 return (dump_free(dscp
, range
->object
, offset
, len
));
1075 panic("Invalid range type in do_dump: %d", range
->type
);
1080 static struct send_range
*
1081 range_alloc(enum type type
, uint64_t object
, uint64_t start_blkid
,
1082 uint64_t end_blkid
, boolean_t eos
)
1084 struct send_range
*range
= kmem_alloc(sizeof (*range
), KM_SLEEP
);
1086 range
->object
= object
;
1087 range
->start_blkid
= start_blkid
;
1088 range
->end_blkid
= end_blkid
;
1089 range
->eos_marker
= eos
;
1091 range
->sru
.data
.abd
= NULL
;
1092 range
->sru
.data
.abuf
= NULL
;
1093 mutex_init(&range
->sru
.data
.lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1094 cv_init(&range
->sru
.data
.cv
, NULL
, CV_DEFAULT
, NULL
);
1095 range
->sru
.data
.io_outstanding
= 0;
1096 range
->sru
.data
.io_err
= 0;
1097 range
->sru
.data
.io_compressed
= B_FALSE
;
1103 * This is the callback function to traverse_dataset that acts as a worker
1104 * thread for dmu_send_impl.
1107 send_cb(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
1108 const zbookmark_phys_t
*zb
, const struct dnode_phys
*dnp
, void *arg
)
1111 struct send_thread_arg
*sta
= arg
;
1112 struct send_range
*record
;
1114 ASSERT(zb
->zb_object
== DMU_META_DNODE_OBJECT
||
1115 zb
->zb_object
>= sta
->resume
.zb_object
);
1118 * All bps of an encrypted os should have the encryption bit set.
1119 * If this is not true it indicates tampering and we report an error.
1121 if (sta
->os
->os_encrypted
&&
1122 !BP_IS_HOLE(bp
) && !BP_USES_CRYPT(bp
)) {
1123 spa_log_error(spa
, zb
);
1124 zfs_panic_recover("unencrypted block in encrypted "
1125 "object set %llu", dmu_objset_id(sta
->os
));
1126 return (SET_ERROR(EIO
));
1130 return (SET_ERROR(EINTR
));
1131 if (zb
->zb_object
!= DMU_META_DNODE_OBJECT
&&
1132 DMU_OBJECT_IS_SPECIAL(zb
->zb_object
))
1134 atomic_inc_64(sta
->num_blocks_visited
);
1136 if (zb
->zb_level
== ZB_DNODE_LEVEL
) {
1137 if (zb
->zb_object
== DMU_META_DNODE_OBJECT
)
1139 record
= range_alloc(OBJECT
, zb
->zb_object
, 0, 0, B_FALSE
);
1140 record
->sru
.object
.bp
= *bp
;
1141 size_t size
= sizeof (*dnp
) * (dnp
->dn_extra_slots
+ 1);
1142 record
->sru
.object
.dnp
= kmem_alloc(size
, KM_SLEEP
);
1143 memcpy(record
->sru
.object
.dnp
, dnp
, size
);
1144 bqueue_enqueue(&sta
->q
, record
, sizeof (*record
));
1147 if (zb
->zb_level
== 0 && zb
->zb_object
== DMU_META_DNODE_OBJECT
&&
1149 record
= range_alloc(OBJECT_RANGE
, 0, zb
->zb_blkid
,
1150 zb
->zb_blkid
+ 1, B_FALSE
);
1151 record
->sru
.object_range
.bp
= *bp
;
1152 bqueue_enqueue(&sta
->q
, record
, sizeof (*record
));
1155 if (zb
->zb_level
< 0 || (zb
->zb_level
> 0 && !BP_IS_HOLE(bp
)))
1157 if (zb
->zb_object
== DMU_META_DNODE_OBJECT
&& !BP_IS_HOLE(bp
))
1160 uint64_t span
= bp_span_in_blocks(dnp
->dn_indblkshift
, zb
->zb_level
);
1164 * If this multiply overflows, we don't need to send this block.
1165 * Even if it has a birth time, it can never not be a hole, so
1166 * we don't need to send records for it.
1168 if (!overflow_multiply(span
, zb
->zb_blkid
, &start
) || (!(zb
->zb_blkid
==
1169 DMU_SPILL_BLKID
|| DMU_OT_IS_METADATA(dnp
->dn_type
)) &&
1170 span
* zb
->zb_blkid
> dnp
->dn_maxblkid
)) {
1171 ASSERT(BP_IS_HOLE(bp
));
1175 if (zb
->zb_blkid
== DMU_SPILL_BLKID
)
1176 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_SA
);
1178 enum type record_type
= DATA
;
1181 else if (BP_IS_REDACTED(bp
))
1182 record_type
= REDACT
;
1186 record
= range_alloc(record_type
, zb
->zb_object
, start
,
1187 (start
+ span
< start
? 0 : start
+ span
), B_FALSE
);
1189 uint64_t datablksz
= (zb
->zb_blkid
== DMU_SPILL_BLKID
?
1190 BP_GET_LSIZE(bp
) : dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
);
1192 if (BP_IS_HOLE(bp
)) {
1193 record
->sru
.hole
.datablksz
= datablksz
;
1194 } else if (BP_IS_REDACTED(bp
)) {
1195 record
->sru
.redact
.datablksz
= datablksz
;
1197 record
->sru
.data
.datablksz
= datablksz
;
1198 record
->sru
.data
.obj_type
= dnp
->dn_type
;
1199 record
->sru
.data
.bp
= *bp
;
1202 bqueue_enqueue(&sta
->q
, record
, sizeof (*record
));
1206 struct redact_list_cb_arg
{
1207 uint64_t *num_blocks_visited
;
1210 boolean_t mark_redact
;
1214 redact_list_cb(redact_block_phys_t
*rb
, void *arg
)
1216 struct redact_list_cb_arg
*rlcap
= arg
;
1218 atomic_inc_64(rlcap
->num_blocks_visited
);
1222 struct send_range
*data
= range_alloc(REDACT
, rb
->rbp_object
,
1223 rb
->rbp_blkid
, rb
->rbp_blkid
+ redact_block_get_count(rb
), B_FALSE
);
1224 ASSERT3U(data
->end_blkid
, >, rb
->rbp_blkid
);
1225 if (rlcap
->mark_redact
) {
1226 data
->type
= REDACT
;
1227 data
->sru
.redact
.datablksz
= redact_block_get_size(rb
);
1229 data
->type
= PREVIOUSLY_REDACTED
;
1231 bqueue_enqueue(rlcap
->q
, data
, sizeof (*data
));
1237 * This function kicks off the traverse_dataset. It also handles setting the
1238 * error code of the thread in case something goes wrong, and pushes the End of
1239 * Stream record when the traverse_dataset call has finished.
1241 static __attribute__((noreturn
)) void
1242 send_traverse_thread(void *arg
)
1244 struct send_thread_arg
*st_arg
= arg
;
1246 struct send_range
*data
;
1247 fstrans_cookie_t cookie
= spl_fstrans_mark();
1249 err
= traverse_dataset_resume(st_arg
->os
->os_dsl_dataset
,
1250 st_arg
->fromtxg
, &st_arg
->resume
,
1251 st_arg
->flags
, send_cb
, st_arg
);
1254 st_arg
->error_code
= err
;
1255 data
= range_alloc(DATA
, 0, 0, 0, B_TRUE
);
1256 bqueue_enqueue_flush(&st_arg
->q
, data
, sizeof (*data
));
1257 spl_fstrans_unmark(cookie
);
1262 * Utility function that causes End of Stream records to compare after of all
1263 * others, so that other threads' comparison logic can stay simple.
1265 static int __attribute__((unused
))
1266 send_range_after(const struct send_range
*from
, const struct send_range
*to
)
1268 if (from
->eos_marker
== B_TRUE
)
1270 if (to
->eos_marker
== B_TRUE
)
1273 uint64_t from_obj
= from
->object
;
1274 uint64_t from_end_obj
= from
->object
+ 1;
1275 uint64_t to_obj
= to
->object
;
1276 uint64_t to_end_obj
= to
->object
+ 1;
1277 if (from_obj
== 0) {
1278 ASSERT(from
->type
== HOLE
|| from
->type
== OBJECT_RANGE
);
1279 from_obj
= from
->start_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1280 from_end_obj
= from
->end_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1283 ASSERT(to
->type
== HOLE
|| to
->type
== OBJECT_RANGE
);
1284 to_obj
= to
->start_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1285 to_end_obj
= to
->end_blkid
<< DNODES_PER_BLOCK_SHIFT
;
1288 if (from_end_obj
<= to_obj
)
1290 if (from_obj
>= to_end_obj
)
1292 int64_t cmp
= TREE_CMP(to
->type
== OBJECT_RANGE
, from
->type
==
1296 cmp
= TREE_CMP(to
->type
== OBJECT
, from
->type
== OBJECT
);
1299 if (from
->end_blkid
<= to
->start_blkid
)
1301 if (from
->start_blkid
>= to
->end_blkid
)
1307 * Pop the new data off the queue, check that the records we receive are in
1308 * the right order, but do not free the old data. This is used so that the
1309 * records can be sent on to the main thread without copying the data.
1311 static struct send_range
*
1312 get_next_range_nofree(bqueue_t
*bq
, struct send_range
*prev
)
1314 struct send_range
*next
= bqueue_dequeue(bq
);
1315 ASSERT3S(send_range_after(prev
, next
), ==, -1);
1320 * Pop the new data off the queue, check that the records we receive are in
1321 * the right order, and free the old data.
1323 static struct send_range
*
1324 get_next_range(bqueue_t
*bq
, struct send_range
*prev
)
1326 struct send_range
*next
= get_next_range_nofree(bq
, prev
);
1331 static __attribute__((noreturn
)) void
1332 redact_list_thread(void *arg
)
1334 struct redact_list_thread_arg
*rlt_arg
= arg
;
1335 struct send_range
*record
;
1336 fstrans_cookie_t cookie
= spl_fstrans_mark();
1337 if (rlt_arg
->rl
!= NULL
) {
1338 struct redact_list_cb_arg rlcba
= {0};
1339 rlcba
.cancel
= &rlt_arg
->cancel
;
1340 rlcba
.q
= &rlt_arg
->q
;
1341 rlcba
.num_blocks_visited
= rlt_arg
->num_blocks_visited
;
1342 rlcba
.mark_redact
= rlt_arg
->mark_redact
;
1343 int err
= dsl_redaction_list_traverse(rlt_arg
->rl
,
1344 &rlt_arg
->resume
, redact_list_cb
, &rlcba
);
1346 rlt_arg
->error_code
= err
;
1348 record
= range_alloc(DATA
, 0, 0, 0, B_TRUE
);
1349 bqueue_enqueue_flush(&rlt_arg
->q
, record
, sizeof (*record
));
1350 spl_fstrans_unmark(cookie
);
1356 * Compare the start point of the two provided ranges. End of stream ranges
1357 * compare last, objects compare before any data or hole inside that object and
1358 * multi-object holes that start at the same object.
1361 send_range_start_compare(struct send_range
*r1
, struct send_range
*r2
)
1363 uint64_t r1_objequiv
= r1
->object
;
1364 uint64_t r1_l0equiv
= r1
->start_blkid
;
1365 uint64_t r2_objequiv
= r2
->object
;
1366 uint64_t r2_l0equiv
= r2
->start_blkid
;
1367 int64_t cmp
= TREE_CMP(r1
->eos_marker
, r2
->eos_marker
);
1370 if (r1
->object
== 0) {
1371 r1_objequiv
= r1
->start_blkid
* DNODES_PER_BLOCK
;
1374 if (r2
->object
== 0) {
1375 r2_objequiv
= r2
->start_blkid
* DNODES_PER_BLOCK
;
1379 cmp
= TREE_CMP(r1_objequiv
, r2_objequiv
);
1382 cmp
= TREE_CMP(r2
->type
== OBJECT_RANGE
, r1
->type
== OBJECT_RANGE
);
1385 cmp
= TREE_CMP(r2
->type
== OBJECT
, r1
->type
== OBJECT
);
1389 return (TREE_CMP(r1_l0equiv
, r2_l0equiv
));
1400 * This function returns the next range the send_merge_thread should operate on.
1401 * The inputs are two arrays; the first one stores the range at the front of the
1402 * queues stored in the second one. The ranges are sorted in descending
1403 * priority order; the metadata from earlier ranges overrules metadata from
1404 * later ranges. out_mask is used to return which threads the ranges came from;
1405 * bit i is set if ranges[i] started at the same place as the returned range.
1407 * This code is not hardcoded to compare a specific number of threads; it could
1408 * be used with any number, just by changing the q_idx enum.
1410 * The "next range" is the one with the earliest start; if two starts are equal,
1411 * the highest-priority range is the next to operate on. If a higher-priority
1412 * range starts in the middle of the first range, then the first range will be
1413 * truncated to end where the higher-priority range starts, and we will operate
1414 * on that one next time. In this way, we make sure that each block covered by
1415 * some range gets covered by a returned range, and each block covered is
1416 * returned using the metadata of the highest-priority range it appears in.
1418 * For example, if the three ranges at the front of the queues were [2,4),
1419 * [3,5), and [1,3), then the ranges returned would be [1,2) with the metadata
1420 * from the third range, [2,4) with the metadata from the first range, and then
1421 * [4,5) with the metadata from the second.
1423 static struct send_range
*
1424 find_next_range(struct send_range
**ranges
, bqueue_t
**qs
, uint64_t *out_mask
)
1426 int idx
= 0; // index of the range with the earliest start
1429 for (i
= 1; i
< NUM_THREADS
; i
++) {
1430 if (send_range_start_compare(ranges
[i
], ranges
[idx
]) < 0)
1433 if (ranges
[idx
]->eos_marker
) {
1434 struct send_range
*ret
= range_alloc(DATA
, 0, 0, 0, B_TRUE
);
1439 * Find all the ranges that start at that same point.
1441 for (i
= 0; i
< NUM_THREADS
; i
++) {
1442 if (send_range_start_compare(ranges
[i
], ranges
[idx
]) == 0)
1447 * OBJECT_RANGE records only come from the TO thread, and should always
1448 * be treated as overlapping with nothing and sent on immediately. They
1449 * are only used in raw sends, and are never redacted.
1451 if (ranges
[idx
]->type
== OBJECT_RANGE
) {
1452 ASSERT3U(idx
, ==, TO_IDX
);
1453 ASSERT3U(*out_mask
, ==, 1 << TO_IDX
);
1454 struct send_range
*ret
= ranges
[idx
];
1455 ranges
[idx
] = get_next_range_nofree(qs
[idx
], ranges
[idx
]);
1459 * Find the first start or end point after the start of the first range.
1461 uint64_t first_change
= ranges
[idx
]->end_blkid
;
1462 for (i
= 0; i
< NUM_THREADS
; i
++) {
1463 if (i
== idx
|| ranges
[i
]->eos_marker
||
1464 ranges
[i
]->object
> ranges
[idx
]->object
||
1465 ranges
[i
]->object
== DMU_META_DNODE_OBJECT
)
1467 ASSERT3U(ranges
[i
]->object
, ==, ranges
[idx
]->object
);
1468 if (first_change
> ranges
[i
]->start_blkid
&&
1469 (bmask
& (1 << i
)) == 0)
1470 first_change
= ranges
[i
]->start_blkid
;
1471 else if (first_change
> ranges
[i
]->end_blkid
)
1472 first_change
= ranges
[i
]->end_blkid
;
1475 * Update all ranges to no longer overlap with the range we're
1476 * returning. All such ranges must start at the same place as the range
1477 * being returned, and end at or after first_change. Thus we update
1478 * their start to first_change. If that makes them size 0, then free
1479 * them and pull a new range from that thread.
1481 for (i
= 0; i
< NUM_THREADS
; i
++) {
1482 if (i
== idx
|| (bmask
& (1 << i
)) == 0)
1484 ASSERT3U(first_change
, >, ranges
[i
]->start_blkid
);
1485 ranges
[i
]->start_blkid
= first_change
;
1486 ASSERT3U(ranges
[i
]->start_blkid
, <=, ranges
[i
]->end_blkid
);
1487 if (ranges
[i
]->start_blkid
== ranges
[i
]->end_blkid
)
1488 ranges
[i
] = get_next_range(qs
[i
], ranges
[i
]);
1491 * Short-circuit the simple case; if the range doesn't overlap with
1492 * anything else, or it only overlaps with things that start at the same
1493 * place and are longer, send it on.
1495 if (first_change
== ranges
[idx
]->end_blkid
) {
1496 struct send_range
*ret
= ranges
[idx
];
1497 ranges
[idx
] = get_next_range_nofree(qs
[idx
], ranges
[idx
]);
1502 * Otherwise, return a truncated copy of ranges[idx] and move the start
1503 * of ranges[idx] back to first_change.
1505 struct send_range
*ret
= kmem_alloc(sizeof (*ret
), KM_SLEEP
);
1506 *ret
= *ranges
[idx
];
1507 ret
->end_blkid
= first_change
;
1508 ranges
[idx
]->start_blkid
= first_change
;
1512 #define FROM_AND_REDACT_BITS ((1 << REDACT_IDX) | (1 << FROM_IDX))
1515 * Merge the results from the from thread and the to thread, and then hand the
1516 * records off to send_prefetch_thread to prefetch them. If this is not a
1517 * send from a redaction bookmark, the from thread will push an end of stream
1518 * record and stop, and we'll just send everything that was changed in the
1519 * to_ds since the ancestor's creation txg. If it is, then since
1520 * traverse_dataset has a canonical order, we can compare each change as
1521 * they're pulled off the queues. That will give us a stream that is
1522 * appropriately sorted, and covers all records. In addition, we pull the
1523 * data from the redact_list_thread and use that to determine which blocks
1524 * should be redacted.
1526 static __attribute__((noreturn
)) void
1527 send_merge_thread(void *arg
)
1529 struct send_merge_thread_arg
*smt_arg
= arg
;
1530 struct send_range
*front_ranges
[NUM_THREADS
];
1531 bqueue_t
*queues
[NUM_THREADS
];
1533 fstrans_cookie_t cookie
= spl_fstrans_mark();
1535 if (smt_arg
->redact_arg
== NULL
) {
1536 front_ranges
[REDACT_IDX
] =
1537 kmem_zalloc(sizeof (struct send_range
), KM_SLEEP
);
1538 front_ranges
[REDACT_IDX
]->eos_marker
= B_TRUE
;
1539 front_ranges
[REDACT_IDX
]->type
= REDACT
;
1540 queues
[REDACT_IDX
] = NULL
;
1542 front_ranges
[REDACT_IDX
] =
1543 bqueue_dequeue(&smt_arg
->redact_arg
->q
);
1544 queues
[REDACT_IDX
] = &smt_arg
->redact_arg
->q
;
1546 front_ranges
[TO_IDX
] = bqueue_dequeue(&smt_arg
->to_arg
->q
);
1547 queues
[TO_IDX
] = &smt_arg
->to_arg
->q
;
1548 front_ranges
[FROM_IDX
] = bqueue_dequeue(&smt_arg
->from_arg
->q
);
1549 queues
[FROM_IDX
] = &smt_arg
->from_arg
->q
;
1551 struct send_range
*range
;
1552 for (range
= find_next_range(front_ranges
, queues
, &mask
);
1553 !range
->eos_marker
&& err
== 0 && !smt_arg
->cancel
;
1554 range
= find_next_range(front_ranges
, queues
, &mask
)) {
1556 * If the range in question was in both the from redact bookmark
1557 * and the bookmark we're using to redact, then don't send it.
1558 * It's already redacted on the receiving system, so a redaction
1559 * record would be redundant.
1561 if ((mask
& FROM_AND_REDACT_BITS
) == FROM_AND_REDACT_BITS
) {
1562 ASSERT3U(range
->type
, ==, REDACT
);
1566 bqueue_enqueue(&smt_arg
->q
, range
, sizeof (*range
));
1568 if (smt_arg
->to_arg
->error_code
!= 0) {
1569 err
= smt_arg
->to_arg
->error_code
;
1570 } else if (smt_arg
->from_arg
->error_code
!= 0) {
1571 err
= smt_arg
->from_arg
->error_code
;
1572 } else if (smt_arg
->redact_arg
!= NULL
&&
1573 smt_arg
->redact_arg
->error_code
!= 0) {
1574 err
= smt_arg
->redact_arg
->error_code
;
1577 if (smt_arg
->cancel
&& err
== 0)
1578 err
= SET_ERROR(EINTR
);
1579 smt_arg
->error
= err
;
1580 if (smt_arg
->error
!= 0) {
1581 smt_arg
->to_arg
->cancel
= B_TRUE
;
1582 smt_arg
->from_arg
->cancel
= B_TRUE
;
1583 if (smt_arg
->redact_arg
!= NULL
)
1584 smt_arg
->redact_arg
->cancel
= B_TRUE
;
1586 for (int i
= 0; i
< NUM_THREADS
; i
++) {
1587 while (!front_ranges
[i
]->eos_marker
) {
1588 front_ranges
[i
] = get_next_range(queues
[i
],
1591 range_free(front_ranges
[i
]);
1594 range
= kmem_zalloc(sizeof (*range
), KM_SLEEP
);
1595 range
->eos_marker
= B_TRUE
;
1596 bqueue_enqueue_flush(&smt_arg
->q
, range
, 1);
1597 spl_fstrans_unmark(cookie
);
1601 struct send_reader_thread_arg
{
1602 struct send_merge_thread_arg
*smta
;
1605 boolean_t issue_reads
;
1606 uint64_t featureflags
;
1611 dmu_send_read_done(zio_t
*zio
)
1613 struct send_range
*range
= zio
->io_private
;
1615 mutex_enter(&range
->sru
.data
.lock
);
1616 if (zio
->io_error
!= 0) {
1617 abd_free(range
->sru
.data
.abd
);
1618 range
->sru
.data
.abd
= NULL
;
1619 range
->sru
.data
.io_err
= zio
->io_error
;
1622 ASSERT(range
->sru
.data
.io_outstanding
);
1623 range
->sru
.data
.io_outstanding
= B_FALSE
;
1624 cv_broadcast(&range
->sru
.data
.cv
);
1625 mutex_exit(&range
->sru
.data
.lock
);
1629 issue_data_read(struct send_reader_thread_arg
*srta
, struct send_range
*range
)
1631 struct srd
*srdp
= &range
->sru
.data
;
1632 blkptr_t
*bp
= &srdp
->bp
;
1633 objset_t
*os
= srta
->smta
->os
;
1635 ASSERT3U(range
->type
, ==, DATA
);
1636 ASSERT3U(range
->start_blkid
+ 1, ==, range
->end_blkid
);
1638 * If we have large blocks stored on disk but
1639 * the send flags don't allow us to send large
1640 * blocks, we split the data from the arc buf
1643 boolean_t split_large_blocks
=
1644 srdp
->datablksz
> SPA_OLD_MAXBLOCKSIZE
&&
1645 !(srta
->featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
);
1647 * We should only request compressed data from the ARC if all
1648 * the following are true:
1649 * - stream compression was requested
1650 * - we aren't splitting large blocks into smaller chunks
1651 * - the data won't need to be byteswapped before sending
1652 * - this isn't an embedded block
1653 * - this isn't metadata (if receiving on a different endian
1654 * system it can be byteswapped more easily)
1656 boolean_t request_compressed
=
1657 (srta
->featureflags
& DMU_BACKUP_FEATURE_COMPRESSED
) &&
1658 !split_large_blocks
&& !BP_SHOULD_BYTESWAP(bp
) &&
1659 !BP_IS_EMBEDDED(bp
) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp
));
1661 enum zio_flag zioflags
= ZIO_FLAG_CANFAIL
;
1663 if (srta
->featureflags
& DMU_BACKUP_FEATURE_RAW
) {
1664 zioflags
|= ZIO_FLAG_RAW
;
1665 srdp
->io_compressed
= B_TRUE
;
1666 } else if (request_compressed
) {
1667 zioflags
|= ZIO_FLAG_RAW_COMPRESS
;
1668 srdp
->io_compressed
= B_TRUE
;
1671 srdp
->datasz
= (zioflags
& ZIO_FLAG_RAW_COMPRESS
) ?
1672 BP_GET_PSIZE(bp
) : BP_GET_LSIZE(bp
);
1674 if (!srta
->issue_reads
)
1676 if (BP_IS_REDACTED(bp
))
1678 if (send_do_embed(bp
, srta
->featureflags
))
1681 zbookmark_phys_t zb
= {
1682 .zb_objset
= dmu_objset_id(os
),
1683 .zb_object
= range
->object
,
1685 .zb_blkid
= range
->start_blkid
,
1688 arc_flags_t aflags
= ARC_FLAG_CACHED_ONLY
;
1690 int arc_err
= arc_read(NULL
, os
->os_spa
, bp
,
1691 arc_getbuf_func
, &srdp
->abuf
, ZIO_PRIORITY_ASYNC_READ
,
1692 zioflags
, &aflags
, &zb
);
1694 * If the data is not already cached in the ARC, we read directly
1695 * from zio. This avoids the performance overhead of adding a new
1696 * entry to the ARC, and we also avoid polluting the ARC cache with
1697 * data that is not likely to be used in the future.
1700 srdp
->abd
= abd_alloc_linear(srdp
->datasz
, B_FALSE
);
1701 srdp
->io_outstanding
= B_TRUE
;
1702 zio_nowait(zio_read(NULL
, os
->os_spa
, bp
, srdp
->abd
,
1703 srdp
->datasz
, dmu_send_read_done
, range
,
1704 ZIO_PRIORITY_ASYNC_READ
, zioflags
, &zb
));
1709 * Create a new record with the given values.
1712 enqueue_range(struct send_reader_thread_arg
*srta
, bqueue_t
*q
, dnode_t
*dn
,
1713 uint64_t blkid
, uint64_t count
, const blkptr_t
*bp
, uint32_t datablksz
)
1715 enum type range_type
= (bp
== NULL
|| BP_IS_HOLE(bp
) ? HOLE
:
1716 (BP_IS_REDACTED(bp
) ? REDACT
: DATA
));
1718 struct send_range
*range
= range_alloc(range_type
, dn
->dn_object
,
1719 blkid
, blkid
+ count
, B_FALSE
);
1721 if (blkid
== DMU_SPILL_BLKID
)
1722 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_SA
);
1724 switch (range_type
) {
1726 range
->sru
.hole
.datablksz
= datablksz
;
1729 ASSERT3U(count
, ==, 1);
1730 range
->sru
.data
.datablksz
= datablksz
;
1731 range
->sru
.data
.obj_type
= dn
->dn_type
;
1732 range
->sru
.data
.bp
= *bp
;
1733 issue_data_read(srta
, range
);
1736 range
->sru
.redact
.datablksz
= datablksz
;
1741 bqueue_enqueue(q
, range
, datablksz
);
1745 * This thread is responsible for two things: First, it retrieves the correct
1746 * blkptr in the to ds if we need to send the data because of something from
1747 * the from thread. As a result of this, we're the first ones to discover that
1748 * some indirect blocks can be discarded because they're not holes. Second,
1749 * it issues prefetches for the data we need to send.
1751 static __attribute__((noreturn
)) void
1752 send_reader_thread(void *arg
)
1754 struct send_reader_thread_arg
*srta
= arg
;
1755 struct send_merge_thread_arg
*smta
= srta
->smta
;
1756 bqueue_t
*inq
= &smta
->q
;
1757 bqueue_t
*outq
= &srta
->q
;
1758 objset_t
*os
= smta
->os
;
1759 fstrans_cookie_t cookie
= spl_fstrans_mark();
1760 struct send_range
*range
= bqueue_dequeue(inq
);
1764 * If the record we're analyzing is from a redaction bookmark from the
1765 * fromds, then we need to know whether or not it exists in the tods so
1766 * we know whether to create records for it or not. If it does, we need
1767 * the datablksz so we can generate an appropriate record for it.
1768 * Finally, if it isn't redacted, we need the blkptr so that we can send
1769 * a WRITE record containing the actual data.
1771 uint64_t last_obj
= UINT64_MAX
;
1772 uint64_t last_obj_exists
= B_TRUE
;
1773 while (!range
->eos_marker
&& !srta
->cancel
&& smta
->error
== 0 &&
1775 switch (range
->type
) {
1777 issue_data_read(srta
, range
);
1778 bqueue_enqueue(outq
, range
, range
->sru
.data
.datablksz
);
1779 range
= get_next_range_nofree(inq
, range
);
1784 case REDACT
: // Redacted blocks must exist
1785 bqueue_enqueue(outq
, range
, sizeof (*range
));
1786 range
= get_next_range_nofree(inq
, range
);
1788 case PREVIOUSLY_REDACTED
: {
1790 * This entry came from the "from bookmark" when
1791 * sending from a bookmark that has a redaction
1792 * list. We need to check if this object/blkid
1793 * exists in the target ("to") dataset, and if
1794 * not then we drop this entry. We also need
1795 * to fill in the block pointer so that we know
1798 * To accomplish the above, we first cache whether or
1799 * not the last object we examined exists. If it
1800 * doesn't, we can drop this record. If it does, we hold
1801 * the dnode and use it to call dbuf_dnode_findbp. We do
1802 * this instead of dbuf_bookmark_findbp because we will
1803 * often operate on large ranges, and holding the dnode
1804 * once is more efficient.
1806 boolean_t object_exists
= B_TRUE
;
1808 * If the data is redacted, we only care if it exists,
1809 * so that we don't send records for objects that have
1813 if (range
->object
== last_obj
&& !last_obj_exists
) {
1815 * If we're still examining the same object as
1816 * previously, and it doesn't exist, we don't
1817 * need to call dbuf_bookmark_findbp.
1819 object_exists
= B_FALSE
;
1821 err
= dnode_hold(os
, range
->object
, FTAG
, &dn
);
1822 if (err
== ENOENT
) {
1823 object_exists
= B_FALSE
;
1826 last_obj
= range
->object
;
1827 last_obj_exists
= object_exists
;
1832 } else if (!object_exists
) {
1834 * The block was modified, but doesn't
1835 * exist in the to dataset; if it was
1836 * deleted in the to dataset, then we'll
1837 * visit the hole bp for it at some point.
1839 range
= get_next_range(inq
, range
);
1843 (dn
->dn_maxblkid
< range
->end_blkid
?
1844 dn
->dn_maxblkid
: range
->end_blkid
);
1846 * The object exists, so we need to try to find the
1847 * blkptr for each block in the range we're processing.
1849 rw_enter(&dn
->dn_struct_rwlock
, RW_READER
);
1850 for (uint64_t blkid
= range
->start_blkid
;
1851 blkid
< file_max
; blkid
++) {
1853 uint32_t datablksz
=
1854 dn
->dn_phys
->dn_datablkszsec
<<
1856 uint64_t offset
= blkid
* datablksz
;
1858 * This call finds the next non-hole block in
1859 * the object. This is to prevent a
1860 * performance problem where we're unredacting
1861 * a large hole. Using dnode_next_offset to
1862 * skip over the large hole avoids iterating
1863 * over every block in it.
1865 err
= dnode_next_offset(dn
, DNODE_FIND_HAVELOCK
,
1868 offset
= UINT64_MAX
;
1870 } else if (err
!= 0) {
1873 if (offset
!= blkid
* datablksz
) {
1875 * if there is a hole from here
1878 offset
= MIN(offset
, file_max
*
1880 uint64_t nblks
= (offset
/ datablksz
) -
1882 enqueue_range(srta
, outq
, dn
, blkid
,
1883 nblks
, NULL
, datablksz
);
1886 if (blkid
>= file_max
)
1888 err
= dbuf_dnode_findbp(dn
, 0, blkid
, &bp
,
1892 ASSERT(!BP_IS_HOLE(&bp
));
1893 enqueue_range(srta
, outq
, dn
, blkid
, 1, &bp
,
1896 rw_exit(&dn
->dn_struct_rwlock
);
1897 dnode_rele(dn
, FTAG
);
1898 range
= get_next_range(inq
, range
);
1902 if (srta
->cancel
|| err
!= 0) {
1903 smta
->cancel
= B_TRUE
;
1905 } else if (smta
->error
!= 0) {
1906 srta
->error
= smta
->error
;
1908 while (!range
->eos_marker
)
1909 range
= get_next_range(inq
, range
);
1911 bqueue_enqueue_flush(outq
, range
, 1);
1912 spl_fstrans_unmark(cookie
);
1916 #define NUM_SNAPS_NOT_REDACTED UINT64_MAX
1918 struct dmu_send_params
{
1920 const void *tag
; // Tag dp was held with, will be used to release dp.
1922 /* To snapshot args */
1924 dsl_dataset_t
*to_ds
;
1925 /* From snapshot args */
1926 zfs_bookmark_phys_t ancestor_zb
;
1927 uint64_t *fromredactsnaps
;
1928 /* NUM_SNAPS_NOT_REDACTED if not sending from redaction bookmark */
1929 uint64_t numfromredactsnaps
;
1933 boolean_t large_block_ok
;
1934 boolean_t compressok
;
1939 uint64_t saved_guid
;
1940 zfs_bookmark_phys_t
*redactbook
;
1941 /* Stream output params */
1942 dmu_send_outparams_t
*dso
;
1944 /* Stream progress params */
1947 char saved_toname
[MAXNAMELEN
];
1951 setup_featureflags(struct dmu_send_params
*dspp
, objset_t
*os
,
1952 uint64_t *featureflags
)
1954 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
1955 dsl_pool_t
*dp
= dspp
->dp
;
1957 if (dmu_objset_type(os
) == DMU_OST_ZFS
) {
1959 if (zfs_get_zplprop(os
, ZFS_PROP_VERSION
, &version
) != 0)
1960 return (SET_ERROR(EINVAL
));
1962 if (version
>= ZPL_VERSION_SA
)
1963 *featureflags
|= DMU_BACKUP_FEATURE_SA_SPILL
;
1967 /* raw sends imply large_block_ok */
1968 if ((dspp
->rawok
|| dspp
->large_block_ok
) &&
1969 dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_BLOCKS
)) {
1970 *featureflags
|= DMU_BACKUP_FEATURE_LARGE_BLOCKS
;
1973 /* encrypted datasets will not have embedded blocks */
1974 if ((dspp
->embedok
|| dspp
->rawok
) && !os
->os_encrypted
&&
1975 spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_EMBEDDED_DATA
)) {
1976 *featureflags
|= DMU_BACKUP_FEATURE_EMBED_DATA
;
1979 /* raw send implies compressok */
1980 if (dspp
->compressok
|| dspp
->rawok
)
1981 *featureflags
|= DMU_BACKUP_FEATURE_COMPRESSED
;
1983 if (dspp
->rawok
&& os
->os_encrypted
)
1984 *featureflags
|= DMU_BACKUP_FEATURE_RAW
;
1986 if ((*featureflags
&
1987 (DMU_BACKUP_FEATURE_EMBED_DATA
| DMU_BACKUP_FEATURE_COMPRESSED
|
1988 DMU_BACKUP_FEATURE_RAW
)) != 0 &&
1989 spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_LZ4_COMPRESS
)) {
1990 *featureflags
|= DMU_BACKUP_FEATURE_LZ4
;
1994 * We specifically do not include DMU_BACKUP_FEATURE_EMBED_DATA here to
1995 * allow sending ZSTD compressed datasets to a receiver that does not
1998 if ((*featureflags
&
1999 (DMU_BACKUP_FEATURE_COMPRESSED
| DMU_BACKUP_FEATURE_RAW
)) != 0 &&
2000 dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_ZSTD_COMPRESS
)) {
2001 *featureflags
|= DMU_BACKUP_FEATURE_ZSTD
;
2004 if (dspp
->resumeobj
!= 0 || dspp
->resumeoff
!= 0) {
2005 *featureflags
|= DMU_BACKUP_FEATURE_RESUMING
;
2008 if (dspp
->redactbook
!= NULL
) {
2009 *featureflags
|= DMU_BACKUP_FEATURE_REDACTED
;
2012 if (dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_DNODE
)) {
2013 *featureflags
|= DMU_BACKUP_FEATURE_LARGE_DNODE
;
2018 static dmu_replay_record_t
*
2019 create_begin_record(struct dmu_send_params
*dspp
, objset_t
*os
,
2020 uint64_t featureflags
)
2022 dmu_replay_record_t
*drr
= kmem_zalloc(sizeof (dmu_replay_record_t
),
2024 drr
->drr_type
= DRR_BEGIN
;
2026 struct drr_begin
*drrb
= &drr
->drr_u
.drr_begin
;
2027 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
2029 drrb
->drr_magic
= DMU_BACKUP_MAGIC
;
2030 drrb
->drr_creation_time
= dsl_dataset_phys(to_ds
)->ds_creation_time
;
2031 drrb
->drr_type
= dmu_objset_type(os
);
2032 drrb
->drr_toguid
= dsl_dataset_phys(to_ds
)->ds_guid
;
2033 drrb
->drr_fromguid
= dspp
->ancestor_zb
.zbm_guid
;
2035 DMU_SET_STREAM_HDRTYPE(drrb
->drr_versioninfo
, DMU_SUBSTREAM
);
2036 DMU_SET_FEATUREFLAGS(drrb
->drr_versioninfo
, featureflags
);
2039 drrb
->drr_flags
|= DRR_FLAG_CLONE
;
2040 if (dsl_dataset_phys(dspp
->to_ds
)->ds_flags
& DS_FLAG_CI_DATASET
)
2041 drrb
->drr_flags
|= DRR_FLAG_CI_DATA
;
2042 if (zfs_send_set_freerecords_bit
)
2043 drrb
->drr_flags
|= DRR_FLAG_FREERECORDS
;
2044 drr
->drr_u
.drr_begin
.drr_flags
|= DRR_FLAG_SPILL_BLOCK
;
2046 if (dspp
->savedok
) {
2047 drrb
->drr_toguid
= dspp
->saved_guid
;
2048 strlcpy(drrb
->drr_toname
, dspp
->saved_toname
,
2049 sizeof (drrb
->drr_toname
));
2051 dsl_dataset_name(to_ds
, drrb
->drr_toname
);
2052 if (!to_ds
->ds_is_snapshot
) {
2053 (void) strlcat(drrb
->drr_toname
, "@--head--",
2054 sizeof (drrb
->drr_toname
));
2061 setup_to_thread(struct send_thread_arg
*to_arg
, objset_t
*to_os
,
2062 dmu_sendstatus_t
*dssp
, uint64_t fromtxg
, boolean_t rawok
)
2064 VERIFY0(bqueue_init(&to_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2065 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2066 offsetof(struct send_range
, ln
)));
2067 to_arg
->error_code
= 0;
2068 to_arg
->cancel
= B_FALSE
;
2070 to_arg
->fromtxg
= fromtxg
;
2071 to_arg
->flags
= TRAVERSE_PRE
| TRAVERSE_PREFETCH_METADATA
;
2073 to_arg
->flags
|= TRAVERSE_NO_DECRYPT
;
2074 if (zfs_send_corrupt_data
)
2075 to_arg
->flags
|= TRAVERSE_HARD
;
2076 to_arg
->num_blocks_visited
= &dssp
->dss_blocks
;
2077 (void) thread_create(NULL
, 0, send_traverse_thread
, to_arg
, 0,
2078 curproc
, TS_RUN
, minclsyspri
);
2082 setup_from_thread(struct redact_list_thread_arg
*from_arg
,
2083 redaction_list_t
*from_rl
, dmu_sendstatus_t
*dssp
)
2085 VERIFY0(bqueue_init(&from_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2086 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2087 offsetof(struct send_range
, ln
)));
2088 from_arg
->error_code
= 0;
2089 from_arg
->cancel
= B_FALSE
;
2090 from_arg
->rl
= from_rl
;
2091 from_arg
->mark_redact
= B_FALSE
;
2092 from_arg
->num_blocks_visited
= &dssp
->dss_blocks
;
2094 * If from_ds is null, send_traverse_thread just returns success and
2095 * enqueues an eos marker.
2097 (void) thread_create(NULL
, 0, redact_list_thread
, from_arg
, 0,
2098 curproc
, TS_RUN
, minclsyspri
);
2102 setup_redact_list_thread(struct redact_list_thread_arg
*rlt_arg
,
2103 struct dmu_send_params
*dspp
, redaction_list_t
*rl
, dmu_sendstatus_t
*dssp
)
2105 if (dspp
->redactbook
== NULL
)
2108 rlt_arg
->cancel
= B_FALSE
;
2109 VERIFY0(bqueue_init(&rlt_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2110 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2111 offsetof(struct send_range
, ln
)));
2112 rlt_arg
->error_code
= 0;
2113 rlt_arg
->mark_redact
= B_TRUE
;
2115 rlt_arg
->num_blocks_visited
= &dssp
->dss_blocks
;
2117 (void) thread_create(NULL
, 0, redact_list_thread
, rlt_arg
, 0,
2118 curproc
, TS_RUN
, minclsyspri
);
2122 setup_merge_thread(struct send_merge_thread_arg
*smt_arg
,
2123 struct dmu_send_params
*dspp
, struct redact_list_thread_arg
*from_arg
,
2124 struct send_thread_arg
*to_arg
, struct redact_list_thread_arg
*rlt_arg
,
2127 VERIFY0(bqueue_init(&smt_arg
->q
, zfs_send_no_prefetch_queue_ff
,
2128 MAX(zfs_send_no_prefetch_queue_length
, 2 * zfs_max_recordsize
),
2129 offsetof(struct send_range
, ln
)));
2130 smt_arg
->cancel
= B_FALSE
;
2132 smt_arg
->from_arg
= from_arg
;
2133 smt_arg
->to_arg
= to_arg
;
2134 if (dspp
->redactbook
!= NULL
)
2135 smt_arg
->redact_arg
= rlt_arg
;
2138 (void) thread_create(NULL
, 0, send_merge_thread
, smt_arg
, 0, curproc
,
2139 TS_RUN
, minclsyspri
);
2143 setup_reader_thread(struct send_reader_thread_arg
*srt_arg
,
2144 struct dmu_send_params
*dspp
, struct send_merge_thread_arg
*smt_arg
,
2145 uint64_t featureflags
)
2147 VERIFY0(bqueue_init(&srt_arg
->q
, zfs_send_queue_ff
,
2148 MAX(zfs_send_queue_length
, 2 * zfs_max_recordsize
),
2149 offsetof(struct send_range
, ln
)));
2150 srt_arg
->smta
= smt_arg
;
2151 srt_arg
->issue_reads
= !dspp
->dso
->dso_dryrun
;
2152 srt_arg
->featureflags
= featureflags
;
2153 (void) thread_create(NULL
, 0, send_reader_thread
, srt_arg
, 0,
2154 curproc
, TS_RUN
, minclsyspri
);
2158 setup_resume_points(struct dmu_send_params
*dspp
,
2159 struct send_thread_arg
*to_arg
, struct redact_list_thread_arg
*from_arg
,
2160 struct redact_list_thread_arg
*rlt_arg
,
2161 struct send_merge_thread_arg
*smt_arg
, boolean_t resuming
, objset_t
*os
,
2162 redaction_list_t
*redact_rl
, nvlist_t
*nvl
)
2165 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
2171 obj
= dspp
->resumeobj
;
2172 dmu_object_info_t to_doi
;
2173 err
= dmu_object_info(os
, obj
, &to_doi
);
2177 blkid
= dspp
->resumeoff
/ to_doi
.doi_data_block_size
;
2180 * If we're resuming a redacted send, we can skip to the appropriate
2181 * point in the redaction bookmark by binary searching through it.
2183 if (redact_rl
!= NULL
) {
2184 SET_BOOKMARK(&rlt_arg
->resume
, to_ds
->ds_object
, obj
, 0, blkid
);
2187 SET_BOOKMARK(&to_arg
->resume
, to_ds
->ds_object
, obj
, 0, blkid
);
2188 if (nvlist_exists(nvl
, BEGINNV_REDACT_FROM_SNAPS
)) {
2189 uint64_t objset
= dspp
->ancestor_zb
.zbm_redaction_obj
;
2191 * Note: If the resume point is in an object whose
2192 * blocksize is different in the from vs to snapshots,
2193 * we will have divided by the "wrong" blocksize.
2194 * However, in this case fromsnap's send_cb() will
2195 * detect that the blocksize has changed and therefore
2196 * ignore this object.
2198 * If we're resuming a send from a redaction bookmark,
2199 * we still cannot accidentally suggest blocks behind
2200 * the to_ds. In addition, we know that any blocks in
2201 * the object in the to_ds will have to be sent, since
2202 * the size changed. Therefore, we can't cause any harm
2205 SET_BOOKMARK(&from_arg
->resume
, objset
, obj
, 0, blkid
);
2208 fnvlist_add_uint64(nvl
, BEGINNV_RESUME_OBJECT
, dspp
->resumeobj
);
2209 fnvlist_add_uint64(nvl
, BEGINNV_RESUME_OFFSET
, dspp
->resumeoff
);
2214 static dmu_sendstatus_t
*
2215 setup_send_progress(struct dmu_send_params
*dspp
)
2217 dmu_sendstatus_t
*dssp
= kmem_zalloc(sizeof (*dssp
), KM_SLEEP
);
2218 dssp
->dss_outfd
= dspp
->outfd
;
2219 dssp
->dss_off
= dspp
->off
;
2220 dssp
->dss_proc
= curproc
;
2221 mutex_enter(&dspp
->to_ds
->ds_sendstream_lock
);
2222 list_insert_head(&dspp
->to_ds
->ds_sendstreams
, dssp
);
2223 mutex_exit(&dspp
->to_ds
->ds_sendstream_lock
);
2228 * Actually do the bulk of the work in a zfs send.
2230 * The idea is that we want to do a send from ancestor_zb to to_ds. We also
2231 * want to not send any data that has been modified by all the datasets in
2232 * redactsnaparr, and store the list of blocks that are redacted in this way in
2233 * a bookmark named redactbook, created on the to_ds. We do this by creating
2234 * several worker threads, whose function is described below.
2236 * There are three cases.
2237 * The first case is a redacted zfs send. In this case there are 5 threads.
2238 * The first thread is the to_ds traversal thread: it calls dataset_traverse on
2239 * the to_ds and finds all the blocks that have changed since ancestor_zb (if
2240 * it's a full send, that's all blocks in the dataset). It then sends those
2241 * blocks on to the send merge thread. The redact list thread takes the data
2242 * from the redaction bookmark and sends those blocks on to the send merge
2243 * thread. The send merge thread takes the data from the to_ds traversal
2244 * thread, and combines it with the redaction records from the redact list
2245 * thread. If a block appears in both the to_ds's data and the redaction data,
2246 * the send merge thread will mark it as redacted and send it on to the prefetch
2247 * thread. Otherwise, the send merge thread will send the block on to the
2248 * prefetch thread unchanged. The prefetch thread will issue prefetch reads for
2249 * any data that isn't redacted, and then send the data on to the main thread.
2250 * The main thread behaves the same as in a normal send case, issuing demand
2251 * reads for data blocks and sending out records over the network
2253 * The graphic below diagrams the flow of data in the case of a redacted zfs
2254 * send. Each box represents a thread, and each line represents the flow of
2257 * Records from the |
2258 * redaction bookmark |
2259 * +--------------------+ | +---------------------------+
2260 * | | v | Send Merge Thread |
2261 * | Redact List Thread +----------> Apply redaction marks to |
2262 * | | | records as specified by |
2263 * +--------------------+ | redaction ranges |
2264 * +----^---------------+------+
2267 * | +------------v--------+
2268 * | | Prefetch Thread |
2269 * +--------------------+ | | Issues prefetch |
2270 * | to_ds Traversal | | | reads of data blocks|
2271 * | Thread (finds +---------------+ +------------+--------+
2272 * | candidate blocks) | Blocks modified | Prefetched data
2273 * +--------------------+ by to_ds since |
2274 * ancestor_zb +------------v----+
2275 * | Main Thread | File Descriptor
2276 * | Sends data over +->(to zfs receive)
2278 * +-----------------+
2280 * The second case is an incremental send from a redaction bookmark. The to_ds
2281 * traversal thread and the main thread behave the same as in the redacted
2282 * send case. The new thread is the from bookmark traversal thread. It
2283 * iterates over the redaction list in the redaction bookmark, and enqueues
2284 * records for each block that was redacted in the original send. The send
2285 * merge thread now has to merge the data from the two threads. For details
2286 * about that process, see the header comment of send_merge_thread(). Any data
2287 * it decides to send on will be prefetched by the prefetch thread. Note that
2288 * you can perform a redacted send from a redaction bookmark; in that case,
2289 * the data flow behaves very similarly to the flow in the redacted send case,
2290 * except with the addition of the bookmark traversal thread iterating over the
2291 * redaction bookmark. The send_merge_thread also has to take on the
2292 * responsibility of merging the redact list thread's records, the bookmark
2293 * traversal thread's records, and the to_ds records.
2295 * +---------------------+
2297 * | Redact List Thread +--------------+
2299 * +---------------------+ |
2300 * Blocks in redaction list | Ranges modified by every secure snap
2301 * of from bookmark | (or EOS if not readcted)
2303 * +---------------------+ | +----v----------------------+
2304 * | bookmark Traversal | v | Send Merge Thread |
2305 * | Thread (finds +---------> Merges bookmark, rlt, and |
2306 * | candidate blocks) | | to_ds send records |
2307 * +---------------------+ +----^---------------+------+
2309 * | +------------v--------+
2310 * | | Prefetch Thread |
2311 * +--------------------+ | | Issues prefetch |
2312 * | to_ds Traversal | | | reads of data blocks|
2313 * | Thread (finds +---------------+ +------------+--------+
2314 * | candidate blocks) | Blocks modified | Prefetched data
2315 * +--------------------+ by to_ds since +------------v----+
2316 * ancestor_zb | Main Thread | File Descriptor
2317 * | Sends data over +->(to zfs receive)
2319 * +-----------------+
2321 * The final case is a simple zfs full or incremental send. The to_ds traversal
2322 * thread behaves the same as always. The redact list thread is never started.
2323 * The send merge thread takes all the blocks that the to_ds traversal thread
2324 * sends it, prefetches the data, and sends the blocks on to the main thread.
2325 * The main thread sends the data over the wire.
2327 * To keep performance acceptable, we want to prefetch the data in the worker
2328 * threads. While the to_ds thread could simply use the TRAVERSE_PREFETCH
2329 * feature built into traverse_dataset, the combining and deletion of records
2330 * due to redaction and sends from redaction bookmarks mean that we could
2331 * issue many unnecessary prefetches. As a result, we only prefetch data
2332 * after we've determined that the record is not going to be redacted. To
2333 * prevent the prefetching from getting too far ahead of the main thread, the
2334 * blocking queues that are used for communication are capped not by the
2335 * number of entries in the queue, but by the sum of the size of the
2336 * prefetches associated with them. The limit on the amount of data that the
2337 * thread can prefetch beyond what the main thread has reached is controlled
2338 * by the global variable zfs_send_queue_length. In addition, to prevent poor
2339 * performance in the beginning of a send, we also limit the distance ahead
2340 * that the traversal threads can be. That distance is controlled by the
2341 * zfs_send_no_prefetch_queue_length tunable.
2343 * Note: Releases dp using the specified tag.
2346 dmu_send_impl(struct dmu_send_params
*dspp
)
2349 dmu_replay_record_t
*drr
;
2350 dmu_sendstatus_t
*dssp
;
2351 dmu_send_cookie_t dsc
= {0};
2353 uint64_t fromtxg
= dspp
->ancestor_zb
.zbm_creation_txg
;
2354 uint64_t featureflags
= 0;
2355 struct redact_list_thread_arg
*from_arg
;
2356 struct send_thread_arg
*to_arg
;
2357 struct redact_list_thread_arg
*rlt_arg
;
2358 struct send_merge_thread_arg
*smt_arg
;
2359 struct send_reader_thread_arg
*srt_arg
;
2360 struct send_range
*range
;
2361 redaction_list_t
*from_rl
= NULL
;
2362 redaction_list_t
*redact_rl
= NULL
;
2363 boolean_t resuming
= (dspp
->resumeobj
!= 0 || dspp
->resumeoff
!= 0);
2364 boolean_t book_resuming
= resuming
;
2366 dsl_dataset_t
*to_ds
= dspp
->to_ds
;
2367 zfs_bookmark_phys_t
*ancestor_zb
= &dspp
->ancestor_zb
;
2368 dsl_pool_t
*dp
= dspp
->dp
;
2369 const void *tag
= dspp
->tag
;
2371 err
= dmu_objset_from_ds(to_ds
, &os
);
2373 dsl_pool_rele(dp
, tag
);
2378 * If this is a non-raw send of an encrypted ds, we can ensure that
2379 * the objset_phys_t is authenticated. This is safe because this is
2380 * either a snapshot or we have owned the dataset, ensuring that
2381 * it can't be modified.
2383 if (!dspp
->rawok
&& os
->os_encrypted
&&
2384 arc_is_unauthenticated(os
->os_phys_buf
)) {
2385 zbookmark_phys_t zb
;
2387 SET_BOOKMARK(&zb
, to_ds
->ds_object
, ZB_ROOT_OBJECT
,
2388 ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
2389 err
= arc_untransform(os
->os_phys_buf
, os
->os_spa
,
2392 dsl_pool_rele(dp
, tag
);
2396 ASSERT0(arc_is_unauthenticated(os
->os_phys_buf
));
2399 if ((err
= setup_featureflags(dspp
, os
, &featureflags
)) != 0) {
2400 dsl_pool_rele(dp
, tag
);
2405 * If we're doing a redacted send, hold the bookmark's redaction list.
2407 if (dspp
->redactbook
!= NULL
) {
2408 err
= dsl_redaction_list_hold_obj(dp
,
2409 dspp
->redactbook
->zbm_redaction_obj
, FTAG
,
2412 dsl_pool_rele(dp
, tag
);
2413 return (SET_ERROR(EINVAL
));
2415 dsl_redaction_list_long_hold(dp
, redact_rl
, FTAG
);
2419 * If we're sending from a redaction bookmark, hold the redaction list
2420 * so that we can consider sending the redacted blocks.
2422 if (ancestor_zb
->zbm_redaction_obj
!= 0) {
2423 err
= dsl_redaction_list_hold_obj(dp
,
2424 ancestor_zb
->zbm_redaction_obj
, FTAG
, &from_rl
);
2426 if (redact_rl
!= NULL
) {
2427 dsl_redaction_list_long_rele(redact_rl
, FTAG
);
2428 dsl_redaction_list_rele(redact_rl
, FTAG
);
2430 dsl_pool_rele(dp
, tag
);
2431 return (SET_ERROR(EINVAL
));
2433 dsl_redaction_list_long_hold(dp
, from_rl
, FTAG
);
2436 dsl_dataset_long_hold(to_ds
, FTAG
);
2438 from_arg
= kmem_zalloc(sizeof (*from_arg
), KM_SLEEP
);
2439 to_arg
= kmem_zalloc(sizeof (*to_arg
), KM_SLEEP
);
2440 rlt_arg
= kmem_zalloc(sizeof (*rlt_arg
), KM_SLEEP
);
2441 smt_arg
= kmem_zalloc(sizeof (*smt_arg
), KM_SLEEP
);
2442 srt_arg
= kmem_zalloc(sizeof (*srt_arg
), KM_SLEEP
);
2444 drr
= create_begin_record(dspp
, os
, featureflags
);
2445 dssp
= setup_send_progress(dspp
);
2448 dsc
.dsc_dso
= dspp
->dso
;
2450 dsc
.dsc_off
= dspp
->off
;
2451 dsc
.dsc_toguid
= dsl_dataset_phys(to_ds
)->ds_guid
;
2452 dsc
.dsc_fromtxg
= fromtxg
;
2453 dsc
.dsc_pending_op
= PENDING_NONE
;
2454 dsc
.dsc_featureflags
= featureflags
;
2455 dsc
.dsc_resume_object
= dspp
->resumeobj
;
2456 dsc
.dsc_resume_offset
= dspp
->resumeoff
;
2458 dsl_pool_rele(dp
, tag
);
2460 void *payload
= NULL
;
2461 size_t payload_len
= 0;
2462 nvlist_t
*nvl
= fnvlist_alloc();
2465 * If we're doing a redacted send, we include the snapshots we're
2466 * redacted with respect to so that the target system knows what send
2467 * streams can be correctly received on top of this dataset. If we're
2468 * instead sending a redacted dataset, we include the snapshots that the
2469 * dataset was created with respect to.
2471 if (dspp
->redactbook
!= NULL
) {
2472 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_SNAPS
,
2473 redact_rl
->rl_phys
->rlp_snaps
,
2474 redact_rl
->rl_phys
->rlp_num_snaps
);
2475 } else if (dsl_dataset_feature_is_active(to_ds
,
2476 SPA_FEATURE_REDACTED_DATASETS
)) {
2477 uint64_t *tods_guids
;
2479 VERIFY(dsl_dataset_get_uint64_array_feature(to_ds
,
2480 SPA_FEATURE_REDACTED_DATASETS
, &length
, &tods_guids
));
2481 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_SNAPS
, tods_guids
,
2486 * If we're sending from a redaction bookmark, then we should retrieve
2487 * the guids of that bookmark so we can send them over the wire.
2489 if (from_rl
!= NULL
) {
2490 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_FROM_SNAPS
,
2491 from_rl
->rl_phys
->rlp_snaps
,
2492 from_rl
->rl_phys
->rlp_num_snaps
);
2496 * If the snapshot we're sending from is redacted, include the redaction
2497 * list in the stream.
2499 if (dspp
->numfromredactsnaps
!= NUM_SNAPS_NOT_REDACTED
) {
2500 ASSERT3P(from_rl
, ==, NULL
);
2501 fnvlist_add_uint64_array(nvl
, BEGINNV_REDACT_FROM_SNAPS
,
2502 dspp
->fromredactsnaps
, (uint_t
)dspp
->numfromredactsnaps
);
2503 if (dspp
->numfromredactsnaps
> 0) {
2504 kmem_free(dspp
->fromredactsnaps
,
2505 dspp
->numfromredactsnaps
* sizeof (uint64_t));
2506 dspp
->fromredactsnaps
= NULL
;
2510 if (resuming
|| book_resuming
) {
2511 err
= setup_resume_points(dspp
, to_arg
, from_arg
,
2512 rlt_arg
, smt_arg
, resuming
, os
, redact_rl
, nvl
);
2517 if (featureflags
& DMU_BACKUP_FEATURE_RAW
) {
2518 uint64_t ivset_guid
= (ancestor_zb
!= NULL
) ?
2519 ancestor_zb
->zbm_ivset_guid
: 0;
2520 nvlist_t
*keynvl
= NULL
;
2521 ASSERT(os
->os_encrypted
);
2523 err
= dsl_crypto_populate_key_nvlist(os
, ivset_guid
,
2530 fnvlist_add_nvlist(nvl
, "crypt_keydata", keynvl
);
2531 fnvlist_free(keynvl
);
2534 if (!nvlist_empty(nvl
)) {
2535 payload
= fnvlist_pack(nvl
, &payload_len
);
2536 drr
->drr_payloadlen
= payload_len
;
2540 err
= dump_record(&dsc
, payload
, payload_len
);
2541 fnvlist_pack_free(payload
, payload_len
);
2547 setup_to_thread(to_arg
, os
, dssp
, fromtxg
, dspp
->rawok
);
2548 setup_from_thread(from_arg
, from_rl
, dssp
);
2549 setup_redact_list_thread(rlt_arg
, dspp
, redact_rl
, dssp
);
2550 setup_merge_thread(smt_arg
, dspp
, from_arg
, to_arg
, rlt_arg
, os
);
2551 setup_reader_thread(srt_arg
, dspp
, smt_arg
, featureflags
);
2553 range
= bqueue_dequeue(&srt_arg
->q
);
2554 while (err
== 0 && !range
->eos_marker
) {
2555 err
= do_dump(&dsc
, range
);
2556 range
= get_next_range(&srt_arg
->q
, range
);
2557 if (issig(JUSTLOOKING
) && issig(FORREAL
))
2558 err
= SET_ERROR(EINTR
);
2562 * If we hit an error or are interrupted, cancel our worker threads and
2563 * clear the queue of any pending records. The threads will pass the
2564 * cancel up the tree of worker threads, and each one will clean up any
2565 * pending records before exiting.
2568 srt_arg
->cancel
= B_TRUE
;
2569 while (!range
->eos_marker
) {
2570 range
= get_next_range(&srt_arg
->q
, range
);
2575 bqueue_destroy(&srt_arg
->q
);
2576 bqueue_destroy(&smt_arg
->q
);
2577 if (dspp
->redactbook
!= NULL
)
2578 bqueue_destroy(&rlt_arg
->q
);
2579 bqueue_destroy(&to_arg
->q
);
2580 bqueue_destroy(&from_arg
->q
);
2582 if (err
== 0 && srt_arg
->error
!= 0)
2583 err
= srt_arg
->error
;
2588 if (dsc
.dsc_pending_op
!= PENDING_NONE
)
2589 if (dump_record(&dsc
, NULL
, 0) != 0)
2590 err
= SET_ERROR(EINTR
);
2593 if (err
== EINTR
&& dsc
.dsc_err
!= 0)
2599 * Send the DRR_END record if this is not a saved stream.
2600 * Otherwise, the omitted DRR_END record will signal to
2601 * the receive side that the stream is incomplete.
2603 if (!dspp
->savedok
) {
2604 memset(drr
, 0, sizeof (dmu_replay_record_t
));
2605 drr
->drr_type
= DRR_END
;
2606 drr
->drr_u
.drr_end
.drr_checksum
= dsc
.dsc_zc
;
2607 drr
->drr_u
.drr_end
.drr_toguid
= dsc
.dsc_toguid
;
2609 if (dump_record(&dsc
, NULL
, 0) != 0)
2613 mutex_enter(&to_ds
->ds_sendstream_lock
);
2614 list_remove(&to_ds
->ds_sendstreams
, dssp
);
2615 mutex_exit(&to_ds
->ds_sendstream_lock
);
2617 VERIFY(err
!= 0 || (dsc
.dsc_sent_begin
&&
2618 (dsc
.dsc_sent_end
|| dspp
->savedok
)));
2620 kmem_free(drr
, sizeof (dmu_replay_record_t
));
2621 kmem_free(dssp
, sizeof (dmu_sendstatus_t
));
2622 kmem_free(from_arg
, sizeof (*from_arg
));
2623 kmem_free(to_arg
, sizeof (*to_arg
));
2624 kmem_free(rlt_arg
, sizeof (*rlt_arg
));
2625 kmem_free(smt_arg
, sizeof (*smt_arg
));
2626 kmem_free(srt_arg
, sizeof (*srt_arg
));
2628 dsl_dataset_long_rele(to_ds
, FTAG
);
2629 if (from_rl
!= NULL
) {
2630 dsl_redaction_list_long_rele(from_rl
, FTAG
);
2631 dsl_redaction_list_rele(from_rl
, FTAG
);
2633 if (redact_rl
!= NULL
) {
2634 dsl_redaction_list_long_rele(redact_rl
, FTAG
);
2635 dsl_redaction_list_rele(redact_rl
, FTAG
);
2642 dmu_send_obj(const char *pool
, uint64_t tosnap
, uint64_t fromsnap
,
2643 boolean_t embedok
, boolean_t large_block_ok
, boolean_t compressok
,
2644 boolean_t rawok
, boolean_t savedok
, int outfd
, offset_t
*off
,
2645 dmu_send_outparams_t
*dsop
)
2648 dsl_dataset_t
*fromds
;
2649 ds_hold_flags_t dsflags
;
2650 struct dmu_send_params dspp
= {0};
2651 dspp
.embedok
= embedok
;
2652 dspp
.large_block_ok
= large_block_ok
;
2653 dspp
.compressok
= compressok
;
2659 dspp
.savedok
= savedok
;
2661 dsflags
= (rawok
) ? DS_HOLD_FLAG_NONE
: DS_HOLD_FLAG_DECRYPT
;
2662 err
= dsl_pool_hold(pool
, FTAG
, &dspp
.dp
);
2666 err
= dsl_dataset_hold_obj_flags(dspp
.dp
, tosnap
, dsflags
, FTAG
,
2669 dsl_pool_rele(dspp
.dp
, FTAG
);
2673 if (fromsnap
!= 0) {
2674 err
= dsl_dataset_hold_obj_flags(dspp
.dp
, fromsnap
, dsflags
,
2677 dsl_dataset_rele_flags(dspp
.to_ds
, dsflags
, FTAG
);
2678 dsl_pool_rele(dspp
.dp
, FTAG
);
2681 dspp
.ancestor_zb
.zbm_guid
= dsl_dataset_phys(fromds
)->ds_guid
;
2682 dspp
.ancestor_zb
.zbm_creation_txg
=
2683 dsl_dataset_phys(fromds
)->ds_creation_txg
;
2684 dspp
.ancestor_zb
.zbm_creation_time
=
2685 dsl_dataset_phys(fromds
)->ds_creation_time
;
2687 if (dsl_dataset_is_zapified(fromds
)) {
2688 (void) zap_lookup(dspp
.dp
->dp_meta_objset
,
2689 fromds
->ds_object
, DS_FIELD_IVSET_GUID
, 8, 1,
2690 &dspp
.ancestor_zb
.zbm_ivset_guid
);
2693 /* See dmu_send for the reasons behind this. */
2694 uint64_t *fromredact
;
2696 if (!dsl_dataset_get_uint64_array_feature(fromds
,
2697 SPA_FEATURE_REDACTED_DATASETS
,
2698 &dspp
.numfromredactsnaps
,
2700 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2701 } else if (dspp
.numfromredactsnaps
> 0) {
2702 uint64_t size
= dspp
.numfromredactsnaps
*
2704 dspp
.fromredactsnaps
= kmem_zalloc(size
, KM_SLEEP
);
2705 memcpy(dspp
.fromredactsnaps
, fromredact
, size
);
2708 boolean_t is_before
=
2709 dsl_dataset_is_before(dspp
.to_ds
, fromds
, 0);
2710 dspp
.is_clone
= (dspp
.to_ds
->ds_dir
!=
2712 dsl_dataset_rele(fromds
, FTAG
);
2714 dsl_pool_rele(dspp
.dp
, FTAG
);
2715 err
= SET_ERROR(EXDEV
);
2717 err
= dmu_send_impl(&dspp
);
2720 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2721 err
= dmu_send_impl(&dspp
);
2723 dsl_dataset_rele(dspp
.to_ds
, FTAG
);
2728 dmu_send(const char *tosnap
, const char *fromsnap
, boolean_t embedok
,
2729 boolean_t large_block_ok
, boolean_t compressok
, boolean_t rawok
,
2730 boolean_t savedok
, uint64_t resumeobj
, uint64_t resumeoff
,
2731 const char *redactbook
, int outfd
, offset_t
*off
,
2732 dmu_send_outparams_t
*dsop
)
2735 ds_hold_flags_t dsflags
;
2736 boolean_t owned
= B_FALSE
;
2737 dsl_dataset_t
*fromds
= NULL
;
2738 zfs_bookmark_phys_t book
= {0};
2739 struct dmu_send_params dspp
= {0};
2741 dsflags
= (rawok
) ? DS_HOLD_FLAG_NONE
: DS_HOLD_FLAG_DECRYPT
;
2742 dspp
.tosnap
= tosnap
;
2743 dspp
.embedok
= embedok
;
2744 dspp
.large_block_ok
= large_block_ok
;
2745 dspp
.compressok
= compressok
;
2750 dspp
.resumeobj
= resumeobj
;
2751 dspp
.resumeoff
= resumeoff
;
2753 dspp
.savedok
= savedok
;
2755 if (fromsnap
!= NULL
&& strpbrk(fromsnap
, "@#") == NULL
)
2756 return (SET_ERROR(EINVAL
));
2758 err
= dsl_pool_hold(tosnap
, FTAG
, &dspp
.dp
);
2762 if (strchr(tosnap
, '@') == NULL
&& spa_writeable(dspp
.dp
->dp_spa
)) {
2764 * We are sending a filesystem or volume. Ensure
2765 * that it doesn't change by owning the dataset.
2770 * We are looking for the dataset that represents the
2771 * partially received send stream. If this stream was
2772 * received as a new snapshot of an existing dataset,
2773 * this will be saved in a hidden clone named
2774 * "<pool>/<dataset>/%recv". Otherwise, the stream
2775 * will be saved in the live dataset itself. In
2776 * either case we need to use dsl_dataset_own_force()
2777 * because the stream is marked as inconsistent,
2778 * which would normally make it unavailable to be
2781 char *name
= kmem_asprintf("%s/%s", tosnap
,
2783 err
= dsl_dataset_own_force(dspp
.dp
, name
, dsflags
,
2785 if (err
== ENOENT
) {
2786 err
= dsl_dataset_own_force(dspp
.dp
, tosnap
,
2787 dsflags
, FTAG
, &dspp
.to_ds
);
2791 err
= zap_lookup(dspp
.dp
->dp_meta_objset
,
2792 dspp
.to_ds
->ds_object
,
2793 DS_FIELD_RESUME_TOGUID
, 8, 1,
2798 err
= zap_lookup(dspp
.dp
->dp_meta_objset
,
2799 dspp
.to_ds
->ds_object
,
2800 DS_FIELD_RESUME_TONAME
, 1,
2801 sizeof (dspp
.saved_toname
),
2805 dsl_dataset_disown(dspp
.to_ds
, dsflags
, FTAG
);
2809 err
= dsl_dataset_own(dspp
.dp
, tosnap
, dsflags
,
2814 err
= dsl_dataset_hold_flags(dspp
.dp
, tosnap
, dsflags
, FTAG
,
2819 dsl_pool_rele(dspp
.dp
, FTAG
);
2823 if (redactbook
!= NULL
) {
2824 char path
[ZFS_MAX_DATASET_NAME_LEN
];
2825 (void) strlcpy(path
, tosnap
, sizeof (path
));
2826 char *at
= strchr(path
, '@');
2830 (void) snprintf(at
, sizeof (path
) - (at
- path
), "#%s",
2832 err
= dsl_bookmark_lookup(dspp
.dp
, path
,
2834 dspp
.redactbook
= &book
;
2839 dsl_pool_rele(dspp
.dp
, FTAG
);
2841 dsl_dataset_disown(dspp
.to_ds
, dsflags
, FTAG
);
2843 dsl_dataset_rele_flags(dspp
.to_ds
, dsflags
, FTAG
);
2847 if (fromsnap
!= NULL
) {
2848 zfs_bookmark_phys_t
*zb
= &dspp
.ancestor_zb
;
2850 if (strpbrk(tosnap
, "@#") != NULL
)
2851 fsnamelen
= strpbrk(tosnap
, "@#") - tosnap
;
2853 fsnamelen
= strlen(tosnap
);
2856 * If the fromsnap is in a different filesystem, then
2857 * mark the send stream as a clone.
2859 if (strncmp(tosnap
, fromsnap
, fsnamelen
) != 0 ||
2860 (fromsnap
[fsnamelen
] != '@' &&
2861 fromsnap
[fsnamelen
] != '#')) {
2862 dspp
.is_clone
= B_TRUE
;
2865 if (strchr(fromsnap
, '@') != NULL
) {
2866 err
= dsl_dataset_hold(dspp
.dp
, fromsnap
, FTAG
,
2870 ASSERT3P(fromds
, ==, NULL
);
2873 * We need to make a deep copy of the redact
2874 * snapshots of the from snapshot, because the
2875 * array will be freed when we evict from_ds.
2877 uint64_t *fromredact
;
2878 if (!dsl_dataset_get_uint64_array_feature(
2879 fromds
, SPA_FEATURE_REDACTED_DATASETS
,
2880 &dspp
.numfromredactsnaps
,
2882 dspp
.numfromredactsnaps
=
2883 NUM_SNAPS_NOT_REDACTED
;
2884 } else if (dspp
.numfromredactsnaps
> 0) {
2886 dspp
.numfromredactsnaps
*
2888 dspp
.fromredactsnaps
= kmem_zalloc(size
,
2890 memcpy(dspp
.fromredactsnaps
, fromredact
,
2893 if (!dsl_dataset_is_before(dspp
.to_ds
, fromds
,
2895 err
= SET_ERROR(EXDEV
);
2897 zb
->zbm_creation_txg
=
2898 dsl_dataset_phys(fromds
)->
2900 zb
->zbm_creation_time
=
2901 dsl_dataset_phys(fromds
)->
2904 dsl_dataset_phys(fromds
)->ds_guid
;
2905 zb
->zbm_redaction_obj
= 0;
2907 if (dsl_dataset_is_zapified(fromds
)) {
2909 dspp
.dp
->dp_meta_objset
,
2911 DS_FIELD_IVSET_GUID
, 8, 1,
2912 &zb
->zbm_ivset_guid
);
2915 dsl_dataset_rele(fromds
, FTAG
);
2918 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2919 err
= dsl_bookmark_lookup(dspp
.dp
, fromsnap
, dspp
.to_ds
,
2921 if (err
== EXDEV
&& zb
->zbm_redaction_obj
!= 0 &&
2923 dsl_dataset_phys(dspp
.to_ds
)->ds_guid
)
2928 /* dmu_send_impl will call dsl_pool_rele for us. */
2929 err
= dmu_send_impl(&dspp
);
2931 dsl_pool_rele(dspp
.dp
, FTAG
);
2934 dspp
.numfromredactsnaps
= NUM_SNAPS_NOT_REDACTED
;
2935 err
= dmu_send_impl(&dspp
);
2938 dsl_dataset_disown(dspp
.to_ds
, dsflags
, FTAG
);
2940 dsl_dataset_rele_flags(dspp
.to_ds
, dsflags
, FTAG
);
2945 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t
*ds
, uint64_t uncompressed
,
2946 uint64_t compressed
, boolean_t stream_compressed
, uint64_t *sizep
)
2951 * Assume that space (both on-disk and in-stream) is dominated by
2952 * data. We will adjust for indirect blocks and the copies property,
2953 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
2956 uint64_t recordsize
;
2957 uint64_t record_count
;
2959 VERIFY0(dmu_objset_from_ds(ds
, &os
));
2961 /* Assume all (uncompressed) blocks are recordsize. */
2962 if (zfs_override_estimate_recordsize
!= 0) {
2963 recordsize
= zfs_override_estimate_recordsize
;
2964 } else if (os
->os_phys
->os_type
== DMU_OST_ZVOL
) {
2965 err
= dsl_prop_get_int_ds(ds
,
2966 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE
), &recordsize
);
2968 err
= dsl_prop_get_int_ds(ds
,
2969 zfs_prop_to_name(ZFS_PROP_RECORDSIZE
), &recordsize
);
2973 record_count
= uncompressed
/ recordsize
;
2976 * If we're estimating a send size for a compressed stream, use the
2977 * compressed data size to estimate the stream size. Otherwise, use the
2978 * uncompressed data size.
2980 size
= stream_compressed
? compressed
: uncompressed
;
2983 * Subtract out approximate space used by indirect blocks.
2984 * Assume most space is used by data blocks (non-indirect, non-dnode).
2985 * Assume no ditto blocks or internal fragmentation.
2987 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
2990 size
-= record_count
* sizeof (blkptr_t
);
2992 /* Add in the space for the record associated with each block. */
2993 size
+= record_count
* sizeof (dmu_replay_record_t
);
3001 dmu_send_estimate_fast(dsl_dataset_t
*origds
, dsl_dataset_t
*fromds
,
3002 zfs_bookmark_phys_t
*frombook
, boolean_t stream_compressed
,
3003 boolean_t saved
, uint64_t *sizep
)
3006 dsl_dataset_t
*ds
= origds
;
3007 uint64_t uncomp
, comp
;
3009 ASSERT(dsl_pool_config_held(origds
->ds_dir
->dd_pool
));
3010 ASSERT(fromds
== NULL
|| frombook
== NULL
);
3013 * If this is a saved send we may actually be sending
3014 * from the %recv clone used for resuming.
3017 objset_t
*mos
= origds
->ds_dir
->dd_pool
->dp_meta_objset
;
3019 char dsname
[ZFS_MAX_DATASET_NAME_LEN
+ 6];
3021 dsl_dataset_name(origds
, dsname
);
3022 (void) strcat(dsname
, "/");
3023 (void) strcat(dsname
, recv_clone_name
);
3025 err
= dsl_dataset_hold(origds
->ds_dir
->dd_pool
,
3027 if (err
!= ENOENT
&& err
!= 0) {
3029 } else if (err
== ENOENT
) {
3033 /* check that this dataset has partially received data */
3034 err
= zap_lookup(mos
, ds
->ds_object
,
3035 DS_FIELD_RESUME_TOGUID
, 8, 1, &guid
);
3037 err
= SET_ERROR(err
== ENOENT
? EINVAL
: err
);
3041 err
= zap_lookup(mos
, ds
->ds_object
,
3042 DS_FIELD_RESUME_TONAME
, 1, sizeof (dsname
), dsname
);
3044 err
= SET_ERROR(err
== ENOENT
? EINVAL
: err
);
3049 /* tosnap must be a snapshot or the target of a saved send */
3050 if (!ds
->ds_is_snapshot
&& ds
== origds
)
3051 return (SET_ERROR(EINVAL
));
3053 if (fromds
!= NULL
) {
3055 if (!fromds
->ds_is_snapshot
) {
3056 err
= SET_ERROR(EINVAL
);
3060 if (!dsl_dataset_is_before(ds
, fromds
, 0)) {
3061 err
= SET_ERROR(EXDEV
);
3065 err
= dsl_dataset_space_written(fromds
, ds
, &used
, &comp
,
3069 } else if (frombook
!= NULL
) {
3071 err
= dsl_dataset_space_written_bookmark(frombook
, ds
, &used
,
3076 uncomp
= dsl_dataset_phys(ds
)->ds_uncompressed_bytes
;
3077 comp
= dsl_dataset_phys(ds
)->ds_compressed_bytes
;
3080 err
= dmu_adjust_send_estimate_for_indirects(ds
, uncomp
, comp
,
3081 stream_compressed
, sizep
);
3083 * Add the size of the BEGIN and END records to the estimate.
3085 *sizep
+= 2 * sizeof (dmu_replay_record_t
);
3089 dsl_dataset_rele(ds
, FTAG
);
3093 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, corrupt_data
, INT
, ZMOD_RW
,
3094 "Allow sending corrupt data");
3096 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, queue_length
, INT
, ZMOD_RW
,
3097 "Maximum send queue length");
3099 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, unmodified_spill_blocks
, INT
, ZMOD_RW
,
3100 "Send unmodified spill blocks");
3102 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, no_prefetch_queue_length
, INT
, ZMOD_RW
,
3103 "Maximum send queue length for non-prefetch queues");
3105 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, queue_ff
, INT
, ZMOD_RW
,
3106 "Send queue fill fraction");
3108 ZFS_MODULE_PARAM(zfs_send
, zfs_send_
, no_prefetch_queue_ff
, INT
, ZMOD_RW
,
3109 "Send queue fill fraction for non-prefetch queues");
3111 ZFS_MODULE_PARAM(zfs_send
, zfs_
, override_estimate_recordsize
, INT
, ZMOD_RW
,
3112 "Override block size estimate with fixed size");