4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26 * Copyright 2014 HybridCluster. All rights reserved.
27 * Copyright 2016 RackTop Systems.
28 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
32 #include <sys/dmu_impl.h>
33 #include <sys/dmu_tx.h>
35 #include <sys/dnode.h>
36 #include <sys/zfs_context.h>
37 #include <sys/dmu_objset.h>
38 #include <sys/dmu_traverse.h>
39 #include <sys/dsl_dataset.h>
40 #include <sys/dsl_dir.h>
41 #include <sys/dsl_prop.h>
42 #include <sys/dsl_pool.h>
43 #include <sys/dsl_synctask.h>
44 #include <sys/spa_impl.h>
45 #include <sys/zfs_ioctl.h>
47 #include <sys/zio_checksum.h>
48 #include <sys/zfs_znode.h>
49 #include <zfs_fletcher.h>
52 #include <sys/zfs_onexit.h>
53 #include <sys/dmu_send.h>
54 #include <sys/dsl_destroy.h>
55 #include <sys/blkptr.h>
56 #include <sys/dsl_bookmark.h>
57 #include <sys/zfeature.h>
58 #include <sys/bqueue.h>
60 #include <sys/policy.h>
62 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
63 int zfs_send_corrupt_data
= B_FALSE
;
64 int zfs_send_queue_length
= SPA_MAXBLOCKSIZE
;
65 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
66 int zfs_send_set_freerecords_bit
= B_TRUE
;
69 * Use this to override the recordsize calculation for fast zfs send estimates.
71 unsigned long zfs_override_estimate_recordsize
= 0;
73 #define BP_SPAN(datablkszsec, indblkshift, level) \
74 (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
75 (level) * (indblkshift - SPA_BLKPTRSHIFT)))
77 struct send_thread_arg
{
79 dsl_dataset_t
*ds
; /* Dataset to traverse */
80 uint64_t fromtxg
; /* Traverse from this txg */
81 int flags
; /* flags to pass to traverse_dataset */
84 zbookmark_phys_t resume
;
87 struct send_block_record
{
88 boolean_t eos_marker
; /* Marks the end of the stream */
92 uint16_t datablkszsec
;
96 typedef struct dump_bytes_io
{
97 dmu_sendarg_t
*dbi_dsp
;
103 dump_bytes_cb(void *arg
)
105 dump_bytes_io_t
*dbi
= (dump_bytes_io_t
*)arg
;
106 dmu_sendarg_t
*dsp
= dbi
->dbi_dsp
;
107 dsl_dataset_t
*ds
= dmu_objset_ds(dsp
->dsa_os
);
108 ssize_t resid
; /* have to get resid to get detailed errno */
111 * The code does not rely on len being a multiple of 8. We keep
112 * this assertion because of the corresponding assertion in
113 * receive_read(). Keeping this assertion ensures that we do not
114 * inadvertently break backwards compatibility (causing the assertion
115 * in receive_read() to trigger on old software). Newer feature flags
116 * (such as raw send) may break this assertion since they were
117 * introduced after the requirement was made obsolete.
120 ASSERT(dbi
->dbi_len
% 8 == 0 ||
121 (dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) != 0);
123 dsp
->dsa_err
= vn_rdwr(UIO_WRITE
, dsp
->dsa_vp
,
124 (caddr_t
)dbi
->dbi_buf
, dbi
->dbi_len
,
125 0, UIO_SYSSPACE
, FAPPEND
, RLIM64_INFINITY
, CRED(), &resid
);
127 mutex_enter(&ds
->ds_sendstream_lock
);
128 *dsp
->dsa_off
+= dbi
->dbi_len
;
129 mutex_exit(&ds
->ds_sendstream_lock
);
133 dump_bytes(dmu_sendarg_t
*dsp
, void *buf
, int len
)
141 #if defined(HAVE_LARGE_STACKS)
145 * The vn_rdwr() call is performed in a taskq to ensure that there is
146 * always enough stack space to write safely to the target filesystem.
147 * The ZIO_TYPE_FREE threads are used because there can be a lot of
148 * them and they are used in vdev_file.c for a similar purpose.
150 spa_taskq_dispatch_sync(dmu_objset_spa(dsp
->dsa_os
), ZIO_TYPE_FREE
,
151 ZIO_TASKQ_ISSUE
, dump_bytes_cb
, &dbi
, TQ_SLEEP
);
152 #endif /* HAVE_LARGE_STACKS */
154 return (dsp
->dsa_err
);
158 * For all record types except BEGIN, fill in the checksum (overlaid in
159 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
160 * up to the start of the checksum itself.
163 dump_record(dmu_sendarg_t
*dsp
, void *payload
, int payload_len
)
165 ASSERT3U(offsetof(dmu_replay_record_t
, drr_u
.drr_checksum
.drr_checksum
),
166 ==, sizeof (dmu_replay_record_t
) - sizeof (zio_cksum_t
));
167 (void) fletcher_4_incremental_native(dsp
->dsa_drr
,
168 offsetof(dmu_replay_record_t
, drr_u
.drr_checksum
.drr_checksum
),
170 if (dsp
->dsa_drr
->drr_type
== DRR_BEGIN
) {
171 dsp
->dsa_sent_begin
= B_TRUE
;
173 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp
->dsa_drr
->drr_u
.
174 drr_checksum
.drr_checksum
));
175 dsp
->dsa_drr
->drr_u
.drr_checksum
.drr_checksum
= dsp
->dsa_zc
;
177 if (dsp
->dsa_drr
->drr_type
== DRR_END
) {
178 dsp
->dsa_sent_end
= B_TRUE
;
180 (void) fletcher_4_incremental_native(&dsp
->dsa_drr
->
181 drr_u
.drr_checksum
.drr_checksum
,
182 sizeof (zio_cksum_t
), &dsp
->dsa_zc
);
183 if (dump_bytes(dsp
, dsp
->dsa_drr
, sizeof (dmu_replay_record_t
)) != 0)
184 return (SET_ERROR(EINTR
));
185 if (payload_len
!= 0) {
186 (void) fletcher_4_incremental_native(payload
, payload_len
,
188 if (dump_bytes(dsp
, payload
, payload_len
) != 0)
189 return (SET_ERROR(EINTR
));
195 * Fill in the drr_free struct, or perform aggregation if the previous record is
196 * also a free record, and the two are adjacent.
198 * Note that we send free records even for a full send, because we want to be
199 * able to receive a full send as a clone, which requires a list of all the free
200 * and freeobject records that were generated on the source.
203 dump_free(dmu_sendarg_t
*dsp
, uint64_t object
, uint64_t offset
,
206 struct drr_free
*drrf
= &(dsp
->dsa_drr
->drr_u
.drr_free
);
209 * When we receive a free record, dbuf_free_range() assumes
210 * that the receiving system doesn't have any dbufs in the range
211 * being freed. This is always true because there is a one-record
212 * constraint: we only send one WRITE record for any given
213 * object,offset. We know that the one-record constraint is
214 * true because we always send data in increasing order by
217 * If the increasing-order constraint ever changes, we should find
218 * another way to assert that the one-record constraint is still
221 ASSERT(object
> dsp
->dsa_last_data_object
||
222 (object
== dsp
->dsa_last_data_object
&&
223 offset
> dsp
->dsa_last_data_offset
));
226 * If there is a pending op, but it's not PENDING_FREE, push it out,
227 * since free block aggregation can only be done for blocks of the
228 * same type (i.e., DRR_FREE records can only be aggregated with
229 * other DRR_FREE records. DRR_FREEOBJECTS records can only be
230 * aggregated with other DRR_FREEOBJECTS records.
232 if (dsp
->dsa_pending_op
!= PENDING_NONE
&&
233 dsp
->dsa_pending_op
!= PENDING_FREE
) {
234 if (dump_record(dsp
, NULL
, 0) != 0)
235 return (SET_ERROR(EINTR
));
236 dsp
->dsa_pending_op
= PENDING_NONE
;
239 if (dsp
->dsa_pending_op
== PENDING_FREE
) {
241 * There should never be a PENDING_FREE if length is
242 * DMU_OBJECT_END (because dump_dnode is the only place where
243 * this function is called with a DMU_OBJECT_END, and only after
244 * flushing any pending record).
246 ASSERT(length
!= DMU_OBJECT_END
);
248 * Check to see whether this free block can be aggregated
251 if (drrf
->drr_object
== object
&& drrf
->drr_offset
+
252 drrf
->drr_length
== offset
) {
253 if (offset
+ length
< offset
)
254 drrf
->drr_length
= DMU_OBJECT_END
;
256 drrf
->drr_length
+= length
;
259 /* not a continuation. Push out pending record */
260 if (dump_record(dsp
, NULL
, 0) != 0)
261 return (SET_ERROR(EINTR
));
262 dsp
->dsa_pending_op
= PENDING_NONE
;
265 /* create a FREE record and make it pending */
266 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
267 dsp
->dsa_drr
->drr_type
= DRR_FREE
;
268 drrf
->drr_object
= object
;
269 drrf
->drr_offset
= offset
;
270 if (offset
+ length
< offset
)
271 drrf
->drr_length
= DMU_OBJECT_END
;
273 drrf
->drr_length
= length
;
274 drrf
->drr_toguid
= dsp
->dsa_toguid
;
275 if (length
== DMU_OBJECT_END
) {
276 if (dump_record(dsp
, NULL
, 0) != 0)
277 return (SET_ERROR(EINTR
));
279 dsp
->dsa_pending_op
= PENDING_FREE
;
286 dump_write(dmu_sendarg_t
*dsp
, dmu_object_type_t type
, uint64_t object
,
287 uint64_t offset
, int lsize
, int psize
, const blkptr_t
*bp
, void *data
)
289 uint64_t payload_size
;
290 boolean_t raw
= (dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
);
291 struct drr_write
*drrw
= &(dsp
->dsa_drr
->drr_u
.drr_write
);
294 * We send data in increasing object, offset order.
295 * See comment in dump_free() for details.
297 ASSERT(object
> dsp
->dsa_last_data_object
||
298 (object
== dsp
->dsa_last_data_object
&&
299 offset
> dsp
->dsa_last_data_offset
));
300 dsp
->dsa_last_data_object
= object
;
301 dsp
->dsa_last_data_offset
= offset
+ lsize
- 1;
304 * If there is any kind of pending aggregation (currently either
305 * a grouping of free objects or free blocks), push it out to
306 * the stream, since aggregation can't be done across operations
307 * of different types.
309 if (dsp
->dsa_pending_op
!= PENDING_NONE
) {
310 if (dump_record(dsp
, NULL
, 0) != 0)
311 return (SET_ERROR(EINTR
));
312 dsp
->dsa_pending_op
= PENDING_NONE
;
314 /* write a WRITE record */
315 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
316 dsp
->dsa_drr
->drr_type
= DRR_WRITE
;
317 drrw
->drr_object
= object
;
318 drrw
->drr_type
= type
;
319 drrw
->drr_offset
= offset
;
320 drrw
->drr_toguid
= dsp
->dsa_toguid
;
321 drrw
->drr_logical_size
= lsize
;
323 /* only set the compression fields if the buf is compressed or raw */
324 if (raw
|| lsize
!= psize
) {
325 ASSERT(!BP_IS_EMBEDDED(bp
));
326 ASSERT3S(psize
, >, 0);
329 ASSERT(BP_IS_PROTECTED(bp
));
332 * This is a raw protected block so we need to pass
333 * along everything the receiving side will need to
334 * interpret this block, including the byteswap, salt,
337 if (BP_SHOULD_BYTESWAP(bp
))
338 drrw
->drr_flags
|= DRR_RAW_BYTESWAP
;
339 zio_crypt_decode_params_bp(bp
, drrw
->drr_salt
,
341 zio_crypt_decode_mac_bp(bp
, drrw
->drr_mac
);
343 /* this is a compressed block */
344 ASSERT(dsp
->dsa_featureflags
&
345 DMU_BACKUP_FEATURE_COMPRESSED
);
346 ASSERT(!BP_SHOULD_BYTESWAP(bp
));
347 ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp
)));
348 ASSERT3U(BP_GET_COMPRESS(bp
), !=, ZIO_COMPRESS_OFF
);
349 ASSERT3S(lsize
, >=, psize
);
352 /* set fields common to compressed and raw sends */
353 drrw
->drr_compressiontype
= BP_GET_COMPRESS(bp
);
354 drrw
->drr_compressed_size
= psize
;
355 payload_size
= drrw
->drr_compressed_size
;
357 payload_size
= drrw
->drr_logical_size
;
360 if (bp
== NULL
|| BP_IS_EMBEDDED(bp
) || (BP_IS_PROTECTED(bp
) && !raw
)) {
362 * There's no pre-computed checksum for partial-block writes,
363 * embedded BP's, or encrypted BP's that are being sent as
364 * plaintext, so (like fletcher4-checkummed blocks) userland
365 * will have to compute a dedup-capable checksum itself.
367 drrw
->drr_checksumtype
= ZIO_CHECKSUM_OFF
;
369 drrw
->drr_checksumtype
= BP_GET_CHECKSUM(bp
);
370 if (zio_checksum_table
[drrw
->drr_checksumtype
].ci_flags
&
371 ZCHECKSUM_FLAG_DEDUP
)
372 drrw
->drr_flags
|= DRR_CHECKSUM_DEDUP
;
373 DDK_SET_LSIZE(&drrw
->drr_key
, BP_GET_LSIZE(bp
));
374 DDK_SET_PSIZE(&drrw
->drr_key
, BP_GET_PSIZE(bp
));
375 DDK_SET_COMPRESS(&drrw
->drr_key
, BP_GET_COMPRESS(bp
));
376 DDK_SET_CRYPT(&drrw
->drr_key
, BP_IS_PROTECTED(bp
));
377 drrw
->drr_key
.ddk_cksum
= bp
->blk_cksum
;
380 if (dump_record(dsp
, data
, payload_size
) != 0)
381 return (SET_ERROR(EINTR
));
386 dump_write_embedded(dmu_sendarg_t
*dsp
, uint64_t object
, uint64_t offset
,
387 int blksz
, const blkptr_t
*bp
)
389 char buf
[BPE_PAYLOAD_SIZE
];
390 struct drr_write_embedded
*drrw
=
391 &(dsp
->dsa_drr
->drr_u
.drr_write_embedded
);
393 if (dsp
->dsa_pending_op
!= PENDING_NONE
) {
394 if (dump_record(dsp
, NULL
, 0) != 0)
395 return (SET_ERROR(EINTR
));
396 dsp
->dsa_pending_op
= PENDING_NONE
;
399 ASSERT(BP_IS_EMBEDDED(bp
));
401 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
402 dsp
->dsa_drr
->drr_type
= DRR_WRITE_EMBEDDED
;
403 drrw
->drr_object
= object
;
404 drrw
->drr_offset
= offset
;
405 drrw
->drr_length
= blksz
;
406 drrw
->drr_toguid
= dsp
->dsa_toguid
;
407 drrw
->drr_compression
= BP_GET_COMPRESS(bp
);
408 drrw
->drr_etype
= BPE_GET_ETYPE(bp
);
409 drrw
->drr_lsize
= BPE_GET_LSIZE(bp
);
410 drrw
->drr_psize
= BPE_GET_PSIZE(bp
);
412 decode_embedded_bp_compressed(bp
, buf
);
414 if (dump_record(dsp
, buf
, P2ROUNDUP(drrw
->drr_psize
, 8)) != 0)
415 return (SET_ERROR(EINTR
));
420 dump_spill(dmu_sendarg_t
*dsp
, const blkptr_t
*bp
, uint64_t object
, void *data
)
422 struct drr_spill
*drrs
= &(dsp
->dsa_drr
->drr_u
.drr_spill
);
423 uint64_t blksz
= BP_GET_LSIZE(bp
);
424 uint64_t payload_size
= blksz
;
426 if (dsp
->dsa_pending_op
!= PENDING_NONE
) {
427 if (dump_record(dsp
, NULL
, 0) != 0)
428 return (SET_ERROR(EINTR
));
429 dsp
->dsa_pending_op
= PENDING_NONE
;
432 /* write a SPILL record */
433 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
434 dsp
->dsa_drr
->drr_type
= DRR_SPILL
;
435 drrs
->drr_object
= object
;
436 drrs
->drr_length
= blksz
;
437 drrs
->drr_toguid
= dsp
->dsa_toguid
;
439 /* handle raw send fields */
440 if (dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
441 ASSERT(BP_IS_PROTECTED(bp
));
443 if (BP_SHOULD_BYTESWAP(bp
))
444 drrs
->drr_flags
|= DRR_RAW_BYTESWAP
;
445 drrs
->drr_compressiontype
= BP_GET_COMPRESS(bp
);
446 drrs
->drr_compressed_size
= BP_GET_PSIZE(bp
);
447 zio_crypt_decode_params_bp(bp
, drrs
->drr_salt
, drrs
->drr_iv
);
448 zio_crypt_decode_mac_bp(bp
, drrs
->drr_mac
);
449 payload_size
= drrs
->drr_compressed_size
;
452 if (dump_record(dsp
, data
, payload_size
) != 0)
453 return (SET_ERROR(EINTR
));
458 dump_freeobjects(dmu_sendarg_t
*dsp
, uint64_t firstobj
, uint64_t numobjs
)
460 struct drr_freeobjects
*drrfo
= &(dsp
->dsa_drr
->drr_u
.drr_freeobjects
);
461 uint64_t maxobj
= DNODES_PER_BLOCK
*
462 (DMU_META_DNODE(dsp
->dsa_os
)->dn_maxblkid
+ 1);
465 * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
466 * leading to zfs recv never completing. to avoid this issue, don't
467 * send FREEOBJECTS records for object IDs which cannot exist on the
471 if (maxobj
< firstobj
)
474 if (maxobj
< firstobj
+ numobjs
)
475 numobjs
= maxobj
- firstobj
;
479 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
480 * push it out, since free block aggregation can only be done for
481 * blocks of the same type (i.e., DRR_FREE records can only be
482 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
483 * can only be aggregated with other DRR_FREEOBJECTS records.
485 if (dsp
->dsa_pending_op
!= PENDING_NONE
&&
486 dsp
->dsa_pending_op
!= PENDING_FREEOBJECTS
) {
487 if (dump_record(dsp
, NULL
, 0) != 0)
488 return (SET_ERROR(EINTR
));
489 dsp
->dsa_pending_op
= PENDING_NONE
;
491 if (dsp
->dsa_pending_op
== PENDING_FREEOBJECTS
) {
493 * See whether this free object array can be aggregated
496 if (drrfo
->drr_firstobj
+ drrfo
->drr_numobjs
== firstobj
) {
497 drrfo
->drr_numobjs
+= numobjs
;
500 /* can't be aggregated. Push out pending record */
501 if (dump_record(dsp
, NULL
, 0) != 0)
502 return (SET_ERROR(EINTR
));
503 dsp
->dsa_pending_op
= PENDING_NONE
;
507 /* write a FREEOBJECTS record */
508 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
509 dsp
->dsa_drr
->drr_type
= DRR_FREEOBJECTS
;
510 drrfo
->drr_firstobj
= firstobj
;
511 drrfo
->drr_numobjs
= numobjs
;
512 drrfo
->drr_toguid
= dsp
->dsa_toguid
;
514 dsp
->dsa_pending_op
= PENDING_FREEOBJECTS
;
520 dump_dnode(dmu_sendarg_t
*dsp
, const blkptr_t
*bp
, uint64_t object
,
523 struct drr_object
*drro
= &(dsp
->dsa_drr
->drr_u
.drr_object
);
526 if (object
< dsp
->dsa_resume_object
) {
528 * Note: when resuming, we will visit all the dnodes in
529 * the block of dnodes that we are resuming from. In
530 * this case it's unnecessary to send the dnodes prior to
531 * the one we are resuming from. We should be at most one
532 * block's worth of dnodes behind the resume point.
534 ASSERT3U(dsp
->dsa_resume_object
- object
, <,
535 1 << (DNODE_BLOCK_SHIFT
- DNODE_SHIFT
));
539 if (dnp
== NULL
|| dnp
->dn_type
== DMU_OT_NONE
)
540 return (dump_freeobjects(dsp
, object
, 1));
542 if (dsp
->dsa_pending_op
!= PENDING_NONE
) {
543 if (dump_record(dsp
, NULL
, 0) != 0)
544 return (SET_ERROR(EINTR
));
545 dsp
->dsa_pending_op
= PENDING_NONE
;
548 /* write an OBJECT record */
549 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
550 dsp
->dsa_drr
->drr_type
= DRR_OBJECT
;
551 drro
->drr_object
= object
;
552 drro
->drr_type
= dnp
->dn_type
;
553 drro
->drr_bonustype
= dnp
->dn_bonustype
;
554 drro
->drr_blksz
= dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
555 drro
->drr_bonuslen
= dnp
->dn_bonuslen
;
556 drro
->drr_dn_slots
= dnp
->dn_extra_slots
+ 1;
557 drro
->drr_checksumtype
= dnp
->dn_checksum
;
558 drro
->drr_compress
= dnp
->dn_compress
;
559 drro
->drr_toguid
= dsp
->dsa_toguid
;
561 if (!(dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
) &&
562 drro
->drr_blksz
> SPA_OLD_MAXBLOCKSIZE
)
563 drro
->drr_blksz
= SPA_OLD_MAXBLOCKSIZE
;
565 bonuslen
= P2ROUNDUP(dnp
->dn_bonuslen
, 8);
567 if ((dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
)) {
568 ASSERT(BP_IS_ENCRYPTED(bp
));
570 if (BP_SHOULD_BYTESWAP(bp
))
571 drro
->drr_flags
|= DRR_RAW_BYTESWAP
;
573 /* needed for reconstructing dnp on recv side */
574 drro
->drr_maxblkid
= dnp
->dn_maxblkid
;
575 drro
->drr_indblkshift
= dnp
->dn_indblkshift
;
576 drro
->drr_nlevels
= dnp
->dn_nlevels
;
577 drro
->drr_nblkptr
= dnp
->dn_nblkptr
;
580 * Since we encrypt the entire bonus area, the (raw) part
581 * beyond the bonuslen is actually nonzero, so we need
585 drro
->drr_raw_bonuslen
= DN_MAX_BONUS_LEN(dnp
);
586 bonuslen
= drro
->drr_raw_bonuslen
;
590 if (dump_record(dsp
, DN_BONUS(dnp
), bonuslen
) != 0)
591 return (SET_ERROR(EINTR
));
593 /* Free anything past the end of the file. */
594 if (dump_free(dsp
, object
, (dnp
->dn_maxblkid
+ 1) *
595 (dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
), DMU_OBJECT_END
) != 0)
596 return (SET_ERROR(EINTR
));
597 if (dsp
->dsa_err
!= 0)
598 return (SET_ERROR(EINTR
));
603 dump_object_range(dmu_sendarg_t
*dsp
, const blkptr_t
*bp
, uint64_t firstobj
,
606 struct drr_object_range
*drror
=
607 &(dsp
->dsa_drr
->drr_u
.drr_object_range
);
609 /* we only use this record type for raw sends */
610 ASSERT(BP_IS_PROTECTED(bp
));
611 ASSERT(dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
);
612 ASSERT3U(BP_GET_COMPRESS(bp
), ==, ZIO_COMPRESS_OFF
);
613 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_DNODE
);
614 ASSERT0(BP_GET_LEVEL(bp
));
616 if (dsp
->dsa_pending_op
!= PENDING_NONE
) {
617 if (dump_record(dsp
, NULL
, 0) != 0)
618 return (SET_ERROR(EINTR
));
619 dsp
->dsa_pending_op
= PENDING_NONE
;
622 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
623 dsp
->dsa_drr
->drr_type
= DRR_OBJECT_RANGE
;
624 drror
->drr_firstobj
= firstobj
;
625 drror
->drr_numslots
= numslots
;
626 drror
->drr_toguid
= dsp
->dsa_toguid
;
627 if (BP_SHOULD_BYTESWAP(bp
))
628 drror
->drr_flags
|= DRR_RAW_BYTESWAP
;
629 zio_crypt_decode_params_bp(bp
, drror
->drr_salt
, drror
->drr_iv
);
630 zio_crypt_decode_mac_bp(bp
, drror
->drr_mac
);
632 if (dump_record(dsp
, NULL
, 0) != 0)
633 return (SET_ERROR(EINTR
));
638 backup_do_embed(dmu_sendarg_t
*dsp
, const blkptr_t
*bp
)
640 if (!BP_IS_EMBEDDED(bp
))
644 * Compression function must be legacy, or explicitly enabled.
646 if ((BP_GET_COMPRESS(bp
) >= ZIO_COMPRESS_LEGACY_FUNCTIONS
&&
647 !(dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_LZ4
)))
651 * Embed type must be explicitly enabled.
653 switch (BPE_GET_ETYPE(bp
)) {
654 case BP_EMBEDDED_TYPE_DATA
:
655 if (dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_EMBED_DATA
)
665 * This is the callback function to traverse_dataset that acts as the worker
666 * thread for dmu_send_impl.
670 send_cb(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
671 const zbookmark_phys_t
*zb
, const struct dnode_phys
*dnp
, void *arg
)
673 struct send_thread_arg
*sta
= arg
;
674 struct send_block_record
*record
;
675 uint64_t record_size
;
678 ASSERT(zb
->zb_object
== DMU_META_DNODE_OBJECT
||
679 zb
->zb_object
>= sta
->resume
.zb_object
);
680 ASSERT3P(sta
->ds
, !=, NULL
);
683 return (SET_ERROR(EINTR
));
686 ASSERT3U(zb
->zb_level
, ==, ZB_DNODE_LEVEL
);
688 } else if (zb
->zb_level
< 0) {
692 record
= kmem_zalloc(sizeof (struct send_block_record
), KM_SLEEP
);
693 record
->eos_marker
= B_FALSE
;
696 record
->indblkshift
= dnp
->dn_indblkshift
;
697 record
->datablkszsec
= dnp
->dn_datablkszsec
;
698 record_size
= dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
699 bqueue_enqueue(&sta
->q
, record
, record_size
);
705 * This function kicks off the traverse_dataset. It also handles setting the
706 * error code of the thread in case something goes wrong, and pushes the End of
707 * Stream record when the traverse_dataset call has finished. If there is no
708 * dataset to traverse, the thread immediately pushes End of Stream marker.
711 send_traverse_thread(void *arg
)
713 struct send_thread_arg
*st_arg
= arg
;
715 struct send_block_record
*data
;
716 fstrans_cookie_t cookie
= spl_fstrans_mark();
718 if (st_arg
->ds
!= NULL
) {
719 err
= traverse_dataset_resume(st_arg
->ds
,
720 st_arg
->fromtxg
, &st_arg
->resume
,
721 st_arg
->flags
, send_cb
, st_arg
);
724 st_arg
->error_code
= err
;
726 data
= kmem_zalloc(sizeof (*data
), KM_SLEEP
);
727 data
->eos_marker
= B_TRUE
;
728 bqueue_enqueue(&st_arg
->q
, data
, 1);
729 spl_fstrans_unmark(cookie
);
734 * This function actually handles figuring out what kind of record needs to be
735 * dumped, reading the data (which has hopefully been prefetched), and calling
736 * the appropriate helper function.
739 do_dump(dmu_sendarg_t
*dsa
, struct send_block_record
*data
)
741 dsl_dataset_t
*ds
= dmu_objset_ds(dsa
->dsa_os
);
742 const blkptr_t
*bp
= &data
->bp
;
743 const zbookmark_phys_t
*zb
= &data
->zb
;
744 uint8_t indblkshift
= data
->indblkshift
;
745 uint16_t dblkszsec
= data
->datablkszsec
;
746 spa_t
*spa
= ds
->ds_dir
->dd_pool
->dp_spa
;
747 dmu_object_type_t type
= bp
? BP_GET_TYPE(bp
) : DMU_OT_NONE
;
750 ASSERT3U(zb
->zb_level
, >=, 0);
752 ASSERT(zb
->zb_object
== DMU_META_DNODE_OBJECT
||
753 zb
->zb_object
>= dsa
->dsa_resume_object
);
756 * All bps of an encrypted os should have the encryption bit set.
757 * If this is not true it indicates tampering and we report an error.
759 if (dsa
->dsa_os
->os_encrypted
&&
760 !BP_IS_HOLE(bp
) && !BP_USES_CRYPT(bp
)) {
761 spa_log_error(spa
, zb
);
762 zfs_panic_recover("unencrypted block in encrypted "
763 "object set %llu", ds
->ds_object
);
764 return (SET_ERROR(EIO
));
767 if (zb
->zb_object
!= DMU_META_DNODE_OBJECT
&&
768 DMU_OBJECT_IS_SPECIAL(zb
->zb_object
)) {
770 } else if (BP_IS_HOLE(bp
) &&
771 zb
->zb_object
== DMU_META_DNODE_OBJECT
) {
772 uint64_t span
= BP_SPAN(dblkszsec
, indblkshift
, zb
->zb_level
);
773 uint64_t dnobj
= (zb
->zb_blkid
* span
) >> DNODE_SHIFT
;
774 err
= dump_freeobjects(dsa
, dnobj
, span
>> DNODE_SHIFT
);
775 } else if (BP_IS_HOLE(bp
)) {
776 uint64_t span
= BP_SPAN(dblkszsec
, indblkshift
, zb
->zb_level
);
777 uint64_t offset
= zb
->zb_blkid
* span
;
778 /* Don't dump free records for offsets > DMU_OBJECT_END */
779 if (zb
->zb_blkid
== 0 || span
<= DMU_OBJECT_END
/ zb
->zb_blkid
)
780 err
= dump_free(dsa
, zb
->zb_object
, offset
, span
);
781 } else if (zb
->zb_level
> 0 || type
== DMU_OT_OBJSET
) {
783 } else if (type
== DMU_OT_DNODE
) {
784 int epb
= BP_GET_LSIZE(bp
) >> DNODE_SHIFT
;
785 arc_flags_t aflags
= ARC_FLAG_WAIT
;
787 enum zio_flag zioflags
= ZIO_FLAG_CANFAIL
;
789 if (dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
790 ASSERT(BP_IS_ENCRYPTED(bp
));
791 ASSERT3U(BP_GET_COMPRESS(bp
), ==, ZIO_COMPRESS_OFF
);
792 zioflags
|= ZIO_FLAG_RAW
;
795 ASSERT0(zb
->zb_level
);
797 if (arc_read(NULL
, spa
, bp
, arc_getbuf_func
, &abuf
,
798 ZIO_PRIORITY_ASYNC_READ
, zioflags
, &aflags
, zb
) != 0)
799 return (SET_ERROR(EIO
));
801 dnode_phys_t
*blk
= abuf
->b_data
;
802 uint64_t dnobj
= zb
->zb_blkid
* epb
;
805 * Raw sends require sending encryption parameters for the
806 * block of dnodes. Regular sends do not need to send this
809 if (dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
810 ASSERT(arc_is_encrypted(abuf
));
811 err
= dump_object_range(dsa
, bp
, dnobj
, epb
);
815 for (int i
= 0; i
< epb
;
816 i
+= blk
[i
].dn_extra_slots
+ 1) {
817 err
= dump_dnode(dsa
, bp
, dnobj
+ i
, blk
+ i
);
822 arc_buf_destroy(abuf
, &abuf
);
823 } else if (type
== DMU_OT_SA
) {
824 arc_flags_t aflags
= ARC_FLAG_WAIT
;
826 enum zio_flag zioflags
= ZIO_FLAG_CANFAIL
;
828 if (dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
829 ASSERT(BP_IS_PROTECTED(bp
));
830 zioflags
|= ZIO_FLAG_RAW
;
833 if (arc_read(NULL
, spa
, bp
, arc_getbuf_func
, &abuf
,
834 ZIO_PRIORITY_ASYNC_READ
, zioflags
, &aflags
, zb
) != 0)
835 return (SET_ERROR(EIO
));
837 err
= dump_spill(dsa
, bp
, zb
->zb_object
, abuf
->b_data
);
838 arc_buf_destroy(abuf
, &abuf
);
839 } else if (backup_do_embed(dsa
, bp
)) {
840 /* it's an embedded level-0 block of a regular object */
841 int blksz
= dblkszsec
<< SPA_MINBLOCKSHIFT
;
842 ASSERT0(zb
->zb_level
);
843 err
= dump_write_embedded(dsa
, zb
->zb_object
,
844 zb
->zb_blkid
* blksz
, blksz
, bp
);
846 /* it's a level-0 block of a regular object */
847 arc_flags_t aflags
= ARC_FLAG_WAIT
;
849 int blksz
= dblkszsec
<< SPA_MINBLOCKSHIFT
;
853 * If we have large blocks stored on disk but the send flags
854 * don't allow us to send large blocks, we split the data from
855 * the arc buf into chunks.
857 boolean_t split_large_blocks
= blksz
> SPA_OLD_MAXBLOCKSIZE
&&
858 !(dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
);
861 * Raw sends require that we always get raw data as it exists
862 * on disk, so we assert that we are not splitting blocks here.
864 boolean_t request_raw
=
865 (dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) != 0;
868 * We should only request compressed data from the ARC if all
869 * the following are true:
870 * - stream compression was requested
871 * - we aren't splitting large blocks into smaller chunks
872 * - the data won't need to be byteswapped before sending
873 * - this isn't an embedded block
874 * - this isn't metadata (if receiving on a different endian
875 * system it can be byteswapped more easily)
877 boolean_t request_compressed
=
878 (dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_COMPRESSED
) &&
879 !split_large_blocks
&& !BP_SHOULD_BYTESWAP(bp
) &&
880 !BP_IS_EMBEDDED(bp
) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp
));
882 IMPLY(request_raw
, !split_large_blocks
);
883 IMPLY(request_raw
, BP_IS_PROTECTED(bp
));
884 ASSERT0(zb
->zb_level
);
885 ASSERT(zb
->zb_object
> dsa
->dsa_resume_object
||
886 (zb
->zb_object
== dsa
->dsa_resume_object
&&
887 zb
->zb_blkid
* blksz
>= dsa
->dsa_resume_offset
));
889 ASSERT3U(blksz
, ==, BP_GET_LSIZE(bp
));
891 enum zio_flag zioflags
= ZIO_FLAG_CANFAIL
;
893 zioflags
|= ZIO_FLAG_RAW
;
894 else if (request_compressed
)
895 zioflags
|= ZIO_FLAG_RAW_COMPRESS
;
897 if (arc_read(NULL
, spa
, bp
, arc_getbuf_func
, &abuf
,
898 ZIO_PRIORITY_ASYNC_READ
, zioflags
, &aflags
, zb
) != 0) {
899 if (zfs_send_corrupt_data
) {
900 /* Send a block filled with 0x"zfs badd bloc" */
901 abuf
= arc_alloc_buf(spa
, &abuf
, ARC_BUFC_DATA
,
904 for (ptr
= abuf
->b_data
;
905 (char *)ptr
< (char *)abuf
->b_data
+ blksz
;
907 *ptr
= 0x2f5baddb10cULL
;
909 return (SET_ERROR(EIO
));
913 offset
= zb
->zb_blkid
* blksz
;
915 if (split_large_blocks
) {
916 ASSERT0(arc_is_encrypted(abuf
));
917 ASSERT3U(arc_get_compression(abuf
), ==,
919 char *buf
= abuf
->b_data
;
920 while (blksz
> 0 && err
== 0) {
921 int n
= MIN(blksz
, SPA_OLD_MAXBLOCKSIZE
);
922 err
= dump_write(dsa
, type
, zb
->zb_object
,
923 offset
, n
, n
, NULL
, buf
);
929 err
= dump_write(dsa
, type
, zb
->zb_object
, offset
,
930 blksz
, arc_buf_size(abuf
), bp
, abuf
->b_data
);
932 arc_buf_destroy(abuf
, &abuf
);
935 ASSERT(err
== 0 || err
== EINTR
);
940 * Pop the new data off the queue, and free the old data.
942 static struct send_block_record
*
943 get_next_record(bqueue_t
*bq
, struct send_block_record
*data
)
945 struct send_block_record
*tmp
= bqueue_dequeue(bq
);
946 kmem_free(data
, sizeof (*data
));
951 * Actually do the bulk of the work in a zfs send.
953 * Note: Releases dp using the specified tag.
956 dmu_send_impl(void *tag
, dsl_pool_t
*dp
, dsl_dataset_t
*to_ds
,
957 zfs_bookmark_phys_t
*ancestor_zb
, boolean_t is_clone
,
958 boolean_t embedok
, boolean_t large_block_ok
, boolean_t compressok
,
959 boolean_t rawok
, int outfd
, uint64_t resumeobj
, uint64_t resumeoff
,
960 vnode_t
*vp
, offset_t
*off
)
963 dmu_replay_record_t
*drr
;
966 uint64_t fromtxg
= 0;
967 uint64_t featureflags
= 0;
968 struct send_thread_arg to_arg
;
969 void *payload
= NULL
;
970 size_t payload_len
= 0;
971 struct send_block_record
*to_data
;
973 err
= dmu_objset_from_ds(to_ds
, &os
);
975 dsl_pool_rele(dp
, tag
);
980 * If this is a non-raw send of an encrypted ds, we can ensure that
981 * the objset_phys_t is authenticated. This is safe because this is
982 * either a snapshot or we have owned the dataset, ensuring that
983 * it can't be modified.
985 if (!rawok
&& os
->os_encrypted
&&
986 arc_is_unauthenticated(os
->os_phys_buf
)) {
989 SET_BOOKMARK(&zb
, to_ds
->ds_object
, ZB_ROOT_OBJECT
,
990 ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
991 err
= arc_untransform(os
->os_phys_buf
, os
->os_spa
,
994 dsl_pool_rele(dp
, tag
);
998 ASSERT0(arc_is_unauthenticated(os
->os_phys_buf
));
1001 drr
= kmem_zalloc(sizeof (dmu_replay_record_t
), KM_SLEEP
);
1002 drr
->drr_type
= DRR_BEGIN
;
1003 drr
->drr_u
.drr_begin
.drr_magic
= DMU_BACKUP_MAGIC
;
1004 DMU_SET_STREAM_HDRTYPE(drr
->drr_u
.drr_begin
.drr_versioninfo
,
1007 bzero(&to_arg
, sizeof (to_arg
));
1010 if (dmu_objset_type(os
) == DMU_OST_ZFS
) {
1012 if (zfs_get_zplprop(os
, ZFS_PROP_VERSION
, &version
) != 0) {
1013 kmem_free(drr
, sizeof (dmu_replay_record_t
));
1014 dsl_pool_rele(dp
, tag
);
1015 return (SET_ERROR(EINVAL
));
1017 if (version
>= ZPL_VERSION_SA
) {
1018 featureflags
|= DMU_BACKUP_FEATURE_SA_SPILL
;
1023 /* raw sends imply large_block_ok */
1024 if ((large_block_ok
|| rawok
) &&
1025 dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_BLOCKS
))
1026 featureflags
|= DMU_BACKUP_FEATURE_LARGE_BLOCKS
;
1027 if (dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_DNODE
))
1028 featureflags
|= DMU_BACKUP_FEATURE_LARGE_DNODE
;
1030 /* encrypted datasets will not have embedded blocks */
1031 if ((embedok
|| rawok
) && !os
->os_encrypted
&&
1032 spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_EMBEDDED_DATA
)) {
1033 featureflags
|= DMU_BACKUP_FEATURE_EMBED_DATA
;
1036 /* raw send implies compressok */
1037 if (compressok
|| rawok
)
1038 featureflags
|= DMU_BACKUP_FEATURE_COMPRESSED
;
1039 if (rawok
&& os
->os_encrypted
)
1040 featureflags
|= DMU_BACKUP_FEATURE_RAW
;
1043 (DMU_BACKUP_FEATURE_EMBED_DATA
| DMU_BACKUP_FEATURE_COMPRESSED
|
1044 DMU_BACKUP_FEATURE_RAW
)) != 0 &&
1045 spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_LZ4_COMPRESS
)) {
1046 featureflags
|= DMU_BACKUP_FEATURE_LZ4
;
1049 if (resumeobj
!= 0 || resumeoff
!= 0) {
1050 featureflags
|= DMU_BACKUP_FEATURE_RESUMING
;
1053 DMU_SET_FEATUREFLAGS(drr
->drr_u
.drr_begin
.drr_versioninfo
,
1056 drr
->drr_u
.drr_begin
.drr_creation_time
=
1057 dsl_dataset_phys(to_ds
)->ds_creation_time
;
1058 drr
->drr_u
.drr_begin
.drr_type
= dmu_objset_type(os
);
1060 drr
->drr_u
.drr_begin
.drr_flags
|= DRR_FLAG_CLONE
;
1061 drr
->drr_u
.drr_begin
.drr_toguid
= dsl_dataset_phys(to_ds
)->ds_guid
;
1062 if (dsl_dataset_phys(to_ds
)->ds_flags
& DS_FLAG_CI_DATASET
)
1063 drr
->drr_u
.drr_begin
.drr_flags
|= DRR_FLAG_CI_DATA
;
1064 if (zfs_send_set_freerecords_bit
)
1065 drr
->drr_u
.drr_begin
.drr_flags
|= DRR_FLAG_FREERECORDS
;
1067 if (ancestor_zb
!= NULL
) {
1068 drr
->drr_u
.drr_begin
.drr_fromguid
=
1069 ancestor_zb
->zbm_guid
;
1070 fromtxg
= ancestor_zb
->zbm_creation_txg
;
1072 dsl_dataset_name(to_ds
, drr
->drr_u
.drr_begin
.drr_toname
);
1073 if (!to_ds
->ds_is_snapshot
) {
1074 (void) strlcat(drr
->drr_u
.drr_begin
.drr_toname
, "@--head--",
1075 sizeof (drr
->drr_u
.drr_begin
.drr_toname
));
1078 dsp
= kmem_zalloc(sizeof (dmu_sendarg_t
), KM_SLEEP
);
1082 dsp
->dsa_outfd
= outfd
;
1083 dsp
->dsa_proc
= curproc
;
1086 dsp
->dsa_toguid
= dsl_dataset_phys(to_ds
)->ds_guid
;
1087 dsp
->dsa_pending_op
= PENDING_NONE
;
1088 dsp
->dsa_featureflags
= featureflags
;
1089 dsp
->dsa_resume_object
= resumeobj
;
1090 dsp
->dsa_resume_offset
= resumeoff
;
1092 mutex_enter(&to_ds
->ds_sendstream_lock
);
1093 list_insert_head(&to_ds
->ds_sendstreams
, dsp
);
1094 mutex_exit(&to_ds
->ds_sendstream_lock
);
1096 dsl_dataset_long_hold(to_ds
, FTAG
);
1097 dsl_pool_rele(dp
, tag
);
1099 /* handle features that require a DRR_BEGIN payload */
1101 (DMU_BACKUP_FEATURE_RESUMING
| DMU_BACKUP_FEATURE_RAW
)) {
1102 nvlist_t
*keynvl
= NULL
;
1103 nvlist_t
*nvl
= fnvlist_alloc();
1105 if (featureflags
& DMU_BACKUP_FEATURE_RESUMING
) {
1106 dmu_object_info_t to_doi
;
1107 err
= dmu_object_info(os
, resumeobj
, &to_doi
);
1113 SET_BOOKMARK(&to_arg
.resume
, to_ds
->ds_object
,
1115 resumeoff
/ to_doi
.doi_data_block_size
);
1117 fnvlist_add_uint64(nvl
, "resume_object", resumeobj
);
1118 fnvlist_add_uint64(nvl
, "resume_offset", resumeoff
);
1121 if (featureflags
& DMU_BACKUP_FEATURE_RAW
) {
1122 uint64_t ivset_guid
= (ancestor_zb
!= NULL
) ?
1123 ancestor_zb
->zbm_ivset_guid
: 0;
1125 ASSERT(os
->os_encrypted
);
1127 err
= dsl_crypto_populate_key_nvlist(to_ds
,
1128 ivset_guid
, &keynvl
);
1134 fnvlist_add_nvlist(nvl
, "crypt_keydata", keynvl
);
1137 payload
= fnvlist_pack(nvl
, &payload_len
);
1138 drr
->drr_payloadlen
= payload_len
;
1139 fnvlist_free(keynvl
);
1143 err
= dump_record(dsp
, payload
, payload_len
);
1144 fnvlist_pack_free(payload
, payload_len
);
1150 err
= bqueue_init(&to_arg
.q
,
1151 MAX(zfs_send_queue_length
, 2 * zfs_max_recordsize
),
1152 offsetof(struct send_block_record
, ln
));
1153 to_arg
.error_code
= 0;
1154 to_arg
.cancel
= B_FALSE
;
1156 to_arg
.fromtxg
= fromtxg
;
1157 to_arg
.flags
= TRAVERSE_PRE
| TRAVERSE_PREFETCH
;
1159 to_arg
.flags
|= TRAVERSE_NO_DECRYPT
;
1160 (void) thread_create(NULL
, 0, send_traverse_thread
, &to_arg
, 0, curproc
,
1161 TS_RUN
, minclsyspri
);
1163 to_data
= bqueue_dequeue(&to_arg
.q
);
1165 while (!to_data
->eos_marker
&& err
== 0) {
1166 err
= do_dump(dsp
, to_data
);
1167 to_data
= get_next_record(&to_arg
.q
, to_data
);
1168 if (issig(JUSTLOOKING
) && issig(FORREAL
))
1173 to_arg
.cancel
= B_TRUE
;
1174 while (!to_data
->eos_marker
) {
1175 to_data
= get_next_record(&to_arg
.q
, to_data
);
1178 kmem_free(to_data
, sizeof (*to_data
));
1180 bqueue_destroy(&to_arg
.q
);
1182 if (err
== 0 && to_arg
.error_code
!= 0)
1183 err
= to_arg
.error_code
;
1188 if (dsp
->dsa_pending_op
!= PENDING_NONE
)
1189 if (dump_record(dsp
, NULL
, 0) != 0)
1190 err
= SET_ERROR(EINTR
);
1193 if (err
== EINTR
&& dsp
->dsa_err
!= 0)
1198 bzero(drr
, sizeof (dmu_replay_record_t
));
1199 drr
->drr_type
= DRR_END
;
1200 drr
->drr_u
.drr_end
.drr_checksum
= dsp
->dsa_zc
;
1201 drr
->drr_u
.drr_end
.drr_toguid
= dsp
->dsa_toguid
;
1203 if (dump_record(dsp
, NULL
, 0) != 0)
1206 mutex_enter(&to_ds
->ds_sendstream_lock
);
1207 list_remove(&to_ds
->ds_sendstreams
, dsp
);
1208 mutex_exit(&to_ds
->ds_sendstream_lock
);
1210 VERIFY(err
!= 0 || (dsp
->dsa_sent_begin
&& dsp
->dsa_sent_end
));
1212 kmem_free(drr
, sizeof (dmu_replay_record_t
));
1213 kmem_free(dsp
, sizeof (dmu_sendarg_t
));
1215 dsl_dataset_long_rele(to_ds
, FTAG
);
1221 dmu_send_obj(const char *pool
, uint64_t tosnap
, uint64_t fromsnap
,
1222 boolean_t embedok
, boolean_t large_block_ok
, boolean_t compressok
,
1223 boolean_t rawok
, int outfd
, vnode_t
*vp
, offset_t
*off
)
1227 dsl_dataset_t
*fromds
= NULL
;
1228 ds_hold_flags_t dsflags
= (rawok
) ? 0 : DS_HOLD_FLAG_DECRYPT
;
1231 err
= dsl_pool_hold(pool
, FTAG
, &dp
);
1235 err
= dsl_dataset_hold_obj_flags(dp
, tosnap
, dsflags
, FTAG
, &ds
);
1237 dsl_pool_rele(dp
, FTAG
);
1241 if (fromsnap
!= 0) {
1242 zfs_bookmark_phys_t zb
= { 0 };
1245 err
= dsl_dataset_hold_obj(dp
, fromsnap
, FTAG
, &fromds
);
1247 dsl_dataset_rele_flags(ds
, dsflags
, FTAG
);
1248 dsl_pool_rele(dp
, FTAG
);
1251 if (!dsl_dataset_is_before(ds
, fromds
, 0)) {
1252 err
= SET_ERROR(EXDEV
);
1253 dsl_dataset_rele(fromds
, FTAG
);
1254 dsl_dataset_rele_flags(ds
, dsflags
, FTAG
);
1255 dsl_pool_rele(dp
, FTAG
);
1259 zb
.zbm_creation_time
=
1260 dsl_dataset_phys(fromds
)->ds_creation_time
;
1261 zb
.zbm_creation_txg
= dsl_dataset_phys(fromds
)->ds_creation_txg
;
1262 zb
.zbm_guid
= dsl_dataset_phys(fromds
)->ds_guid
;
1264 if (dsl_dataset_is_zapified(fromds
)) {
1265 (void) zap_lookup(dp
->dp_meta_objset
,
1266 fromds
->ds_object
, DS_FIELD_IVSET_GUID
, 8, 1,
1267 &zb
.zbm_ivset_guid
);
1270 is_clone
= (fromds
->ds_dir
!= ds
->ds_dir
);
1271 dsl_dataset_rele(fromds
, FTAG
);
1272 err
= dmu_send_impl(FTAG
, dp
, ds
, &zb
, is_clone
,
1273 embedok
, large_block_ok
, compressok
, rawok
, outfd
,
1276 err
= dmu_send_impl(FTAG
, dp
, ds
, NULL
, B_FALSE
,
1277 embedok
, large_block_ok
, compressok
, rawok
, outfd
,
1280 dsl_dataset_rele_flags(ds
, dsflags
, FTAG
);
1285 dmu_send(const char *tosnap
, const char *fromsnap
, boolean_t embedok
,
1286 boolean_t large_block_ok
, boolean_t compressok
, boolean_t rawok
,
1287 int outfd
, uint64_t resumeobj
, uint64_t resumeoff
, vnode_t
*vp
,
1293 ds_hold_flags_t dsflags
= (rawok
) ? 0 : DS_HOLD_FLAG_DECRYPT
;
1294 boolean_t owned
= B_FALSE
;
1296 if (fromsnap
!= NULL
&& strpbrk(fromsnap
, "@#") == NULL
)
1297 return (SET_ERROR(EINVAL
));
1299 err
= dsl_pool_hold(tosnap
, FTAG
, &dp
);
1302 if (strchr(tosnap
, '@') == NULL
&& spa_writeable(dp
->dp_spa
)) {
1304 * We are sending a filesystem or volume. Ensure
1305 * that it doesn't change by owning the dataset.
1307 err
= dsl_dataset_own(dp
, tosnap
, dsflags
, FTAG
, &ds
);
1310 err
= dsl_dataset_hold_flags(dp
, tosnap
, dsflags
, FTAG
, &ds
);
1313 dsl_pool_rele(dp
, FTAG
);
1317 if (fromsnap
!= NULL
) {
1318 zfs_bookmark_phys_t zb
= { 0 };
1319 boolean_t is_clone
= B_FALSE
;
1320 int fsnamelen
= strchr(tosnap
, '@') - tosnap
;
1323 * If the fromsnap is in a different filesystem, then
1324 * mark the send stream as a clone.
1326 if (strncmp(tosnap
, fromsnap
, fsnamelen
) != 0 ||
1327 (fromsnap
[fsnamelen
] != '@' &&
1328 fromsnap
[fsnamelen
] != '#')) {
1332 if (strchr(fromsnap
, '@')) {
1333 dsl_dataset_t
*fromds
;
1334 err
= dsl_dataset_hold(dp
, fromsnap
, FTAG
, &fromds
);
1336 if (!dsl_dataset_is_before(ds
, fromds
, 0))
1337 err
= SET_ERROR(EXDEV
);
1338 zb
.zbm_creation_time
=
1339 dsl_dataset_phys(fromds
)->ds_creation_time
;
1340 zb
.zbm_creation_txg
=
1341 dsl_dataset_phys(fromds
)->ds_creation_txg
;
1342 zb
.zbm_guid
= dsl_dataset_phys(fromds
)->ds_guid
;
1343 is_clone
= (ds
->ds_dir
!= fromds
->ds_dir
);
1345 if (dsl_dataset_is_zapified(fromds
)) {
1346 (void) zap_lookup(dp
->dp_meta_objset
,
1348 DS_FIELD_IVSET_GUID
, 8, 1,
1349 &zb
.zbm_ivset_guid
);
1351 dsl_dataset_rele(fromds
, FTAG
);
1354 err
= dsl_bookmark_lookup(dp
, fromsnap
, ds
, &zb
);
1358 dsl_dataset_disown(ds
, dsflags
, FTAG
);
1360 dsl_dataset_rele_flags(ds
, dsflags
, FTAG
);
1362 dsl_pool_rele(dp
, FTAG
);
1365 err
= dmu_send_impl(FTAG
, dp
, ds
, &zb
, is_clone
,
1366 embedok
, large_block_ok
, compressok
, rawok
,
1367 outfd
, resumeobj
, resumeoff
, vp
, off
);
1369 err
= dmu_send_impl(FTAG
, dp
, ds
, NULL
, B_FALSE
,
1370 embedok
, large_block_ok
, compressok
, rawok
,
1371 outfd
, resumeobj
, resumeoff
, vp
, off
);
1374 dsl_dataset_disown(ds
, dsflags
, FTAG
);
1376 dsl_dataset_rele_flags(ds
, dsflags
, FTAG
);
1382 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t
*ds
, uint64_t uncompressed
,
1383 uint64_t compressed
, boolean_t stream_compressed
, uint64_t *sizep
)
1388 * Assume that space (both on-disk and in-stream) is dominated by
1389 * data. We will adjust for indirect blocks and the copies property,
1390 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
1393 uint64_t recordsize
;
1394 uint64_t record_count
;
1396 VERIFY0(dmu_objset_from_ds(ds
, &os
));
1398 /* Assume all (uncompressed) blocks are recordsize. */
1399 if (zfs_override_estimate_recordsize
!= 0) {
1400 recordsize
= zfs_override_estimate_recordsize
;
1401 } else if (os
->os_phys
->os_type
== DMU_OST_ZVOL
) {
1402 err
= dsl_prop_get_int_ds(ds
,
1403 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE
), &recordsize
);
1405 err
= dsl_prop_get_int_ds(ds
,
1406 zfs_prop_to_name(ZFS_PROP_RECORDSIZE
), &recordsize
);
1410 record_count
= uncompressed
/ recordsize
;
1413 * If we're estimating a send size for a compressed stream, use the
1414 * compressed data size to estimate the stream size. Otherwise, use the
1415 * uncompressed data size.
1417 size
= stream_compressed
? compressed
: uncompressed
;
1420 * Subtract out approximate space used by indirect blocks.
1421 * Assume most space is used by data blocks (non-indirect, non-dnode).
1422 * Assume no ditto blocks or internal fragmentation.
1424 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
1427 size
-= record_count
* sizeof (blkptr_t
);
1429 /* Add in the space for the record associated with each block. */
1430 size
+= record_count
* sizeof (dmu_replay_record_t
);
1438 dmu_send_estimate(dsl_dataset_t
*ds
, dsl_dataset_t
*fromds
,
1439 boolean_t stream_compressed
, uint64_t *sizep
)
1442 uint64_t uncomp
, comp
;
1444 ASSERT(dsl_pool_config_held(ds
->ds_dir
->dd_pool
));
1446 /* tosnap must be a snapshot */
1447 if (!ds
->ds_is_snapshot
)
1448 return (SET_ERROR(EINVAL
));
1450 /* fromsnap, if provided, must be a snapshot */
1451 if (fromds
!= NULL
&& !fromds
->ds_is_snapshot
)
1452 return (SET_ERROR(EINVAL
));
1455 * fromsnap must be an earlier snapshot from the same fs as tosnap,
1456 * or the origin's fs.
1458 if (fromds
!= NULL
&& !dsl_dataset_is_before(ds
, fromds
, 0))
1459 return (SET_ERROR(EXDEV
));
1461 /* Get compressed and uncompressed size estimates of changed data. */
1462 if (fromds
== NULL
) {
1463 uncomp
= dsl_dataset_phys(ds
)->ds_uncompressed_bytes
;
1464 comp
= dsl_dataset_phys(ds
)->ds_compressed_bytes
;
1467 err
= dsl_dataset_space_written(fromds
, ds
,
1468 &used
, &comp
, &uncomp
);
1473 err
= dmu_adjust_send_estimate_for_indirects(ds
, uncomp
, comp
,
1474 stream_compressed
, sizep
);
1476 * Add the size of the BEGIN and END records to the estimate.
1478 *sizep
+= 2 * sizeof (dmu_replay_record_t
);
1482 struct calculate_send_arg
{
1483 uint64_t uncompressed
;
1484 uint64_t compressed
;
1488 * Simple callback used to traverse the blocks of a snapshot and sum their
1489 * uncompressed and compressed sizes.
1493 dmu_calculate_send_traversal(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
1494 const zbookmark_phys_t
*zb
, const dnode_phys_t
*dnp
, void *arg
)
1496 struct calculate_send_arg
*space
= arg
;
1497 if (bp
!= NULL
&& !BP_IS_HOLE(bp
)) {
1498 space
->uncompressed
+= BP_GET_UCSIZE(bp
);
1499 space
->compressed
+= BP_GET_PSIZE(bp
);
1505 * Given a desination snapshot and a TXG, calculate the approximate size of a
1506 * send stream sent from that TXG. from_txg may be zero, indicating that the
1507 * whole snapshot will be sent.
1510 dmu_send_estimate_from_txg(dsl_dataset_t
*ds
, uint64_t from_txg
,
1511 boolean_t stream_compressed
, uint64_t *sizep
)
1514 struct calculate_send_arg size
= { 0 };
1516 ASSERT(dsl_pool_config_held(ds
->ds_dir
->dd_pool
));
1518 /* tosnap must be a snapshot */
1519 if (!dsl_dataset_is_snapshot(ds
))
1520 return (SET_ERROR(EINVAL
));
1522 /* verify that from_txg is before the provided snapshot was taken */
1523 if (from_txg
>= dsl_dataset_phys(ds
)->ds_creation_txg
) {
1524 return (SET_ERROR(EXDEV
));
1527 * traverse the blocks of the snapshot with birth times after
1528 * from_txg, summing their uncompressed size
1530 err
= traverse_dataset(ds
, from_txg
,
1531 TRAVERSE_POST
| TRAVERSE_NO_DECRYPT
,
1532 dmu_calculate_send_traversal
, &size
);
1537 err
= dmu_adjust_send_estimate_for_indirects(ds
, size
.uncompressed
,
1538 size
.compressed
, stream_compressed
, sizep
);
1543 #if defined(_KERNEL)
1545 module_param(zfs_override_estimate_recordsize
, ulong
, 0644);
1546 MODULE_PARM_DESC(zfs_override_estimate_recordsize
,
1547 "Record size calculation override for zfs send estimates");
1550 module_param(zfs_send_corrupt_data
, int, 0644);
1551 MODULE_PARM_DESC(zfs_send_corrupt_data
, "Allow sending corrupt data");
1553 module_param(zfs_send_queue_length
, int, 0644);
1554 MODULE_PARM_DESC(zfs_send_queue_length
, "Maximum send queue length");