4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26 * Copyright 2014 HybridCluster. All rights reserved.
27 * Copyright 2016 RackTop Systems.
28 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
32 #include <sys/dmu_impl.h>
33 #include <sys/dmu_tx.h>
35 #include <sys/dnode.h>
36 #include <sys/zfs_context.h>
37 #include <sys/dmu_objset.h>
38 #include <sys/dmu_traverse.h>
39 #include <sys/dsl_dataset.h>
40 #include <sys/dsl_dir.h>
41 #include <sys/dsl_prop.h>
42 #include <sys/dsl_pool.h>
43 #include <sys/dsl_synctask.h>
44 #include <sys/spa_impl.h>
45 #include <sys/zfs_ioctl.h>
47 #include <sys/zio_checksum.h>
48 #include <sys/zfs_znode.h>
49 #include <zfs_fletcher.h>
52 #include <sys/zfs_onexit.h>
53 #include <sys/dmu_send.h>
54 #include <sys/dsl_destroy.h>
55 #include <sys/blkptr.h>
56 #include <sys/dsl_bookmark.h>
57 #include <sys/zfeature.h>
58 #include <sys/bqueue.h>
60 #include <sys/policy.h>
62 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
63 int zfs_send_corrupt_data
= B_FALSE
;
64 int zfs_send_queue_length
= SPA_MAXBLOCKSIZE
;
65 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
66 int zfs_send_set_freerecords_bit
= B_TRUE
;
67 /* Set this tunable to FALSE is disable sending unmodified spill blocks. */
68 int zfs_send_unmodified_spill_blocks
= B_TRUE
;
71 * Use this to override the recordsize calculation for fast zfs send estimates.
73 unsigned long zfs_override_estimate_recordsize
= 0;
75 #define BP_SPAN(datablkszsec, indblkshift, level) \
76 (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
77 (level) * (indblkshift - SPA_BLKPTRSHIFT)))
79 struct send_thread_arg
{
81 dsl_dataset_t
*ds
; /* Dataset to traverse */
82 uint64_t fromtxg
; /* Traverse from this txg */
83 int flags
; /* flags to pass to traverse_dataset */
86 zbookmark_phys_t resume
;
89 struct send_block_record
{
90 boolean_t eos_marker
; /* Marks the end of the stream */
94 uint16_t datablkszsec
;
98 typedef struct dump_bytes_io
{
99 dmu_sendarg_t
*dbi_dsp
;
104 static int do_dump(dmu_sendarg_t
*dsa
, struct send_block_record
*data
);
107 dump_bytes_cb(void *arg
)
109 dump_bytes_io_t
*dbi
= (dump_bytes_io_t
*)arg
;
110 dmu_sendarg_t
*dsp
= dbi
->dbi_dsp
;
111 dsl_dataset_t
*ds
= dmu_objset_ds(dsp
->dsa_os
);
112 ssize_t resid
; /* have to get resid to get detailed errno */
115 * The code does not rely on len being a multiple of 8. We keep
116 * this assertion because of the corresponding assertion in
117 * receive_read(). Keeping this assertion ensures that we do not
118 * inadvertently break backwards compatibility (causing the assertion
119 * in receive_read() to trigger on old software). Newer feature flags
120 * (such as raw send) may break this assertion since they were
121 * introduced after the requirement was made obsolete.
124 ASSERT(dbi
->dbi_len
% 8 == 0 ||
125 (dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) != 0);
127 dsp
->dsa_err
= vn_rdwr(UIO_WRITE
, dsp
->dsa_vp
,
128 (caddr_t
)dbi
->dbi_buf
, dbi
->dbi_len
,
129 0, UIO_SYSSPACE
, FAPPEND
, RLIM64_INFINITY
, CRED(), &resid
);
131 mutex_enter(&ds
->ds_sendstream_lock
);
132 *dsp
->dsa_off
+= dbi
->dbi_len
;
133 mutex_exit(&ds
->ds_sendstream_lock
);
137 dump_bytes(dmu_sendarg_t
*dsp
, void *buf
, int len
)
145 #if defined(HAVE_LARGE_STACKS)
149 * The vn_rdwr() call is performed in a taskq to ensure that there is
150 * always enough stack space to write safely to the target filesystem.
151 * The ZIO_TYPE_FREE threads are used because there can be a lot of
152 * them and they are used in vdev_file.c for a similar purpose.
154 spa_taskq_dispatch_sync(dmu_objset_spa(dsp
->dsa_os
), ZIO_TYPE_FREE
,
155 ZIO_TASKQ_ISSUE
, dump_bytes_cb
, &dbi
, TQ_SLEEP
);
156 #endif /* HAVE_LARGE_STACKS */
158 return (dsp
->dsa_err
);
162 * For all record types except BEGIN, fill in the checksum (overlaid in
163 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
164 * up to the start of the checksum itself.
167 dump_record(dmu_sendarg_t
*dsp
, void *payload
, int payload_len
)
169 ASSERT3U(offsetof(dmu_replay_record_t
, drr_u
.drr_checksum
.drr_checksum
),
170 ==, sizeof (dmu_replay_record_t
) - sizeof (zio_cksum_t
));
171 (void) fletcher_4_incremental_native(dsp
->dsa_drr
,
172 offsetof(dmu_replay_record_t
, drr_u
.drr_checksum
.drr_checksum
),
174 if (dsp
->dsa_drr
->drr_type
== DRR_BEGIN
) {
175 dsp
->dsa_sent_begin
= B_TRUE
;
177 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp
->dsa_drr
->drr_u
.
178 drr_checksum
.drr_checksum
));
179 dsp
->dsa_drr
->drr_u
.drr_checksum
.drr_checksum
= dsp
->dsa_zc
;
181 if (dsp
->dsa_drr
->drr_type
== DRR_END
) {
182 dsp
->dsa_sent_end
= B_TRUE
;
184 (void) fletcher_4_incremental_native(&dsp
->dsa_drr
->
185 drr_u
.drr_checksum
.drr_checksum
,
186 sizeof (zio_cksum_t
), &dsp
->dsa_zc
);
187 if (dump_bytes(dsp
, dsp
->dsa_drr
, sizeof (dmu_replay_record_t
)) != 0)
188 return (SET_ERROR(EINTR
));
189 if (payload_len
!= 0) {
190 (void) fletcher_4_incremental_native(payload
, payload_len
,
192 if (dump_bytes(dsp
, payload
, payload_len
) != 0)
193 return (SET_ERROR(EINTR
));
199 * Fill in the drr_free struct, or perform aggregation if the previous record is
200 * also a free record, and the two are adjacent.
202 * Note that we send free records even for a full send, because we want to be
203 * able to receive a full send as a clone, which requires a list of all the free
204 * and freeobject records that were generated on the source.
207 dump_free(dmu_sendarg_t
*dsp
, uint64_t object
, uint64_t offset
,
210 struct drr_free
*drrf
= &(dsp
->dsa_drr
->drr_u
.drr_free
);
213 * When we receive a free record, dbuf_free_range() assumes
214 * that the receiving system doesn't have any dbufs in the range
215 * being freed. This is always true because there is a one-record
216 * constraint: we only send one WRITE record for any given
217 * object,offset. We know that the one-record constraint is
218 * true because we always send data in increasing order by
221 * If the increasing-order constraint ever changes, we should find
222 * another way to assert that the one-record constraint is still
225 ASSERT(object
> dsp
->dsa_last_data_object
||
226 (object
== dsp
->dsa_last_data_object
&&
227 offset
> dsp
->dsa_last_data_offset
));
230 * If there is a pending op, but it's not PENDING_FREE, push it out,
231 * since free block aggregation can only be done for blocks of the
232 * same type (i.e., DRR_FREE records can only be aggregated with
233 * other DRR_FREE records. DRR_FREEOBJECTS records can only be
234 * aggregated with other DRR_FREEOBJECTS records.
236 if (dsp
->dsa_pending_op
!= PENDING_NONE
&&
237 dsp
->dsa_pending_op
!= PENDING_FREE
) {
238 if (dump_record(dsp
, NULL
, 0) != 0)
239 return (SET_ERROR(EINTR
));
240 dsp
->dsa_pending_op
= PENDING_NONE
;
243 if (dsp
->dsa_pending_op
== PENDING_FREE
) {
245 * There should never be a PENDING_FREE if length is
246 * DMU_OBJECT_END (because dump_dnode is the only place where
247 * this function is called with a DMU_OBJECT_END, and only after
248 * flushing any pending record).
250 ASSERT(length
!= DMU_OBJECT_END
);
252 * Check to see whether this free block can be aggregated
255 if (drrf
->drr_object
== object
&& drrf
->drr_offset
+
256 drrf
->drr_length
== offset
) {
257 if (offset
+ length
< offset
)
258 drrf
->drr_length
= DMU_OBJECT_END
;
260 drrf
->drr_length
+= length
;
263 /* not a continuation. Push out pending record */
264 if (dump_record(dsp
, NULL
, 0) != 0)
265 return (SET_ERROR(EINTR
));
266 dsp
->dsa_pending_op
= PENDING_NONE
;
269 /* create a FREE record and make it pending */
270 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
271 dsp
->dsa_drr
->drr_type
= DRR_FREE
;
272 drrf
->drr_object
= object
;
273 drrf
->drr_offset
= offset
;
274 if (offset
+ length
< offset
)
275 drrf
->drr_length
= DMU_OBJECT_END
;
277 drrf
->drr_length
= length
;
278 drrf
->drr_toguid
= dsp
->dsa_toguid
;
279 if (length
== DMU_OBJECT_END
) {
280 if (dump_record(dsp
, NULL
, 0) != 0)
281 return (SET_ERROR(EINTR
));
283 dsp
->dsa_pending_op
= PENDING_FREE
;
290 dump_write(dmu_sendarg_t
*dsp
, dmu_object_type_t type
, uint64_t object
,
291 uint64_t offset
, int lsize
, int psize
, const blkptr_t
*bp
, void *data
)
293 uint64_t payload_size
;
294 boolean_t raw
= (dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
);
295 struct drr_write
*drrw
= &(dsp
->dsa_drr
->drr_u
.drr_write
);
298 * We send data in increasing object, offset order.
299 * See comment in dump_free() for details.
301 ASSERT(object
> dsp
->dsa_last_data_object
||
302 (object
== dsp
->dsa_last_data_object
&&
303 offset
> dsp
->dsa_last_data_offset
));
304 dsp
->dsa_last_data_object
= object
;
305 dsp
->dsa_last_data_offset
= offset
+ lsize
- 1;
308 * If there is any kind of pending aggregation (currently either
309 * a grouping of free objects or free blocks), push it out to
310 * the stream, since aggregation can't be done across operations
311 * of different types.
313 if (dsp
->dsa_pending_op
!= PENDING_NONE
) {
314 if (dump_record(dsp
, NULL
, 0) != 0)
315 return (SET_ERROR(EINTR
));
316 dsp
->dsa_pending_op
= PENDING_NONE
;
318 /* write a WRITE record */
319 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
320 dsp
->dsa_drr
->drr_type
= DRR_WRITE
;
321 drrw
->drr_object
= object
;
322 drrw
->drr_type
= type
;
323 drrw
->drr_offset
= offset
;
324 drrw
->drr_toguid
= dsp
->dsa_toguid
;
325 drrw
->drr_logical_size
= lsize
;
327 /* only set the compression fields if the buf is compressed or raw */
328 if (raw
|| lsize
!= psize
) {
329 ASSERT(!BP_IS_EMBEDDED(bp
));
330 ASSERT3S(psize
, >, 0);
333 ASSERT(BP_IS_PROTECTED(bp
));
336 * This is a raw protected block so we need to pass
337 * along everything the receiving side will need to
338 * interpret this block, including the byteswap, salt,
341 if (BP_SHOULD_BYTESWAP(bp
))
342 drrw
->drr_flags
|= DRR_RAW_BYTESWAP
;
343 zio_crypt_decode_params_bp(bp
, drrw
->drr_salt
,
345 zio_crypt_decode_mac_bp(bp
, drrw
->drr_mac
);
347 /* this is a compressed block */
348 ASSERT(dsp
->dsa_featureflags
&
349 DMU_BACKUP_FEATURE_COMPRESSED
);
350 ASSERT(!BP_SHOULD_BYTESWAP(bp
));
351 ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp
)));
352 ASSERT3U(BP_GET_COMPRESS(bp
), !=, ZIO_COMPRESS_OFF
);
353 ASSERT3S(lsize
, >=, psize
);
356 /* set fields common to compressed and raw sends */
357 drrw
->drr_compressiontype
= BP_GET_COMPRESS(bp
);
358 drrw
->drr_compressed_size
= psize
;
359 payload_size
= drrw
->drr_compressed_size
;
361 payload_size
= drrw
->drr_logical_size
;
364 if (bp
== NULL
|| BP_IS_EMBEDDED(bp
) || (BP_IS_PROTECTED(bp
) && !raw
)) {
366 * There's no pre-computed checksum for partial-block writes,
367 * embedded BP's, or encrypted BP's that are being sent as
368 * plaintext, so (like fletcher4-checkummed blocks) userland
369 * will have to compute a dedup-capable checksum itself.
371 drrw
->drr_checksumtype
= ZIO_CHECKSUM_OFF
;
373 drrw
->drr_checksumtype
= BP_GET_CHECKSUM(bp
);
374 if (zio_checksum_table
[drrw
->drr_checksumtype
].ci_flags
&
375 ZCHECKSUM_FLAG_DEDUP
)
376 drrw
->drr_flags
|= DRR_CHECKSUM_DEDUP
;
377 DDK_SET_LSIZE(&drrw
->drr_key
, BP_GET_LSIZE(bp
));
378 DDK_SET_PSIZE(&drrw
->drr_key
, BP_GET_PSIZE(bp
));
379 DDK_SET_COMPRESS(&drrw
->drr_key
, BP_GET_COMPRESS(bp
));
380 DDK_SET_CRYPT(&drrw
->drr_key
, BP_IS_PROTECTED(bp
));
381 drrw
->drr_key
.ddk_cksum
= bp
->blk_cksum
;
384 if (dump_record(dsp
, data
, payload_size
) != 0)
385 return (SET_ERROR(EINTR
));
390 dump_write_embedded(dmu_sendarg_t
*dsp
, uint64_t object
, uint64_t offset
,
391 int blksz
, const blkptr_t
*bp
)
393 char buf
[BPE_PAYLOAD_SIZE
];
394 struct drr_write_embedded
*drrw
=
395 &(dsp
->dsa_drr
->drr_u
.drr_write_embedded
);
397 if (dsp
->dsa_pending_op
!= PENDING_NONE
) {
398 if (dump_record(dsp
, NULL
, 0) != 0)
399 return (SET_ERROR(EINTR
));
400 dsp
->dsa_pending_op
= PENDING_NONE
;
403 ASSERT(BP_IS_EMBEDDED(bp
));
405 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
406 dsp
->dsa_drr
->drr_type
= DRR_WRITE_EMBEDDED
;
407 drrw
->drr_object
= object
;
408 drrw
->drr_offset
= offset
;
409 drrw
->drr_length
= blksz
;
410 drrw
->drr_toguid
= dsp
->dsa_toguid
;
411 drrw
->drr_compression
= BP_GET_COMPRESS(bp
);
412 drrw
->drr_etype
= BPE_GET_ETYPE(bp
);
413 drrw
->drr_lsize
= BPE_GET_LSIZE(bp
);
414 drrw
->drr_psize
= BPE_GET_PSIZE(bp
);
416 decode_embedded_bp_compressed(bp
, buf
);
418 if (dump_record(dsp
, buf
, P2ROUNDUP(drrw
->drr_psize
, 8)) != 0)
419 return (SET_ERROR(EINTR
));
424 dump_spill(dmu_sendarg_t
*dsp
, const blkptr_t
*bp
, uint64_t object
, void *data
)
426 struct drr_spill
*drrs
= &(dsp
->dsa_drr
->drr_u
.drr_spill
);
427 uint64_t blksz
= BP_GET_LSIZE(bp
);
428 uint64_t payload_size
= blksz
;
430 if (dsp
->dsa_pending_op
!= PENDING_NONE
) {
431 if (dump_record(dsp
, NULL
, 0) != 0)
432 return (SET_ERROR(EINTR
));
433 dsp
->dsa_pending_op
= PENDING_NONE
;
436 /* write a SPILL record */
437 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
438 dsp
->dsa_drr
->drr_type
= DRR_SPILL
;
439 drrs
->drr_object
= object
;
440 drrs
->drr_length
= blksz
;
441 drrs
->drr_toguid
= dsp
->dsa_toguid
;
443 /* See comment in dump_dnode() for full details */
444 if (zfs_send_unmodified_spill_blocks
&&
445 (bp
->blk_birth
<= dsp
->dsa_fromtxg
)) {
446 drrs
->drr_flags
|= DRR_SPILL_UNMODIFIED
;
449 /* handle raw send fields */
450 if (dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
451 ASSERT(BP_IS_PROTECTED(bp
));
453 if (BP_SHOULD_BYTESWAP(bp
))
454 drrs
->drr_flags
|= DRR_RAW_BYTESWAP
;
455 drrs
->drr_compressiontype
= BP_GET_COMPRESS(bp
);
456 drrs
->drr_compressed_size
= BP_GET_PSIZE(bp
);
457 zio_crypt_decode_params_bp(bp
, drrs
->drr_salt
, drrs
->drr_iv
);
458 zio_crypt_decode_mac_bp(bp
, drrs
->drr_mac
);
459 payload_size
= drrs
->drr_compressed_size
;
462 if (dump_record(dsp
, data
, payload_size
) != 0)
463 return (SET_ERROR(EINTR
));
468 dump_freeobjects(dmu_sendarg_t
*dsp
, uint64_t firstobj
, uint64_t numobjs
)
470 struct drr_freeobjects
*drrfo
= &(dsp
->dsa_drr
->drr_u
.drr_freeobjects
);
471 uint64_t maxobj
= DNODES_PER_BLOCK
*
472 (DMU_META_DNODE(dsp
->dsa_os
)->dn_maxblkid
+ 1);
475 * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
476 * leading to zfs recv never completing. to avoid this issue, don't
477 * send FREEOBJECTS records for object IDs which cannot exist on the
481 if (maxobj
< firstobj
)
484 if (maxobj
< firstobj
+ numobjs
)
485 numobjs
= maxobj
- firstobj
;
489 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
490 * push it out, since free block aggregation can only be done for
491 * blocks of the same type (i.e., DRR_FREE records can only be
492 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
493 * can only be aggregated with other DRR_FREEOBJECTS records.
495 if (dsp
->dsa_pending_op
!= PENDING_NONE
&&
496 dsp
->dsa_pending_op
!= PENDING_FREEOBJECTS
) {
497 if (dump_record(dsp
, NULL
, 0) != 0)
498 return (SET_ERROR(EINTR
));
499 dsp
->dsa_pending_op
= PENDING_NONE
;
501 if (dsp
->dsa_pending_op
== PENDING_FREEOBJECTS
) {
503 * See whether this free object array can be aggregated
506 if (drrfo
->drr_firstobj
+ drrfo
->drr_numobjs
== firstobj
) {
507 drrfo
->drr_numobjs
+= numobjs
;
510 /* can't be aggregated. Push out pending record */
511 if (dump_record(dsp
, NULL
, 0) != 0)
512 return (SET_ERROR(EINTR
));
513 dsp
->dsa_pending_op
= PENDING_NONE
;
517 /* write a FREEOBJECTS record */
518 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
519 dsp
->dsa_drr
->drr_type
= DRR_FREEOBJECTS
;
520 drrfo
->drr_firstobj
= firstobj
;
521 drrfo
->drr_numobjs
= numobjs
;
522 drrfo
->drr_toguid
= dsp
->dsa_toguid
;
524 dsp
->dsa_pending_op
= PENDING_FREEOBJECTS
;
530 dump_dnode(dmu_sendarg_t
*dsp
, const blkptr_t
*bp
, uint64_t object
,
533 struct drr_object
*drro
= &(dsp
->dsa_drr
->drr_u
.drr_object
);
536 if (object
< dsp
->dsa_resume_object
) {
538 * Note: when resuming, we will visit all the dnodes in
539 * the block of dnodes that we are resuming from. In
540 * this case it's unnecessary to send the dnodes prior to
541 * the one we are resuming from. We should be at most one
542 * block's worth of dnodes behind the resume point.
544 ASSERT3U(dsp
->dsa_resume_object
- object
, <,
545 1 << (DNODE_BLOCK_SHIFT
- DNODE_SHIFT
));
549 if (dnp
== NULL
|| dnp
->dn_type
== DMU_OT_NONE
)
550 return (dump_freeobjects(dsp
, object
, 1));
552 if (dsp
->dsa_pending_op
!= PENDING_NONE
) {
553 if (dump_record(dsp
, NULL
, 0) != 0)
554 return (SET_ERROR(EINTR
));
555 dsp
->dsa_pending_op
= PENDING_NONE
;
558 /* write an OBJECT record */
559 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
560 dsp
->dsa_drr
->drr_type
= DRR_OBJECT
;
561 drro
->drr_object
= object
;
562 drro
->drr_type
= dnp
->dn_type
;
563 drro
->drr_bonustype
= dnp
->dn_bonustype
;
564 drro
->drr_blksz
= dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
565 drro
->drr_bonuslen
= dnp
->dn_bonuslen
;
566 drro
->drr_dn_slots
= dnp
->dn_extra_slots
+ 1;
567 drro
->drr_checksumtype
= dnp
->dn_checksum
;
568 drro
->drr_compress
= dnp
->dn_compress
;
569 drro
->drr_toguid
= dsp
->dsa_toguid
;
571 if (!(dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
) &&
572 drro
->drr_blksz
> SPA_OLD_MAXBLOCKSIZE
)
573 drro
->drr_blksz
= SPA_OLD_MAXBLOCKSIZE
;
575 bonuslen
= P2ROUNDUP(dnp
->dn_bonuslen
, 8);
577 if ((dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
)) {
578 ASSERT(BP_IS_ENCRYPTED(bp
));
580 if (BP_SHOULD_BYTESWAP(bp
))
581 drro
->drr_flags
|= DRR_RAW_BYTESWAP
;
583 /* needed for reconstructing dnp on recv side */
584 drro
->drr_maxblkid
= dnp
->dn_maxblkid
;
585 drro
->drr_indblkshift
= dnp
->dn_indblkshift
;
586 drro
->drr_nlevels
= dnp
->dn_nlevels
;
587 drro
->drr_nblkptr
= dnp
->dn_nblkptr
;
590 * Since we encrypt the entire bonus area, the (raw) part
591 * beyond the bonuslen is actually nonzero, so we need
595 drro
->drr_raw_bonuslen
= DN_MAX_BONUS_LEN(dnp
);
596 bonuslen
= drro
->drr_raw_bonuslen
;
601 * DRR_OBJECT_SPILL is set for every dnode which references a
602 * spill block. This allows the receiving pool to definitively
603 * determine when a spill block should be kept or freed.
605 if (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
)
606 drro
->drr_flags
|= DRR_OBJECT_SPILL
;
608 if (dump_record(dsp
, DN_BONUS(dnp
), bonuslen
) != 0)
609 return (SET_ERROR(EINTR
));
611 /* Free anything past the end of the file. */
612 if (dump_free(dsp
, object
, (dnp
->dn_maxblkid
+ 1) *
613 (dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
), DMU_OBJECT_END
) != 0)
614 return (SET_ERROR(EINTR
));
617 * Send DRR_SPILL records for unmodified spill blocks. This is useful
618 * because changing certain attributes of the object (e.g. blocksize)
619 * can cause old versions of ZFS to incorrectly remove a spill block.
620 * Including these records in the stream forces an up to date version
621 * to always be written ensuring they're never lost. Current versions
622 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
623 * ignore these unmodified spill blocks.
625 if (zfs_send_unmodified_spill_blocks
&&
626 (dnp
->dn_flags
& DNODE_FLAG_SPILL_BLKPTR
) &&
627 (DN_SPILL_BLKPTR(dnp
)->blk_birth
<= dsp
->dsa_fromtxg
)) {
628 struct send_block_record record
;
630 bzero(&record
, sizeof (struct send_block_record
));
631 record
.eos_marker
= B_FALSE
;
632 record
.bp
= *DN_SPILL_BLKPTR(dnp
);
633 SET_BOOKMARK(&(record
.zb
), dmu_objset_id(dsp
->dsa_os
),
634 object
, 0, DMU_SPILL_BLKID
);
636 if (do_dump(dsp
, &record
) != 0)
637 return (SET_ERROR(EINTR
));
640 if (dsp
->dsa_err
!= 0)
641 return (SET_ERROR(EINTR
));
647 dump_object_range(dmu_sendarg_t
*dsp
, const blkptr_t
*bp
, uint64_t firstobj
,
650 struct drr_object_range
*drror
=
651 &(dsp
->dsa_drr
->drr_u
.drr_object_range
);
653 /* we only use this record type for raw sends */
654 ASSERT(BP_IS_PROTECTED(bp
));
655 ASSERT(dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
);
656 ASSERT3U(BP_GET_COMPRESS(bp
), ==, ZIO_COMPRESS_OFF
);
657 ASSERT3U(BP_GET_TYPE(bp
), ==, DMU_OT_DNODE
);
658 ASSERT0(BP_GET_LEVEL(bp
));
660 if (dsp
->dsa_pending_op
!= PENDING_NONE
) {
661 if (dump_record(dsp
, NULL
, 0) != 0)
662 return (SET_ERROR(EINTR
));
663 dsp
->dsa_pending_op
= PENDING_NONE
;
666 bzero(dsp
->dsa_drr
, sizeof (dmu_replay_record_t
));
667 dsp
->dsa_drr
->drr_type
= DRR_OBJECT_RANGE
;
668 drror
->drr_firstobj
= firstobj
;
669 drror
->drr_numslots
= numslots
;
670 drror
->drr_toguid
= dsp
->dsa_toguid
;
671 if (BP_SHOULD_BYTESWAP(bp
))
672 drror
->drr_flags
|= DRR_RAW_BYTESWAP
;
673 zio_crypt_decode_params_bp(bp
, drror
->drr_salt
, drror
->drr_iv
);
674 zio_crypt_decode_mac_bp(bp
, drror
->drr_mac
);
676 if (dump_record(dsp
, NULL
, 0) != 0)
677 return (SET_ERROR(EINTR
));
682 backup_do_embed(dmu_sendarg_t
*dsp
, const blkptr_t
*bp
)
684 if (!BP_IS_EMBEDDED(bp
))
688 * Compression function must be legacy, or explicitly enabled.
690 if ((BP_GET_COMPRESS(bp
) >= ZIO_COMPRESS_LEGACY_FUNCTIONS
&&
691 !(dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_LZ4
)))
695 * Embed type must be explicitly enabled.
697 switch (BPE_GET_ETYPE(bp
)) {
698 case BP_EMBEDDED_TYPE_DATA
:
699 if (dsp
->dsa_featureflags
& DMU_BACKUP_FEATURE_EMBED_DATA
)
709 * This is the callback function to traverse_dataset that acts as the worker
710 * thread for dmu_send_impl.
714 send_cb(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
715 const zbookmark_phys_t
*zb
, const struct dnode_phys
*dnp
, void *arg
)
717 struct send_thread_arg
*sta
= arg
;
718 struct send_block_record
*record
;
719 uint64_t record_size
;
722 ASSERT(zb
->zb_object
== DMU_META_DNODE_OBJECT
||
723 zb
->zb_object
>= sta
->resume
.zb_object
);
724 ASSERT3P(sta
->ds
, !=, NULL
);
727 return (SET_ERROR(EINTR
));
730 ASSERT3U(zb
->zb_level
, ==, ZB_DNODE_LEVEL
);
732 } else if (zb
->zb_level
< 0) {
736 record
= kmem_zalloc(sizeof (struct send_block_record
), KM_SLEEP
);
737 record
->eos_marker
= B_FALSE
;
740 record
->indblkshift
= dnp
->dn_indblkshift
;
741 record
->datablkszsec
= dnp
->dn_datablkszsec
;
742 record_size
= dnp
->dn_datablkszsec
<< SPA_MINBLOCKSHIFT
;
743 bqueue_enqueue(&sta
->q
, record
, record_size
);
749 * This function kicks off the traverse_dataset. It also handles setting the
750 * error code of the thread in case something goes wrong, and pushes the End of
751 * Stream record when the traverse_dataset call has finished. If there is no
752 * dataset to traverse, the thread immediately pushes End of Stream marker.
755 send_traverse_thread(void *arg
)
757 struct send_thread_arg
*st_arg
= arg
;
759 struct send_block_record
*data
;
760 fstrans_cookie_t cookie
= spl_fstrans_mark();
762 if (st_arg
->ds
!= NULL
) {
763 err
= traverse_dataset_resume(st_arg
->ds
,
764 st_arg
->fromtxg
, &st_arg
->resume
,
765 st_arg
->flags
, send_cb
, st_arg
);
768 st_arg
->error_code
= err
;
770 data
= kmem_zalloc(sizeof (*data
), KM_SLEEP
);
771 data
->eos_marker
= B_TRUE
;
772 bqueue_enqueue(&st_arg
->q
, data
, 1);
773 spl_fstrans_unmark(cookie
);
778 * This function actually handles figuring out what kind of record needs to be
779 * dumped, reading the data (which has hopefully been prefetched), and calling
780 * the appropriate helper function.
783 do_dump(dmu_sendarg_t
*dsa
, struct send_block_record
*data
)
785 dsl_dataset_t
*ds
= dmu_objset_ds(dsa
->dsa_os
);
786 const blkptr_t
*bp
= &data
->bp
;
787 const zbookmark_phys_t
*zb
= &data
->zb
;
788 uint8_t indblkshift
= data
->indblkshift
;
789 uint16_t dblkszsec
= data
->datablkszsec
;
790 spa_t
*spa
= ds
->ds_dir
->dd_pool
->dp_spa
;
791 dmu_object_type_t type
= bp
? BP_GET_TYPE(bp
) : DMU_OT_NONE
;
794 ASSERT3U(zb
->zb_level
, >=, 0);
796 ASSERT(zb
->zb_object
== DMU_META_DNODE_OBJECT
||
797 zb
->zb_object
>= dsa
->dsa_resume_object
);
800 * All bps of an encrypted os should have the encryption bit set.
801 * If this is not true it indicates tampering and we report an error.
803 if (dsa
->dsa_os
->os_encrypted
&&
804 !BP_IS_HOLE(bp
) && !BP_USES_CRYPT(bp
)) {
805 spa_log_error(spa
, zb
);
806 zfs_panic_recover("unencrypted block in encrypted "
807 "object set %llu", ds
->ds_object
);
808 return (SET_ERROR(EIO
));
811 if (zb
->zb_object
!= DMU_META_DNODE_OBJECT
&&
812 DMU_OBJECT_IS_SPECIAL(zb
->zb_object
)) {
814 } else if (BP_IS_HOLE(bp
) &&
815 zb
->zb_object
== DMU_META_DNODE_OBJECT
) {
816 uint64_t span
= BP_SPAN(dblkszsec
, indblkshift
, zb
->zb_level
);
817 uint64_t dnobj
= (zb
->zb_blkid
* span
) >> DNODE_SHIFT
;
818 err
= dump_freeobjects(dsa
, dnobj
, span
>> DNODE_SHIFT
);
819 } else if (BP_IS_HOLE(bp
)) {
820 uint64_t span
= BP_SPAN(dblkszsec
, indblkshift
, zb
->zb_level
);
821 uint64_t offset
= zb
->zb_blkid
* span
;
822 /* Don't dump free records for offsets > DMU_OBJECT_END */
823 if (zb
->zb_blkid
== 0 || span
<= DMU_OBJECT_END
/ zb
->zb_blkid
)
824 err
= dump_free(dsa
, zb
->zb_object
, offset
, span
);
825 } else if (zb
->zb_level
> 0 || type
== DMU_OT_OBJSET
) {
827 } else if (type
== DMU_OT_DNODE
) {
828 int epb
= BP_GET_LSIZE(bp
) >> DNODE_SHIFT
;
829 arc_flags_t aflags
= ARC_FLAG_WAIT
;
831 enum zio_flag zioflags
= ZIO_FLAG_CANFAIL
;
833 if (dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
834 ASSERT(BP_IS_ENCRYPTED(bp
));
835 ASSERT3U(BP_GET_COMPRESS(bp
), ==, ZIO_COMPRESS_OFF
);
836 zioflags
|= ZIO_FLAG_RAW
;
839 ASSERT0(zb
->zb_level
);
841 if (arc_read(NULL
, spa
, bp
, arc_getbuf_func
, &abuf
,
842 ZIO_PRIORITY_ASYNC_READ
, zioflags
, &aflags
, zb
) != 0)
843 return (SET_ERROR(EIO
));
845 dnode_phys_t
*blk
= abuf
->b_data
;
846 uint64_t dnobj
= zb
->zb_blkid
* epb
;
849 * Raw sends require sending encryption parameters for the
850 * block of dnodes. Regular sends do not need to send this
853 if (dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
854 ASSERT(arc_is_encrypted(abuf
));
855 err
= dump_object_range(dsa
, bp
, dnobj
, epb
);
859 for (int i
= 0; i
< epb
;
860 i
+= blk
[i
].dn_extra_slots
+ 1) {
861 err
= dump_dnode(dsa
, bp
, dnobj
+ i
, blk
+ i
);
866 arc_buf_destroy(abuf
, &abuf
);
867 } else if (type
== DMU_OT_SA
) {
868 arc_flags_t aflags
= ARC_FLAG_WAIT
;
870 enum zio_flag zioflags
= ZIO_FLAG_CANFAIL
;
872 if (dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) {
873 ASSERT(BP_IS_PROTECTED(bp
));
874 zioflags
|= ZIO_FLAG_RAW
;
877 if (arc_read(NULL
, spa
, bp
, arc_getbuf_func
, &abuf
,
878 ZIO_PRIORITY_ASYNC_READ
, zioflags
, &aflags
, zb
) != 0)
879 return (SET_ERROR(EIO
));
881 err
= dump_spill(dsa
, bp
, zb
->zb_object
, abuf
->b_data
);
882 arc_buf_destroy(abuf
, &abuf
);
883 } else if (backup_do_embed(dsa
, bp
)) {
884 /* it's an embedded level-0 block of a regular object */
885 int blksz
= dblkszsec
<< SPA_MINBLOCKSHIFT
;
886 ASSERT0(zb
->zb_level
);
887 err
= dump_write_embedded(dsa
, zb
->zb_object
,
888 zb
->zb_blkid
* blksz
, blksz
, bp
);
890 /* it's a level-0 block of a regular object */
891 arc_flags_t aflags
= ARC_FLAG_WAIT
;
893 int blksz
= dblkszsec
<< SPA_MINBLOCKSHIFT
;
897 * If we have large blocks stored on disk but the send flags
898 * don't allow us to send large blocks, we split the data from
899 * the arc buf into chunks.
901 boolean_t split_large_blocks
= blksz
> SPA_OLD_MAXBLOCKSIZE
&&
902 !(dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_LARGE_BLOCKS
);
905 * Raw sends require that we always get raw data as it exists
906 * on disk, so we assert that we are not splitting blocks here.
908 boolean_t request_raw
=
909 (dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_RAW
) != 0;
912 * We should only request compressed data from the ARC if all
913 * the following are true:
914 * - stream compression was requested
915 * - we aren't splitting large blocks into smaller chunks
916 * - the data won't need to be byteswapped before sending
917 * - this isn't an embedded block
918 * - this isn't metadata (if receiving on a different endian
919 * system it can be byteswapped more easily)
921 boolean_t request_compressed
=
922 (dsa
->dsa_featureflags
& DMU_BACKUP_FEATURE_COMPRESSED
) &&
923 !split_large_blocks
&& !BP_SHOULD_BYTESWAP(bp
) &&
924 !BP_IS_EMBEDDED(bp
) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp
));
926 IMPLY(request_raw
, !split_large_blocks
);
927 IMPLY(request_raw
, BP_IS_PROTECTED(bp
));
928 ASSERT0(zb
->zb_level
);
929 ASSERT(zb
->zb_object
> dsa
->dsa_resume_object
||
930 (zb
->zb_object
== dsa
->dsa_resume_object
&&
931 zb
->zb_blkid
* blksz
>= dsa
->dsa_resume_offset
));
933 ASSERT3U(blksz
, ==, BP_GET_LSIZE(bp
));
935 enum zio_flag zioflags
= ZIO_FLAG_CANFAIL
;
937 zioflags
|= ZIO_FLAG_RAW
;
938 else if (request_compressed
)
939 zioflags
|= ZIO_FLAG_RAW_COMPRESS
;
941 if (arc_read(NULL
, spa
, bp
, arc_getbuf_func
, &abuf
,
942 ZIO_PRIORITY_ASYNC_READ
, zioflags
, &aflags
, zb
) != 0) {
943 if (zfs_send_corrupt_data
) {
944 /* Send a block filled with 0x"zfs badd bloc" */
945 abuf
= arc_alloc_buf(spa
, &abuf
, ARC_BUFC_DATA
,
948 for (ptr
= abuf
->b_data
;
949 (char *)ptr
< (char *)abuf
->b_data
+ blksz
;
951 *ptr
= 0x2f5baddb10cULL
;
953 return (SET_ERROR(EIO
));
957 offset
= zb
->zb_blkid
* blksz
;
959 if (split_large_blocks
) {
960 ASSERT0(arc_is_encrypted(abuf
));
961 ASSERT3U(arc_get_compression(abuf
), ==,
963 char *buf
= abuf
->b_data
;
964 while (blksz
> 0 && err
== 0) {
965 int n
= MIN(blksz
, SPA_OLD_MAXBLOCKSIZE
);
966 err
= dump_write(dsa
, type
, zb
->zb_object
,
967 offset
, n
, n
, NULL
, buf
);
973 err
= dump_write(dsa
, type
, zb
->zb_object
, offset
,
974 blksz
, arc_buf_size(abuf
), bp
, abuf
->b_data
);
976 arc_buf_destroy(abuf
, &abuf
);
979 ASSERT(err
== 0 || err
== EINTR
);
984 * Pop the new data off the queue, and free the old data.
986 static struct send_block_record
*
987 get_next_record(bqueue_t
*bq
, struct send_block_record
*data
)
989 struct send_block_record
*tmp
= bqueue_dequeue(bq
);
990 kmem_free(data
, sizeof (*data
));
995 * Actually do the bulk of the work in a zfs send.
997 * Note: Releases dp using the specified tag.
1000 dmu_send_impl(void *tag
, dsl_pool_t
*dp
, dsl_dataset_t
*to_ds
,
1001 zfs_bookmark_phys_t
*ancestor_zb
, boolean_t is_clone
,
1002 boolean_t embedok
, boolean_t large_block_ok
, boolean_t compressok
,
1003 boolean_t rawok
, int outfd
, uint64_t resumeobj
, uint64_t resumeoff
,
1004 vnode_t
*vp
, offset_t
*off
)
1007 dmu_replay_record_t
*drr
;
1010 uint64_t fromtxg
= 0;
1011 uint64_t featureflags
= 0;
1012 struct send_thread_arg to_arg
;
1013 void *payload
= NULL
;
1014 size_t payload_len
= 0;
1015 struct send_block_record
*to_data
;
1017 err
= dmu_objset_from_ds(to_ds
, &os
);
1019 dsl_pool_rele(dp
, tag
);
1024 * If this is a non-raw send of an encrypted ds, we can ensure that
1025 * the objset_phys_t is authenticated. This is safe because this is
1026 * either a snapshot or we have owned the dataset, ensuring that
1027 * it can't be modified.
1029 if (!rawok
&& os
->os_encrypted
&&
1030 arc_is_unauthenticated(os
->os_phys_buf
)) {
1031 zbookmark_phys_t zb
;
1033 SET_BOOKMARK(&zb
, to_ds
->ds_object
, ZB_ROOT_OBJECT
,
1034 ZB_ROOT_LEVEL
, ZB_ROOT_BLKID
);
1035 err
= arc_untransform(os
->os_phys_buf
, os
->os_spa
,
1038 dsl_pool_rele(dp
, tag
);
1042 ASSERT0(arc_is_unauthenticated(os
->os_phys_buf
));
1045 drr
= kmem_zalloc(sizeof (dmu_replay_record_t
), KM_SLEEP
);
1046 drr
->drr_type
= DRR_BEGIN
;
1047 drr
->drr_u
.drr_begin
.drr_magic
= DMU_BACKUP_MAGIC
;
1048 DMU_SET_STREAM_HDRTYPE(drr
->drr_u
.drr_begin
.drr_versioninfo
,
1051 bzero(&to_arg
, sizeof (to_arg
));
1054 if (dmu_objset_type(os
) == DMU_OST_ZFS
) {
1056 if (zfs_get_zplprop(os
, ZFS_PROP_VERSION
, &version
) != 0) {
1057 kmem_free(drr
, sizeof (dmu_replay_record_t
));
1058 dsl_pool_rele(dp
, tag
);
1059 return (SET_ERROR(EINVAL
));
1061 if (version
>= ZPL_VERSION_SA
) {
1062 featureflags
|= DMU_BACKUP_FEATURE_SA_SPILL
;
1067 /* raw sends imply large_block_ok */
1068 if ((large_block_ok
|| rawok
) &&
1069 dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_BLOCKS
))
1070 featureflags
|= DMU_BACKUP_FEATURE_LARGE_BLOCKS
;
1071 if (dsl_dataset_feature_is_active(to_ds
, SPA_FEATURE_LARGE_DNODE
))
1072 featureflags
|= DMU_BACKUP_FEATURE_LARGE_DNODE
;
1074 /* encrypted datasets will not have embedded blocks */
1075 if ((embedok
|| rawok
) && !os
->os_encrypted
&&
1076 spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_EMBEDDED_DATA
)) {
1077 featureflags
|= DMU_BACKUP_FEATURE_EMBED_DATA
;
1080 /* raw send implies compressok */
1081 if (compressok
|| rawok
)
1082 featureflags
|= DMU_BACKUP_FEATURE_COMPRESSED
;
1084 if (rawok
&& os
->os_encrypted
)
1085 featureflags
|= DMU_BACKUP_FEATURE_RAW
;
1088 (DMU_BACKUP_FEATURE_EMBED_DATA
| DMU_BACKUP_FEATURE_COMPRESSED
|
1089 DMU_BACKUP_FEATURE_RAW
)) != 0 &&
1090 spa_feature_is_active(dp
->dp_spa
, SPA_FEATURE_LZ4_COMPRESS
)) {
1091 featureflags
|= DMU_BACKUP_FEATURE_LZ4
;
1094 if (resumeobj
!= 0 || resumeoff
!= 0) {
1095 featureflags
|= DMU_BACKUP_FEATURE_RESUMING
;
1098 DMU_SET_FEATUREFLAGS(drr
->drr_u
.drr_begin
.drr_versioninfo
,
1101 drr
->drr_u
.drr_begin
.drr_creation_time
=
1102 dsl_dataset_phys(to_ds
)->ds_creation_time
;
1103 drr
->drr_u
.drr_begin
.drr_type
= dmu_objset_type(os
);
1105 drr
->drr_u
.drr_begin
.drr_flags
|= DRR_FLAG_CLONE
;
1106 drr
->drr_u
.drr_begin
.drr_toguid
= dsl_dataset_phys(to_ds
)->ds_guid
;
1107 if (dsl_dataset_phys(to_ds
)->ds_flags
& DS_FLAG_CI_DATASET
)
1108 drr
->drr_u
.drr_begin
.drr_flags
|= DRR_FLAG_CI_DATA
;
1109 if (zfs_send_set_freerecords_bit
)
1110 drr
->drr_u
.drr_begin
.drr_flags
|= DRR_FLAG_FREERECORDS
;
1112 drr
->drr_u
.drr_begin
.drr_flags
|= DRR_FLAG_SPILL_BLOCK
;
1114 if (ancestor_zb
!= NULL
) {
1115 drr
->drr_u
.drr_begin
.drr_fromguid
=
1116 ancestor_zb
->zbm_guid
;
1117 fromtxg
= ancestor_zb
->zbm_creation_txg
;
1119 dsl_dataset_name(to_ds
, drr
->drr_u
.drr_begin
.drr_toname
);
1120 if (!to_ds
->ds_is_snapshot
) {
1121 (void) strlcat(drr
->drr_u
.drr_begin
.drr_toname
, "@--head--",
1122 sizeof (drr
->drr_u
.drr_begin
.drr_toname
));
1125 dsp
= kmem_zalloc(sizeof (dmu_sendarg_t
), KM_SLEEP
);
1129 dsp
->dsa_outfd
= outfd
;
1130 dsp
->dsa_proc
= curproc
;
1133 dsp
->dsa_toguid
= dsl_dataset_phys(to_ds
)->ds_guid
;
1134 dsp
->dsa_fromtxg
= fromtxg
;
1135 dsp
->dsa_pending_op
= PENDING_NONE
;
1136 dsp
->dsa_featureflags
= featureflags
;
1137 dsp
->dsa_resume_object
= resumeobj
;
1138 dsp
->dsa_resume_offset
= resumeoff
;
1140 mutex_enter(&to_ds
->ds_sendstream_lock
);
1141 list_insert_head(&to_ds
->ds_sendstreams
, dsp
);
1142 mutex_exit(&to_ds
->ds_sendstream_lock
);
1144 dsl_dataset_long_hold(to_ds
, FTAG
);
1145 dsl_pool_rele(dp
, tag
);
1147 /* handle features that require a DRR_BEGIN payload */
1149 (DMU_BACKUP_FEATURE_RESUMING
| DMU_BACKUP_FEATURE_RAW
)) {
1150 nvlist_t
*keynvl
= NULL
;
1151 nvlist_t
*nvl
= fnvlist_alloc();
1153 if (featureflags
& DMU_BACKUP_FEATURE_RESUMING
) {
1154 dmu_object_info_t to_doi
;
1155 err
= dmu_object_info(os
, resumeobj
, &to_doi
);
1161 SET_BOOKMARK(&to_arg
.resume
, to_ds
->ds_object
,
1163 resumeoff
/ to_doi
.doi_data_block_size
);
1165 fnvlist_add_uint64(nvl
, "resume_object", resumeobj
);
1166 fnvlist_add_uint64(nvl
, "resume_offset", resumeoff
);
1169 if (featureflags
& DMU_BACKUP_FEATURE_RAW
) {
1170 uint64_t ivset_guid
= (ancestor_zb
!= NULL
) ?
1171 ancestor_zb
->zbm_ivset_guid
: 0;
1173 ASSERT(os
->os_encrypted
);
1175 err
= dsl_crypto_populate_key_nvlist(to_ds
,
1176 ivset_guid
, &keynvl
);
1182 fnvlist_add_nvlist(nvl
, "crypt_keydata", keynvl
);
1185 payload
= fnvlist_pack(nvl
, &payload_len
);
1186 drr
->drr_payloadlen
= payload_len
;
1187 fnvlist_free(keynvl
);
1191 err
= dump_record(dsp
, payload
, payload_len
);
1192 fnvlist_pack_free(payload
, payload_len
);
1198 err
= bqueue_init(&to_arg
.q
,
1199 MAX(zfs_send_queue_length
, 2 * zfs_max_recordsize
),
1200 offsetof(struct send_block_record
, ln
));
1201 to_arg
.error_code
= 0;
1202 to_arg
.cancel
= B_FALSE
;
1204 to_arg
.fromtxg
= fromtxg
;
1205 to_arg
.flags
= TRAVERSE_PRE
| TRAVERSE_PREFETCH
;
1207 to_arg
.flags
|= TRAVERSE_NO_DECRYPT
;
1208 (void) thread_create(NULL
, 0, send_traverse_thread
, &to_arg
, 0, curproc
,
1209 TS_RUN
, minclsyspri
);
1211 to_data
= bqueue_dequeue(&to_arg
.q
);
1213 while (!to_data
->eos_marker
&& err
== 0) {
1214 err
= do_dump(dsp
, to_data
);
1215 to_data
= get_next_record(&to_arg
.q
, to_data
);
1216 if (issig(JUSTLOOKING
) && issig(FORREAL
))
1221 to_arg
.cancel
= B_TRUE
;
1222 while (!to_data
->eos_marker
) {
1223 to_data
= get_next_record(&to_arg
.q
, to_data
);
1226 kmem_free(to_data
, sizeof (*to_data
));
1228 bqueue_destroy(&to_arg
.q
);
1230 if (err
== 0 && to_arg
.error_code
!= 0)
1231 err
= to_arg
.error_code
;
1236 if (dsp
->dsa_pending_op
!= PENDING_NONE
)
1237 if (dump_record(dsp
, NULL
, 0) != 0)
1238 err
= SET_ERROR(EINTR
);
1241 if (err
== EINTR
&& dsp
->dsa_err
!= 0)
1246 bzero(drr
, sizeof (dmu_replay_record_t
));
1247 drr
->drr_type
= DRR_END
;
1248 drr
->drr_u
.drr_end
.drr_checksum
= dsp
->dsa_zc
;
1249 drr
->drr_u
.drr_end
.drr_toguid
= dsp
->dsa_toguid
;
1251 if (dump_record(dsp
, NULL
, 0) != 0)
1254 mutex_enter(&to_ds
->ds_sendstream_lock
);
1255 list_remove(&to_ds
->ds_sendstreams
, dsp
);
1256 mutex_exit(&to_ds
->ds_sendstream_lock
);
1258 VERIFY(err
!= 0 || (dsp
->dsa_sent_begin
&& dsp
->dsa_sent_end
));
1260 kmem_free(drr
, sizeof (dmu_replay_record_t
));
1261 kmem_free(dsp
, sizeof (dmu_sendarg_t
));
1263 dsl_dataset_long_rele(to_ds
, FTAG
);
1269 dmu_send_obj(const char *pool
, uint64_t tosnap
, uint64_t fromsnap
,
1270 boolean_t embedok
, boolean_t large_block_ok
, boolean_t compressok
,
1271 boolean_t rawok
, int outfd
, vnode_t
*vp
, offset_t
*off
)
1275 dsl_dataset_t
*fromds
= NULL
;
1276 ds_hold_flags_t dsflags
= (rawok
) ? 0 : DS_HOLD_FLAG_DECRYPT
;
1279 err
= dsl_pool_hold(pool
, FTAG
, &dp
);
1283 err
= dsl_dataset_hold_obj_flags(dp
, tosnap
, dsflags
, FTAG
, &ds
);
1285 dsl_pool_rele(dp
, FTAG
);
1289 if (fromsnap
!= 0) {
1290 zfs_bookmark_phys_t zb
= { 0 };
1293 err
= dsl_dataset_hold_obj(dp
, fromsnap
, FTAG
, &fromds
);
1295 dsl_dataset_rele_flags(ds
, dsflags
, FTAG
);
1296 dsl_pool_rele(dp
, FTAG
);
1299 if (!dsl_dataset_is_before(ds
, fromds
, 0)) {
1300 err
= SET_ERROR(EXDEV
);
1301 dsl_dataset_rele(fromds
, FTAG
);
1302 dsl_dataset_rele_flags(ds
, dsflags
, FTAG
);
1303 dsl_pool_rele(dp
, FTAG
);
1307 zb
.zbm_creation_time
=
1308 dsl_dataset_phys(fromds
)->ds_creation_time
;
1309 zb
.zbm_creation_txg
= dsl_dataset_phys(fromds
)->ds_creation_txg
;
1310 zb
.zbm_guid
= dsl_dataset_phys(fromds
)->ds_guid
;
1312 if (dsl_dataset_is_zapified(fromds
)) {
1313 (void) zap_lookup(dp
->dp_meta_objset
,
1314 fromds
->ds_object
, DS_FIELD_IVSET_GUID
, 8, 1,
1315 &zb
.zbm_ivset_guid
);
1318 is_clone
= (fromds
->ds_dir
!= ds
->ds_dir
);
1319 dsl_dataset_rele(fromds
, FTAG
);
1320 err
= dmu_send_impl(FTAG
, dp
, ds
, &zb
, is_clone
,
1321 embedok
, large_block_ok
, compressok
, rawok
, outfd
,
1324 err
= dmu_send_impl(FTAG
, dp
, ds
, NULL
, B_FALSE
,
1325 embedok
, large_block_ok
, compressok
, rawok
, outfd
,
1328 dsl_dataset_rele_flags(ds
, dsflags
, FTAG
);
1333 dmu_send(const char *tosnap
, const char *fromsnap
, boolean_t embedok
,
1334 boolean_t large_block_ok
, boolean_t compressok
, boolean_t rawok
,
1335 int outfd
, uint64_t resumeobj
, uint64_t resumeoff
, vnode_t
*vp
,
1341 ds_hold_flags_t dsflags
= (rawok
) ? 0 : DS_HOLD_FLAG_DECRYPT
;
1342 boolean_t owned
= B_FALSE
;
1344 if (fromsnap
!= NULL
&& strpbrk(fromsnap
, "@#") == NULL
)
1345 return (SET_ERROR(EINVAL
));
1347 err
= dsl_pool_hold(tosnap
, FTAG
, &dp
);
1350 if (strchr(tosnap
, '@') == NULL
&& spa_writeable(dp
->dp_spa
)) {
1352 * We are sending a filesystem or volume. Ensure
1353 * that it doesn't change by owning the dataset.
1355 err
= dsl_dataset_own(dp
, tosnap
, dsflags
, FTAG
, &ds
);
1358 err
= dsl_dataset_hold_flags(dp
, tosnap
, dsflags
, FTAG
, &ds
);
1361 dsl_pool_rele(dp
, FTAG
);
1365 if (fromsnap
!= NULL
) {
1366 zfs_bookmark_phys_t zb
= { 0 };
1367 boolean_t is_clone
= B_FALSE
;
1368 int fsnamelen
= strchr(tosnap
, '@') - tosnap
;
1371 * If the fromsnap is in a different filesystem, then
1372 * mark the send stream as a clone.
1374 if (strncmp(tosnap
, fromsnap
, fsnamelen
) != 0 ||
1375 (fromsnap
[fsnamelen
] != '@' &&
1376 fromsnap
[fsnamelen
] != '#')) {
1380 if (strchr(fromsnap
, '@')) {
1381 dsl_dataset_t
*fromds
;
1382 err
= dsl_dataset_hold(dp
, fromsnap
, FTAG
, &fromds
);
1384 if (!dsl_dataset_is_before(ds
, fromds
, 0))
1385 err
= SET_ERROR(EXDEV
);
1386 zb
.zbm_creation_time
=
1387 dsl_dataset_phys(fromds
)->ds_creation_time
;
1388 zb
.zbm_creation_txg
=
1389 dsl_dataset_phys(fromds
)->ds_creation_txg
;
1390 zb
.zbm_guid
= dsl_dataset_phys(fromds
)->ds_guid
;
1391 is_clone
= (ds
->ds_dir
!= fromds
->ds_dir
);
1393 if (dsl_dataset_is_zapified(fromds
)) {
1394 (void) zap_lookup(dp
->dp_meta_objset
,
1396 DS_FIELD_IVSET_GUID
, 8, 1,
1397 &zb
.zbm_ivset_guid
);
1399 dsl_dataset_rele(fromds
, FTAG
);
1402 err
= dsl_bookmark_lookup(dp
, fromsnap
, ds
, &zb
);
1406 dsl_dataset_disown(ds
, dsflags
, FTAG
);
1408 dsl_dataset_rele_flags(ds
, dsflags
, FTAG
);
1410 dsl_pool_rele(dp
, FTAG
);
1413 err
= dmu_send_impl(FTAG
, dp
, ds
, &zb
, is_clone
,
1414 embedok
, large_block_ok
, compressok
, rawok
,
1415 outfd
, resumeobj
, resumeoff
, vp
, off
);
1417 err
= dmu_send_impl(FTAG
, dp
, ds
, NULL
, B_FALSE
,
1418 embedok
, large_block_ok
, compressok
, rawok
,
1419 outfd
, resumeobj
, resumeoff
, vp
, off
);
1422 dsl_dataset_disown(ds
, dsflags
, FTAG
);
1424 dsl_dataset_rele_flags(ds
, dsflags
, FTAG
);
1430 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t
*ds
, uint64_t uncompressed
,
1431 uint64_t compressed
, boolean_t stream_compressed
, uint64_t *sizep
)
1436 * Assume that space (both on-disk and in-stream) is dominated by
1437 * data. We will adjust for indirect blocks and the copies property,
1438 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
1441 uint64_t recordsize
;
1442 uint64_t record_count
;
1444 VERIFY0(dmu_objset_from_ds(ds
, &os
));
1446 /* Assume all (uncompressed) blocks are recordsize. */
1447 if (zfs_override_estimate_recordsize
!= 0) {
1448 recordsize
= zfs_override_estimate_recordsize
;
1449 } else if (os
->os_phys
->os_type
== DMU_OST_ZVOL
) {
1450 err
= dsl_prop_get_int_ds(ds
,
1451 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE
), &recordsize
);
1453 err
= dsl_prop_get_int_ds(ds
,
1454 zfs_prop_to_name(ZFS_PROP_RECORDSIZE
), &recordsize
);
1458 record_count
= uncompressed
/ recordsize
;
1461 * If we're estimating a send size for a compressed stream, use the
1462 * compressed data size to estimate the stream size. Otherwise, use the
1463 * uncompressed data size.
1465 size
= stream_compressed
? compressed
: uncompressed
;
1468 * Subtract out approximate space used by indirect blocks.
1469 * Assume most space is used by data blocks (non-indirect, non-dnode).
1470 * Assume no ditto blocks or internal fragmentation.
1472 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
1475 size
-= record_count
* sizeof (blkptr_t
);
1477 /* Add in the space for the record associated with each block. */
1478 size
+= record_count
* sizeof (dmu_replay_record_t
);
1486 dmu_send_estimate(dsl_dataset_t
*ds
, dsl_dataset_t
*fromds
,
1487 boolean_t stream_compressed
, uint64_t *sizep
)
1490 uint64_t uncomp
, comp
;
1492 ASSERT(dsl_pool_config_held(ds
->ds_dir
->dd_pool
));
1494 /* tosnap must be a snapshot */
1495 if (!ds
->ds_is_snapshot
)
1496 return (SET_ERROR(EINVAL
));
1498 /* fromsnap, if provided, must be a snapshot */
1499 if (fromds
!= NULL
&& !fromds
->ds_is_snapshot
)
1500 return (SET_ERROR(EINVAL
));
1503 * fromsnap must be an earlier snapshot from the same fs as tosnap,
1504 * or the origin's fs.
1506 if (fromds
!= NULL
&& !dsl_dataset_is_before(ds
, fromds
, 0))
1507 return (SET_ERROR(EXDEV
));
1509 /* Get compressed and uncompressed size estimates of changed data. */
1510 if (fromds
== NULL
) {
1511 uncomp
= dsl_dataset_phys(ds
)->ds_uncompressed_bytes
;
1512 comp
= dsl_dataset_phys(ds
)->ds_compressed_bytes
;
1515 err
= dsl_dataset_space_written(fromds
, ds
,
1516 &used
, &comp
, &uncomp
);
1521 err
= dmu_adjust_send_estimate_for_indirects(ds
, uncomp
, comp
,
1522 stream_compressed
, sizep
);
1524 * Add the size of the BEGIN and END records to the estimate.
1526 *sizep
+= 2 * sizeof (dmu_replay_record_t
);
1530 struct calculate_send_arg
{
1531 uint64_t uncompressed
;
1532 uint64_t compressed
;
1536 * Simple callback used to traverse the blocks of a snapshot and sum their
1537 * uncompressed and compressed sizes.
1541 dmu_calculate_send_traversal(spa_t
*spa
, zilog_t
*zilog
, const blkptr_t
*bp
,
1542 const zbookmark_phys_t
*zb
, const dnode_phys_t
*dnp
, void *arg
)
1544 struct calculate_send_arg
*space
= arg
;
1545 if (bp
!= NULL
&& !BP_IS_HOLE(bp
)) {
1546 space
->uncompressed
+= BP_GET_UCSIZE(bp
);
1547 space
->compressed
+= BP_GET_PSIZE(bp
);
1553 * Given a desination snapshot and a TXG, calculate the approximate size of a
1554 * send stream sent from that TXG. from_txg may be zero, indicating that the
1555 * whole snapshot will be sent.
1558 dmu_send_estimate_from_txg(dsl_dataset_t
*ds
, uint64_t from_txg
,
1559 boolean_t stream_compressed
, uint64_t *sizep
)
1562 struct calculate_send_arg size
= { 0 };
1564 ASSERT(dsl_pool_config_held(ds
->ds_dir
->dd_pool
));
1566 /* tosnap must be a snapshot */
1567 if (!dsl_dataset_is_snapshot(ds
))
1568 return (SET_ERROR(EINVAL
));
1570 /* verify that from_txg is before the provided snapshot was taken */
1571 if (from_txg
>= dsl_dataset_phys(ds
)->ds_creation_txg
) {
1572 return (SET_ERROR(EXDEV
));
1575 * traverse the blocks of the snapshot with birth times after
1576 * from_txg, summing their uncompressed size
1578 err
= traverse_dataset(ds
, from_txg
,
1579 TRAVERSE_POST
| TRAVERSE_NO_DECRYPT
,
1580 dmu_calculate_send_traversal
, &size
);
1585 err
= dmu_adjust_send_estimate_for_indirects(ds
, size
.uncompressed
,
1586 size
.compressed
, stream_compressed
, sizep
);
1591 #if defined(_KERNEL)
1593 module_param(zfs_override_estimate_recordsize
, ulong
, 0644);
1594 MODULE_PARM_DESC(zfs_override_estimate_recordsize
,
1595 "Record size calculation override for zfs send estimates");
1598 module_param(zfs_send_corrupt_data
, int, 0644);
1599 MODULE_PARM_DESC(zfs_send_corrupt_data
, "Allow sending corrupt data");
1601 module_param(zfs_send_queue_length
, int, 0644);
1602 MODULE_PARM_DESC(zfs_send_queue_length
, "Maximum send queue length");
1604 module_param(zfs_send_unmodified_spill_blocks
, int, 0644);
1605 MODULE_PARM_DESC(zfs_send_unmodified_spill_blocks
,
1606 "Send unmodified spill blocks");