]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dmu_send.c
Fix typo/etc in module/zfs/zfs_ctldir.c
[mirror_zfs.git] / module / zfs / dmu_send.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
8d35c149 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
e6d3a843 24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
788eb90c 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
47dfff3b 26 * Copyright 2014 HybridCluster. All rights reserved.
b607405f 27 * Copyright 2016 RackTop Systems.
a0bd735a 28 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
8d35c149 29 */
34dc7c2f 30
34dc7c2f
BB
31#include <sys/dmu.h>
32#include <sys/dmu_impl.h>
33#include <sys/dmu_tx.h>
34#include <sys/dbuf.h>
35#include <sys/dnode.h>
36#include <sys/zfs_context.h>
37#include <sys/dmu_objset.h>
38#include <sys/dmu_traverse.h>
39#include <sys/dsl_dataset.h>
40#include <sys/dsl_dir.h>
428870ff 41#include <sys/dsl_prop.h>
34dc7c2f
BB
42#include <sys/dsl_pool.h>
43#include <sys/dsl_synctask.h>
044baf00 44#include <sys/spa_impl.h>
34dc7c2f
BB
45#include <sys/zfs_ioctl.h>
46#include <sys/zap.h>
47#include <sys/zio_checksum.h>
428870ff
BB
48#include <sys/zfs_znode.h>
49#include <zfs_fletcher.h>
50#include <sys/avl.h>
51#include <sys/ddt.h>
572e2857 52#include <sys/zfs_onexit.h>
13fe0198
MA
53#include <sys/dmu_send.h>
54#include <sys/dsl_destroy.h>
9b67f605 55#include <sys/blkptr.h>
da536844 56#include <sys/dsl_bookmark.h>
9b67f605 57#include <sys/zfeature.h>
fcff0f35 58#include <sys/bqueue.h>
a0bd735a 59#include <sys/zvol.h>
f74b821a 60#include <sys/policy.h>
34dc7c2f 61
330d06f9
MA
62/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
63int zfs_send_corrupt_data = B_FALSE;
3b0d9928 64int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
b607405f
AS
65/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
66int zfs_send_set_freerecords_bit = B_TRUE;
330d06f9 67
ca0845d5
PD
68/*
69 * Use this to override the recordsize calculation for fast zfs send estimates.
70 */
71unsigned long zfs_override_estimate_recordsize = 0;
72
fcff0f35
PD
73#define BP_SPAN(datablkszsec, indblkshift, level) \
74 (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
75 (level) * (indblkshift - SPA_BLKPTRSHIFT)))
76
77struct send_thread_arg {
78 bqueue_t q;
79 dsl_dataset_t *ds; /* Dataset to traverse */
80 uint64_t fromtxg; /* Traverse from this txg */
81 int flags; /* flags to pass to traverse_dataset */
82 int error_code;
83 boolean_t cancel;
47dfff3b 84 zbookmark_phys_t resume;
fcff0f35
PD
85};
86
87struct send_block_record {
88 boolean_t eos_marker; /* Marks the end of the stream */
89 blkptr_t bp;
90 zbookmark_phys_t zb;
91 uint8_t indblkshift;
92 uint16_t datablkszsec;
93 bqueue_node_t ln;
94};
95
044baf00
BB
96typedef struct dump_bytes_io {
97 dmu_sendarg_t *dbi_dsp;
98 void *dbi_buf;
99 int dbi_len;
100} dump_bytes_io_t;
101
102static void
b58986ee 103dump_bytes_cb(void *arg)
34dc7c2f 104{
044baf00
BB
105 dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
106 dmu_sendarg_t *dsp = dbi->dbi_dsp;
47dfff3b 107 dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
34dc7c2f 108 ssize_t resid; /* have to get resid to get detailed errno */
f8866f8a
ER
109
110 /*
b5256303 111 * The code does not rely on len being a multiple of 8. We keep
f8866f8a
ER
112 * this assertion because of the corresponding assertion in
113 * receive_read(). Keeping this assertion ensures that we do not
114 * inadvertently break backwards compatibility (causing the assertion
b5256303
TC
115 * in receive_read() to trigger on old software). Newer feature flags
116 * (such as raw send) may break this assertion since they were
117 * introduced after the requirement was made obsolete.
f8866f8a
ER
118 */
119
b5256303
TC
120 ASSERT(dbi->dbi_len % 8 == 0 ||
121 (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
34dc7c2f 122
37abac6d 123 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
044baf00 124 (caddr_t)dbi->dbi_buf, dbi->dbi_len,
34dc7c2f 125 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
37abac6d
BP
126
127 mutex_enter(&ds->ds_sendstream_lock);
044baf00 128 *dsp->dsa_off += dbi->dbi_len;
37abac6d 129 mutex_exit(&ds->ds_sendstream_lock);
044baf00
BB
130}
131
132static int
133dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
134{
135 dump_bytes_io_t dbi;
136
137 dbi.dbi_dsp = dsp;
138 dbi.dbi_buf = buf;
139 dbi.dbi_len = len;
140
b58986ee
BB
141#if defined(HAVE_LARGE_STACKS)
142 dump_bytes_cb(&dbi);
143#else
044baf00
BB
144 /*
145 * The vn_rdwr() call is performed in a taskq to ensure that there is
146 * always enough stack space to write safely to the target filesystem.
147 * The ZIO_TYPE_FREE threads are used because there can be a lot of
148 * them and they are used in vdev_file.c for a similar purpose.
149 */
150 spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE,
b58986ee
BB
151 ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP);
152#endif /* HAVE_LARGE_STACKS */
37abac6d
BP
153
154 return (dsp->dsa_err);
34dc7c2f
BB
155}
156
37f8a883
MA
157/*
158 * For all record types except BEGIN, fill in the checksum (overlaid in
159 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
160 * up to the start of the checksum itself.
161 */
162static int
163dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
164{
165 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
166 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
a6255b7f 167 (void) fletcher_4_incremental_native(dsp->dsa_drr,
37f8a883
MA
168 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
169 &dsp->dsa_zc);
51907a31
K
170 if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
171 dsp->dsa_sent_begin = B_TRUE;
172 } else {
37f8a883
MA
173 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
174 drr_checksum.drr_checksum));
175 dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
176 }
51907a31
K
177 if (dsp->dsa_drr->drr_type == DRR_END) {
178 dsp->dsa_sent_end = B_TRUE;
179 }
a6255b7f 180 (void) fletcher_4_incremental_native(&dsp->dsa_drr->
37f8a883
MA
181 drr_u.drr_checksum.drr_checksum,
182 sizeof (zio_cksum_t), &dsp->dsa_zc);
183 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
184 return (SET_ERROR(EINTR));
185 if (payload_len != 0) {
a6255b7f 186 (void) fletcher_4_incremental_native(payload, payload_len,
37f8a883
MA
187 &dsp->dsa_zc);
188 if (dump_bytes(dsp, payload, payload_len) != 0)
189 return (SET_ERROR(EINTR));
190 }
191 return (0);
192}
193
e6d3a843
PD
194/*
195 * Fill in the drr_free struct, or perform aggregation if the previous record is
196 * also a free record, and the two are adjacent.
197 *
198 * Note that we send free records even for a full send, because we want to be
199 * able to receive a full send as a clone, which requires a list of all the free
200 * and freeobject records that were generated on the source.
201 */
34dc7c2f 202static int
37abac6d 203dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
34dc7c2f
BB
204 uint64_t length)
205{
37abac6d 206 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
428870ff 207
ea97f8ce
MA
208 /*
209 * When we receive a free record, dbuf_free_range() assumes
210 * that the receiving system doesn't have any dbufs in the range
211 * being freed. This is always true because there is a one-record
212 * constraint: we only send one WRITE record for any given
47dfff3b 213 * object,offset. We know that the one-record constraint is
ea97f8ce
MA
214 * true because we always send data in increasing order by
215 * object,offset.
216 *
217 * If the increasing-order constraint ever changes, we should find
218 * another way to assert that the one-record constraint is still
219 * satisfied.
220 */
221 ASSERT(object > dsp->dsa_last_data_object ||
222 (object == dsp->dsa_last_data_object &&
223 offset > dsp->dsa_last_data_offset));
224
428870ff
BB
225 /*
226 * If there is a pending op, but it's not PENDING_FREE, push it out,
227 * since free block aggregation can only be done for blocks of the
228 * same type (i.e., DRR_FREE records can only be aggregated with
229 * other DRR_FREE records. DRR_FREEOBJECTS records can only be
230 * aggregated with other DRR_FREEOBJECTS records.
231 */
37abac6d
BP
232 if (dsp->dsa_pending_op != PENDING_NONE &&
233 dsp->dsa_pending_op != PENDING_FREE) {
37f8a883 234 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 235 return (SET_ERROR(EINTR));
37abac6d 236 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
237 }
238
37abac6d 239 if (dsp->dsa_pending_op == PENDING_FREE) {
428870ff 240 /*
ee45fbd8 241 * There should never be a PENDING_FREE if length is
242 * DMU_OBJECT_END (because dump_dnode is the only place where
243 * this function is called with a DMU_OBJECT_END, and only after
244 * flushing any pending record).
428870ff 245 */
ee45fbd8 246 ASSERT(length != DMU_OBJECT_END);
428870ff
BB
247 /*
248 * Check to see whether this free block can be aggregated
249 * with pending one.
250 */
251 if (drrf->drr_object == object && drrf->drr_offset +
252 drrf->drr_length == offset) {
ee45fbd8 253 if (offset + length < offset)
254 drrf->drr_length = DMU_OBJECT_END;
255 else
256 drrf->drr_length += length;
428870ff
BB
257 return (0);
258 } else {
259 /* not a continuation. Push out pending record */
37f8a883 260 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 261 return (SET_ERROR(EINTR));
37abac6d 262 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
263 }
264 }
265 /* create a FREE record and make it pending */
37abac6d
BP
266 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
267 dsp->dsa_drr->drr_type = DRR_FREE;
428870ff
BB
268 drrf->drr_object = object;
269 drrf->drr_offset = offset;
ee45fbd8 270 if (offset + length < offset)
271 drrf->drr_length = DMU_OBJECT_END;
272 else
273 drrf->drr_length = length;
37abac6d 274 drrf->drr_toguid = dsp->dsa_toguid;
ee45fbd8 275 if (length == DMU_OBJECT_END) {
37f8a883 276 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 277 return (SET_ERROR(EINTR));
428870ff 278 } else {
37abac6d 279 dsp->dsa_pending_op = PENDING_FREE;
428870ff 280 }
34dc7c2f 281
34dc7c2f
BB
282 return (0);
283}
284
285static int
b5256303
TC
286dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object,
287 uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data)
34dc7c2f 288{
2aa34383 289 uint64_t payload_size;
b5256303 290 boolean_t raw = (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW);
37abac6d 291 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
428870ff 292
ea97f8ce
MA
293 /*
294 * We send data in increasing object, offset order.
295 * See comment in dump_free() for details.
296 */
297 ASSERT(object > dsp->dsa_last_data_object ||
298 (object == dsp->dsa_last_data_object &&
299 offset > dsp->dsa_last_data_offset));
300 dsp->dsa_last_data_object = object;
2aa34383 301 dsp->dsa_last_data_offset = offset + lsize - 1;
428870ff
BB
302
303 /*
304 * If there is any kind of pending aggregation (currently either
305 * a grouping of free objects or free blocks), push it out to
306 * the stream, since aggregation can't be done across operations
307 * of different types.
308 */
37abac6d 309 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 310 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 311 return (SET_ERROR(EINTR));
37abac6d 312 dsp->dsa_pending_op = PENDING_NONE;
428870ff 313 }
37f8a883 314 /* write a WRITE record */
37abac6d
BP
315 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
316 dsp->dsa_drr->drr_type = DRR_WRITE;
428870ff
BB
317 drrw->drr_object = object;
318 drrw->drr_type = type;
319 drrw->drr_offset = offset;
37abac6d 320 drrw->drr_toguid = dsp->dsa_toguid;
2aa34383
DK
321 drrw->drr_logical_size = lsize;
322
b5256303
TC
323 /* only set the compression fields if the buf is compressed or raw */
324 if (raw || lsize != psize) {
2aa34383 325 ASSERT(!BP_IS_EMBEDDED(bp));
2aa34383 326 ASSERT3S(psize, >, 0);
2aa34383 327
b5256303
TC
328 if (raw) {
329 ASSERT(BP_IS_PROTECTED(bp));
330
331 /*
9b840763
TC
332 * This is a raw protected block so we need to pass
333 * along everything the receiving side will need to
334 * interpret this block, including the byteswap, salt,
335 * IV, and MAC.
b5256303 336 */
b5256303
TC
337 if (BP_SHOULD_BYTESWAP(bp))
338 drrw->drr_flags |= DRR_RAW_BYTESWAP;
339 zio_crypt_decode_params_bp(bp, drrw->drr_salt,
340 drrw->drr_iv);
341 zio_crypt_decode_mac_bp(bp, drrw->drr_mac);
342 } else {
343 /* this is a compressed block */
344 ASSERT(dsp->dsa_featureflags &
345 DMU_BACKUP_FEATURE_COMPRESSED);
346 ASSERT(!BP_SHOULD_BYTESWAP(bp));
347 ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
348 ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
349 ASSERT3S(lsize, >=, psize);
350 }
351
352 /* set fields common to compressed and raw sends */
2aa34383
DK
353 drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
354 drrw->drr_compressed_size = psize;
355 payload_size = drrw->drr_compressed_size;
356 } else {
357 payload_size = drrw->drr_logical_size;
358 }
359
b5256303 360 if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) {
9b67f605 361 /*
b5256303
TC
362 * There's no pre-computed checksum for partial-block writes,
363 * embedded BP's, or encrypted BP's that are being sent as
364 * plaintext, so (like fletcher4-checkummed blocks) userland
365 * will have to compute a dedup-capable checksum itself.
9b67f605
MA
366 */
367 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
368 } else {
369 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
3c67d83a
TH
370 if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
371 ZCHECKSUM_FLAG_DEDUP)
b5256303 372 drrw->drr_flags |= DRR_CHECKSUM_DEDUP;
9b67f605
MA
373 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
374 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
375 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
b5256303 376 DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp));
9b67f605
MA
377 drrw->drr_key.ddk_cksum = bp->blk_cksum;
378 }
428870ff 379
2aa34383 380 if (dump_record(dsp, data, payload_size) != 0)
2e528b49 381 return (SET_ERROR(EINTR));
428870ff
BB
382 return (0);
383}
384
9b67f605
MA
385static int
386dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
387 int blksz, const blkptr_t *bp)
388{
389 char buf[BPE_PAYLOAD_SIZE];
390 struct drr_write_embedded *drrw =
391 &(dsp->dsa_drr->drr_u.drr_write_embedded);
392
393 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 394 if (dump_record(dsp, NULL, 0) != 0)
ecb2b7dc 395 return (SET_ERROR(EINTR));
9b67f605
MA
396 dsp->dsa_pending_op = PENDING_NONE;
397 }
398
399 ASSERT(BP_IS_EMBEDDED(bp));
400
401 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
402 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
403 drrw->drr_object = object;
404 drrw->drr_offset = offset;
405 drrw->drr_length = blksz;
406 drrw->drr_toguid = dsp->dsa_toguid;
407 drrw->drr_compression = BP_GET_COMPRESS(bp);
408 drrw->drr_etype = BPE_GET_ETYPE(bp);
409 drrw->drr_lsize = BPE_GET_LSIZE(bp);
410 drrw->drr_psize = BPE_GET_PSIZE(bp);
411
412 decode_embedded_bp_compressed(bp, buf);
413
37f8a883 414 if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
ecb2b7dc 415 return (SET_ERROR(EINTR));
9b67f605
MA
416 return (0);
417}
418
428870ff 419static int
b5256303 420dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data)
428870ff 421{
37abac6d 422 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
b5256303 423 uint64_t blksz = BP_GET_LSIZE(bp);
b0ee5946 424 uint64_t payload_size = blksz;
428870ff 425
37abac6d 426 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 427 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 428 return (SET_ERROR(EINTR));
37abac6d 429 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
430 }
431
432 /* write a SPILL record */
37abac6d
BP
433 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
434 dsp->dsa_drr->drr_type = DRR_SPILL;
428870ff
BB
435 drrs->drr_object = object;
436 drrs->drr_length = blksz;
37abac6d 437 drrs->drr_toguid = dsp->dsa_toguid;
34dc7c2f 438
b5256303 439 /* handle raw send fields */
9b840763
TC
440 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
441 ASSERT(BP_IS_PROTECTED(bp));
442
b5256303
TC
443 if (BP_SHOULD_BYTESWAP(bp))
444 drrs->drr_flags |= DRR_RAW_BYTESWAP;
445 drrs->drr_compressiontype = BP_GET_COMPRESS(bp);
446 drrs->drr_compressed_size = BP_GET_PSIZE(bp);
447 zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv);
448 zio_crypt_decode_mac_bp(bp, drrs->drr_mac);
b0ee5946 449 payload_size = drrs->drr_compressed_size;
b5256303
TC
450 }
451
b0ee5946 452 if (dump_record(dsp, data, payload_size) != 0)
2e528b49 453 return (SET_ERROR(EINTR));
34dc7c2f
BB
454 return (0);
455}
456
457static int
37abac6d 458dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
34dc7c2f 459{
37abac6d 460 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
829e95c4
FG
461 uint64_t maxobj = DNODES_PER_BLOCK *
462 (DMU_META_DNODE(dsp->dsa_os)->dn_maxblkid + 1);
463
464 /*
465 * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
466 * leading to zfs recv never completing. to avoid this issue, don't
467 * send FREEOBJECTS records for object IDs which cannot exist on the
468 * receiving side.
469 */
470 if (maxobj > 0) {
471 if (maxobj < firstobj)
472 return (0);
473
474 if (maxobj < firstobj + numobjs)
475 numobjs = maxobj - firstobj;
476 }
428870ff
BB
477
478 /*
479 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
480 * push it out, since free block aggregation can only be done for
481 * blocks of the same type (i.e., DRR_FREE records can only be
482 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
483 * can only be aggregated with other DRR_FREEOBJECTS records.
484 */
37abac6d
BP
485 if (dsp->dsa_pending_op != PENDING_NONE &&
486 dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
37f8a883 487 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 488 return (SET_ERROR(EINTR));
37abac6d 489 dsp->dsa_pending_op = PENDING_NONE;
428870ff 490 }
37abac6d 491 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
428870ff
BB
492 /*
493 * See whether this free object array can be aggregated
494 * with pending one
495 */
496 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
497 drrfo->drr_numobjs += numobjs;
498 return (0);
499 } else {
500 /* can't be aggregated. Push out pending record */
37f8a883 501 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 502 return (SET_ERROR(EINTR));
37abac6d 503 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
504 }
505 }
506
34dc7c2f 507 /* write a FREEOBJECTS record */
37abac6d
BP
508 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
509 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
428870ff
BB
510 drrfo->drr_firstobj = firstobj;
511 drrfo->drr_numobjs = numobjs;
37abac6d 512 drrfo->drr_toguid = dsp->dsa_toguid;
428870ff 513
37abac6d 514 dsp->dsa_pending_op = PENDING_FREEOBJECTS;
34dc7c2f 515
34dc7c2f
BB
516 return (0);
517}
518
519static int
b5256303
TC
520dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object,
521 dnode_phys_t *dnp)
34dc7c2f 522{
37abac6d 523 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
4807c0ba 524 int bonuslen;
428870ff 525
47dfff3b
MA
526 if (object < dsp->dsa_resume_object) {
527 /*
528 * Note: when resuming, we will visit all the dnodes in
529 * the block of dnodes that we are resuming from. In
530 * this case it's unnecessary to send the dnodes prior to
531 * the one we are resuming from. We should be at most one
532 * block's worth of dnodes behind the resume point.
533 */
534 ASSERT3U(dsp->dsa_resume_object - object, <,
535 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
536 return (0);
537 }
538
34dc7c2f 539 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
37abac6d 540 return (dump_freeobjects(dsp, object, 1));
34dc7c2f 541
37abac6d 542 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 543 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 544 return (SET_ERROR(EINTR));
37abac6d 545 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
546 }
547
34dc7c2f 548 /* write an OBJECT record */
37abac6d
BP
549 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
550 dsp->dsa_drr->drr_type = DRR_OBJECT;
428870ff
BB
551 drro->drr_object = object;
552 drro->drr_type = dnp->dn_type;
553 drro->drr_bonustype = dnp->dn_bonustype;
554 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
555 drro->drr_bonuslen = dnp->dn_bonuslen;
50c957f7 556 drro->drr_dn_slots = dnp->dn_extra_slots + 1;
428870ff
BB
557 drro->drr_checksumtype = dnp->dn_checksum;
558 drro->drr_compress = dnp->dn_compress;
37abac6d 559 drro->drr_toguid = dsp->dsa_toguid;
428870ff 560
f1512ee6
MA
561 if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
562 drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
563 drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
564
4807c0ba
TC
565 bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8);
566
9b840763
TC
567 if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW)) {
568 ASSERT(BP_IS_ENCRYPTED(bp));
569
b5256303
TC
570 if (BP_SHOULD_BYTESWAP(bp))
571 drro->drr_flags |= DRR_RAW_BYTESWAP;
572
573 /* needed for reconstructing dnp on recv side */
ae76f45c 574 drro->drr_maxblkid = dnp->dn_maxblkid;
b5256303
TC
575 drro->drr_indblkshift = dnp->dn_indblkshift;
576 drro->drr_nlevels = dnp->dn_nlevels;
577 drro->drr_nblkptr = dnp->dn_nblkptr;
578
579 /*
580 * Since we encrypt the entire bonus area, the (raw) part
4807c0ba 581 * beyond the bonuslen is actually nonzero, so we need
b5256303
TC
582 * to send it.
583 */
584 if (bonuslen != 0) {
585 drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp);
586 bonuslen = drro->drr_raw_bonuslen;
587 }
37f8a883 588 }
34dc7c2f 589
b5256303
TC
590 if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0)
591 return (SET_ERROR(EINTR));
592
ea97f8ce 593 /* Free anything past the end of the file. */
37abac6d 594 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
ee45fbd8 595 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
2e528b49 596 return (SET_ERROR(EINTR));
13fe0198 597 if (dsp->dsa_err != 0)
2e528b49 598 return (SET_ERROR(EINTR));
34dc7c2f
BB
599 return (0);
600}
601
b5256303
TC
602static int
603dump_object_range(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t firstobj,
604 uint64_t numslots)
605{
606 struct drr_object_range *drror =
607 &(dsp->dsa_drr->drr_u.drr_object_range);
608
609 /* we only use this record type for raw sends */
610 ASSERT(BP_IS_PROTECTED(bp));
611 ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW);
612 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
613 ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
614 ASSERT0(BP_GET_LEVEL(bp));
615
616 if (dsp->dsa_pending_op != PENDING_NONE) {
617 if (dump_record(dsp, NULL, 0) != 0)
618 return (SET_ERROR(EINTR));
619 dsp->dsa_pending_op = PENDING_NONE;
620 }
621
622 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
623 dsp->dsa_drr->drr_type = DRR_OBJECT_RANGE;
624 drror->drr_firstobj = firstobj;
625 drror->drr_numslots = numslots;
626 drror->drr_toguid = dsp->dsa_toguid;
b5256303
TC
627 if (BP_SHOULD_BYTESWAP(bp))
628 drror->drr_flags |= DRR_RAW_BYTESWAP;
629 zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv);
630 zio_crypt_decode_mac_bp(bp, drror->drr_mac);
631
632 if (dump_record(dsp, NULL, 0) != 0)
633 return (SET_ERROR(EINTR));
634 return (0);
635}
636
9b67f605
MA
637static boolean_t
638backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
639{
640 if (!BP_IS_EMBEDDED(bp))
641 return (B_FALSE);
642
643 /*
644 * Compression function must be legacy, or explicitly enabled.
645 */
646 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
2aa34383 647 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4)))
9b67f605
MA
648 return (B_FALSE);
649
650 /*
651 * Embed type must be explicitly enabled.
652 */
653 switch (BPE_GET_ETYPE(bp)) {
654 case BP_EMBEDDED_TYPE_DATA:
655 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
656 return (B_TRUE);
657 break;
658 default:
659 return (B_FALSE);
660 }
661 return (B_FALSE);
662}
663
fcff0f35
PD
664/*
665 * This is the callback function to traverse_dataset that acts as the worker
666 * thread for dmu_send_impl.
667 */
668/*ARGSUSED*/
669static int
670send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
671 const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
672{
673 struct send_thread_arg *sta = arg;
674 struct send_block_record *record;
675 uint64_t record_size;
676 int err = 0;
677
47dfff3b
MA
678 ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
679 zb->zb_object >= sta->resume.zb_object);
b5256303 680 ASSERT3P(sta->ds, !=, NULL);
47dfff3b 681
fcff0f35
PD
682 if (sta->cancel)
683 return (SET_ERROR(EINTR));
34dc7c2f 684
fcff0f35
PD
685 if (bp == NULL) {
686 ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
687 return (0);
688 } else if (zb->zb_level < 0) {
689 return (0);
690 }
691
692 record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
693 record->eos_marker = B_FALSE;
694 record->bp = *bp;
695 record->zb = *zb;
696 record->indblkshift = dnp->dn_indblkshift;
697 record->datablkszsec = dnp->dn_datablkszsec;
698 record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
699 bqueue_enqueue(&sta->q, record, record_size);
700
701 return (err);
702}
703
704/*
705 * This function kicks off the traverse_dataset. It also handles setting the
706 * error code of the thread in case something goes wrong, and pushes the End of
707 * Stream record when the traverse_dataset call has finished. If there is no
708 * dataset to traverse, the thread immediately pushes End of Stream marker.
709 */
710static void
711send_traverse_thread(void *arg)
712{
713 struct send_thread_arg *st_arg = arg;
714 int err;
715 struct send_block_record *data;
3e635ac1 716 fstrans_cookie_t cookie = spl_fstrans_mark();
fcff0f35
PD
717
718 if (st_arg->ds != NULL) {
47dfff3b
MA
719 err = traverse_dataset_resume(st_arg->ds,
720 st_arg->fromtxg, &st_arg->resume,
721 st_arg->flags, send_cb, st_arg);
722
fcff0f35
PD
723 if (err != EINTR)
724 st_arg->error_code = err;
725 }
726 data = kmem_zalloc(sizeof (*data), KM_SLEEP);
727 data->eos_marker = B_TRUE;
728 bqueue_enqueue(&st_arg->q, data, 1);
3e635ac1 729 spl_fstrans_unmark(cookie);
34a6b428 730 thread_exit();
fcff0f35
PD
731}
732
733/*
734 * This function actually handles figuring out what kind of record needs to be
735 * dumped, reading the data (which has hopefully been prefetched), and calling
736 * the appropriate helper function.
737 */
34dc7c2f 738static int
fcff0f35 739do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
34dc7c2f 740{
fcff0f35
PD
741 dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
742 const blkptr_t *bp = &data->bp;
743 const zbookmark_phys_t *zb = &data->zb;
744 uint8_t indblkshift = data->indblkshift;
745 uint16_t dblkszsec = data->datablkszsec;
746 spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
34dc7c2f 747 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
34dc7c2f
BB
748 int err = 0;
749
fcff0f35 750 ASSERT3U(zb->zb_level, >=, 0);
34dc7c2f 751
47dfff3b
MA
752 ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
753 zb->zb_object >= dsa->dsa_resume_object);
754
b5256303
TC
755 /*
756 * All bps of an encrypted os should have the encryption bit set.
757 * If this is not true it indicates tampering and we report an error.
758 */
759 if (dsa->dsa_os->os_encrypted &&
760 !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
761 spa_log_error(spa, zb);
762 zfs_panic_recover("unencrypted block in encrypted "
763 "object set %llu", ds->ds_object);
764 return (SET_ERROR(EIO));
765 }
766
428870ff
BB
767 if (zb->zb_object != DMU_META_DNODE_OBJECT &&
768 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
9babb374 769 return (0);
b0bc7a84
MG
770 } else if (BP_IS_HOLE(bp) &&
771 zb->zb_object == DMU_META_DNODE_OBJECT) {
fcff0f35 772 uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
b128c09f 773 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
fcff0f35 774 err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
b0bc7a84 775 } else if (BP_IS_HOLE(bp)) {
fcff0f35
PD
776 uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
777 uint64_t offset = zb->zb_blkid * span;
ee45fbd8 778 /* Don't dump free records for offsets > DMU_OBJECT_END */
779 if (zb->zb_blkid == 0 || span <= DMU_OBJECT_END / zb->zb_blkid)
780 err = dump_free(dsa, zb->zb_object, offset, span);
b128c09f
BB
781 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
782 return (0);
783 } else if (type == DMU_OT_DNODE) {
50c957f7 784 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
2a432414 785 arc_flags_t aflags = ARC_FLAG_WAIT;
b128c09f 786 arc_buf_t *abuf;
b5256303 787 enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
fcff0f35 788
b5256303
TC
789 if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
790 ASSERT(BP_IS_ENCRYPTED(bp));
791 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
792 zioflags |= ZIO_FLAG_RAW;
793 }
794
fcff0f35 795 ASSERT0(zb->zb_level);
b128c09f 796
294f6806 797 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
b5256303 798 ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0)
2e528b49 799 return (SET_ERROR(EIO));
34dc7c2f 800
1c27024e
DB
801 dnode_phys_t *blk = abuf->b_data;
802 uint64_t dnobj = zb->zb_blkid * epb;
b5256303
TC
803
804 /*
805 * Raw sends require sending encryption parameters for the
806 * block of dnodes. Regular sends do not need to send this
807 * info.
808 */
809 if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
810 ASSERT(arc_is_encrypted(abuf));
811 err = dump_object_range(dsa, bp, dnobj, epb);
812 }
813
814 if (err == 0) {
1c27024e
DB
815 for (int i = 0; i < epb;
816 i += blk[i].dn_extra_slots + 1) {
b5256303
TC
817 err = dump_dnode(dsa, bp, dnobj + i, blk + i);
818 if (err != 0)
819 break;
820 }
34dc7c2f 821 }
d3c2ae1c 822 arc_buf_destroy(abuf, &abuf);
428870ff 823 } else if (type == DMU_OT_SA) {
2a432414 824 arc_flags_t aflags = ARC_FLAG_WAIT;
b128c09f 825 arc_buf_t *abuf;
b5256303
TC
826 enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
827
828 if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
829 ASSERT(BP_IS_PROTECTED(bp));
830 zioflags |= ZIO_FLAG_RAW;
831 }
b128c09f 832
294f6806 833 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
b5256303 834 ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0)
2e528b49 835 return (SET_ERROR(EIO));
b128c09f 836
b5256303 837 err = dump_spill(dsa, bp, zb->zb_object, abuf->b_data);
d3c2ae1c 838 arc_buf_destroy(abuf, &abuf);
fcff0f35 839 } else if (backup_do_embed(dsa, bp)) {
9b67f605 840 /* it's an embedded level-0 block of a regular object */
fcff0f35
PD
841 int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
842 ASSERT0(zb->zb_level);
843 err = dump_write_embedded(dsa, zb->zb_object,
9b67f605 844 zb->zb_blkid * blksz, blksz, bp);
fcff0f35
PD
845 } else {
846 /* it's a level-0 block of a regular object */
2a432414 847 arc_flags_t aflags = ARC_FLAG_WAIT;
428870ff 848 arc_buf_t *abuf;
fcff0f35
PD
849 int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
850 uint64_t offset;
2aa34383
DK
851
852 /*
853 * If we have large blocks stored on disk but the send flags
854 * don't allow us to send large blocks, we split the data from
855 * the arc buf into chunks.
856 */
a7004725 857 boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE &&
2aa34383 858 !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
b5256303
TC
859
860 /*
861 * Raw sends require that we always get raw data as it exists
862 * on disk, so we assert that we are not splitting blocks here.
863 */
864 boolean_t request_raw =
865 (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
866
2aa34383
DK
867 /*
868 * We should only request compressed data from the ARC if all
869 * the following are true:
870 * - stream compression was requested
871 * - we aren't splitting large blocks into smaller chunks
872 * - the data won't need to be byteswapped before sending
873 * - this isn't an embedded block
874 * - this isn't metadata (if receiving on a different endian
875 * system it can be byteswapped more easily)
876 */
877 boolean_t request_compressed =
878 (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
879 !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
880 !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
428870ff 881
b5256303
TC
882 IMPLY(request_raw, !split_large_blocks);
883 IMPLY(request_raw, BP_IS_PROTECTED(bp));
da536844 884 ASSERT0(zb->zb_level);
47dfff3b
MA
885 ASSERT(zb->zb_object > dsa->dsa_resume_object ||
886 (zb->zb_object == dsa->dsa_resume_object &&
887 zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
888
a7004725
DK
889 ASSERT3U(blksz, ==, BP_GET_LSIZE(bp));
890
891 enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
b5256303 892 if (request_raw)
2aa34383 893 zioflags |= ZIO_FLAG_RAW;
b5256303
TC
894 else if (request_compressed)
895 zioflags |= ZIO_FLAG_RAW_COMPRESS;
2aa34383 896
294f6806 897 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
a7004725 898 ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) {
330d06f9 899 if (zfs_send_corrupt_data) {
330d06f9 900 /* Send a block filled with 0x"zfs badd bloc" */
2aa34383
DK
901 abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA,
902 blksz);
a7004725 903 uint64_t *ptr;
330d06f9
MA
904 for (ptr = abuf->b_data;
905 (char *)ptr < (char *)abuf->b_data + blksz;
906 ptr++)
dd26aa53 907 *ptr = 0x2f5baddb10cULL;
330d06f9 908 } else {
2e528b49 909 return (SET_ERROR(EIO));
330d06f9
MA
910 }
911 }
428870ff 912
f1512ee6
MA
913 offset = zb->zb_blkid * blksz;
914
2aa34383 915 if (split_large_blocks) {
b5256303 916 ASSERT0(arc_is_encrypted(abuf));
2aa34383
DK
917 ASSERT3U(arc_get_compression(abuf), ==,
918 ZIO_COMPRESS_OFF);
a7004725 919 char *buf = abuf->b_data;
f1512ee6
MA
920 while (blksz > 0 && err == 0) {
921 int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
fcff0f35 922 err = dump_write(dsa, type, zb->zb_object,
2aa34383 923 offset, n, n, NULL, buf);
f1512ee6
MA
924 offset += n;
925 buf += n;
926 blksz -= n;
927 }
928 } else {
2aa34383 929 err = dump_write(dsa, type, zb->zb_object, offset,
b5256303 930 blksz, arc_buf_size(abuf), bp, abuf->b_data);
f1512ee6 931 }
d3c2ae1c 932 arc_buf_destroy(abuf, &abuf);
34dc7c2f
BB
933 }
934
935 ASSERT(err == 0 || err == EINTR);
936 return (err);
937}
938
6f1ffb06 939/*
fcff0f35
PD
940 * Pop the new data off the queue, and free the old data.
941 */
942static struct send_block_record *
943get_next_record(bqueue_t *bq, struct send_block_record *data)
944{
945 struct send_block_record *tmp = bqueue_dequeue(bq);
946 kmem_free(data, sizeof (*data));
947 return (tmp);
948}
949
950/*
951 * Actually do the bulk of the work in a zfs send.
952 *
953 * Note: Releases dp using the specified tag.
6f1ffb06 954 */
13fe0198 955static int
fcff0f35 956dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
2aa34383
DK
957 zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone,
958 boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
b5256303 959 boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
47dfff3b 960 vnode_t *vp, offset_t *off)
34dc7c2f 961{
13fe0198 962 objset_t *os;
34dc7c2f 963 dmu_replay_record_t *drr;
37abac6d 964 dmu_sendarg_t *dsp;
34dc7c2f
BB
965 int err;
966 uint64_t fromtxg = 0;
9b67f605 967 uint64_t featureflags = 0;
fcff0f35 968 struct send_thread_arg to_arg;
47dfff3b
MA
969 void *payload = NULL;
970 size_t payload_len = 0;
fcff0f35 971 struct send_block_record *to_data;
34dc7c2f 972
fcff0f35 973 err = dmu_objset_from_ds(to_ds, &os);
13fe0198 974 if (err != 0) {
13fe0198
MA
975 dsl_pool_rele(dp, tag);
976 return (err);
977 }
34dc7c2f 978
b5256303
TC
979 /*
980 * If this is a non-raw send of an encrypted ds, we can ensure that
981 * the objset_phys_t is authenticated. This is safe because this is
982 * either a snapshot or we have owned the dataset, ensuring that
983 * it can't be modified.
984 */
985 if (!rawok && os->os_encrypted &&
986 arc_is_unauthenticated(os->os_phys_buf)) {
a2c2ed1b
TC
987 zbookmark_phys_t zb;
988
989 SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT,
990 ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
b5256303 991 err = arc_untransform(os->os_phys_buf, os->os_spa,
a2c2ed1b 992 &zb, B_FALSE);
b5256303
TC
993 if (err != 0) {
994 dsl_pool_rele(dp, tag);
995 return (err);
996 }
997
998 ASSERT0(arc_is_unauthenticated(os->os_phys_buf));
999 }
1000
34dc7c2f
BB
1001 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1002 drr->drr_type = DRR_BEGIN;
1003 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
428870ff
BB
1004 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
1005 DMU_SUBSTREAM);
1006
47dfff3b
MA
1007 bzero(&to_arg, sizeof (to_arg));
1008
428870ff 1009#ifdef _KERNEL
13fe0198 1010 if (dmu_objset_type(os) == DMU_OST_ZFS) {
428870ff 1011 uint64_t version;
13fe0198 1012 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
37abac6d 1013 kmem_free(drr, sizeof (dmu_replay_record_t));
13fe0198 1014 dsl_pool_rele(dp, tag);
2e528b49 1015 return (SET_ERROR(EINVAL));
37abac6d 1016 }
13fe0198 1017 if (version >= ZPL_VERSION_SA) {
9b67f605 1018 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
428870ff
BB
1019 }
1020 }
1021#endif
1022
b5256303
TC
1023 /* raw sends imply large_block_ok */
1024 if ((large_block_ok || rawok) &&
d52d80b7 1025 dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS))
f1512ee6 1026 featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
d52d80b7 1027 if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE))
50c957f7 1028 featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
b5256303
TC
1029
1030 /* encrypted datasets will not have embedded blocks */
1031 if ((embedok || rawok) && !os->os_encrypted &&
9b67f605
MA
1032 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
1033 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
2aa34383 1034 }
b5256303
TC
1035
1036 /* raw send implies compressok */
1037 if (compressok || rawok)
2aa34383 1038 featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
b5256303
TC
1039 if (rawok && os->os_encrypted)
1040 featureflags |= DMU_BACKUP_FEATURE_RAW;
1041
2aa34383 1042 if ((featureflags &
b5256303
TC
1043 (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED |
1044 DMU_BACKUP_FEATURE_RAW)) != 0 &&
1045 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
2aa34383 1046 featureflags |= DMU_BACKUP_FEATURE_LZ4;
9b67f605
MA
1047 }
1048
47dfff3b
MA
1049 if (resumeobj != 0 || resumeoff != 0) {
1050 featureflags |= DMU_BACKUP_FEATURE_RESUMING;
1051 }
1052
9b67f605
MA
1053 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
1054 featureflags);
1055
34dc7c2f 1056 drr->drr_u.drr_begin.drr_creation_time =
fcff0f35 1057 dsl_dataset_phys(to_ds)->ds_creation_time;
13fe0198 1058 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
da536844 1059 if (is_clone)
34dc7c2f 1060 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
fcff0f35
PD
1061 drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
1062 if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
34dc7c2f 1063 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
b607405f
AS
1064 if (zfs_send_set_freerecords_bit)
1065 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
34dc7c2f 1066
fcff0f35
PD
1067 if (ancestor_zb != NULL) {
1068 drr->drr_u.drr_begin.drr_fromguid =
1069 ancestor_zb->zbm_guid;
1070 fromtxg = ancestor_zb->zbm_creation_txg;
da536844 1071 }
fcff0f35
PD
1072 dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
1073 if (!to_ds->ds_is_snapshot) {
da536844
MA
1074 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
1075 sizeof (drr->drr_u.drr_begin.drr_toname));
13fe0198 1076 }
34dc7c2f 1077
37abac6d
BP
1078 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
1079
1080 dsp->dsa_drr = drr;
1081 dsp->dsa_vp = vp;
1082 dsp->dsa_outfd = outfd;
1083 dsp->dsa_proc = curproc;
13fe0198 1084 dsp->dsa_os = os;
37abac6d 1085 dsp->dsa_off = off;
fcff0f35 1086 dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
37abac6d 1087 dsp->dsa_pending_op = PENDING_NONE;
9b67f605 1088 dsp->dsa_featureflags = featureflags;
47dfff3b
MA
1089 dsp->dsa_resume_object = resumeobj;
1090 dsp->dsa_resume_offset = resumeoff;
37abac6d 1091
fcff0f35
PD
1092 mutex_enter(&to_ds->ds_sendstream_lock);
1093 list_insert_head(&to_ds->ds_sendstreams, dsp);
1094 mutex_exit(&to_ds->ds_sendstream_lock);
37abac6d 1095
fcff0f35 1096 dsl_dataset_long_hold(to_ds, FTAG);
7ec09286
MA
1097 dsl_pool_rele(dp, tag);
1098
b5256303
TC
1099 /* handle features that require a DRR_BEGIN payload */
1100 if (featureflags &
1101 (DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_RAW)) {
1102 nvlist_t *keynvl = NULL;
1103 nvlist_t *nvl = fnvlist_alloc();
1104
1105 if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
1106 dmu_object_info_t to_doi;
1107 err = dmu_object_info(os, resumeobj, &to_doi);
1108 if (err != 0) {
1109 fnvlist_free(nvl);
1110 goto out;
1111 }
1112
1113 SET_BOOKMARK(&to_arg.resume, to_ds->ds_object,
1114 resumeobj, 0,
1115 resumeoff / to_doi.doi_data_block_size);
1116
1117 fnvlist_add_uint64(nvl, "resume_object", resumeobj);
1118 fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
1119 }
1120
1121 if (featureflags & DMU_BACKUP_FEATURE_RAW) {
f00ab3f2
TC
1122 uint64_t ivset_guid = (ancestor_zb != NULL) ?
1123 ancestor_zb->zbm_ivset_guid : 0;
1124
b5256303
TC
1125 ASSERT(os->os_encrypted);
1126
f00ab3f2
TC
1127 err = dsl_crypto_populate_key_nvlist(to_ds,
1128 ivset_guid, &keynvl);
b5256303
TC
1129 if (err != 0) {
1130 fnvlist_free(nvl);
1131 goto out;
1132 }
1133
1134 fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl);
1135 }
47dfff3b 1136
47dfff3b
MA
1137 payload = fnvlist_pack(nvl, &payload_len);
1138 drr->drr_payloadlen = payload_len;
b5256303 1139 fnvlist_free(keynvl);
47dfff3b
MA
1140 fnvlist_free(nvl);
1141 }
1142
1143 err = dump_record(dsp, payload, payload_len);
1144 fnvlist_pack_free(payload, payload_len);
1145 if (err != 0) {
37abac6d
BP
1146 err = dsp->dsa_err;
1147 goto out;
34dc7c2f
BB
1148 }
1149
3b0d9928
BB
1150 err = bqueue_init(&to_arg.q,
1151 MAX(zfs_send_queue_length, 2 * zfs_max_recordsize),
fcff0f35
PD
1152 offsetof(struct send_block_record, ln));
1153 to_arg.error_code = 0;
1154 to_arg.cancel = B_FALSE;
1155 to_arg.ds = to_ds;
1156 to_arg.fromtxg = fromtxg;
1157 to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
b5256303
TC
1158 if (rawok)
1159 to_arg.flags |= TRAVERSE_NO_DECRYPT;
fcff0f35
PD
1160 (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc,
1161 TS_RUN, minclsyspri);
1162
1163 to_data = bqueue_dequeue(&to_arg.q);
1164
1165 while (!to_data->eos_marker && err == 0) {
1166 err = do_dump(dsp, to_data);
1167 to_data = get_next_record(&to_arg.q, to_data);
1168 if (issig(JUSTLOOKING) && issig(FORREAL))
1169 err = EINTR;
1170 }
1171
1172 if (err != 0) {
1173 to_arg.cancel = B_TRUE;
1174 while (!to_data->eos_marker) {
1175 to_data = get_next_record(&to_arg.q, to_data);
1176 }
1177 }
1178 kmem_free(to_data, sizeof (*to_data));
1179
1180 bqueue_destroy(&to_arg.q);
1181
1182 if (err == 0 && to_arg.error_code != 0)
1183 err = to_arg.error_code;
1184
1185 if (err != 0)
1186 goto out;
34dc7c2f 1187
37abac6d 1188 if (dsp->dsa_pending_op != PENDING_NONE)
37f8a883 1189 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 1190 err = SET_ERROR(EINTR);
428870ff 1191
13fe0198
MA
1192 if (err != 0) {
1193 if (err == EINTR && dsp->dsa_err != 0)
37abac6d
BP
1194 err = dsp->dsa_err;
1195 goto out;
34dc7c2f
BB
1196 }
1197
1198 bzero(drr, sizeof (dmu_replay_record_t));
1199 drr->drr_type = DRR_END;
37abac6d
BP
1200 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
1201 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
34dc7c2f 1202
fcff0f35 1203 if (dump_record(dsp, NULL, 0) != 0)
37abac6d 1204 err = dsp->dsa_err;
37abac6d 1205out:
fcff0f35
PD
1206 mutex_enter(&to_ds->ds_sendstream_lock);
1207 list_remove(&to_ds->ds_sendstreams, dsp);
1208 mutex_exit(&to_ds->ds_sendstream_lock);
37abac6d 1209
51907a31
K
1210 VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
1211
34dc7c2f 1212 kmem_free(drr, sizeof (dmu_replay_record_t));
37abac6d 1213 kmem_free(dsp, sizeof (dmu_sendarg_t));
34dc7c2f 1214
fcff0f35 1215 dsl_dataset_long_rele(to_ds, FTAG);
13fe0198 1216
37abac6d 1217 return (err);
34dc7c2f
BB
1218}
1219
330d06f9 1220int
13fe0198 1221dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
2aa34383 1222 boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
b5256303 1223 boolean_t rawok, int outfd, vnode_t *vp, offset_t *off)
13fe0198
MA
1224{
1225 dsl_pool_t *dp;
1226 dsl_dataset_t *ds;
1227 dsl_dataset_t *fromds = NULL;
b5256303 1228 ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
13fe0198
MA
1229 int err;
1230
1231 err = dsl_pool_hold(pool, FTAG, &dp);
1232 if (err != 0)
1233 return (err);
1234
b5256303 1235 err = dsl_dataset_hold_obj_flags(dp, tosnap, dsflags, FTAG, &ds);
13fe0198
MA
1236 if (err != 0) {
1237 dsl_pool_rele(dp, FTAG);
1238 return (err);
1239 }
1240
1241 if (fromsnap != 0) {
f00ab3f2 1242 zfs_bookmark_phys_t zb = { 0 };
da536844
MA
1243 boolean_t is_clone;
1244
13fe0198
MA
1245 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
1246 if (err != 0) {
b5256303 1247 dsl_dataset_rele_flags(ds, dsflags, FTAG);
13fe0198
MA
1248 dsl_pool_rele(dp, FTAG);
1249 return (err);
1250 }
f00ab3f2 1251 if (!dsl_dataset_is_before(ds, fromds, 0)) {
da536844 1252 err = SET_ERROR(EXDEV);
f00ab3f2
TC
1253 dsl_dataset_rele(fromds, FTAG);
1254 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1255 dsl_pool_rele(dp, FTAG);
1256 return (err);
1257 }
1258
d683ddbb
JG
1259 zb.zbm_creation_time =
1260 dsl_dataset_phys(fromds)->ds_creation_time;
1261 zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
1262 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
f00ab3f2
TC
1263
1264 if (dsl_dataset_is_zapified(fromds)) {
1265 (void) zap_lookup(dp->dp_meta_objset,
1266 fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1,
1267 &zb.zbm_ivset_guid);
1268 }
1269
da536844
MA
1270 is_clone = (fromds->ds_dir != ds->ds_dir);
1271 dsl_dataset_rele(fromds, FTAG);
f1512ee6 1272 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
b5256303
TC
1273 embedok, large_block_ok, compressok, rawok, outfd,
1274 0, 0, vp, off);
da536844 1275 } else {
f1512ee6 1276 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
b5256303
TC
1277 embedok, large_block_ok, compressok, rawok, outfd,
1278 0, 0, vp, off);
13fe0198 1279 }
b5256303 1280 dsl_dataset_rele_flags(ds, dsflags, FTAG);
da536844 1281 return (err);
13fe0198
MA
1282}
1283
1284int
47dfff3b 1285dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
b5256303
TC
1286 boolean_t large_block_ok, boolean_t compressok, boolean_t rawok,
1287 int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp,
1288 offset_t *off)
13fe0198
MA
1289{
1290 dsl_pool_t *dp;
1291 dsl_dataset_t *ds;
13fe0198 1292 int err;
b5256303 1293 ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
da536844 1294 boolean_t owned = B_FALSE;
13fe0198 1295
da536844 1296 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
2e528b49 1297 return (SET_ERROR(EINVAL));
13fe0198
MA
1298
1299 err = dsl_pool_hold(tosnap, FTAG, &dp);
1300 if (err != 0)
1301 return (err);
da536844
MA
1302 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
1303 /*
1304 * We are sending a filesystem or volume. Ensure
1305 * that it doesn't change by owning the dataset.
1306 */
b5256303 1307 err = dsl_dataset_own(dp, tosnap, dsflags, FTAG, &ds);
da536844
MA
1308 owned = B_TRUE;
1309 } else {
b5256303 1310 err = dsl_dataset_hold_flags(dp, tosnap, dsflags, FTAG, &ds);
da536844 1311 }
13fe0198
MA
1312 if (err != 0) {
1313 dsl_pool_rele(dp, FTAG);
1314 return (err);
1315 }
1316
1317 if (fromsnap != NULL) {
f00ab3f2 1318 zfs_bookmark_phys_t zb = { 0 };
da536844
MA
1319 boolean_t is_clone = B_FALSE;
1320 int fsnamelen = strchr(tosnap, '@') - tosnap;
1321
1322 /*
1323 * If the fromsnap is in a different filesystem, then
1324 * mark the send stream as a clone.
1325 */
1326 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
1327 (fromsnap[fsnamelen] != '@' &&
1328 fromsnap[fsnamelen] != '#')) {
1329 is_clone = B_TRUE;
1330 }
1331
1332 if (strchr(fromsnap, '@')) {
1333 dsl_dataset_t *fromds;
1334 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
1335 if (err == 0) {
1336 if (!dsl_dataset_is_before(ds, fromds, 0))
1337 err = SET_ERROR(EXDEV);
1338 zb.zbm_creation_time =
d683ddbb 1339 dsl_dataset_phys(fromds)->ds_creation_time;
da536844 1340 zb.zbm_creation_txg =
d683ddbb
JG
1341 dsl_dataset_phys(fromds)->ds_creation_txg;
1342 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
da536844 1343 is_clone = (ds->ds_dir != fromds->ds_dir);
f00ab3f2
TC
1344
1345 if (dsl_dataset_is_zapified(fromds)) {
1346 (void) zap_lookup(dp->dp_meta_objset,
1347 fromds->ds_object,
1348 DS_FIELD_IVSET_GUID, 8, 1,
1349 &zb.zbm_ivset_guid);
1350 }
da536844
MA
1351 dsl_dataset_rele(fromds, FTAG);
1352 }
1353 } else {
1354 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
1355 }
13fe0198 1356 if (err != 0) {
b5256303
TC
1357 if (owned)
1358 dsl_dataset_disown(ds, dsflags, FTAG);
1359 else
1360 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1361
13fe0198
MA
1362 dsl_pool_rele(dp, FTAG);
1363 return (err);
1364 }
f1512ee6 1365 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
b5256303 1366 embedok, large_block_ok, compressok, rawok,
47dfff3b 1367 outfd, resumeobj, resumeoff, vp, off);
da536844 1368 } else {
f1512ee6 1369 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
b5256303 1370 embedok, large_block_ok, compressok, rawok,
47dfff3b 1371 outfd, resumeobj, resumeoff, vp, off);
13fe0198 1372 }
da536844 1373 if (owned)
b5256303 1374 dsl_dataset_disown(ds, dsflags, FTAG);
da536844 1375 else
b5256303
TC
1376 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1377
da536844 1378 return (err);
13fe0198
MA
1379}
1380
5dc8b736 1381static int
2aa34383
DK
1382dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
1383 uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
5dc8b736 1384{
ca0845d5 1385 int err = 0;
2aa34383 1386 uint64_t size;
5dc8b736
MG
1387 /*
1388 * Assume that space (both on-disk and in-stream) is dominated by
1389 * data. We will adjust for indirect blocks and the copies property,
1390 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
1391 */
1392
2aa34383
DK
1393 uint64_t recordsize;
1394 uint64_t record_count;
dd429b46
PD
1395 objset_t *os;
1396 VERIFY0(dmu_objset_from_ds(ds, &os));
2aa34383
DK
1397
1398 /* Assume all (uncompressed) blocks are recordsize. */
ca0845d5
PD
1399 if (zfs_override_estimate_recordsize != 0) {
1400 recordsize = zfs_override_estimate_recordsize;
1401 } else if (os->os_phys->os_type == DMU_OST_ZVOL) {
dd429b46
PD
1402 err = dsl_prop_get_int_ds(ds,
1403 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
1404 } else {
1405 err = dsl_prop_get_int_ds(ds,
1406 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
1407 }
2aa34383
DK
1408 if (err != 0)
1409 return (err);
1410 record_count = uncompressed / recordsize;
1411
1412 /*
1413 * If we're estimating a send size for a compressed stream, use the
1414 * compressed data size to estimate the stream size. Otherwise, use the
1415 * uncompressed data size.
1416 */
1417 size = stream_compressed ? compressed : uncompressed;
1418
5dc8b736
MG
1419 /*
1420 * Subtract out approximate space used by indirect blocks.
1421 * Assume most space is used by data blocks (non-indirect, non-dnode).
2aa34383 1422 * Assume no ditto blocks or internal fragmentation.
5dc8b736
MG
1423 *
1424 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
2aa34383 1425 * block.
5dc8b736 1426 */
2aa34383 1427 size -= record_count * sizeof (blkptr_t);
5dc8b736
MG
1428
1429 /* Add in the space for the record associated with each block. */
2aa34383 1430 size += record_count * sizeof (dmu_replay_record_t);
5dc8b736
MG
1431
1432 *sizep = size;
1433
1434 return (0);
1435}
1436
13fe0198 1437int
2aa34383
DK
1438dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds,
1439 boolean_t stream_compressed, uint64_t *sizep)
330d06f9 1440{
330d06f9 1441 int err;
2aa34383 1442 uint64_t uncomp, comp;
13fe0198 1443
fd0fd646 1444 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
330d06f9
MA
1445
1446 /* tosnap must be a snapshot */
0c66c32d 1447 if (!ds->ds_is_snapshot)
2e528b49 1448 return (SET_ERROR(EINVAL));
330d06f9 1449
71e2fe41
AG
1450 /* fromsnap, if provided, must be a snapshot */
1451 if (fromds != NULL && !fromds->ds_is_snapshot)
1452 return (SET_ERROR(EINVAL));
1453
6f1ffb06
MA
1454 /*
1455 * fromsnap must be an earlier snapshot from the same fs as tosnap,
1456 * or the origin's fs.
1457 */
da536844 1458 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
2e528b49 1459 return (SET_ERROR(EXDEV));
330d06f9 1460
2aa34383 1461 /* Get compressed and uncompressed size estimates of changed data. */
330d06f9 1462 if (fromds == NULL) {
2aa34383
DK
1463 uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
1464 comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
330d06f9 1465 } else {
2aa34383 1466 uint64_t used;
330d06f9 1467 err = dsl_dataset_space_written(fromds, ds,
2aa34383 1468 &used, &comp, &uncomp);
13fe0198 1469 if (err != 0)
330d06f9
MA
1470 return (err);
1471 }
1472
2aa34383
DK
1473 err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
1474 stream_compressed, sizep);
dd429b46
PD
1475 /*
1476 * Add the size of the BEGIN and END records to the estimate.
1477 */
1478 *sizep += 2 * sizeof (dmu_replay_record_t);
5dc8b736
MG
1479 return (err);
1480}
330d06f9 1481
2aa34383
DK
1482struct calculate_send_arg {
1483 uint64_t uncompressed;
1484 uint64_t compressed;
1485};
1486
5dc8b736
MG
1487/*
1488 * Simple callback used to traverse the blocks of a snapshot and sum their
2aa34383 1489 * uncompressed and compressed sizes.
5dc8b736
MG
1490 */
1491/* ARGSUSED */
1492static int
1493dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1494 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
1495{
2aa34383 1496 struct calculate_send_arg *space = arg;
5dc8b736 1497 if (bp != NULL && !BP_IS_HOLE(bp)) {
2aa34383
DK
1498 space->uncompressed += BP_GET_UCSIZE(bp);
1499 space->compressed += BP_GET_PSIZE(bp);
5dc8b736
MG
1500 }
1501 return (0);
1502}
1503
1504/*
1505 * Given a desination snapshot and a TXG, calculate the approximate size of a
1506 * send stream sent from that TXG. from_txg may be zero, indicating that the
1507 * whole snapshot will be sent.
1508 */
1509int
1510dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
2aa34383 1511 boolean_t stream_compressed, uint64_t *sizep)
5dc8b736 1512{
5dc8b736 1513 int err;
2aa34383 1514 struct calculate_send_arg size = { 0 };
5dc8b736 1515
fd0fd646 1516 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
5dc8b736
MG
1517
1518 /* tosnap must be a snapshot */
1519 if (!dsl_dataset_is_snapshot(ds))
1520 return (SET_ERROR(EINVAL));
1521
1522 /* verify that from_txg is before the provided snapshot was taken */
1523 if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
1524 return (SET_ERROR(EXDEV));
1525 }
330d06f9 1526 /*
5dc8b736
MG
1527 * traverse the blocks of the snapshot with birth times after
1528 * from_txg, summing their uncompressed size
330d06f9 1529 */
b5256303
TC
1530 err = traverse_dataset(ds, from_txg,
1531 TRAVERSE_POST | TRAVERSE_NO_DECRYPT,
5dc8b736 1532 dmu_calculate_send_traversal, &size);
2aa34383 1533
5dc8b736 1534 if (err)
330d06f9 1535 return (err);
330d06f9 1536
2aa34383
DK
1537 err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed,
1538 size.compressed, stream_compressed, sizep);
5dc8b736 1539 return (err);
330d06f9
MA
1540}
1541
47dfff3b 1542
03916905
PD
1543#if defined(_KERNEL)
1544/* BEGIN CSTYLED */
1545module_param(zfs_override_estimate_recordsize, ulong, 0644);
1546MODULE_PARM_DESC(zfs_override_estimate_recordsize,
1547 "Record size calculation override for zfs send estimates");
1548/* END CSTYLED */
37f8a883 1549
03916905
PD
1550module_param(zfs_send_corrupt_data, int, 0644);
1551MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data");
3b0d9928
BB
1552
1553module_param(zfs_send_queue_length, int, 0644);
1554MODULE_PARM_DESC(zfs_send_queue_length, "Maximum send queue length");
fd8febbd 1555#endif