]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dmu_send.c
Fix send/recv lost spill block
[mirror_zfs.git] / module / zfs / dmu_send.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
8d35c149 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
e6d3a843 24 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
788eb90c 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
47dfff3b 26 * Copyright 2014 HybridCluster. All rights reserved.
b607405f 27 * Copyright 2016 RackTop Systems.
a0bd735a 28 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
8d35c149 29 */
34dc7c2f 30
34dc7c2f
BB
31#include <sys/dmu.h>
32#include <sys/dmu_impl.h>
33#include <sys/dmu_tx.h>
34#include <sys/dbuf.h>
35#include <sys/dnode.h>
36#include <sys/zfs_context.h>
37#include <sys/dmu_objset.h>
38#include <sys/dmu_traverse.h>
39#include <sys/dsl_dataset.h>
40#include <sys/dsl_dir.h>
428870ff 41#include <sys/dsl_prop.h>
34dc7c2f
BB
42#include <sys/dsl_pool.h>
43#include <sys/dsl_synctask.h>
044baf00 44#include <sys/spa_impl.h>
34dc7c2f
BB
45#include <sys/zfs_ioctl.h>
46#include <sys/zap.h>
47#include <sys/zio_checksum.h>
428870ff
BB
48#include <sys/zfs_znode.h>
49#include <zfs_fletcher.h>
50#include <sys/avl.h>
51#include <sys/ddt.h>
572e2857 52#include <sys/zfs_onexit.h>
13fe0198
MA
53#include <sys/dmu_send.h>
54#include <sys/dsl_destroy.h>
9b67f605 55#include <sys/blkptr.h>
da536844 56#include <sys/dsl_bookmark.h>
9b67f605 57#include <sys/zfeature.h>
fcff0f35 58#include <sys/bqueue.h>
a0bd735a 59#include <sys/zvol.h>
f74b821a 60#include <sys/policy.h>
34dc7c2f 61
330d06f9
MA
62/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
63int zfs_send_corrupt_data = B_FALSE;
3b0d9928 64int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
b607405f
AS
65/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
66int zfs_send_set_freerecords_bit = B_TRUE;
caf9dd20
BB
67/* Set this tunable to FALSE is disable sending unmodified spill blocks. */
68int zfs_send_unmodified_spill_blocks = B_TRUE;
330d06f9 69
ca0845d5
PD
70/*
71 * Use this to override the recordsize calculation for fast zfs send estimates.
72 */
73unsigned long zfs_override_estimate_recordsize = 0;
74
fcff0f35
PD
75#define BP_SPAN(datablkszsec, indblkshift, level) \
76 (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
77 (level) * (indblkshift - SPA_BLKPTRSHIFT)))
78
79struct send_thread_arg {
80 bqueue_t q;
81 dsl_dataset_t *ds; /* Dataset to traverse */
82 uint64_t fromtxg; /* Traverse from this txg */
83 int flags; /* flags to pass to traverse_dataset */
84 int error_code;
85 boolean_t cancel;
47dfff3b 86 zbookmark_phys_t resume;
fcff0f35
PD
87};
88
89struct send_block_record {
90 boolean_t eos_marker; /* Marks the end of the stream */
91 blkptr_t bp;
92 zbookmark_phys_t zb;
93 uint8_t indblkshift;
94 uint16_t datablkszsec;
95 bqueue_node_t ln;
96};
97
044baf00
BB
98typedef struct dump_bytes_io {
99 dmu_sendarg_t *dbi_dsp;
100 void *dbi_buf;
101 int dbi_len;
102} dump_bytes_io_t;
103
caf9dd20
BB
104static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data);
105
044baf00 106static void
b58986ee 107dump_bytes_cb(void *arg)
34dc7c2f 108{
044baf00
BB
109 dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
110 dmu_sendarg_t *dsp = dbi->dbi_dsp;
47dfff3b 111 dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
34dc7c2f 112 ssize_t resid; /* have to get resid to get detailed errno */
f8866f8a
ER
113
114 /*
b5256303 115 * The code does not rely on len being a multiple of 8. We keep
f8866f8a
ER
116 * this assertion because of the corresponding assertion in
117 * receive_read(). Keeping this assertion ensures that we do not
118 * inadvertently break backwards compatibility (causing the assertion
b5256303
TC
119 * in receive_read() to trigger on old software). Newer feature flags
120 * (such as raw send) may break this assertion since they were
121 * introduced after the requirement was made obsolete.
f8866f8a
ER
122 */
123
b5256303
TC
124 ASSERT(dbi->dbi_len % 8 == 0 ||
125 (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
34dc7c2f 126
37abac6d 127 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
044baf00 128 (caddr_t)dbi->dbi_buf, dbi->dbi_len,
34dc7c2f 129 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
37abac6d
BP
130
131 mutex_enter(&ds->ds_sendstream_lock);
044baf00 132 *dsp->dsa_off += dbi->dbi_len;
37abac6d 133 mutex_exit(&ds->ds_sendstream_lock);
044baf00
BB
134}
135
136static int
137dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
138{
139 dump_bytes_io_t dbi;
140
141 dbi.dbi_dsp = dsp;
142 dbi.dbi_buf = buf;
143 dbi.dbi_len = len;
144
b58986ee
BB
145#if defined(HAVE_LARGE_STACKS)
146 dump_bytes_cb(&dbi);
147#else
044baf00
BB
148 /*
149 * The vn_rdwr() call is performed in a taskq to ensure that there is
150 * always enough stack space to write safely to the target filesystem.
151 * The ZIO_TYPE_FREE threads are used because there can be a lot of
152 * them and they are used in vdev_file.c for a similar purpose.
153 */
154 spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE,
b58986ee
BB
155 ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP);
156#endif /* HAVE_LARGE_STACKS */
37abac6d
BP
157
158 return (dsp->dsa_err);
34dc7c2f
BB
159}
160
37f8a883
MA
161/*
162 * For all record types except BEGIN, fill in the checksum (overlaid in
163 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
164 * up to the start of the checksum itself.
165 */
166static int
167dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
168{
169 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
170 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
a6255b7f 171 (void) fletcher_4_incremental_native(dsp->dsa_drr,
37f8a883
MA
172 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
173 &dsp->dsa_zc);
51907a31
K
174 if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
175 dsp->dsa_sent_begin = B_TRUE;
176 } else {
37f8a883
MA
177 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
178 drr_checksum.drr_checksum));
179 dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
180 }
51907a31
K
181 if (dsp->dsa_drr->drr_type == DRR_END) {
182 dsp->dsa_sent_end = B_TRUE;
183 }
a6255b7f 184 (void) fletcher_4_incremental_native(&dsp->dsa_drr->
37f8a883
MA
185 drr_u.drr_checksum.drr_checksum,
186 sizeof (zio_cksum_t), &dsp->dsa_zc);
187 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
188 return (SET_ERROR(EINTR));
189 if (payload_len != 0) {
a6255b7f 190 (void) fletcher_4_incremental_native(payload, payload_len,
37f8a883
MA
191 &dsp->dsa_zc);
192 if (dump_bytes(dsp, payload, payload_len) != 0)
193 return (SET_ERROR(EINTR));
194 }
195 return (0);
196}
197
e6d3a843
PD
198/*
199 * Fill in the drr_free struct, or perform aggregation if the previous record is
200 * also a free record, and the two are adjacent.
201 *
202 * Note that we send free records even for a full send, because we want to be
203 * able to receive a full send as a clone, which requires a list of all the free
204 * and freeobject records that were generated on the source.
205 */
34dc7c2f 206static int
37abac6d 207dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
34dc7c2f
BB
208 uint64_t length)
209{
37abac6d 210 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
428870ff 211
ea97f8ce
MA
212 /*
213 * When we receive a free record, dbuf_free_range() assumes
214 * that the receiving system doesn't have any dbufs in the range
215 * being freed. This is always true because there is a one-record
216 * constraint: we only send one WRITE record for any given
47dfff3b 217 * object,offset. We know that the one-record constraint is
ea97f8ce
MA
218 * true because we always send data in increasing order by
219 * object,offset.
220 *
221 * If the increasing-order constraint ever changes, we should find
222 * another way to assert that the one-record constraint is still
223 * satisfied.
224 */
225 ASSERT(object > dsp->dsa_last_data_object ||
226 (object == dsp->dsa_last_data_object &&
227 offset > dsp->dsa_last_data_offset));
228
428870ff
BB
229 /*
230 * If there is a pending op, but it's not PENDING_FREE, push it out,
231 * since free block aggregation can only be done for blocks of the
232 * same type (i.e., DRR_FREE records can only be aggregated with
233 * other DRR_FREE records. DRR_FREEOBJECTS records can only be
234 * aggregated with other DRR_FREEOBJECTS records.
235 */
37abac6d
BP
236 if (dsp->dsa_pending_op != PENDING_NONE &&
237 dsp->dsa_pending_op != PENDING_FREE) {
37f8a883 238 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 239 return (SET_ERROR(EINTR));
37abac6d 240 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
241 }
242
37abac6d 243 if (dsp->dsa_pending_op == PENDING_FREE) {
428870ff 244 /*
ee45fbd8 245 * There should never be a PENDING_FREE if length is
246 * DMU_OBJECT_END (because dump_dnode is the only place where
247 * this function is called with a DMU_OBJECT_END, and only after
248 * flushing any pending record).
428870ff 249 */
ee45fbd8 250 ASSERT(length != DMU_OBJECT_END);
428870ff
BB
251 /*
252 * Check to see whether this free block can be aggregated
253 * with pending one.
254 */
255 if (drrf->drr_object == object && drrf->drr_offset +
256 drrf->drr_length == offset) {
ee45fbd8 257 if (offset + length < offset)
258 drrf->drr_length = DMU_OBJECT_END;
259 else
260 drrf->drr_length += length;
428870ff
BB
261 return (0);
262 } else {
263 /* not a continuation. Push out pending record */
37f8a883 264 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 265 return (SET_ERROR(EINTR));
37abac6d 266 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
267 }
268 }
269 /* create a FREE record and make it pending */
37abac6d
BP
270 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
271 dsp->dsa_drr->drr_type = DRR_FREE;
428870ff
BB
272 drrf->drr_object = object;
273 drrf->drr_offset = offset;
ee45fbd8 274 if (offset + length < offset)
275 drrf->drr_length = DMU_OBJECT_END;
276 else
277 drrf->drr_length = length;
37abac6d 278 drrf->drr_toguid = dsp->dsa_toguid;
ee45fbd8 279 if (length == DMU_OBJECT_END) {
37f8a883 280 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 281 return (SET_ERROR(EINTR));
428870ff 282 } else {
37abac6d 283 dsp->dsa_pending_op = PENDING_FREE;
428870ff 284 }
34dc7c2f 285
34dc7c2f
BB
286 return (0);
287}
288
289static int
b5256303
TC
290dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object,
291 uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data)
34dc7c2f 292{
2aa34383 293 uint64_t payload_size;
b5256303 294 boolean_t raw = (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW);
37abac6d 295 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
428870ff 296
ea97f8ce
MA
297 /*
298 * We send data in increasing object, offset order.
299 * See comment in dump_free() for details.
300 */
301 ASSERT(object > dsp->dsa_last_data_object ||
302 (object == dsp->dsa_last_data_object &&
303 offset > dsp->dsa_last_data_offset));
304 dsp->dsa_last_data_object = object;
2aa34383 305 dsp->dsa_last_data_offset = offset + lsize - 1;
428870ff
BB
306
307 /*
308 * If there is any kind of pending aggregation (currently either
309 * a grouping of free objects or free blocks), push it out to
310 * the stream, since aggregation can't be done across operations
311 * of different types.
312 */
37abac6d 313 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 314 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 315 return (SET_ERROR(EINTR));
37abac6d 316 dsp->dsa_pending_op = PENDING_NONE;
428870ff 317 }
37f8a883 318 /* write a WRITE record */
37abac6d
BP
319 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
320 dsp->dsa_drr->drr_type = DRR_WRITE;
428870ff
BB
321 drrw->drr_object = object;
322 drrw->drr_type = type;
323 drrw->drr_offset = offset;
37abac6d 324 drrw->drr_toguid = dsp->dsa_toguid;
2aa34383
DK
325 drrw->drr_logical_size = lsize;
326
b5256303
TC
327 /* only set the compression fields if the buf is compressed or raw */
328 if (raw || lsize != psize) {
2aa34383 329 ASSERT(!BP_IS_EMBEDDED(bp));
2aa34383 330 ASSERT3S(psize, >, 0);
2aa34383 331
b5256303
TC
332 if (raw) {
333 ASSERT(BP_IS_PROTECTED(bp));
334
335 /*
9b840763
TC
336 * This is a raw protected block so we need to pass
337 * along everything the receiving side will need to
338 * interpret this block, including the byteswap, salt,
339 * IV, and MAC.
b5256303 340 */
b5256303
TC
341 if (BP_SHOULD_BYTESWAP(bp))
342 drrw->drr_flags |= DRR_RAW_BYTESWAP;
343 zio_crypt_decode_params_bp(bp, drrw->drr_salt,
344 drrw->drr_iv);
345 zio_crypt_decode_mac_bp(bp, drrw->drr_mac);
346 } else {
347 /* this is a compressed block */
348 ASSERT(dsp->dsa_featureflags &
349 DMU_BACKUP_FEATURE_COMPRESSED);
350 ASSERT(!BP_SHOULD_BYTESWAP(bp));
351 ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
352 ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
353 ASSERT3S(lsize, >=, psize);
354 }
355
356 /* set fields common to compressed and raw sends */
2aa34383
DK
357 drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
358 drrw->drr_compressed_size = psize;
359 payload_size = drrw->drr_compressed_size;
360 } else {
361 payload_size = drrw->drr_logical_size;
362 }
363
b5256303 364 if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) {
9b67f605 365 /*
b5256303
TC
366 * There's no pre-computed checksum for partial-block writes,
367 * embedded BP's, or encrypted BP's that are being sent as
368 * plaintext, so (like fletcher4-checkummed blocks) userland
369 * will have to compute a dedup-capable checksum itself.
9b67f605
MA
370 */
371 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
372 } else {
373 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
3c67d83a
TH
374 if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
375 ZCHECKSUM_FLAG_DEDUP)
b5256303 376 drrw->drr_flags |= DRR_CHECKSUM_DEDUP;
9b67f605
MA
377 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
378 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
379 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
b5256303 380 DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp));
9b67f605
MA
381 drrw->drr_key.ddk_cksum = bp->blk_cksum;
382 }
428870ff 383
2aa34383 384 if (dump_record(dsp, data, payload_size) != 0)
2e528b49 385 return (SET_ERROR(EINTR));
428870ff
BB
386 return (0);
387}
388
9b67f605
MA
389static int
390dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
391 int blksz, const blkptr_t *bp)
392{
393 char buf[BPE_PAYLOAD_SIZE];
394 struct drr_write_embedded *drrw =
395 &(dsp->dsa_drr->drr_u.drr_write_embedded);
396
397 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 398 if (dump_record(dsp, NULL, 0) != 0)
ecb2b7dc 399 return (SET_ERROR(EINTR));
9b67f605
MA
400 dsp->dsa_pending_op = PENDING_NONE;
401 }
402
403 ASSERT(BP_IS_EMBEDDED(bp));
404
405 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
406 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
407 drrw->drr_object = object;
408 drrw->drr_offset = offset;
409 drrw->drr_length = blksz;
410 drrw->drr_toguid = dsp->dsa_toguid;
411 drrw->drr_compression = BP_GET_COMPRESS(bp);
412 drrw->drr_etype = BPE_GET_ETYPE(bp);
413 drrw->drr_lsize = BPE_GET_LSIZE(bp);
414 drrw->drr_psize = BPE_GET_PSIZE(bp);
415
416 decode_embedded_bp_compressed(bp, buf);
417
37f8a883 418 if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
ecb2b7dc 419 return (SET_ERROR(EINTR));
9b67f605
MA
420 return (0);
421}
422
428870ff 423static int
b5256303 424dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data)
428870ff 425{
37abac6d 426 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
b5256303 427 uint64_t blksz = BP_GET_LSIZE(bp);
b0ee5946 428 uint64_t payload_size = blksz;
428870ff 429
37abac6d 430 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 431 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 432 return (SET_ERROR(EINTR));
37abac6d 433 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
434 }
435
436 /* write a SPILL record */
37abac6d
BP
437 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
438 dsp->dsa_drr->drr_type = DRR_SPILL;
428870ff
BB
439 drrs->drr_object = object;
440 drrs->drr_length = blksz;
37abac6d 441 drrs->drr_toguid = dsp->dsa_toguid;
34dc7c2f 442
caf9dd20
BB
443 /* See comment in dump_dnode() for full details */
444 if (zfs_send_unmodified_spill_blocks &&
445 (bp->blk_birth <= dsp->dsa_fromtxg)) {
446 drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
447 }
448
b5256303 449 /* handle raw send fields */
9b840763
TC
450 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
451 ASSERT(BP_IS_PROTECTED(bp));
452
b5256303
TC
453 if (BP_SHOULD_BYTESWAP(bp))
454 drrs->drr_flags |= DRR_RAW_BYTESWAP;
455 drrs->drr_compressiontype = BP_GET_COMPRESS(bp);
456 drrs->drr_compressed_size = BP_GET_PSIZE(bp);
457 zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv);
458 zio_crypt_decode_mac_bp(bp, drrs->drr_mac);
b0ee5946 459 payload_size = drrs->drr_compressed_size;
b5256303
TC
460 }
461
b0ee5946 462 if (dump_record(dsp, data, payload_size) != 0)
2e528b49 463 return (SET_ERROR(EINTR));
34dc7c2f
BB
464 return (0);
465}
466
467static int
37abac6d 468dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
34dc7c2f 469{
37abac6d 470 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
829e95c4
FG
471 uint64_t maxobj = DNODES_PER_BLOCK *
472 (DMU_META_DNODE(dsp->dsa_os)->dn_maxblkid + 1);
473
474 /*
475 * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
476 * leading to zfs recv never completing. to avoid this issue, don't
477 * send FREEOBJECTS records for object IDs which cannot exist on the
478 * receiving side.
479 */
480 if (maxobj > 0) {
481 if (maxobj < firstobj)
482 return (0);
483
484 if (maxobj < firstobj + numobjs)
485 numobjs = maxobj - firstobj;
486 }
428870ff
BB
487
488 /*
489 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
490 * push it out, since free block aggregation can only be done for
491 * blocks of the same type (i.e., DRR_FREE records can only be
492 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
493 * can only be aggregated with other DRR_FREEOBJECTS records.
494 */
37abac6d
BP
495 if (dsp->dsa_pending_op != PENDING_NONE &&
496 dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
37f8a883 497 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 498 return (SET_ERROR(EINTR));
37abac6d 499 dsp->dsa_pending_op = PENDING_NONE;
428870ff 500 }
37abac6d 501 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
428870ff
BB
502 /*
503 * See whether this free object array can be aggregated
504 * with pending one
505 */
506 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
507 drrfo->drr_numobjs += numobjs;
508 return (0);
509 } else {
510 /* can't be aggregated. Push out pending record */
37f8a883 511 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 512 return (SET_ERROR(EINTR));
37abac6d 513 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
514 }
515 }
516
34dc7c2f 517 /* write a FREEOBJECTS record */
37abac6d
BP
518 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
519 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
428870ff
BB
520 drrfo->drr_firstobj = firstobj;
521 drrfo->drr_numobjs = numobjs;
37abac6d 522 drrfo->drr_toguid = dsp->dsa_toguid;
428870ff 523
37abac6d 524 dsp->dsa_pending_op = PENDING_FREEOBJECTS;
34dc7c2f 525
34dc7c2f
BB
526 return (0);
527}
528
529static int
b5256303
TC
530dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object,
531 dnode_phys_t *dnp)
34dc7c2f 532{
37abac6d 533 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
4807c0ba 534 int bonuslen;
428870ff 535
47dfff3b
MA
536 if (object < dsp->dsa_resume_object) {
537 /*
538 * Note: when resuming, we will visit all the dnodes in
539 * the block of dnodes that we are resuming from. In
540 * this case it's unnecessary to send the dnodes prior to
541 * the one we are resuming from. We should be at most one
542 * block's worth of dnodes behind the resume point.
543 */
544 ASSERT3U(dsp->dsa_resume_object - object, <,
545 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
546 return (0);
547 }
548
34dc7c2f 549 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
37abac6d 550 return (dump_freeobjects(dsp, object, 1));
34dc7c2f 551
37abac6d 552 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 553 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 554 return (SET_ERROR(EINTR));
37abac6d 555 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
556 }
557
34dc7c2f 558 /* write an OBJECT record */
37abac6d
BP
559 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
560 dsp->dsa_drr->drr_type = DRR_OBJECT;
428870ff
BB
561 drro->drr_object = object;
562 drro->drr_type = dnp->dn_type;
563 drro->drr_bonustype = dnp->dn_bonustype;
564 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
565 drro->drr_bonuslen = dnp->dn_bonuslen;
50c957f7 566 drro->drr_dn_slots = dnp->dn_extra_slots + 1;
428870ff
BB
567 drro->drr_checksumtype = dnp->dn_checksum;
568 drro->drr_compress = dnp->dn_compress;
37abac6d 569 drro->drr_toguid = dsp->dsa_toguid;
428870ff 570
f1512ee6
MA
571 if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
572 drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
573 drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
574
4807c0ba
TC
575 bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8);
576
9b840763
TC
577 if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW)) {
578 ASSERT(BP_IS_ENCRYPTED(bp));
579
b5256303
TC
580 if (BP_SHOULD_BYTESWAP(bp))
581 drro->drr_flags |= DRR_RAW_BYTESWAP;
582
583 /* needed for reconstructing dnp on recv side */
ae76f45c 584 drro->drr_maxblkid = dnp->dn_maxblkid;
b5256303
TC
585 drro->drr_indblkshift = dnp->dn_indblkshift;
586 drro->drr_nlevels = dnp->dn_nlevels;
587 drro->drr_nblkptr = dnp->dn_nblkptr;
588
589 /*
590 * Since we encrypt the entire bonus area, the (raw) part
4807c0ba 591 * beyond the bonuslen is actually nonzero, so we need
b5256303
TC
592 * to send it.
593 */
594 if (bonuslen != 0) {
595 drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp);
596 bonuslen = drro->drr_raw_bonuslen;
597 }
37f8a883 598 }
34dc7c2f 599
caf9dd20
BB
600 /*
601 * DRR_OBJECT_SPILL is set for every dnode which references a
602 * spill block. This allows the receiving pool to definitively
603 * determine when a spill block should be kept or freed.
604 */
605 if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
606 drro->drr_flags |= DRR_OBJECT_SPILL;
607
b5256303
TC
608 if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0)
609 return (SET_ERROR(EINTR));
610
ea97f8ce 611 /* Free anything past the end of the file. */
37abac6d 612 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
ee45fbd8 613 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
2e528b49 614 return (SET_ERROR(EINTR));
caf9dd20
BB
615
616 /*
617 * Send DRR_SPILL records for unmodified spill blocks. This is useful
618 * because changing certain attributes of the object (e.g. blocksize)
619 * can cause old versions of ZFS to incorrectly remove a spill block.
620 * Including these records in the stream forces an up to date version
621 * to always be written ensuring they're never lost. Current versions
622 * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
623 * ignore these unmodified spill blocks.
624 */
625 if (zfs_send_unmodified_spill_blocks &&
626 (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
627 (DN_SPILL_BLKPTR(dnp)->blk_birth <= dsp->dsa_fromtxg)) {
628 struct send_block_record record;
629
630 bzero(&record, sizeof (struct send_block_record));
631 record.eos_marker = B_FALSE;
632 record.bp = *DN_SPILL_BLKPTR(dnp);
633 SET_BOOKMARK(&(record.zb), dmu_objset_id(dsp->dsa_os),
634 object, 0, DMU_SPILL_BLKID);
635
636 if (do_dump(dsp, &record) != 0)
637 return (SET_ERROR(EINTR));
638 }
639
13fe0198 640 if (dsp->dsa_err != 0)
2e528b49 641 return (SET_ERROR(EINTR));
caf9dd20 642
34dc7c2f
BB
643 return (0);
644}
645
b5256303
TC
646static int
647dump_object_range(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t firstobj,
648 uint64_t numslots)
649{
650 struct drr_object_range *drror =
651 &(dsp->dsa_drr->drr_u.drr_object_range);
652
653 /* we only use this record type for raw sends */
654 ASSERT(BP_IS_PROTECTED(bp));
655 ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW);
656 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
657 ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
658 ASSERT0(BP_GET_LEVEL(bp));
659
660 if (dsp->dsa_pending_op != PENDING_NONE) {
661 if (dump_record(dsp, NULL, 0) != 0)
662 return (SET_ERROR(EINTR));
663 dsp->dsa_pending_op = PENDING_NONE;
664 }
665
666 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
667 dsp->dsa_drr->drr_type = DRR_OBJECT_RANGE;
668 drror->drr_firstobj = firstobj;
669 drror->drr_numslots = numslots;
670 drror->drr_toguid = dsp->dsa_toguid;
b5256303
TC
671 if (BP_SHOULD_BYTESWAP(bp))
672 drror->drr_flags |= DRR_RAW_BYTESWAP;
673 zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv);
674 zio_crypt_decode_mac_bp(bp, drror->drr_mac);
675
676 if (dump_record(dsp, NULL, 0) != 0)
677 return (SET_ERROR(EINTR));
678 return (0);
679}
680
9b67f605
MA
681static boolean_t
682backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
683{
684 if (!BP_IS_EMBEDDED(bp))
685 return (B_FALSE);
686
687 /*
688 * Compression function must be legacy, or explicitly enabled.
689 */
690 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
2aa34383 691 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4)))
9b67f605
MA
692 return (B_FALSE);
693
694 /*
695 * Embed type must be explicitly enabled.
696 */
697 switch (BPE_GET_ETYPE(bp)) {
698 case BP_EMBEDDED_TYPE_DATA:
699 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
700 return (B_TRUE);
701 break;
702 default:
703 return (B_FALSE);
704 }
705 return (B_FALSE);
706}
707
fcff0f35
PD
708/*
709 * This is the callback function to traverse_dataset that acts as the worker
710 * thread for dmu_send_impl.
711 */
712/*ARGSUSED*/
713static int
714send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
715 const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
716{
717 struct send_thread_arg *sta = arg;
718 struct send_block_record *record;
719 uint64_t record_size;
720 int err = 0;
721
47dfff3b
MA
722 ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
723 zb->zb_object >= sta->resume.zb_object);
b5256303 724 ASSERT3P(sta->ds, !=, NULL);
47dfff3b 725
fcff0f35
PD
726 if (sta->cancel)
727 return (SET_ERROR(EINTR));
34dc7c2f 728
fcff0f35
PD
729 if (bp == NULL) {
730 ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
731 return (0);
732 } else if (zb->zb_level < 0) {
733 return (0);
734 }
735
736 record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
737 record->eos_marker = B_FALSE;
738 record->bp = *bp;
739 record->zb = *zb;
740 record->indblkshift = dnp->dn_indblkshift;
741 record->datablkszsec = dnp->dn_datablkszsec;
742 record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
743 bqueue_enqueue(&sta->q, record, record_size);
744
745 return (err);
746}
747
748/*
749 * This function kicks off the traverse_dataset. It also handles setting the
750 * error code of the thread in case something goes wrong, and pushes the End of
751 * Stream record when the traverse_dataset call has finished. If there is no
752 * dataset to traverse, the thread immediately pushes End of Stream marker.
753 */
754static void
755send_traverse_thread(void *arg)
756{
757 struct send_thread_arg *st_arg = arg;
758 int err;
759 struct send_block_record *data;
3e635ac1 760 fstrans_cookie_t cookie = spl_fstrans_mark();
fcff0f35
PD
761
762 if (st_arg->ds != NULL) {
47dfff3b
MA
763 err = traverse_dataset_resume(st_arg->ds,
764 st_arg->fromtxg, &st_arg->resume,
765 st_arg->flags, send_cb, st_arg);
766
fcff0f35
PD
767 if (err != EINTR)
768 st_arg->error_code = err;
769 }
770 data = kmem_zalloc(sizeof (*data), KM_SLEEP);
771 data->eos_marker = B_TRUE;
772 bqueue_enqueue(&st_arg->q, data, 1);
3e635ac1 773 spl_fstrans_unmark(cookie);
34a6b428 774 thread_exit();
fcff0f35
PD
775}
776
777/*
778 * This function actually handles figuring out what kind of record needs to be
779 * dumped, reading the data (which has hopefully been prefetched), and calling
780 * the appropriate helper function.
781 */
34dc7c2f 782static int
fcff0f35 783do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
34dc7c2f 784{
fcff0f35
PD
785 dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
786 const blkptr_t *bp = &data->bp;
787 const zbookmark_phys_t *zb = &data->zb;
788 uint8_t indblkshift = data->indblkshift;
789 uint16_t dblkszsec = data->datablkszsec;
790 spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
34dc7c2f 791 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
34dc7c2f
BB
792 int err = 0;
793
fcff0f35 794 ASSERT3U(zb->zb_level, >=, 0);
34dc7c2f 795
47dfff3b
MA
796 ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
797 zb->zb_object >= dsa->dsa_resume_object);
798
b5256303
TC
799 /*
800 * All bps of an encrypted os should have the encryption bit set.
801 * If this is not true it indicates tampering and we report an error.
802 */
803 if (dsa->dsa_os->os_encrypted &&
804 !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
805 spa_log_error(spa, zb);
806 zfs_panic_recover("unencrypted block in encrypted "
807 "object set %llu", ds->ds_object);
808 return (SET_ERROR(EIO));
809 }
810
428870ff
BB
811 if (zb->zb_object != DMU_META_DNODE_OBJECT &&
812 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
9babb374 813 return (0);
b0bc7a84
MG
814 } else if (BP_IS_HOLE(bp) &&
815 zb->zb_object == DMU_META_DNODE_OBJECT) {
fcff0f35 816 uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
b128c09f 817 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
fcff0f35 818 err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
b0bc7a84 819 } else if (BP_IS_HOLE(bp)) {
fcff0f35
PD
820 uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
821 uint64_t offset = zb->zb_blkid * span;
ee45fbd8 822 /* Don't dump free records for offsets > DMU_OBJECT_END */
823 if (zb->zb_blkid == 0 || span <= DMU_OBJECT_END / zb->zb_blkid)
824 err = dump_free(dsa, zb->zb_object, offset, span);
b128c09f
BB
825 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
826 return (0);
827 } else if (type == DMU_OT_DNODE) {
50c957f7 828 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
2a432414 829 arc_flags_t aflags = ARC_FLAG_WAIT;
b128c09f 830 arc_buf_t *abuf;
b5256303 831 enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
fcff0f35 832
b5256303
TC
833 if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
834 ASSERT(BP_IS_ENCRYPTED(bp));
835 ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
836 zioflags |= ZIO_FLAG_RAW;
837 }
838
fcff0f35 839 ASSERT0(zb->zb_level);
b128c09f 840
294f6806 841 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
b5256303 842 ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0)
2e528b49 843 return (SET_ERROR(EIO));
34dc7c2f 844
1c27024e
DB
845 dnode_phys_t *blk = abuf->b_data;
846 uint64_t dnobj = zb->zb_blkid * epb;
b5256303
TC
847
848 /*
849 * Raw sends require sending encryption parameters for the
850 * block of dnodes. Regular sends do not need to send this
851 * info.
852 */
853 if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
854 ASSERT(arc_is_encrypted(abuf));
855 err = dump_object_range(dsa, bp, dnobj, epb);
856 }
857
858 if (err == 0) {
1c27024e
DB
859 for (int i = 0; i < epb;
860 i += blk[i].dn_extra_slots + 1) {
b5256303
TC
861 err = dump_dnode(dsa, bp, dnobj + i, blk + i);
862 if (err != 0)
863 break;
864 }
34dc7c2f 865 }
d3c2ae1c 866 arc_buf_destroy(abuf, &abuf);
428870ff 867 } else if (type == DMU_OT_SA) {
2a432414 868 arc_flags_t aflags = ARC_FLAG_WAIT;
b128c09f 869 arc_buf_t *abuf;
b5256303
TC
870 enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
871
872 if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
873 ASSERT(BP_IS_PROTECTED(bp));
874 zioflags |= ZIO_FLAG_RAW;
875 }
b128c09f 876
294f6806 877 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
b5256303 878 ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0)
2e528b49 879 return (SET_ERROR(EIO));
b128c09f 880
b5256303 881 err = dump_spill(dsa, bp, zb->zb_object, abuf->b_data);
d3c2ae1c 882 arc_buf_destroy(abuf, &abuf);
fcff0f35 883 } else if (backup_do_embed(dsa, bp)) {
9b67f605 884 /* it's an embedded level-0 block of a regular object */
fcff0f35
PD
885 int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
886 ASSERT0(zb->zb_level);
887 err = dump_write_embedded(dsa, zb->zb_object,
9b67f605 888 zb->zb_blkid * blksz, blksz, bp);
fcff0f35
PD
889 } else {
890 /* it's a level-0 block of a regular object */
2a432414 891 arc_flags_t aflags = ARC_FLAG_WAIT;
428870ff 892 arc_buf_t *abuf;
fcff0f35
PD
893 int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
894 uint64_t offset;
2aa34383
DK
895
896 /*
897 * If we have large blocks stored on disk but the send flags
898 * don't allow us to send large blocks, we split the data from
899 * the arc buf into chunks.
900 */
a7004725 901 boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE &&
2aa34383 902 !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
b5256303
TC
903
904 /*
905 * Raw sends require that we always get raw data as it exists
906 * on disk, so we assert that we are not splitting blocks here.
907 */
908 boolean_t request_raw =
909 (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
910
2aa34383
DK
911 /*
912 * We should only request compressed data from the ARC if all
913 * the following are true:
914 * - stream compression was requested
915 * - we aren't splitting large blocks into smaller chunks
916 * - the data won't need to be byteswapped before sending
917 * - this isn't an embedded block
918 * - this isn't metadata (if receiving on a different endian
919 * system it can be byteswapped more easily)
920 */
921 boolean_t request_compressed =
922 (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
923 !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
924 !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
428870ff 925
b5256303
TC
926 IMPLY(request_raw, !split_large_blocks);
927 IMPLY(request_raw, BP_IS_PROTECTED(bp));
da536844 928 ASSERT0(zb->zb_level);
47dfff3b
MA
929 ASSERT(zb->zb_object > dsa->dsa_resume_object ||
930 (zb->zb_object == dsa->dsa_resume_object &&
931 zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
932
a7004725
DK
933 ASSERT3U(blksz, ==, BP_GET_LSIZE(bp));
934
935 enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
b5256303 936 if (request_raw)
2aa34383 937 zioflags |= ZIO_FLAG_RAW;
b5256303
TC
938 else if (request_compressed)
939 zioflags |= ZIO_FLAG_RAW_COMPRESS;
2aa34383 940
294f6806 941 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
a7004725 942 ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) {
330d06f9 943 if (zfs_send_corrupt_data) {
330d06f9 944 /* Send a block filled with 0x"zfs badd bloc" */
2aa34383
DK
945 abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA,
946 blksz);
a7004725 947 uint64_t *ptr;
330d06f9
MA
948 for (ptr = abuf->b_data;
949 (char *)ptr < (char *)abuf->b_data + blksz;
950 ptr++)
dd26aa53 951 *ptr = 0x2f5baddb10cULL;
330d06f9 952 } else {
2e528b49 953 return (SET_ERROR(EIO));
330d06f9
MA
954 }
955 }
428870ff 956
f1512ee6
MA
957 offset = zb->zb_blkid * blksz;
958
2aa34383 959 if (split_large_blocks) {
b5256303 960 ASSERT0(arc_is_encrypted(abuf));
2aa34383
DK
961 ASSERT3U(arc_get_compression(abuf), ==,
962 ZIO_COMPRESS_OFF);
a7004725 963 char *buf = abuf->b_data;
f1512ee6
MA
964 while (blksz > 0 && err == 0) {
965 int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
fcff0f35 966 err = dump_write(dsa, type, zb->zb_object,
2aa34383 967 offset, n, n, NULL, buf);
f1512ee6
MA
968 offset += n;
969 buf += n;
970 blksz -= n;
971 }
972 } else {
2aa34383 973 err = dump_write(dsa, type, zb->zb_object, offset,
b5256303 974 blksz, arc_buf_size(abuf), bp, abuf->b_data);
f1512ee6 975 }
d3c2ae1c 976 arc_buf_destroy(abuf, &abuf);
34dc7c2f
BB
977 }
978
979 ASSERT(err == 0 || err == EINTR);
980 return (err);
981}
982
6f1ffb06 983/*
fcff0f35
PD
984 * Pop the new data off the queue, and free the old data.
985 */
986static struct send_block_record *
987get_next_record(bqueue_t *bq, struct send_block_record *data)
988{
989 struct send_block_record *tmp = bqueue_dequeue(bq);
990 kmem_free(data, sizeof (*data));
991 return (tmp);
992}
993
994/*
995 * Actually do the bulk of the work in a zfs send.
996 *
997 * Note: Releases dp using the specified tag.
6f1ffb06 998 */
13fe0198 999static int
fcff0f35 1000dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
2aa34383
DK
1001 zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone,
1002 boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
b5256303 1003 boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
47dfff3b 1004 vnode_t *vp, offset_t *off)
34dc7c2f 1005{
13fe0198 1006 objset_t *os;
34dc7c2f 1007 dmu_replay_record_t *drr;
37abac6d 1008 dmu_sendarg_t *dsp;
34dc7c2f
BB
1009 int err;
1010 uint64_t fromtxg = 0;
9b67f605 1011 uint64_t featureflags = 0;
fcff0f35 1012 struct send_thread_arg to_arg;
47dfff3b
MA
1013 void *payload = NULL;
1014 size_t payload_len = 0;
fcff0f35 1015 struct send_block_record *to_data;
34dc7c2f 1016
fcff0f35 1017 err = dmu_objset_from_ds(to_ds, &os);
13fe0198 1018 if (err != 0) {
13fe0198
MA
1019 dsl_pool_rele(dp, tag);
1020 return (err);
1021 }
34dc7c2f 1022
b5256303
TC
1023 /*
1024 * If this is a non-raw send of an encrypted ds, we can ensure that
1025 * the objset_phys_t is authenticated. This is safe because this is
1026 * either a snapshot or we have owned the dataset, ensuring that
1027 * it can't be modified.
1028 */
1029 if (!rawok && os->os_encrypted &&
1030 arc_is_unauthenticated(os->os_phys_buf)) {
a2c2ed1b
TC
1031 zbookmark_phys_t zb;
1032
1033 SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT,
1034 ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
b5256303 1035 err = arc_untransform(os->os_phys_buf, os->os_spa,
a2c2ed1b 1036 &zb, B_FALSE);
b5256303
TC
1037 if (err != 0) {
1038 dsl_pool_rele(dp, tag);
1039 return (err);
1040 }
1041
1042 ASSERT0(arc_is_unauthenticated(os->os_phys_buf));
1043 }
1044
34dc7c2f
BB
1045 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1046 drr->drr_type = DRR_BEGIN;
1047 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
428870ff
BB
1048 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
1049 DMU_SUBSTREAM);
1050
47dfff3b
MA
1051 bzero(&to_arg, sizeof (to_arg));
1052
428870ff 1053#ifdef _KERNEL
13fe0198 1054 if (dmu_objset_type(os) == DMU_OST_ZFS) {
428870ff 1055 uint64_t version;
13fe0198 1056 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
37abac6d 1057 kmem_free(drr, sizeof (dmu_replay_record_t));
13fe0198 1058 dsl_pool_rele(dp, tag);
2e528b49 1059 return (SET_ERROR(EINVAL));
37abac6d 1060 }
13fe0198 1061 if (version >= ZPL_VERSION_SA) {
9b67f605 1062 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
428870ff
BB
1063 }
1064 }
1065#endif
1066
b5256303
TC
1067 /* raw sends imply large_block_ok */
1068 if ((large_block_ok || rawok) &&
d52d80b7 1069 dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS))
f1512ee6 1070 featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
d52d80b7 1071 if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE))
50c957f7 1072 featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
b5256303
TC
1073
1074 /* encrypted datasets will not have embedded blocks */
1075 if ((embedok || rawok) && !os->os_encrypted &&
9b67f605
MA
1076 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
1077 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
2aa34383 1078 }
b5256303
TC
1079
1080 /* raw send implies compressok */
1081 if (compressok || rawok)
2aa34383 1082 featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
caf9dd20 1083
b5256303
TC
1084 if (rawok && os->os_encrypted)
1085 featureflags |= DMU_BACKUP_FEATURE_RAW;
1086
2aa34383 1087 if ((featureflags &
b5256303
TC
1088 (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED |
1089 DMU_BACKUP_FEATURE_RAW)) != 0 &&
1090 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
2aa34383 1091 featureflags |= DMU_BACKUP_FEATURE_LZ4;
9b67f605
MA
1092 }
1093
47dfff3b
MA
1094 if (resumeobj != 0 || resumeoff != 0) {
1095 featureflags |= DMU_BACKUP_FEATURE_RESUMING;
1096 }
1097
9b67f605
MA
1098 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
1099 featureflags);
1100
34dc7c2f 1101 drr->drr_u.drr_begin.drr_creation_time =
fcff0f35 1102 dsl_dataset_phys(to_ds)->ds_creation_time;
13fe0198 1103 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
da536844 1104 if (is_clone)
34dc7c2f 1105 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
fcff0f35
PD
1106 drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
1107 if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
34dc7c2f 1108 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
b607405f
AS
1109 if (zfs_send_set_freerecords_bit)
1110 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
34dc7c2f 1111
caf9dd20
BB
1112 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK;
1113
fcff0f35
PD
1114 if (ancestor_zb != NULL) {
1115 drr->drr_u.drr_begin.drr_fromguid =
1116 ancestor_zb->zbm_guid;
1117 fromtxg = ancestor_zb->zbm_creation_txg;
da536844 1118 }
fcff0f35
PD
1119 dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
1120 if (!to_ds->ds_is_snapshot) {
da536844
MA
1121 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
1122 sizeof (drr->drr_u.drr_begin.drr_toname));
13fe0198 1123 }
34dc7c2f 1124
37abac6d
BP
1125 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
1126
1127 dsp->dsa_drr = drr;
1128 dsp->dsa_vp = vp;
1129 dsp->dsa_outfd = outfd;
1130 dsp->dsa_proc = curproc;
13fe0198 1131 dsp->dsa_os = os;
37abac6d 1132 dsp->dsa_off = off;
fcff0f35 1133 dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
caf9dd20 1134 dsp->dsa_fromtxg = fromtxg;
37abac6d 1135 dsp->dsa_pending_op = PENDING_NONE;
9b67f605 1136 dsp->dsa_featureflags = featureflags;
47dfff3b
MA
1137 dsp->dsa_resume_object = resumeobj;
1138 dsp->dsa_resume_offset = resumeoff;
37abac6d 1139
fcff0f35
PD
1140 mutex_enter(&to_ds->ds_sendstream_lock);
1141 list_insert_head(&to_ds->ds_sendstreams, dsp);
1142 mutex_exit(&to_ds->ds_sendstream_lock);
37abac6d 1143
fcff0f35 1144 dsl_dataset_long_hold(to_ds, FTAG);
7ec09286
MA
1145 dsl_pool_rele(dp, tag);
1146
b5256303
TC
1147 /* handle features that require a DRR_BEGIN payload */
1148 if (featureflags &
1149 (DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_RAW)) {
1150 nvlist_t *keynvl = NULL;
1151 nvlist_t *nvl = fnvlist_alloc();
1152
1153 if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
1154 dmu_object_info_t to_doi;
1155 err = dmu_object_info(os, resumeobj, &to_doi);
1156 if (err != 0) {
1157 fnvlist_free(nvl);
1158 goto out;
1159 }
1160
1161 SET_BOOKMARK(&to_arg.resume, to_ds->ds_object,
1162 resumeobj, 0,
1163 resumeoff / to_doi.doi_data_block_size);
1164
1165 fnvlist_add_uint64(nvl, "resume_object", resumeobj);
1166 fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
1167 }
1168
1169 if (featureflags & DMU_BACKUP_FEATURE_RAW) {
f00ab3f2
TC
1170 uint64_t ivset_guid = (ancestor_zb != NULL) ?
1171 ancestor_zb->zbm_ivset_guid : 0;
1172
b5256303
TC
1173 ASSERT(os->os_encrypted);
1174
f00ab3f2
TC
1175 err = dsl_crypto_populate_key_nvlist(to_ds,
1176 ivset_guid, &keynvl);
b5256303
TC
1177 if (err != 0) {
1178 fnvlist_free(nvl);
1179 goto out;
1180 }
1181
1182 fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl);
1183 }
47dfff3b 1184
47dfff3b
MA
1185 payload = fnvlist_pack(nvl, &payload_len);
1186 drr->drr_payloadlen = payload_len;
b5256303 1187 fnvlist_free(keynvl);
47dfff3b
MA
1188 fnvlist_free(nvl);
1189 }
1190
1191 err = dump_record(dsp, payload, payload_len);
1192 fnvlist_pack_free(payload, payload_len);
1193 if (err != 0) {
37abac6d
BP
1194 err = dsp->dsa_err;
1195 goto out;
34dc7c2f
BB
1196 }
1197
3b0d9928
BB
1198 err = bqueue_init(&to_arg.q,
1199 MAX(zfs_send_queue_length, 2 * zfs_max_recordsize),
fcff0f35
PD
1200 offsetof(struct send_block_record, ln));
1201 to_arg.error_code = 0;
1202 to_arg.cancel = B_FALSE;
1203 to_arg.ds = to_ds;
1204 to_arg.fromtxg = fromtxg;
1205 to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
b5256303
TC
1206 if (rawok)
1207 to_arg.flags |= TRAVERSE_NO_DECRYPT;
fcff0f35
PD
1208 (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc,
1209 TS_RUN, minclsyspri);
1210
1211 to_data = bqueue_dequeue(&to_arg.q);
1212
1213 while (!to_data->eos_marker && err == 0) {
1214 err = do_dump(dsp, to_data);
1215 to_data = get_next_record(&to_arg.q, to_data);
1216 if (issig(JUSTLOOKING) && issig(FORREAL))
1217 err = EINTR;
1218 }
1219
1220 if (err != 0) {
1221 to_arg.cancel = B_TRUE;
1222 while (!to_data->eos_marker) {
1223 to_data = get_next_record(&to_arg.q, to_data);
1224 }
1225 }
1226 kmem_free(to_data, sizeof (*to_data));
1227
1228 bqueue_destroy(&to_arg.q);
1229
1230 if (err == 0 && to_arg.error_code != 0)
1231 err = to_arg.error_code;
1232
1233 if (err != 0)
1234 goto out;
34dc7c2f 1235
37abac6d 1236 if (dsp->dsa_pending_op != PENDING_NONE)
37f8a883 1237 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 1238 err = SET_ERROR(EINTR);
428870ff 1239
13fe0198
MA
1240 if (err != 0) {
1241 if (err == EINTR && dsp->dsa_err != 0)
37abac6d
BP
1242 err = dsp->dsa_err;
1243 goto out;
34dc7c2f
BB
1244 }
1245
1246 bzero(drr, sizeof (dmu_replay_record_t));
1247 drr->drr_type = DRR_END;
37abac6d
BP
1248 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
1249 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
34dc7c2f 1250
fcff0f35 1251 if (dump_record(dsp, NULL, 0) != 0)
37abac6d 1252 err = dsp->dsa_err;
37abac6d 1253out:
fcff0f35
PD
1254 mutex_enter(&to_ds->ds_sendstream_lock);
1255 list_remove(&to_ds->ds_sendstreams, dsp);
1256 mutex_exit(&to_ds->ds_sendstream_lock);
37abac6d 1257
51907a31
K
1258 VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
1259
34dc7c2f 1260 kmem_free(drr, sizeof (dmu_replay_record_t));
37abac6d 1261 kmem_free(dsp, sizeof (dmu_sendarg_t));
34dc7c2f 1262
fcff0f35 1263 dsl_dataset_long_rele(to_ds, FTAG);
13fe0198 1264
37abac6d 1265 return (err);
34dc7c2f
BB
1266}
1267
330d06f9 1268int
13fe0198 1269dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
2aa34383 1270 boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
b5256303 1271 boolean_t rawok, int outfd, vnode_t *vp, offset_t *off)
13fe0198
MA
1272{
1273 dsl_pool_t *dp;
1274 dsl_dataset_t *ds;
1275 dsl_dataset_t *fromds = NULL;
b5256303 1276 ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
13fe0198
MA
1277 int err;
1278
1279 err = dsl_pool_hold(pool, FTAG, &dp);
1280 if (err != 0)
1281 return (err);
1282
b5256303 1283 err = dsl_dataset_hold_obj_flags(dp, tosnap, dsflags, FTAG, &ds);
13fe0198
MA
1284 if (err != 0) {
1285 dsl_pool_rele(dp, FTAG);
1286 return (err);
1287 }
1288
1289 if (fromsnap != 0) {
f00ab3f2 1290 zfs_bookmark_phys_t zb = { 0 };
da536844
MA
1291 boolean_t is_clone;
1292
13fe0198
MA
1293 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
1294 if (err != 0) {
b5256303 1295 dsl_dataset_rele_flags(ds, dsflags, FTAG);
13fe0198
MA
1296 dsl_pool_rele(dp, FTAG);
1297 return (err);
1298 }
f00ab3f2 1299 if (!dsl_dataset_is_before(ds, fromds, 0)) {
da536844 1300 err = SET_ERROR(EXDEV);
f00ab3f2
TC
1301 dsl_dataset_rele(fromds, FTAG);
1302 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1303 dsl_pool_rele(dp, FTAG);
1304 return (err);
1305 }
1306
d683ddbb
JG
1307 zb.zbm_creation_time =
1308 dsl_dataset_phys(fromds)->ds_creation_time;
1309 zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
1310 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
f00ab3f2
TC
1311
1312 if (dsl_dataset_is_zapified(fromds)) {
1313 (void) zap_lookup(dp->dp_meta_objset,
1314 fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1,
1315 &zb.zbm_ivset_guid);
1316 }
1317
da536844
MA
1318 is_clone = (fromds->ds_dir != ds->ds_dir);
1319 dsl_dataset_rele(fromds, FTAG);
f1512ee6 1320 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
b5256303
TC
1321 embedok, large_block_ok, compressok, rawok, outfd,
1322 0, 0, vp, off);
da536844 1323 } else {
f1512ee6 1324 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
b5256303
TC
1325 embedok, large_block_ok, compressok, rawok, outfd,
1326 0, 0, vp, off);
13fe0198 1327 }
b5256303 1328 dsl_dataset_rele_flags(ds, dsflags, FTAG);
da536844 1329 return (err);
13fe0198
MA
1330}
1331
1332int
47dfff3b 1333dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
b5256303
TC
1334 boolean_t large_block_ok, boolean_t compressok, boolean_t rawok,
1335 int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp,
1336 offset_t *off)
13fe0198
MA
1337{
1338 dsl_pool_t *dp;
1339 dsl_dataset_t *ds;
13fe0198 1340 int err;
b5256303 1341 ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
da536844 1342 boolean_t owned = B_FALSE;
13fe0198 1343
da536844 1344 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
2e528b49 1345 return (SET_ERROR(EINVAL));
13fe0198
MA
1346
1347 err = dsl_pool_hold(tosnap, FTAG, &dp);
1348 if (err != 0)
1349 return (err);
da536844
MA
1350 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
1351 /*
1352 * We are sending a filesystem or volume. Ensure
1353 * that it doesn't change by owning the dataset.
1354 */
b5256303 1355 err = dsl_dataset_own(dp, tosnap, dsflags, FTAG, &ds);
da536844
MA
1356 owned = B_TRUE;
1357 } else {
b5256303 1358 err = dsl_dataset_hold_flags(dp, tosnap, dsflags, FTAG, &ds);
da536844 1359 }
13fe0198
MA
1360 if (err != 0) {
1361 dsl_pool_rele(dp, FTAG);
1362 return (err);
1363 }
1364
1365 if (fromsnap != NULL) {
f00ab3f2 1366 zfs_bookmark_phys_t zb = { 0 };
da536844
MA
1367 boolean_t is_clone = B_FALSE;
1368 int fsnamelen = strchr(tosnap, '@') - tosnap;
1369
1370 /*
1371 * If the fromsnap is in a different filesystem, then
1372 * mark the send stream as a clone.
1373 */
1374 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
1375 (fromsnap[fsnamelen] != '@' &&
1376 fromsnap[fsnamelen] != '#')) {
1377 is_clone = B_TRUE;
1378 }
1379
1380 if (strchr(fromsnap, '@')) {
1381 dsl_dataset_t *fromds;
1382 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
1383 if (err == 0) {
1384 if (!dsl_dataset_is_before(ds, fromds, 0))
1385 err = SET_ERROR(EXDEV);
1386 zb.zbm_creation_time =
d683ddbb 1387 dsl_dataset_phys(fromds)->ds_creation_time;
da536844 1388 zb.zbm_creation_txg =
d683ddbb
JG
1389 dsl_dataset_phys(fromds)->ds_creation_txg;
1390 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
da536844 1391 is_clone = (ds->ds_dir != fromds->ds_dir);
f00ab3f2
TC
1392
1393 if (dsl_dataset_is_zapified(fromds)) {
1394 (void) zap_lookup(dp->dp_meta_objset,
1395 fromds->ds_object,
1396 DS_FIELD_IVSET_GUID, 8, 1,
1397 &zb.zbm_ivset_guid);
1398 }
da536844
MA
1399 dsl_dataset_rele(fromds, FTAG);
1400 }
1401 } else {
1402 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
1403 }
13fe0198 1404 if (err != 0) {
b5256303
TC
1405 if (owned)
1406 dsl_dataset_disown(ds, dsflags, FTAG);
1407 else
1408 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1409
13fe0198
MA
1410 dsl_pool_rele(dp, FTAG);
1411 return (err);
1412 }
f1512ee6 1413 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
b5256303 1414 embedok, large_block_ok, compressok, rawok,
47dfff3b 1415 outfd, resumeobj, resumeoff, vp, off);
da536844 1416 } else {
f1512ee6 1417 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
b5256303 1418 embedok, large_block_ok, compressok, rawok,
47dfff3b 1419 outfd, resumeobj, resumeoff, vp, off);
13fe0198 1420 }
da536844 1421 if (owned)
b5256303 1422 dsl_dataset_disown(ds, dsflags, FTAG);
da536844 1423 else
b5256303
TC
1424 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1425
da536844 1426 return (err);
13fe0198
MA
1427}
1428
5dc8b736 1429static int
2aa34383
DK
1430dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
1431 uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
5dc8b736 1432{
ca0845d5 1433 int err = 0;
2aa34383 1434 uint64_t size;
5dc8b736
MG
1435 /*
1436 * Assume that space (both on-disk and in-stream) is dominated by
1437 * data. We will adjust for indirect blocks and the copies property,
1438 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
1439 */
1440
2aa34383
DK
1441 uint64_t recordsize;
1442 uint64_t record_count;
dd429b46
PD
1443 objset_t *os;
1444 VERIFY0(dmu_objset_from_ds(ds, &os));
2aa34383
DK
1445
1446 /* Assume all (uncompressed) blocks are recordsize. */
ca0845d5
PD
1447 if (zfs_override_estimate_recordsize != 0) {
1448 recordsize = zfs_override_estimate_recordsize;
1449 } else if (os->os_phys->os_type == DMU_OST_ZVOL) {
dd429b46
PD
1450 err = dsl_prop_get_int_ds(ds,
1451 zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
1452 } else {
1453 err = dsl_prop_get_int_ds(ds,
1454 zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
1455 }
2aa34383
DK
1456 if (err != 0)
1457 return (err);
1458 record_count = uncompressed / recordsize;
1459
1460 /*
1461 * If we're estimating a send size for a compressed stream, use the
1462 * compressed data size to estimate the stream size. Otherwise, use the
1463 * uncompressed data size.
1464 */
1465 size = stream_compressed ? compressed : uncompressed;
1466
5dc8b736
MG
1467 /*
1468 * Subtract out approximate space used by indirect blocks.
1469 * Assume most space is used by data blocks (non-indirect, non-dnode).
2aa34383 1470 * Assume no ditto blocks or internal fragmentation.
5dc8b736
MG
1471 *
1472 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
2aa34383 1473 * block.
5dc8b736 1474 */
2aa34383 1475 size -= record_count * sizeof (blkptr_t);
5dc8b736
MG
1476
1477 /* Add in the space for the record associated with each block. */
2aa34383 1478 size += record_count * sizeof (dmu_replay_record_t);
5dc8b736
MG
1479
1480 *sizep = size;
1481
1482 return (0);
1483}
1484
13fe0198 1485int
2aa34383
DK
1486dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds,
1487 boolean_t stream_compressed, uint64_t *sizep)
330d06f9 1488{
330d06f9 1489 int err;
2aa34383 1490 uint64_t uncomp, comp;
13fe0198 1491
fd0fd646 1492 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
330d06f9
MA
1493
1494 /* tosnap must be a snapshot */
0c66c32d 1495 if (!ds->ds_is_snapshot)
2e528b49 1496 return (SET_ERROR(EINVAL));
330d06f9 1497
71e2fe41
AG
1498 /* fromsnap, if provided, must be a snapshot */
1499 if (fromds != NULL && !fromds->ds_is_snapshot)
1500 return (SET_ERROR(EINVAL));
1501
6f1ffb06
MA
1502 /*
1503 * fromsnap must be an earlier snapshot from the same fs as tosnap,
1504 * or the origin's fs.
1505 */
da536844 1506 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
2e528b49 1507 return (SET_ERROR(EXDEV));
330d06f9 1508
2aa34383 1509 /* Get compressed and uncompressed size estimates of changed data. */
330d06f9 1510 if (fromds == NULL) {
2aa34383
DK
1511 uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
1512 comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
330d06f9 1513 } else {
2aa34383 1514 uint64_t used;
330d06f9 1515 err = dsl_dataset_space_written(fromds, ds,
2aa34383 1516 &used, &comp, &uncomp);
13fe0198 1517 if (err != 0)
330d06f9
MA
1518 return (err);
1519 }
1520
2aa34383
DK
1521 err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
1522 stream_compressed, sizep);
dd429b46
PD
1523 /*
1524 * Add the size of the BEGIN and END records to the estimate.
1525 */
1526 *sizep += 2 * sizeof (dmu_replay_record_t);
5dc8b736
MG
1527 return (err);
1528}
330d06f9 1529
2aa34383
DK
1530struct calculate_send_arg {
1531 uint64_t uncompressed;
1532 uint64_t compressed;
1533};
1534
5dc8b736
MG
1535/*
1536 * Simple callback used to traverse the blocks of a snapshot and sum their
2aa34383 1537 * uncompressed and compressed sizes.
5dc8b736
MG
1538 */
1539/* ARGSUSED */
1540static int
1541dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1542 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
1543{
2aa34383 1544 struct calculate_send_arg *space = arg;
5dc8b736 1545 if (bp != NULL && !BP_IS_HOLE(bp)) {
2aa34383
DK
1546 space->uncompressed += BP_GET_UCSIZE(bp);
1547 space->compressed += BP_GET_PSIZE(bp);
5dc8b736
MG
1548 }
1549 return (0);
1550}
1551
1552/*
1553 * Given a desination snapshot and a TXG, calculate the approximate size of a
1554 * send stream sent from that TXG. from_txg may be zero, indicating that the
1555 * whole snapshot will be sent.
1556 */
1557int
1558dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
2aa34383 1559 boolean_t stream_compressed, uint64_t *sizep)
5dc8b736 1560{
5dc8b736 1561 int err;
2aa34383 1562 struct calculate_send_arg size = { 0 };
5dc8b736 1563
fd0fd646 1564 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
5dc8b736
MG
1565
1566 /* tosnap must be a snapshot */
1567 if (!dsl_dataset_is_snapshot(ds))
1568 return (SET_ERROR(EINVAL));
1569
1570 /* verify that from_txg is before the provided snapshot was taken */
1571 if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
1572 return (SET_ERROR(EXDEV));
1573 }
330d06f9 1574 /*
5dc8b736
MG
1575 * traverse the blocks of the snapshot with birth times after
1576 * from_txg, summing their uncompressed size
330d06f9 1577 */
b5256303
TC
1578 err = traverse_dataset(ds, from_txg,
1579 TRAVERSE_POST | TRAVERSE_NO_DECRYPT,
5dc8b736 1580 dmu_calculate_send_traversal, &size);
2aa34383 1581
5dc8b736 1582 if (err)
330d06f9 1583 return (err);
330d06f9 1584
2aa34383
DK
1585 err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed,
1586 size.compressed, stream_compressed, sizep);
5dc8b736 1587 return (err);
330d06f9
MA
1588}
1589
47dfff3b 1590
03916905
PD
1591#if defined(_KERNEL)
1592/* BEGIN CSTYLED */
1593module_param(zfs_override_estimate_recordsize, ulong, 0644);
1594MODULE_PARM_DESC(zfs_override_estimate_recordsize,
1595 "Record size calculation override for zfs send estimates");
1596/* END CSTYLED */
37f8a883 1597
03916905
PD
1598module_param(zfs_send_corrupt_data, int, 0644);
1599MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data");
3b0d9928
BB
1600
1601module_param(zfs_send_queue_length, int, 0644);
1602MODULE_PARM_DESC(zfs_send_queue_length, "Maximum send queue length");
caf9dd20
BB
1603
1604module_param(zfs_send_unmodified_spill_blocks, int, 0644);
1605MODULE_PARM_DESC(zfs_send_unmodified_spill_blocks,
1606 "Send unmodified spill blocks");
fd8febbd 1607#endif