]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/dmu_send.c
Add `zfs allow` and `zfs unallow` support
[mirror_zfs.git] / module / zfs / dmu_send.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
241b5415 23 * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
8d35c149 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
788eb90c 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
5dbd68a3 26 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
a0bd735a 27 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
8d35c149 28 */
34dc7c2f 29
34dc7c2f
BB
30#include <sys/dmu.h>
31#include <sys/dmu_impl.h>
32#include <sys/dmu_tx.h>
33#include <sys/dbuf.h>
34#include <sys/dnode.h>
35#include <sys/zfs_context.h>
36#include <sys/dmu_objset.h>
37#include <sys/dmu_traverse.h>
38#include <sys/dsl_dataset.h>
39#include <sys/dsl_dir.h>
428870ff 40#include <sys/dsl_prop.h>
34dc7c2f
BB
41#include <sys/dsl_pool.h>
42#include <sys/dsl_synctask.h>
044baf00 43#include <sys/spa_impl.h>
34dc7c2f
BB
44#include <sys/zfs_ioctl.h>
45#include <sys/zap.h>
46#include <sys/zio_checksum.h>
428870ff
BB
47#include <sys/zfs_znode.h>
48#include <zfs_fletcher.h>
49#include <sys/avl.h>
50#include <sys/ddt.h>
572e2857 51#include <sys/zfs_onexit.h>
13fe0198
MA
52#include <sys/dmu_send.h>
53#include <sys/dsl_destroy.h>
9b67f605 54#include <sys/blkptr.h>
da536844 55#include <sys/dsl_bookmark.h>
9b67f605 56#include <sys/zfeature.h>
fcff0f35 57#include <sys/bqueue.h>
a0bd735a 58#include <sys/zvol.h>
f74b821a 59#include <sys/policy.h>
34dc7c2f 60
330d06f9
MA
61/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
62int zfs_send_corrupt_data = B_FALSE;
fcff0f35
PD
63int zfs_send_queue_length = 16 * 1024 * 1024;
64int zfs_recv_queue_length = 16 * 1024 * 1024;
330d06f9 65
34dc7c2f 66static char *dmu_recv_tag = "dmu_recv_tag";
13fe0198 67static const char *recv_clone_name = "%recv";
34dc7c2f 68
fcff0f35
PD
69#define BP_SPAN(datablkszsec, indblkshift, level) \
70 (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
71 (level) * (indblkshift - SPA_BLKPTRSHIFT)))
72
73struct send_thread_arg {
74 bqueue_t q;
75 dsl_dataset_t *ds; /* Dataset to traverse */
76 uint64_t fromtxg; /* Traverse from this txg */
77 int flags; /* flags to pass to traverse_dataset */
78 int error_code;
79 boolean_t cancel;
80};
81
82struct send_block_record {
83 boolean_t eos_marker; /* Marks the end of the stream */
84 blkptr_t bp;
85 zbookmark_phys_t zb;
86 uint8_t indblkshift;
87 uint16_t datablkszsec;
88 bqueue_node_t ln;
89};
90
044baf00
BB
91typedef struct dump_bytes_io {
92 dmu_sendarg_t *dbi_dsp;
93 void *dbi_buf;
94 int dbi_len;
95} dump_bytes_io_t;
96
97static void
b58986ee 98dump_bytes_cb(void *arg)
34dc7c2f 99{
044baf00
BB
100 dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
101 dmu_sendarg_t *dsp = dbi->dbi_dsp;
37abac6d 102 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
34dc7c2f 103 ssize_t resid; /* have to get resid to get detailed errno */
c99c9001 104 ASSERT0(dbi->dbi_len % 8);
34dc7c2f 105
37abac6d 106 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
044baf00 107 (caddr_t)dbi->dbi_buf, dbi->dbi_len,
34dc7c2f 108 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
37abac6d
BP
109
110 mutex_enter(&ds->ds_sendstream_lock);
044baf00 111 *dsp->dsa_off += dbi->dbi_len;
37abac6d 112 mutex_exit(&ds->ds_sendstream_lock);
044baf00
BB
113}
114
115static int
116dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
117{
118 dump_bytes_io_t dbi;
119
120 dbi.dbi_dsp = dsp;
121 dbi.dbi_buf = buf;
122 dbi.dbi_len = len;
123
b58986ee
BB
124#if defined(HAVE_LARGE_STACKS)
125 dump_bytes_cb(&dbi);
126#else
044baf00
BB
127 /*
128 * The vn_rdwr() call is performed in a taskq to ensure that there is
129 * always enough stack space to write safely to the target filesystem.
130 * The ZIO_TYPE_FREE threads are used because there can be a lot of
131 * them and they are used in vdev_file.c for a similar purpose.
132 */
133 spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE,
b58986ee
BB
134 ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP);
135#endif /* HAVE_LARGE_STACKS */
37abac6d
BP
136
137 return (dsp->dsa_err);
34dc7c2f
BB
138}
139
37f8a883
MA
140/*
141 * For all record types except BEGIN, fill in the checksum (overlaid in
142 * drr_u.drr_checksum.drr_checksum). The checksum verifies everything
143 * up to the start of the checksum itself.
144 */
145static int
146dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
147{
148 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
149 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
150 fletcher_4_incremental_native(dsp->dsa_drr,
151 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
152 &dsp->dsa_zc);
153 if (dsp->dsa_drr->drr_type != DRR_BEGIN) {
154 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
155 drr_checksum.drr_checksum));
156 dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
157 }
158 fletcher_4_incremental_native(&dsp->dsa_drr->
159 drr_u.drr_checksum.drr_checksum,
160 sizeof (zio_cksum_t), &dsp->dsa_zc);
161 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
162 return (SET_ERROR(EINTR));
163 if (payload_len != 0) {
164 fletcher_4_incremental_native(payload, payload_len,
165 &dsp->dsa_zc);
166 if (dump_bytes(dsp, payload, payload_len) != 0)
167 return (SET_ERROR(EINTR));
168 }
169 return (0);
170}
171
34dc7c2f 172static int
37abac6d 173dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
34dc7c2f
BB
174 uint64_t length)
175{
37abac6d 176 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
428870ff 177
ea97f8ce
MA
178 /*
179 * When we receive a free record, dbuf_free_range() assumes
180 * that the receiving system doesn't have any dbufs in the range
181 * being freed. This is always true because there is a one-record
182 * constraint: we only send one WRITE record for any given
183 * object+offset. We know that the one-record constraint is
184 * true because we always send data in increasing order by
185 * object,offset.
186 *
187 * If the increasing-order constraint ever changes, we should find
188 * another way to assert that the one-record constraint is still
189 * satisfied.
190 */
191 ASSERT(object > dsp->dsa_last_data_object ||
192 (object == dsp->dsa_last_data_object &&
193 offset > dsp->dsa_last_data_offset));
194
195 /*
196 * If we are doing a non-incremental send, then there can't
197 * be any data in the dataset we're receiving into. Therefore
198 * a free record would simply be a no-op. Save space by not
199 * sending it to begin with.
200 */
201 if (!dsp->dsa_incremental)
202 return (0);
203
c578f007
SK
204 if (length != -1ULL && offset + length < offset)
205 length = -1ULL;
206
428870ff
BB
207 /*
208 * If there is a pending op, but it's not PENDING_FREE, push it out,
209 * since free block aggregation can only be done for blocks of the
210 * same type (i.e., DRR_FREE records can only be aggregated with
211 * other DRR_FREE records. DRR_FREEOBJECTS records can only be
212 * aggregated with other DRR_FREEOBJECTS records.
213 */
37abac6d
BP
214 if (dsp->dsa_pending_op != PENDING_NONE &&
215 dsp->dsa_pending_op != PENDING_FREE) {
37f8a883 216 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 217 return (SET_ERROR(EINTR));
37abac6d 218 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
219 }
220
37abac6d 221 if (dsp->dsa_pending_op == PENDING_FREE) {
428870ff
BB
222 /*
223 * There should never be a PENDING_FREE if length is -1
224 * (because dump_dnode is the only place where this
225 * function is called with a -1, and only after flushing
226 * any pending record).
227 */
228 ASSERT(length != -1ULL);
229 /*
230 * Check to see whether this free block can be aggregated
231 * with pending one.
232 */
233 if (drrf->drr_object == object && drrf->drr_offset +
234 drrf->drr_length == offset) {
235 drrf->drr_length += length;
236 return (0);
237 } else {
238 /* not a continuation. Push out pending record */
37f8a883 239 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 240 return (SET_ERROR(EINTR));
37abac6d 241 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
242 }
243 }
244 /* create a FREE record and make it pending */
37abac6d
BP
245 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
246 dsp->dsa_drr->drr_type = DRR_FREE;
428870ff
BB
247 drrf->drr_object = object;
248 drrf->drr_offset = offset;
249 drrf->drr_length = length;
37abac6d 250 drrf->drr_toguid = dsp->dsa_toguid;
428870ff 251 if (length == -1ULL) {
37f8a883 252 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 253 return (SET_ERROR(EINTR));
428870ff 254 } else {
37abac6d 255 dsp->dsa_pending_op = PENDING_FREE;
428870ff 256 }
34dc7c2f 257
34dc7c2f
BB
258 return (0);
259}
260
261static int
9b67f605 262dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
428870ff 263 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
34dc7c2f 264{
37abac6d 265 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
428870ff 266
ea97f8ce
MA
267 /*
268 * We send data in increasing object, offset order.
269 * See comment in dump_free() for details.
270 */
271 ASSERT(object > dsp->dsa_last_data_object ||
272 (object == dsp->dsa_last_data_object &&
273 offset > dsp->dsa_last_data_offset));
274 dsp->dsa_last_data_object = object;
275 dsp->dsa_last_data_offset = offset + blksz - 1;
428870ff
BB
276
277 /*
278 * If there is any kind of pending aggregation (currently either
279 * a grouping of free objects or free blocks), push it out to
280 * the stream, since aggregation can't be done across operations
281 * of different types.
282 */
37abac6d 283 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 284 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 285 return (SET_ERROR(EINTR));
37abac6d 286 dsp->dsa_pending_op = PENDING_NONE;
428870ff 287 }
37f8a883 288 /* write a WRITE record */
37abac6d
BP
289 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
290 dsp->dsa_drr->drr_type = DRR_WRITE;
428870ff
BB
291 drrw->drr_object = object;
292 drrw->drr_type = type;
293 drrw->drr_offset = offset;
294 drrw->drr_length = blksz;
37abac6d 295 drrw->drr_toguid = dsp->dsa_toguid;
f1512ee6 296 if (bp == NULL || BP_IS_EMBEDDED(bp)) {
9b67f605 297 /*
f1512ee6
MA
298 * There's no pre-computed checksum for partial-block
299 * writes or embedded BP's, so (like
300 * fletcher4-checkummed blocks) userland will have to
301 * compute a dedup-capable checksum itself.
9b67f605
MA
302 */
303 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
304 } else {
305 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
306 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
307 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
308 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
309 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
310 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
311 drrw->drr_key.ddk_cksum = bp->blk_cksum;
312 }
428870ff 313
37f8a883 314 if (dump_record(dsp, data, blksz) != 0)
2e528b49 315 return (SET_ERROR(EINTR));
428870ff
BB
316 return (0);
317}
318
9b67f605
MA
319static int
320dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
321 int blksz, const blkptr_t *bp)
322{
323 char buf[BPE_PAYLOAD_SIZE];
324 struct drr_write_embedded *drrw =
325 &(dsp->dsa_drr->drr_u.drr_write_embedded);
326
327 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 328 if (dump_record(dsp, NULL, 0) != 0)
9b67f605
MA
329 return (EINTR);
330 dsp->dsa_pending_op = PENDING_NONE;
331 }
332
333 ASSERT(BP_IS_EMBEDDED(bp));
334
335 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
336 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
337 drrw->drr_object = object;
338 drrw->drr_offset = offset;
339 drrw->drr_length = blksz;
340 drrw->drr_toguid = dsp->dsa_toguid;
341 drrw->drr_compression = BP_GET_COMPRESS(bp);
342 drrw->drr_etype = BPE_GET_ETYPE(bp);
343 drrw->drr_lsize = BPE_GET_LSIZE(bp);
344 drrw->drr_psize = BPE_GET_PSIZE(bp);
345
346 decode_embedded_bp_compressed(bp, buf);
347
37f8a883 348 if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
9b67f605
MA
349 return (EINTR);
350 return (0);
351}
352
428870ff 353static int
37abac6d 354dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
428870ff 355{
37abac6d 356 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
428870ff 357
37abac6d 358 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 359 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 360 return (SET_ERROR(EINTR));
37abac6d 361 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
362 }
363
364 /* write a SPILL record */
37abac6d
BP
365 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
366 dsp->dsa_drr->drr_type = DRR_SPILL;
428870ff
BB
367 drrs->drr_object = object;
368 drrs->drr_length = blksz;
37abac6d 369 drrs->drr_toguid = dsp->dsa_toguid;
34dc7c2f 370
37f8a883 371 if (dump_record(dsp, data, blksz) != 0)
2e528b49 372 return (SET_ERROR(EINTR));
34dc7c2f
BB
373 return (0);
374}
375
376static int
37abac6d 377dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
34dc7c2f 378{
37abac6d 379 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
428870ff 380
ea97f8ce
MA
381 /* See comment in dump_free(). */
382 if (!dsp->dsa_incremental)
383 return (0);
384
428870ff
BB
385 /*
386 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
387 * push it out, since free block aggregation can only be done for
388 * blocks of the same type (i.e., DRR_FREE records can only be
389 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
390 * can only be aggregated with other DRR_FREEOBJECTS records.
391 */
37abac6d
BP
392 if (dsp->dsa_pending_op != PENDING_NONE &&
393 dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
37f8a883 394 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 395 return (SET_ERROR(EINTR));
37abac6d 396 dsp->dsa_pending_op = PENDING_NONE;
428870ff 397 }
37abac6d 398 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
428870ff
BB
399 /*
400 * See whether this free object array can be aggregated
401 * with pending one
402 */
403 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
404 drrfo->drr_numobjs += numobjs;
405 return (0);
406 } else {
407 /* can't be aggregated. Push out pending record */
37f8a883 408 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 409 return (SET_ERROR(EINTR));
37abac6d 410 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
411 }
412 }
413
34dc7c2f 414 /* write a FREEOBJECTS record */
37abac6d
BP
415 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
416 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
428870ff
BB
417 drrfo->drr_firstobj = firstobj;
418 drrfo->drr_numobjs = numobjs;
37abac6d 419 drrfo->drr_toguid = dsp->dsa_toguid;
428870ff 420
37abac6d 421 dsp->dsa_pending_op = PENDING_FREEOBJECTS;
34dc7c2f 422
34dc7c2f
BB
423 return (0);
424}
425
426static int
37abac6d 427dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
34dc7c2f 428{
37abac6d 429 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
428870ff 430
34dc7c2f 431 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
37abac6d 432 return (dump_freeobjects(dsp, object, 1));
34dc7c2f 433
37abac6d 434 if (dsp->dsa_pending_op != PENDING_NONE) {
37f8a883 435 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 436 return (SET_ERROR(EINTR));
37abac6d 437 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
438 }
439
34dc7c2f 440 /* write an OBJECT record */
37abac6d
BP
441 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
442 dsp->dsa_drr->drr_type = DRR_OBJECT;
428870ff
BB
443 drro->drr_object = object;
444 drro->drr_type = dnp->dn_type;
445 drro->drr_bonustype = dnp->dn_bonustype;
446 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
447 drro->drr_bonuslen = dnp->dn_bonuslen;
448 drro->drr_checksumtype = dnp->dn_checksum;
449 drro->drr_compress = dnp->dn_compress;
37abac6d 450 drro->drr_toguid = dsp->dsa_toguid;
428870ff 451
f1512ee6
MA
452 if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
453 drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
454 drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
455
37f8a883
MA
456 if (dump_record(dsp, DN_BONUS(dnp),
457 P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) {
2e528b49 458 return (SET_ERROR(EINTR));
37f8a883 459 }
34dc7c2f 460
ea97f8ce 461 /* Free anything past the end of the file. */
37abac6d 462 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
ea97f8ce 463 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
2e528b49 464 return (SET_ERROR(EINTR));
13fe0198 465 if (dsp->dsa_err != 0)
2e528b49 466 return (SET_ERROR(EINTR));
34dc7c2f
BB
467 return (0);
468}
469
9b67f605
MA
470static boolean_t
471backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
472{
473 if (!BP_IS_EMBEDDED(bp))
474 return (B_FALSE);
475
476 /*
477 * Compression function must be legacy, or explicitly enabled.
478 */
479 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
480 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
481 return (B_FALSE);
482
483 /*
484 * Embed type must be explicitly enabled.
485 */
486 switch (BPE_GET_ETYPE(bp)) {
487 case BP_EMBEDDED_TYPE_DATA:
488 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
489 return (B_TRUE);
490 break;
491 default:
492 return (B_FALSE);
493 }
494 return (B_FALSE);
495}
496
fcff0f35
PD
497/*
498 * This is the callback function to traverse_dataset that acts as the worker
499 * thread for dmu_send_impl.
500 */
501/*ARGSUSED*/
502static int
503send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
504 const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
505{
506 struct send_thread_arg *sta = arg;
507 struct send_block_record *record;
508 uint64_t record_size;
509 int err = 0;
510
511 if (sta->cancel)
512 return (SET_ERROR(EINTR));
34dc7c2f 513
fcff0f35
PD
514 if (bp == NULL) {
515 ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
516 return (0);
517 } else if (zb->zb_level < 0) {
518 return (0);
519 }
520
521 record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
522 record->eos_marker = B_FALSE;
523 record->bp = *bp;
524 record->zb = *zb;
525 record->indblkshift = dnp->dn_indblkshift;
526 record->datablkszsec = dnp->dn_datablkszsec;
527 record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
528 bqueue_enqueue(&sta->q, record, record_size);
529
530 return (err);
531}
532
533/*
534 * This function kicks off the traverse_dataset. It also handles setting the
535 * error code of the thread in case something goes wrong, and pushes the End of
536 * Stream record when the traverse_dataset call has finished. If there is no
537 * dataset to traverse, the thread immediately pushes End of Stream marker.
538 */
539static void
540send_traverse_thread(void *arg)
541{
542 struct send_thread_arg *st_arg = arg;
543 int err;
544 struct send_block_record *data;
545
546 if (st_arg->ds != NULL) {
547 err = traverse_dataset(st_arg->ds, st_arg->fromtxg,
548 st_arg->flags, send_cb, arg);
549 if (err != EINTR)
550 st_arg->error_code = err;
551 }
552 data = kmem_zalloc(sizeof (*data), KM_SLEEP);
553 data->eos_marker = B_TRUE;
554 bqueue_enqueue(&st_arg->q, data, 1);
555}
556
557/*
558 * This function actually handles figuring out what kind of record needs to be
559 * dumped, reading the data (which has hopefully been prefetched), and calling
560 * the appropriate helper function.
561 */
34dc7c2f 562static int
fcff0f35 563do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
34dc7c2f 564{
fcff0f35
PD
565 dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
566 const blkptr_t *bp = &data->bp;
567 const zbookmark_phys_t *zb = &data->zb;
568 uint8_t indblkshift = data->indblkshift;
569 uint16_t dblkszsec = data->datablkszsec;
570 spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
34dc7c2f 571 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
34dc7c2f 572 int err = 0;
fcff0f35
PD
573 dnode_phys_t *blk;
574 uint64_t dnobj;
34dc7c2f 575
fcff0f35 576 ASSERT3U(zb->zb_level, >=, 0);
34dc7c2f 577
428870ff
BB
578 if (zb->zb_object != DMU_META_DNODE_OBJECT &&
579 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
9babb374 580 return (0);
b0bc7a84
MG
581 } else if (BP_IS_HOLE(bp) &&
582 zb->zb_object == DMU_META_DNODE_OBJECT) {
fcff0f35 583 uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
b128c09f 584 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
fcff0f35 585 err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
b0bc7a84 586 } else if (BP_IS_HOLE(bp)) {
fcff0f35
PD
587 uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
588 uint64_t offset = zb->zb_blkid * span;
589 err = dump_free(dsa, zb->zb_object, offset, span);
b128c09f
BB
590 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
591 return (0);
592 } else if (type == DMU_OT_DNODE) {
34dc7c2f 593 int blksz = BP_GET_LSIZE(bp);
2a432414 594 arc_flags_t aflags = ARC_FLAG_WAIT;
b128c09f 595 arc_buf_t *abuf;
fcff0f35
PD
596 int i;
597
598 ASSERT0(zb->zb_level);
b128c09f 599
294f6806
GW
600 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
601 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
602 &aflags, zb) != 0)
2e528b49 603 return (SET_ERROR(EIO));
34dc7c2f 604
b128c09f 605 blk = abuf->b_data;
fcff0f35 606 dnobj = zb->zb_blkid * (blksz >> DNODE_SHIFT);
34dc7c2f 607 for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
fcff0f35 608 err = dump_dnode(dsa, dnobj + i, blk + i);
13fe0198 609 if (err != 0)
34dc7c2f
BB
610 break;
611 }
b128c09f 612 (void) arc_buf_remove_ref(abuf, &abuf);
428870ff 613 } else if (type == DMU_OT_SA) {
2a432414 614 arc_flags_t aflags = ARC_FLAG_WAIT;
b128c09f 615 arc_buf_t *abuf;
34dc7c2f 616 int blksz = BP_GET_LSIZE(bp);
b128c09f 617
294f6806
GW
618 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
619 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
620 &aflags, zb) != 0)
2e528b49 621 return (SET_ERROR(EIO));
b128c09f 622
fcff0f35 623 err = dump_spill(dsa, zb->zb_object, blksz, abuf->b_data);
428870ff 624 (void) arc_buf_remove_ref(abuf, &abuf);
fcff0f35 625 } else if (backup_do_embed(dsa, bp)) {
9b67f605 626 /* it's an embedded level-0 block of a regular object */
fcff0f35
PD
627 int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
628 ASSERT0(zb->zb_level);
629 err = dump_write_embedded(dsa, zb->zb_object,
9b67f605 630 zb->zb_blkid * blksz, blksz, bp);
fcff0f35
PD
631 } else {
632 /* it's a level-0 block of a regular object */
2a432414 633 arc_flags_t aflags = ARC_FLAG_WAIT;
428870ff 634 arc_buf_t *abuf;
fcff0f35
PD
635 int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
636 uint64_t offset;
428870ff 637
da536844 638 ASSERT0(zb->zb_level);
294f6806
GW
639 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
640 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
641 &aflags, zb) != 0) {
330d06f9
MA
642 if (zfs_send_corrupt_data) {
643 uint64_t *ptr;
644 /* Send a block filled with 0x"zfs badd bloc" */
645 abuf = arc_buf_alloc(spa, blksz, &abuf,
646 ARC_BUFC_DATA);
647 for (ptr = abuf->b_data;
648 (char *)ptr < (char *)abuf->b_data + blksz;
649 ptr++)
dd26aa53 650 *ptr = 0x2f5baddb10cULL;
330d06f9 651 } else {
2e528b49 652 return (SET_ERROR(EIO));
330d06f9
MA
653 }
654 }
428870ff 655
f1512ee6
MA
656 offset = zb->zb_blkid * blksz;
657
fcff0f35 658 if (!(dsa->dsa_featureflags &
f1512ee6
MA
659 DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
660 blksz > SPA_OLD_MAXBLOCKSIZE) {
661 char *buf = abuf->b_data;
662 while (blksz > 0 && err == 0) {
663 int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
fcff0f35 664 err = dump_write(dsa, type, zb->zb_object,
f1512ee6
MA
665 offset, n, NULL, buf);
666 offset += n;
667 buf += n;
668 blksz -= n;
669 }
670 } else {
fcff0f35 671 err = dump_write(dsa, type, zb->zb_object,
f1512ee6
MA
672 offset, blksz, bp, abuf->b_data);
673 }
b128c09f 674 (void) arc_buf_remove_ref(abuf, &abuf);
34dc7c2f
BB
675 }
676
677 ASSERT(err == 0 || err == EINTR);
678 return (err);
679}
680
6f1ffb06 681/*
fcff0f35
PD
682 * Pop the new data off the queue, and free the old data.
683 */
684static struct send_block_record *
685get_next_record(bqueue_t *bq, struct send_block_record *data)
686{
687 struct send_block_record *tmp = bqueue_dequeue(bq);
688 kmem_free(data, sizeof (*data));
689 return (tmp);
690}
691
692/*
693 * Actually do the bulk of the work in a zfs send.
694 *
695 * Note: Releases dp using the specified tag.
6f1ffb06 696 */
13fe0198 697static int
fcff0f35
PD
698dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
699 zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok,
f1512ee6 700 boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
34dc7c2f 701{
13fe0198 702 objset_t *os;
34dc7c2f 703 dmu_replay_record_t *drr;
37abac6d 704 dmu_sendarg_t *dsp;
34dc7c2f
BB
705 int err;
706 uint64_t fromtxg = 0;
9b67f605 707 uint64_t featureflags = 0;
fcff0f35
PD
708 struct send_thread_arg to_arg;
709 struct send_block_record *to_data;
34dc7c2f 710
fcff0f35 711 err = dmu_objset_from_ds(to_ds, &os);
13fe0198 712 if (err != 0) {
13fe0198
MA
713 dsl_pool_rele(dp, tag);
714 return (err);
715 }
34dc7c2f 716
34dc7c2f
BB
717 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
718 drr->drr_type = DRR_BEGIN;
719 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
428870ff
BB
720 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
721 DMU_SUBSTREAM);
722
723#ifdef _KERNEL
13fe0198 724 if (dmu_objset_type(os) == DMU_OST_ZFS) {
428870ff 725 uint64_t version;
13fe0198 726 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
37abac6d 727 kmem_free(drr, sizeof (dmu_replay_record_t));
13fe0198 728 dsl_pool_rele(dp, tag);
2e528b49 729 return (SET_ERROR(EINVAL));
37abac6d 730 }
13fe0198 731 if (version >= ZPL_VERSION_SA) {
9b67f605 732 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
428870ff
BB
733 }
734 }
735#endif
736
fcff0f35 737 if (large_block_ok && to_ds->ds_feature_inuse[SPA_FEATURE_LARGE_BLOCKS])
f1512ee6 738 featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
9b67f605
MA
739 if (embedok &&
740 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
741 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
742 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
743 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
9b67f605
MA
744 }
745
746 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
747 featureflags);
748
34dc7c2f 749 drr->drr_u.drr_begin.drr_creation_time =
fcff0f35 750 dsl_dataset_phys(to_ds)->ds_creation_time;
13fe0198 751 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
da536844 752 if (is_clone)
34dc7c2f 753 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
fcff0f35
PD
754 drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
755 if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
34dc7c2f
BB
756 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
757
fcff0f35
PD
758 if (ancestor_zb != NULL) {
759 drr->drr_u.drr_begin.drr_fromguid =
760 ancestor_zb->zbm_guid;
761 fromtxg = ancestor_zb->zbm_creation_txg;
da536844 762 }
fcff0f35
PD
763 dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
764 if (!to_ds->ds_is_snapshot) {
da536844
MA
765 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
766 sizeof (drr->drr_u.drr_begin.drr_toname));
13fe0198 767 }
34dc7c2f 768
37abac6d
BP
769 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
770
771 dsp->dsa_drr = drr;
772 dsp->dsa_vp = vp;
773 dsp->dsa_outfd = outfd;
774 dsp->dsa_proc = curproc;
13fe0198 775 dsp->dsa_os = os;
37abac6d 776 dsp->dsa_off = off;
fcff0f35 777 dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
37abac6d 778 dsp->dsa_pending_op = PENDING_NONE;
fcff0f35 779 dsp->dsa_incremental = (ancestor_zb != NULL);
9b67f605 780 dsp->dsa_featureflags = featureflags;
37abac6d 781
fcff0f35
PD
782 mutex_enter(&to_ds->ds_sendstream_lock);
783 list_insert_head(&to_ds->ds_sendstreams, dsp);
784 mutex_exit(&to_ds->ds_sendstream_lock);
37abac6d 785
fcff0f35 786 dsl_dataset_long_hold(to_ds, FTAG);
7ec09286
MA
787 dsl_pool_rele(dp, tag);
788
37f8a883 789 if (dump_record(dsp, NULL, 0) != 0) {
37abac6d
BP
790 err = dsp->dsa_err;
791 goto out;
34dc7c2f
BB
792 }
793
fcff0f35
PD
794 err = bqueue_init(&to_arg.q, zfs_send_queue_length,
795 offsetof(struct send_block_record, ln));
796 to_arg.error_code = 0;
797 to_arg.cancel = B_FALSE;
798 to_arg.ds = to_ds;
799 to_arg.fromtxg = fromtxg;
800 to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
801 (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc,
802 TS_RUN, minclsyspri);
803
804 to_data = bqueue_dequeue(&to_arg.q);
805
806 while (!to_data->eos_marker && err == 0) {
807 err = do_dump(dsp, to_data);
808 to_data = get_next_record(&to_arg.q, to_data);
809 if (issig(JUSTLOOKING) && issig(FORREAL))
810 err = EINTR;
811 }
812
813 if (err != 0) {
814 to_arg.cancel = B_TRUE;
815 while (!to_data->eos_marker) {
816 to_data = get_next_record(&to_arg.q, to_data);
817 }
818 }
819 kmem_free(to_data, sizeof (*to_data));
820
821 bqueue_destroy(&to_arg.q);
822
823 if (err == 0 && to_arg.error_code != 0)
824 err = to_arg.error_code;
825
826 if (err != 0)
827 goto out;
34dc7c2f 828
37abac6d 829 if (dsp->dsa_pending_op != PENDING_NONE)
37f8a883 830 if (dump_record(dsp, NULL, 0) != 0)
2e528b49 831 err = SET_ERROR(EINTR);
428870ff 832
13fe0198
MA
833 if (err != 0) {
834 if (err == EINTR && dsp->dsa_err != 0)
37abac6d
BP
835 err = dsp->dsa_err;
836 goto out;
34dc7c2f
BB
837 }
838
839 bzero(drr, sizeof (dmu_replay_record_t));
840 drr->drr_type = DRR_END;
37abac6d
BP
841 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
842 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
34dc7c2f 843
fcff0f35 844 if (dump_record(dsp, NULL, 0) != 0)
37abac6d 845 err = dsp->dsa_err;
34dc7c2f 846
37abac6d 847out:
fcff0f35
PD
848 mutex_enter(&to_ds->ds_sendstream_lock);
849 list_remove(&to_ds->ds_sendstreams, dsp);
850 mutex_exit(&to_ds->ds_sendstream_lock);
37abac6d 851
34dc7c2f 852 kmem_free(drr, sizeof (dmu_replay_record_t));
37abac6d 853 kmem_free(dsp, sizeof (dmu_sendarg_t));
34dc7c2f 854
fcff0f35 855 dsl_dataset_long_rele(to_ds, FTAG);
13fe0198 856
37abac6d 857 return (err);
34dc7c2f
BB
858}
859
330d06f9 860int
13fe0198 861dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
f1512ee6
MA
862 boolean_t embedok, boolean_t large_block_ok,
863 int outfd, vnode_t *vp, offset_t *off)
13fe0198
MA
864{
865 dsl_pool_t *dp;
866 dsl_dataset_t *ds;
867 dsl_dataset_t *fromds = NULL;
868 int err;
869
870 err = dsl_pool_hold(pool, FTAG, &dp);
871 if (err != 0)
872 return (err);
873
874 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
875 if (err != 0) {
876 dsl_pool_rele(dp, FTAG);
877 return (err);
878 }
879
880 if (fromsnap != 0) {
da536844
MA
881 zfs_bookmark_phys_t zb;
882 boolean_t is_clone;
883
13fe0198
MA
884 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
885 if (err != 0) {
886 dsl_dataset_rele(ds, FTAG);
887 dsl_pool_rele(dp, FTAG);
888 return (err);
889 }
da536844
MA
890 if (!dsl_dataset_is_before(ds, fromds, 0))
891 err = SET_ERROR(EXDEV);
d683ddbb
JG
892 zb.zbm_creation_time =
893 dsl_dataset_phys(fromds)->ds_creation_time;
894 zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
895 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
da536844
MA
896 is_clone = (fromds->ds_dir != ds->ds_dir);
897 dsl_dataset_rele(fromds, FTAG);
f1512ee6
MA
898 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
899 embedok, large_block_ok, outfd, vp, off);
da536844 900 } else {
f1512ee6
MA
901 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
902 embedok, large_block_ok, outfd, vp, off);
13fe0198 903 }
da536844
MA
904 dsl_dataset_rele(ds, FTAG);
905 return (err);
13fe0198
MA
906}
907
908int
f1512ee6
MA
909dmu_send(const char *tosnap, const char *fromsnap,
910 boolean_t embedok, boolean_t large_block_ok,
13fe0198
MA
911 int outfd, vnode_t *vp, offset_t *off)
912{
913 dsl_pool_t *dp;
914 dsl_dataset_t *ds;
13fe0198 915 int err;
da536844 916 boolean_t owned = B_FALSE;
13fe0198 917
da536844 918 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
2e528b49 919 return (SET_ERROR(EINVAL));
13fe0198
MA
920
921 err = dsl_pool_hold(tosnap, FTAG, &dp);
922 if (err != 0)
923 return (err);
924
da536844
MA
925 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
926 /*
927 * We are sending a filesystem or volume. Ensure
928 * that it doesn't change by owning the dataset.
929 */
930 err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
931 owned = B_TRUE;
932 } else {
933 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
934 }
13fe0198
MA
935 if (err != 0) {
936 dsl_pool_rele(dp, FTAG);
937 return (err);
938 }
939
940 if (fromsnap != NULL) {
da536844
MA
941 zfs_bookmark_phys_t zb;
942 boolean_t is_clone = B_FALSE;
943 int fsnamelen = strchr(tosnap, '@') - tosnap;
944
945 /*
946 * If the fromsnap is in a different filesystem, then
947 * mark the send stream as a clone.
948 */
949 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
950 (fromsnap[fsnamelen] != '@' &&
951 fromsnap[fsnamelen] != '#')) {
952 is_clone = B_TRUE;
953 }
954
955 if (strchr(fromsnap, '@')) {
956 dsl_dataset_t *fromds;
957 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
958 if (err == 0) {
959 if (!dsl_dataset_is_before(ds, fromds, 0))
960 err = SET_ERROR(EXDEV);
961 zb.zbm_creation_time =
d683ddbb 962 dsl_dataset_phys(fromds)->ds_creation_time;
da536844 963 zb.zbm_creation_txg =
d683ddbb
JG
964 dsl_dataset_phys(fromds)->ds_creation_txg;
965 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
da536844
MA
966 is_clone = (ds->ds_dir != fromds->ds_dir);
967 dsl_dataset_rele(fromds, FTAG);
968 }
969 } else {
970 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
971 }
13fe0198
MA
972 if (err != 0) {
973 dsl_dataset_rele(ds, FTAG);
974 dsl_pool_rele(dp, FTAG);
975 return (err);
976 }
f1512ee6
MA
977 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
978 embedok, large_block_ok, outfd, vp, off);
da536844 979 } else {
f1512ee6
MA
980 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
981 embedok, large_block_ok, outfd, vp, off);
13fe0198 982 }
da536844
MA
983 if (owned)
984 dsl_dataset_disown(ds, FTAG);
985 else
986 dsl_dataset_rele(ds, FTAG);
987 return (err);
13fe0198
MA
988}
989
5dc8b736
MG
990static int
991dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
992 uint64_t *sizep)
993{
994 int err;
995 /*
996 * Assume that space (both on-disk and in-stream) is dominated by
997 * data. We will adjust for indirect blocks and the copies property,
998 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
999 */
1000
1001 /*
1002 * Subtract out approximate space used by indirect blocks.
1003 * Assume most space is used by data blocks (non-indirect, non-dnode).
1004 * Assume all blocks are recordsize. Assume ditto blocks and
1005 * internal fragmentation counter out compression.
1006 *
1007 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
1008 * block, which we observe in practice.
1009 */
1010 uint64_t recordsize;
1011 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
1012 if (err != 0)
1013 return (err);
1014 size -= size / recordsize * sizeof (blkptr_t);
1015
1016 /* Add in the space for the record associated with each block. */
1017 size += size / recordsize * sizeof (dmu_replay_record_t);
1018
1019 *sizep = size;
1020
1021 return (0);
1022}
1023
13fe0198
MA
1024int
1025dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
330d06f9 1026{
330d06f9 1027 int err;
5dc8b736 1028 uint64_t size;
13fe0198 1029
fd0fd646 1030 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
330d06f9
MA
1031
1032 /* tosnap must be a snapshot */
0c66c32d 1033 if (!ds->ds_is_snapshot)
2e528b49 1034 return (SET_ERROR(EINVAL));
330d06f9 1035
71e2fe41
AG
1036 /* fromsnap, if provided, must be a snapshot */
1037 if (fromds != NULL && !fromds->ds_is_snapshot)
1038 return (SET_ERROR(EINVAL));
1039
6f1ffb06
MA
1040 /*
1041 * fromsnap must be an earlier snapshot from the same fs as tosnap,
1042 * or the origin's fs.
1043 */
da536844 1044 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
2e528b49 1045 return (SET_ERROR(EXDEV));
330d06f9 1046
330d06f9
MA
1047 /* Get uncompressed size estimate of changed data. */
1048 if (fromds == NULL) {
d683ddbb 1049 size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
330d06f9
MA
1050 } else {
1051 uint64_t used, comp;
1052 err = dsl_dataset_space_written(fromds, ds,
1053 &used, &comp, &size);
13fe0198 1054 if (err != 0)
330d06f9
MA
1055 return (err);
1056 }
1057
5dc8b736
MG
1058 err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
1059 return (err);
1060}
330d06f9 1061
5dc8b736
MG
1062/*
1063 * Simple callback used to traverse the blocks of a snapshot and sum their
1064 * uncompressed size
1065 */
1066/* ARGSUSED */
1067static int
1068dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1069 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
1070{
1071 uint64_t *spaceptr = arg;
1072 if (bp != NULL && !BP_IS_HOLE(bp)) {
1073 *spaceptr += BP_GET_UCSIZE(bp);
1074 }
1075 return (0);
1076}
1077
1078/*
1079 * Given a desination snapshot and a TXG, calculate the approximate size of a
1080 * send stream sent from that TXG. from_txg may be zero, indicating that the
1081 * whole snapshot will be sent.
1082 */
1083int
1084dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
1085 uint64_t *sizep)
1086{
5dc8b736
MG
1087 int err;
1088 uint64_t size = 0;
1089
fd0fd646 1090 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
5dc8b736
MG
1091
1092 /* tosnap must be a snapshot */
1093 if (!dsl_dataset_is_snapshot(ds))
1094 return (SET_ERROR(EINVAL));
1095
1096 /* verify that from_txg is before the provided snapshot was taken */
1097 if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
1098 return (SET_ERROR(EXDEV));
1099 }
330d06f9 1100 /*
5dc8b736
MG
1101 * traverse the blocks of the snapshot with birth times after
1102 * from_txg, summing their uncompressed size
330d06f9 1103 */
5dc8b736
MG
1104 err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
1105 dmu_calculate_send_traversal, &size);
1106 if (err)
330d06f9 1107 return (err);
330d06f9 1108
5dc8b736
MG
1109 err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
1110 return (err);
330d06f9
MA
1111}
1112
13fe0198
MA
1113typedef struct dmu_recv_begin_arg {
1114 const char *drba_origin;
1115 dmu_recv_cookie_t *drba_cookie;
1116 cred_t *drba_cred;
19580676 1117 uint64_t drba_snapobj;
13fe0198 1118} dmu_recv_begin_arg_t;
34dc7c2f 1119
34dc7c2f 1120static int
13fe0198
MA
1121recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
1122 uint64_t fromguid)
34dc7c2f 1123{
34dc7c2f 1124 uint64_t val;
13fe0198
MA
1125 int error;
1126 dsl_pool_t *dp = ds->ds_dir->dd_pool;
34dc7c2f 1127
13fe0198
MA
1128 /* temporary clone name must not exist */
1129 error = zap_lookup(dp->dp_meta_objset,
d683ddbb 1130 dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
13fe0198
MA
1131 8, 1, &val);
1132 if (error != ENOENT)
1133 return (error == 0 ? EBUSY : error);
1134
572e2857 1135 /* new snapshot name must not exist */
13fe0198 1136 error = zap_lookup(dp->dp_meta_objset,
d683ddbb
JG
1137 dsl_dataset_phys(ds)->ds_snapnames_zapobj,
1138 drba->drba_cookie->drc_tosnap, 8, 1, &val);
13fe0198
MA
1139 if (error != ENOENT)
1140 return (error == 0 ? EEXIST : error);
572e2857 1141
788eb90c
JJ
1142 /*
1143 * Check snapshot limit before receiving. We'll recheck again at the
1144 * end, but might as well abort before receiving if we're already over
1145 * the limit.
1146 *
1147 * Note that we do not check the file system limit with
1148 * dsl_dir_fscount_check because the temporary %clones don't count
1149 * against that limit.
1150 */
1151 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
1152 NULL, drba->drba_cred);
1153 if (error != 0)
1154 return (error);
1155
13fe0198 1156 if (fromguid != 0) {
19580676 1157 dsl_dataset_t *snap;
d683ddbb 1158 uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
19580676
MA
1159
1160 /* Find snapshot in this dir that matches fromguid. */
1161 while (obj != 0) {
1162 error = dsl_dataset_hold_obj(dp, obj, FTAG,
1163 &snap);
1164 if (error != 0)
1165 return (SET_ERROR(ENODEV));
1166 if (snap->ds_dir != ds->ds_dir) {
1167 dsl_dataset_rele(snap, FTAG);
1168 return (SET_ERROR(ENODEV));
1169 }
d683ddbb 1170 if (dsl_dataset_phys(snap)->ds_guid == fromguid)
19580676 1171 break;
d683ddbb 1172 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
19580676
MA
1173 dsl_dataset_rele(snap, FTAG);
1174 }
1175 if (obj == 0)
2e528b49 1176 return (SET_ERROR(ENODEV));
34dc7c2f 1177
19580676
MA
1178 if (drba->drba_cookie->drc_force) {
1179 drba->drba_snapobj = obj;
1180 } else {
1181 /*
1182 * If we are not forcing, there must be no
1183 * changes since fromsnap.
1184 */
1185 if (dsl_dataset_modified_since_snap(ds, snap)) {
428870ff 1186 dsl_dataset_rele(snap, FTAG);
19580676 1187 return (SET_ERROR(ETXTBSY));
428870ff 1188 }
19580676 1189 drba->drba_snapobj = ds->ds_prev->ds_object;
428870ff 1190 }
19580676
MA
1191
1192 dsl_dataset_rele(snap, FTAG);
428870ff 1193 } else {
cf50a2b0
AG
1194 /* if full, then must be forced */
1195 if (!drba->drba_cookie->drc_force)
1196 return (SET_ERROR(EEXIST));
1197 /* start from $ORIGIN@$ORIGIN, if supported */
1198 drba->drba_snapobj = dp->dp_origin_snap != NULL ?
1199 dp->dp_origin_snap->ds_object : 0;
428870ff 1200 }
34dc7c2f 1201
34dc7c2f 1202 return (0);
13fe0198
MA
1203
1204}
1205
1206static int
1207dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
1208{
1209 dmu_recv_begin_arg_t *drba = arg;
1210 dsl_pool_t *dp = dmu_tx_pool(tx);
1211 struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
1212 uint64_t fromguid = drrb->drr_fromguid;
1213 int flags = drrb->drr_flags;
1214 int error;
9b67f605 1215 uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
13fe0198
MA
1216 dsl_dataset_t *ds;
1217 const char *tofs = drba->drba_cookie->drc_tofs;
1218
1219 /* already checked */
1220 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
1221
1222 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
1223 DMU_COMPOUNDSTREAM ||
1224 drrb->drr_type >= DMU_OST_NUMTYPES ||
1225 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
2e528b49 1226 return (SET_ERROR(EINVAL));
13fe0198
MA
1227
1228 /* Verify pool version supports SA if SA_SPILL feature set */
9b67f605
MA
1229 if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
1230 spa_version(dp->dp_spa) < SPA_VERSION_SA)
1231 return (SET_ERROR(ENOTSUP));
1232
1233 /*
1234 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
1235 * record to a plan WRITE record, so the pool must have the
1236 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
1237 * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
1238 */
1239 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
1240 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
1241 return (SET_ERROR(ENOTSUP));
1242 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
1243 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
2e528b49 1244 return (SET_ERROR(ENOTSUP));
13fe0198 1245
f1512ee6
MA
1246 /*
1247 * The receiving code doesn't know how to translate large blocks
1248 * to smaller ones, so the pool must have the LARGE_BLOCKS
1249 * feature enabled if the stream has LARGE_BLOCKS.
1250 */
1251 if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
1252 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
1253 return (SET_ERROR(ENOTSUP));
1254
13fe0198
MA
1255 error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
1256 if (error == 0) {
1257 /* target fs already exists; recv into temp clone */
1258
1259 /* Can't recv a clone into an existing fs */
1260 if (flags & DRR_FLAG_CLONE) {
1261 dsl_dataset_rele(ds, FTAG);
2e528b49 1262 return (SET_ERROR(EINVAL));
13fe0198
MA
1263 }
1264
1265 error = recv_begin_check_existing_impl(drba, ds, fromguid);
1266 dsl_dataset_rele(ds, FTAG);
1267 } else if (error == ENOENT) {
1268 /* target fs does not exist; must be a full backup or clone */
1269 char buf[MAXNAMELEN];
1270
1271 /*
1272 * If it's a non-clone incremental, we are missing the
1273 * target fs, so fail the recv.
1274 */
fcff0f35
PD
1275 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
1276 drba->drba_origin))
2e528b49 1277 return (SET_ERROR(ENOENT));
13fe0198
MA
1278
1279 /* Open the parent of tofs */
1280 ASSERT3U(strlen(tofs), <, MAXNAMELEN);
1281 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
1282 error = dsl_dataset_hold(dp, buf, FTAG, &ds);
1283 if (error != 0)
1284 return (error);
1285
788eb90c
JJ
1286 /*
1287 * Check filesystem and snapshot limits before receiving. We'll
1288 * recheck snapshot limits again at the end (we create the
1289 * filesystems and increment those counts during begin_sync).
1290 */
1291 error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
1292 ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
1293 if (error != 0) {
1294 dsl_dataset_rele(ds, FTAG);
1295 return (error);
1296 }
1297
1298 error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
1299 ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
1300 if (error != 0) {
1301 dsl_dataset_rele(ds, FTAG);
1302 return (error);
1303 }
1304
13fe0198
MA
1305 if (drba->drba_origin != NULL) {
1306 dsl_dataset_t *origin;
1307 error = dsl_dataset_hold(dp, drba->drba_origin,
1308 FTAG, &origin);
1309 if (error != 0) {
1310 dsl_dataset_rele(ds, FTAG);
1311 return (error);
1312 }
0c66c32d 1313 if (!origin->ds_is_snapshot) {
13fe0198
MA
1314 dsl_dataset_rele(origin, FTAG);
1315 dsl_dataset_rele(ds, FTAG);
2e528b49 1316 return (SET_ERROR(EINVAL));
13fe0198 1317 }
d683ddbb 1318 if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
13fe0198
MA
1319 dsl_dataset_rele(origin, FTAG);
1320 dsl_dataset_rele(ds, FTAG);
2e528b49 1321 return (SET_ERROR(ENODEV));
13fe0198
MA
1322 }
1323 dsl_dataset_rele(origin, FTAG);
1324 }
1325 dsl_dataset_rele(ds, FTAG);
1326 error = 0;
1327 }
1328 return (error);
34dc7c2f
BB
1329}
1330
34dc7c2f 1331static void
13fe0198 1332dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
34dc7c2f 1333{
13fe0198
MA
1334 dmu_recv_begin_arg_t *drba = arg;
1335 dsl_pool_t *dp = dmu_tx_pool(tx);
1336 struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
1337 const char *tofs = drba->drba_cookie->drc_tofs;
1338 dsl_dataset_t *ds, *newds;
34dc7c2f 1339 uint64_t dsobj;
13fe0198
MA
1340 int error;
1341 uint64_t crflags;
1342
1343 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
1344 DS_FLAG_CI_DATASET : 0;
34dc7c2f 1345
13fe0198
MA
1346 error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
1347 if (error == 0) {
1348 /* create temporary clone */
19580676
MA
1349 dsl_dataset_t *snap = NULL;
1350 if (drba->drba_snapobj != 0) {
1351 VERIFY0(dsl_dataset_hold_obj(dp,
1352 drba->drba_snapobj, FTAG, &snap));
1353 }
13fe0198 1354 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
19580676 1355 snap, crflags, drba->drba_cred, tx);
6b42ea85
PD
1356 if (drba->drba_snapobj != 0)
1357 dsl_dataset_rele(snap, FTAG);
13fe0198
MA
1358 dsl_dataset_rele(ds, FTAG);
1359 } else {
1360 dsl_dir_t *dd;
1361 const char *tail;
1362 dsl_dataset_t *origin = NULL;
1363
1364 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
1365
1366 if (drba->drba_origin != NULL) {
1367 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
1368 FTAG, &origin));
1369 }
1370
1371 /* Create new dataset. */
1372 dsobj = dsl_dataset_create_sync(dd,
1373 strrchr(tofs, '/') + 1,
1374 origin, crflags, drba->drba_cred, tx);
1375 if (origin != NULL)
1376 dsl_dataset_rele(origin, FTAG);
1377 dsl_dir_rele(dd, FTAG);
1378 drba->drba_cookie->drc_newfs = B_TRUE;
1379 }
1380 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
1381
1382 dmu_buf_will_dirty(newds->ds_dbuf, tx);
d683ddbb 1383 dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
34dc7c2f 1384
428870ff
BB
1385 /*
1386 * If we actually created a non-clone, we need to create the
1387 * objset in our new dataset.
1388 */
13fe0198 1389 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
428870ff 1390 (void) dmu_objset_create_impl(dp->dp_spa,
13fe0198 1391 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
428870ff 1392 }
34dc7c2f 1393
13fe0198 1394 drba->drba_cookie->drc_ds = newds;
428870ff 1395
13fe0198 1396 spa_history_log_internal_ds(newds, "receive", tx, "");
34dc7c2f
BB
1397}
1398
34dc7c2f
BB
1399/*
1400 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
1401 * succeeds; otherwise we will leak the holds on the datasets.
1402 */
1403int
13fe0198
MA
1404dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
1405 boolean_t force, char *origin, dmu_recv_cookie_t *drc)
34dc7c2f 1406{
13fe0198
MA
1407 dmu_recv_begin_arg_t drba = { 0 };
1408 dmu_replay_record_t *drr;
34dc7c2f
BB
1409
1410 bzero(drc, sizeof (dmu_recv_cookie_t));
1411 drc->drc_drrb = drrb;
1412 drc->drc_tosnap = tosnap;
13fe0198 1413 drc->drc_tofs = tofs;
34dc7c2f 1414 drc->drc_force = force;
788eb90c 1415 drc->drc_cred = CRED();
34dc7c2f 1416
13fe0198
MA
1417 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
1418 drc->drc_byteswap = B_TRUE;
1419 else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
2e528b49 1420 return (SET_ERROR(EINVAL));
34dc7c2f 1421
13fe0198
MA
1422 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1423 drr->drr_type = DRR_BEGIN;
1424 drr->drr_u.drr_begin = *drc->drc_drrb;
1425 if (drc->drc_byteswap) {
1426 fletcher_4_incremental_byteswap(drr,
1427 sizeof (dmu_replay_record_t), &drc->drc_cksum);
1428 } else {
1429 fletcher_4_incremental_native(drr,
1430 sizeof (dmu_replay_record_t), &drc->drc_cksum);
1431 }
1432 kmem_free(drr, sizeof (dmu_replay_record_t));
428870ff 1433
13fe0198
MA
1434 if (drc->drc_byteswap) {
1435 drrb->drr_magic = BSWAP_64(drrb->drr_magic);
1436 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
1437 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
1438 drrb->drr_type = BSWAP_32(drrb->drr_type);
1439 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
1440 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
34dc7c2f
BB
1441 }
1442
13fe0198
MA
1443 drba.drba_origin = origin;
1444 drba.drba_cookie = drc;
1445 drba.drba_cred = CRED();
1446
1447 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
3d45fdd6 1448 &drba, 5, ZFS_SPACE_CHECK_NORMAL));
34dc7c2f
BB
1449}
1450
fcff0f35
PD
1451struct receive_record_arg {
1452 dmu_replay_record_t header;
1453 void *payload; /* Pointer to a buffer containing the payload */
1454 /*
1455 * If the record is a write, pointer to the arc_buf_t containing the
1456 * payload.
1457 */
1458 arc_buf_t *write_buf;
1459 int payload_size;
1460 boolean_t eos_marker; /* Marks the end of the stream */
1461 bqueue_node_t node;
1462};
1463
1464struct receive_writer_arg {
37f8a883 1465 objset_t *os;
13fe0198 1466 boolean_t byteswap;
fcff0f35
PD
1467 bqueue_t q;
1468 /*
1469 * These three args are used to signal to the main thread that we're
1470 * done.
1471 */
1472 kmutex_t mutex;
1473 kcondvar_t cv;
1474 boolean_t done;
1475 int err;
1476 /* A map from guid to dataset to help handle dedup'd streams. */
1477 avl_tree_t *guid_to_ds_map;
1478};
37f8a883 1479
fcff0f35
PD
1480struct receive_arg {
1481 objset_t *os;
1482 vnode_t *vp; /* The vnode to read the stream from */
1483 uint64_t voff; /* The current offset in the stream */
1484 /*
1485 * A record that has had its payload read in, but hasn't yet been handed
1486 * off to the worker thread.
1487 */
1488 struct receive_record_arg *rrd;
1489 /* A record that has had its header read in, but not its payload. */
1490 struct receive_record_arg *next_rrd;
34dc7c2f 1491 zio_cksum_t cksum;
37f8a883 1492 zio_cksum_t prev_cksum;
fcff0f35
PD
1493 int err;
1494 boolean_t byteswap;
1495 /* Sorted list of objects not to issue prefetches for. */
1496 list_t ignore_obj_list;
1497};
37f8a883 1498
fcff0f35
PD
1499struct receive_ign_obj_node {
1500 list_node_t node;
1501 uint64_t object;
34dc7c2f
BB
1502};
1503
428870ff
BB
1504typedef struct guid_map_entry {
1505 uint64_t guid;
1506 dsl_dataset_t *gme_ds;
1507 avl_node_t avlnode;
1508} guid_map_entry_t;
1509
1510static int
1511guid_compare(const void *arg1, const void *arg2)
1512{
1513 const guid_map_entry_t *gmep1 = arg1;
1514 const guid_map_entry_t *gmep2 = arg2;
1515
1516 if (gmep1->guid < gmep2->guid)
1517 return (-1);
1518 else if (gmep1->guid > gmep2->guid)
1519 return (1);
1520 return (0);
1521}
1522
572e2857
BB
1523static void
1524free_guid_map_onexit(void *arg)
1525{
1526 avl_tree_t *ca = arg;
1527 void *cookie = NULL;
1528 guid_map_entry_t *gmep;
1529
1530 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
13fe0198 1531 dsl_dataset_long_rele(gmep->gme_ds, gmep);
7ec09286 1532 dsl_dataset_rele(gmep->gme_ds, gmep);
572e2857
BB
1533 kmem_free(gmep, sizeof (guid_map_entry_t));
1534 }
1535 avl_destroy(ca);
1536 kmem_free(ca, sizeof (avl_tree_t));
1537}
1538
37f8a883 1539static int
fcff0f35 1540receive_read(struct receive_arg *ra, int len, void *buf)
34dc7c2f 1541{
34dc7c2f
BB
1542 int done = 0;
1543
1544 /* some things will require 8-byte alignment, so everything must */
c99c9001 1545 ASSERT0(len % 8);
34dc7c2f
BB
1546
1547 while (done < len) {
1548 ssize_t resid;
1549
1550 ra->err = vn_rdwr(UIO_READ, ra->vp,
37f8a883 1551 (char *)buf + done, len - done,
34dc7c2f
BB
1552 ra->voff, UIO_SYSSPACE, FAPPEND,
1553 RLIM64_INFINITY, CRED(), &resid);
1554
1555 if (resid == len - done)
2e528b49 1556 ra->err = SET_ERROR(EINVAL);
34dc7c2f
BB
1557 ra->voff += len - done - resid;
1558 done = len - resid;
13fe0198 1559 if (ra->err != 0)
37f8a883 1560 return (ra->err);
34dc7c2f
BB
1561 }
1562
1563 ASSERT3U(done, ==, len);
37f8a883 1564 return (0);
34dc7c2f
BB
1565}
1566
60948de1 1567noinline static void
37f8a883 1568byteswap_record(dmu_replay_record_t *drr)
34dc7c2f
BB
1569{
1570#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
1571#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
1572 drr->drr_type = BSWAP_32(drr->drr_type);
1573 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
37f8a883 1574
34dc7c2f
BB
1575 switch (drr->drr_type) {
1576 case DRR_BEGIN:
1577 DO64(drr_begin.drr_magic);
428870ff 1578 DO64(drr_begin.drr_versioninfo);
34dc7c2f
BB
1579 DO64(drr_begin.drr_creation_time);
1580 DO32(drr_begin.drr_type);
1581 DO32(drr_begin.drr_flags);
1582 DO64(drr_begin.drr_toguid);
1583 DO64(drr_begin.drr_fromguid);
1584 break;
1585 case DRR_OBJECT:
1586 DO64(drr_object.drr_object);
34dc7c2f
BB
1587 DO32(drr_object.drr_type);
1588 DO32(drr_object.drr_bonustype);
1589 DO32(drr_object.drr_blksz);
1590 DO32(drr_object.drr_bonuslen);
428870ff 1591 DO64(drr_object.drr_toguid);
34dc7c2f
BB
1592 break;
1593 case DRR_FREEOBJECTS:
1594 DO64(drr_freeobjects.drr_firstobj);
1595 DO64(drr_freeobjects.drr_numobjs);
428870ff 1596 DO64(drr_freeobjects.drr_toguid);
34dc7c2f
BB
1597 break;
1598 case DRR_WRITE:
1599 DO64(drr_write.drr_object);
1600 DO32(drr_write.drr_type);
1601 DO64(drr_write.drr_offset);
1602 DO64(drr_write.drr_length);
428870ff 1603 DO64(drr_write.drr_toguid);
37f8a883 1604 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
428870ff
BB
1605 DO64(drr_write.drr_key.ddk_prop);
1606 break;
1607 case DRR_WRITE_BYREF:
1608 DO64(drr_write_byref.drr_object);
1609 DO64(drr_write_byref.drr_offset);
1610 DO64(drr_write_byref.drr_length);
1611 DO64(drr_write_byref.drr_toguid);
1612 DO64(drr_write_byref.drr_refguid);
1613 DO64(drr_write_byref.drr_refobject);
1614 DO64(drr_write_byref.drr_refoffset);
37f8a883
MA
1615 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
1616 drr_key.ddk_cksum);
428870ff 1617 DO64(drr_write_byref.drr_key.ddk_prop);
34dc7c2f 1618 break;
9b67f605
MA
1619 case DRR_WRITE_EMBEDDED:
1620 DO64(drr_write_embedded.drr_object);
1621 DO64(drr_write_embedded.drr_offset);
1622 DO64(drr_write_embedded.drr_length);
1623 DO64(drr_write_embedded.drr_toguid);
1624 DO32(drr_write_embedded.drr_lsize);
1625 DO32(drr_write_embedded.drr_psize);
1626 break;
34dc7c2f
BB
1627 case DRR_FREE:
1628 DO64(drr_free.drr_object);
1629 DO64(drr_free.drr_offset);
1630 DO64(drr_free.drr_length);
428870ff
BB
1631 DO64(drr_free.drr_toguid);
1632 break;
1633 case DRR_SPILL:
1634 DO64(drr_spill.drr_object);
1635 DO64(drr_spill.drr_length);
1636 DO64(drr_spill.drr_toguid);
34dc7c2f
BB
1637 break;
1638 case DRR_END:
428870ff 1639 DO64(drr_end.drr_toguid);
37f8a883 1640 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
34dc7c2f 1641 break;
e75c13c3
BB
1642 default:
1643 break;
34dc7c2f 1644 }
37f8a883
MA
1645
1646 if (drr->drr_type != DRR_BEGIN) {
1647 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
1648 }
1649
34dc7c2f
BB
1650#undef DO64
1651#undef DO32
1652}
1653
6c59307a
MA
1654static inline uint8_t
1655deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
1656{
1657 if (bonus_type == DMU_OT_SA) {
1658 return (1);
1659 } else {
1660 return (1 +
1661 ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
1662 }
1663}
1664
60948de1 1665noinline static int
fcff0f35
PD
1666receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
1667 void *data)
34dc7c2f 1668{
6c59307a 1669 dmu_object_info_t doi;
34dc7c2f 1670 dmu_tx_t *tx;
6c59307a
MA
1671 uint64_t object;
1672 int err;
34dc7c2f 1673
34dc7c2f 1674 if (drro->drr_type == DMU_OT_NONE ||
9ae529ec
CS
1675 !DMU_OT_IS_VALID(drro->drr_type) ||
1676 !DMU_OT_IS_VALID(drro->drr_bonustype) ||
428870ff 1677 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
34dc7c2f
BB
1678 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1679 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1680 drro->drr_blksz < SPA_MINBLOCKSIZE ||
fcff0f35 1681 drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
34dc7c2f 1682 drro->drr_bonuslen > DN_MAX_BONUSLEN) {
2e528b49 1683 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1684 }
1685
fcff0f35 1686 err = dmu_object_info(rwa->os, drro->drr_object, &doi);
9babb374
BB
1687
1688 if (err != 0 && err != ENOENT)
2e528b49 1689 return (SET_ERROR(EINVAL));
6c59307a 1690 object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
9babb374 1691
6c59307a
MA
1692 /*
1693 * If we are losing blkptrs or changing the block size this must
1694 * be a new file instance. We must clear out the previous file
1695 * contents before we can change this type of metadata in the dnode.
1696 */
1697 if (err == 0) {
1698 int nblkptr;
1699
1700 nblkptr = deduce_nblkptr(drro->drr_bonustype,
1701 drro->drr_bonuslen);
1702
1703 if (drro->drr_blksz != doi.doi_data_block_size ||
1704 nblkptr < doi.doi_nblkptr) {
fcff0f35 1705 err = dmu_free_long_range(rwa->os, drro->drr_object,
6c59307a
MA
1706 0, DMU_OBJECT_END);
1707 if (err != 0)
1708 return (SET_ERROR(EINVAL));
34dc7c2f 1709 }
6c59307a
MA
1710 }
1711
fcff0f35 1712 tx = dmu_tx_create(rwa->os);
6c59307a
MA
1713 dmu_tx_hold_bonus(tx, object);
1714 err = dmu_tx_assign(tx, TXG_WAIT);
1715 if (err != 0) {
1716 dmu_tx_abort(tx);
1717 return (err);
1718 }
1719
1720 if (object == DMU_NEW_OBJECT) {
1721 /* currently free, want to be allocated */
fcff0f35 1722 err = dmu_object_claim(rwa->os, drro->drr_object,
34dc7c2f
BB
1723 drro->drr_type, drro->drr_blksz,
1724 drro->drr_bonustype, drro->drr_bonuslen, tx);
6c59307a
MA
1725 } else if (drro->drr_type != doi.doi_type ||
1726 drro->drr_blksz != doi.doi_data_block_size ||
1727 drro->drr_bonustype != doi.doi_bonus_type ||
1728 drro->drr_bonuslen != doi.doi_bonus_size) {
1729 /* currently allocated, but with different properties */
fcff0f35 1730 err = dmu_object_reclaim(rwa->os, drro->drr_object,
34dc7c2f 1731 drro->drr_type, drro->drr_blksz,
6c59307a 1732 drro->drr_bonustype, drro->drr_bonuslen, tx);
34dc7c2f 1733 }
13fe0198 1734 if (err != 0) {
6c59307a 1735 dmu_tx_commit(tx);
2e528b49 1736 return (SET_ERROR(EINVAL));
428870ff 1737 }
9babb374 1738
fcff0f35 1739 dmu_object_set_checksum(rwa->os, drro->drr_object,
37f8a883 1740 drro->drr_checksumtype, tx);
fcff0f35 1741 dmu_object_set_compress(rwa->os, drro->drr_object,
37f8a883 1742 drro->drr_compress, tx);
34dc7c2f 1743
b128c09f 1744 if (data != NULL) {
34dc7c2f 1745 dmu_buf_t *db;
b128c09f 1746
fcff0f35 1747 VERIFY0(dmu_bonus_hold(rwa->os, drro->drr_object, FTAG, &db));
34dc7c2f
BB
1748 dmu_buf_will_dirty(db, tx);
1749
1750 ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
34dc7c2f 1751 bcopy(data, db->db_data, drro->drr_bonuslen);
fcff0f35 1752 if (rwa->byteswap) {
9ae529ec
CS
1753 dmu_object_byteswap_t byteswap =
1754 DMU_OT_BYTESWAP(drro->drr_bonustype);
1755 dmu_ot_byteswap[byteswap].ob_func(db->db_data,
34dc7c2f
BB
1756 drro->drr_bonuslen);
1757 }
1758 dmu_buf_rele(db, FTAG);
1759 }
1760 dmu_tx_commit(tx);
1761 return (0);
1762}
1763
1764/* ARGSUSED */
60948de1 1765noinline static int
fcff0f35 1766receive_freeobjects(struct receive_writer_arg *rwa,
34dc7c2f
BB
1767 struct drr_freeobjects *drrfo)
1768{
1769 uint64_t obj;
1770
1771 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
2e528b49 1772 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1773
1774 for (obj = drrfo->drr_firstobj;
1775 obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
fcff0f35 1776 (void) dmu_object_next(rwa->os, &obj, FALSE, 0)) {
34dc7c2f
BB
1777 int err;
1778
fcff0f35 1779 if (dmu_object_info(rwa->os, obj, NULL) != 0)
34dc7c2f
BB
1780 continue;
1781
fcff0f35 1782 err = dmu_free_long_object(rwa->os, obj);
13fe0198 1783 if (err != 0)
34dc7c2f 1784 return (err);
34dc7c2f
BB
1785 }
1786 return (0);
1787}
1788
60948de1 1789noinline static int
fcff0f35
PD
1790receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
1791 arc_buf_t *abuf)
34dc7c2f
BB
1792{
1793 dmu_tx_t *tx;
88904bb3 1794 dmu_buf_t *bonus;
34dc7c2f
BB
1795 int err;
1796
1797 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
9ae529ec 1798 !DMU_OT_IS_VALID(drrw->drr_type))
2e528b49 1799 return (SET_ERROR(EINVAL));
34dc7c2f 1800
fcff0f35 1801 if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
88904bb3
MA
1802 return (SET_ERROR(EINVAL));
1803
fcff0f35 1804 tx = dmu_tx_create(rwa->os);
34dc7c2f
BB
1805
1806 dmu_tx_hold_write(tx, drrw->drr_object,
1807 drrw->drr_offset, drrw->drr_length);
1808 err = dmu_tx_assign(tx, TXG_WAIT);
13fe0198 1809 if (err != 0) {
34dc7c2f
BB
1810 dmu_tx_abort(tx);
1811 return (err);
1812 }
fcff0f35 1813 if (rwa->byteswap) {
9ae529ec
CS
1814 dmu_object_byteswap_t byteswap =
1815 DMU_OT_BYTESWAP(drrw->drr_type);
37f8a883
MA
1816 dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
1817 drrw->drr_length);
9ae529ec 1818 }
37f8a883 1819
fcff0f35 1820 if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
37f8a883 1821 return (SET_ERROR(EINVAL));
88904bb3 1822 dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
34dc7c2f 1823 dmu_tx_commit(tx);
88904bb3 1824 dmu_buf_rele(bonus, FTAG);
34dc7c2f
BB
1825 return (0);
1826}
1827
428870ff
BB
1828/*
1829 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed
1830 * streams to refer to a copy of the data that is already on the
1831 * system because it came in earlier in the stream. This function
1832 * finds the earlier copy of the data, and uses that copy instead of
1833 * data from the stream to fulfill this write.
1834 */
1835static int
fcff0f35
PD
1836receive_write_byref(struct receive_writer_arg *rwa,
1837 struct drr_write_byref *drrwbr)
428870ff
BB
1838{
1839 dmu_tx_t *tx;
1840 int err;
1841 guid_map_entry_t gmesrch;
1842 guid_map_entry_t *gmep;
9b67f605 1843 avl_index_t where;
428870ff
BB
1844 objset_t *ref_os = NULL;
1845 dmu_buf_t *dbp;
1846
1847 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
2e528b49 1848 return (SET_ERROR(EINVAL));
428870ff
BB
1849
1850 /*
1851 * If the GUID of the referenced dataset is different from the
1852 * GUID of the target dataset, find the referenced dataset.
1853 */
1854 if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1855 gmesrch.guid = drrwbr->drr_refguid;
fcff0f35 1856 if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
428870ff 1857 &where)) == NULL) {
2e528b49 1858 return (SET_ERROR(EINVAL));
428870ff
BB
1859 }
1860 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
2e528b49 1861 return (SET_ERROR(EINVAL));
428870ff 1862 } else {
fcff0f35 1863 ref_os = rwa->os;
428870ff
BB
1864 }
1865
c65aa5b2
BB
1866 err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1867 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
9b67f605 1868 if (err != 0)
428870ff
BB
1869 return (err);
1870
fcff0f35 1871 tx = dmu_tx_create(rwa->os);
428870ff
BB
1872
1873 dmu_tx_hold_write(tx, drrwbr->drr_object,
1874 drrwbr->drr_offset, drrwbr->drr_length);
1875 err = dmu_tx_assign(tx, TXG_WAIT);
13fe0198 1876 if (err != 0) {
428870ff
BB
1877 dmu_tx_abort(tx);
1878 return (err);
1879 }
fcff0f35 1880 dmu_write(rwa->os, drrwbr->drr_object,
428870ff
BB
1881 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1882 dmu_buf_rele(dbp, FTAG);
1883 dmu_tx_commit(tx);
1884 return (0);
1885}
1886
9b67f605 1887static int
fcff0f35 1888receive_write_embedded(struct receive_writer_arg *rwa,
37f8a883 1889 struct drr_write_embedded *drrwnp, void *data)
9b67f605
MA
1890{
1891 dmu_tx_t *tx;
1892 int err;
9b67f605
MA
1893
1894 if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
1895 return (EINVAL);
1896
1897 if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
1898 return (EINVAL);
1899
1900 if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
1901 return (EINVAL);
1902 if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
1903 return (EINVAL);
1904
fcff0f35 1905 tx = dmu_tx_create(rwa->os);
9b67f605
MA
1906
1907 dmu_tx_hold_write(tx, drrwnp->drr_object,
1908 drrwnp->drr_offset, drrwnp->drr_length);
1909 err = dmu_tx_assign(tx, TXG_WAIT);
1910 if (err != 0) {
1911 dmu_tx_abort(tx);
1912 return (err);
1913 }
1914
fcff0f35 1915 dmu_write_embedded(rwa->os, drrwnp->drr_object,
9b67f605
MA
1916 drrwnp->drr_offset, data, drrwnp->drr_etype,
1917 drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
fcff0f35 1918 rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
9b67f605
MA
1919
1920 dmu_tx_commit(tx);
1921 return (0);
1922}
1923
428870ff 1924static int
fcff0f35
PD
1925receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
1926 void *data)
428870ff
BB
1927{
1928 dmu_tx_t *tx;
428870ff
BB
1929 dmu_buf_t *db, *db_spill;
1930 int err;
1931
1932 if (drrs->drr_length < SPA_MINBLOCKSIZE ||
fcff0f35 1933 drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
2e528b49 1934 return (SET_ERROR(EINVAL));
428870ff 1935
fcff0f35 1936 if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
2e528b49 1937 return (SET_ERROR(EINVAL));
428870ff 1938
fcff0f35 1939 VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
428870ff
BB
1940 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
1941 dmu_buf_rele(db, FTAG);
1942 return (err);
1943 }
1944
fcff0f35 1945 tx = dmu_tx_create(rwa->os);
428870ff
BB
1946
1947 dmu_tx_hold_spill(tx, db->db_object);
1948
1949 err = dmu_tx_assign(tx, TXG_WAIT);
13fe0198 1950 if (err != 0) {
428870ff
BB
1951 dmu_buf_rele(db, FTAG);
1952 dmu_buf_rele(db_spill, FTAG);
1953 dmu_tx_abort(tx);
1954 return (err);
1955 }
1956 dmu_buf_will_dirty(db_spill, tx);
1957
1958 if (db_spill->db_size < drrs->drr_length)
1959 VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1960 drrs->drr_length, tx));
1961 bcopy(data, db_spill->db_data, drrs->drr_length);
1962
1963 dmu_buf_rele(db, FTAG);
1964 dmu_buf_rele(db_spill, FTAG);
1965
1966 dmu_tx_commit(tx);
1967 return (0);
1968}
1969
34dc7c2f 1970/* ARGSUSED */
60948de1 1971noinline static int
fcff0f35 1972receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
34dc7c2f 1973{
34dc7c2f
BB
1974 int err;
1975
1976 if (drrf->drr_length != -1ULL &&
1977 drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
2e528b49 1978 return (SET_ERROR(EINVAL));
34dc7c2f 1979
fcff0f35 1980 if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
2e528b49 1981 return (SET_ERROR(EINVAL));
34dc7c2f 1982
fcff0f35 1983 err = dmu_free_long_range(rwa->os, drrf->drr_object,
34dc7c2f 1984 drrf->drr_offset, drrf->drr_length);
fcff0f35 1985
34dc7c2f
BB
1986 return (err);
1987}
1988
13fe0198
MA
1989/* used to destroy the drc_ds on error */
1990static void
1991dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
1992{
1993 char name[MAXNAMELEN];
1994 dsl_dataset_name(drc->drc_ds, name);
1995 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
1996 (void) dsl_destroy_head(name);
1997}
1998
37f8a883 1999static void
fcff0f35 2000receive_cksum(struct receive_arg *ra, int len, void *buf)
37f8a883
MA
2001{
2002 if (ra->byteswap) {
2003 fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
2004 } else {
2005 fletcher_4_incremental_native(buf, len, &ra->cksum);
2006 }
2007}
2008
2009/*
fcff0f35
PD
2010 * Read the payload into a buffer of size len, and update the current record's
2011 * payload field.
2012 * Allocate ra->next_rrd and read the next record's header into
2013 * ra->next_rrd->header.
37f8a883
MA
2014 * Verify checksum of payload and next record.
2015 */
2016static int
fcff0f35 2017receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
37f8a883
MA
2018{
2019 int err;
2020 zio_cksum_t cksum_orig;
2021 zio_cksum_t *cksump;
2022
2023 if (len != 0) {
fcff0f35
PD
2024 ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
2025 ra->rrd->payload = buf;
2026 ra->rrd->payload_size = len;
2027 err = receive_read(ra, len, ra->rrd->payload);
37f8a883
MA
2028 if (err != 0)
2029 return (err);
fcff0f35 2030 receive_cksum(ra, len, ra->rrd->payload);
37f8a883
MA
2031 }
2032
2033 ra->prev_cksum = ra->cksum;
2034
fcff0f35
PD
2035 ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
2036 err = receive_read(ra, sizeof (ra->next_rrd->header),
2037 &ra->next_rrd->header);
2038 if (err != 0) {
2039 kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
2040 ra->next_rrd = NULL;
37f8a883 2041 return (err);
fcff0f35
PD
2042 }
2043 if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
2044 kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
2045 ra->next_rrd = NULL;
37f8a883 2046 return (SET_ERROR(EINVAL));
fcff0f35 2047 }
37f8a883
MA
2048
2049 /*
2050 * Note: checksum is of everything up to but not including the
2051 * checksum itself.
2052 */
2053 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
2054 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
fcff0f35 2055 receive_cksum(ra,
37f8a883 2056 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
fcff0f35 2057 &ra->next_rrd->header);
37f8a883 2058
fcff0f35
PD
2059 cksum_orig = ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
2060 cksump = &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
37f8a883
MA
2061
2062 if (ra->byteswap)
fcff0f35 2063 byteswap_record(&ra->next_rrd->header);
37f8a883
MA
2064
2065 if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
fcff0f35
PD
2066 !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
2067 kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
2068 ra->next_rrd = NULL;
37f8a883 2069 return (SET_ERROR(ECKSUM));
fcff0f35 2070 }
37f8a883 2071
fcff0f35 2072 receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
37f8a883
MA
2073
2074 return (0);
2075}
2076
fcff0f35
PD
2077/*
2078 * Issue the prefetch reads for any necessary indirect blocks.
2079 *
2080 * We use the object ignore list to tell us whether or not to issue prefetches
2081 * for a given object. We do this for both correctness (in case the blocksize
2082 * of an object has changed) and performance (if the object doesn't exist, don't
2083 * needlessly try to issue prefetches). We also trim the list as we go through
2084 * the stream to prevent it from growing to an unbounded size.
2085 *
2086 * The object numbers within will always be in sorted order, and any write
2087 * records we see will also be in sorted order, but they're not sorted with
2088 * respect to each other (i.e. we can get several object records before
2089 * receiving each object's write records). As a result, once we've reached a
2090 * given object number, we can safely remove any reference to lower object
2091 * numbers in the ignore list. In practice, we receive up to 32 object records
2092 * before receiving write records, so the list can have up to 32 nodes in it.
2093 */
2094/* ARGSUSED */
2095static void
2096receive_read_prefetch(struct receive_arg *ra,
2097 uint64_t object, uint64_t offset, uint64_t length)
2098{
2099 struct receive_ign_obj_node *node = list_head(&ra->ignore_obj_list);
2100 while (node != NULL && node->object < object) {
2101 VERIFY3P(node, ==, list_remove_head(&ra->ignore_obj_list));
2102 kmem_free(node, sizeof (*node));
2103 node = list_head(&ra->ignore_obj_list);
2104 }
2105 if (node == NULL || node->object > object) {
2106 dmu_prefetch(ra->os, object, 1, offset, length,
2107 ZIO_PRIORITY_SYNC_READ);
2108 }
2109}
2110
2111/*
2112 * Read records off the stream, issuing any necessary prefetches.
2113 */
37f8a883 2114static int
fcff0f35 2115receive_read_record(struct receive_arg *ra)
37f8a883
MA
2116{
2117 int err;
2118
fcff0f35 2119 switch (ra->rrd->header.drr_type) {
37f8a883
MA
2120 case DRR_OBJECT:
2121 {
fcff0f35
PD
2122 struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
2123 uint32_t size = P2ROUNDUP(drro->drr_bonuslen, 8);
2124 void *buf = kmem_zalloc(size, KM_SLEEP);
2125 dmu_object_info_t doi;
2126 err = receive_read_payload_and_next_header(ra, size, buf);
2127 if (err != 0) {
2128 kmem_free(buf, size);
37f8a883 2129 return (err);
fcff0f35
PD
2130 }
2131 err = dmu_object_info(ra->os, drro->drr_object, &doi);
2132 /*
2133 * See receive_read_prefetch for an explanation why we're
2134 * storing this object in the ignore_obj_list.
2135 */
2136 if (err == ENOENT ||
2137 (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
2138 struct receive_ign_obj_node *node =
2139 kmem_zalloc(sizeof (*node),
2140 KM_SLEEP);
2141 node->object = drro->drr_object;
2142#ifdef ZFS_DEBUG
2143 {
2144 struct receive_ign_obj_node *last_object =
2145 list_tail(&ra->ignore_obj_list);
2146 uint64_t last_objnum = (last_object != NULL ?
2147 last_object->object : 0);
2148 ASSERT3U(node->object, >, last_objnum);
2149 }
2150#endif
2151 list_insert_tail(&ra->ignore_obj_list, node);
2152 err = 0;
2153 }
2154 return (err);
37f8a883
MA
2155 }
2156 case DRR_FREEOBJECTS:
2157 {
fcff0f35
PD
2158 err = receive_read_payload_and_next_header(ra, 0, NULL);
2159 return (err);
37f8a883
MA
2160 }
2161 case DRR_WRITE:
2162 {
fcff0f35 2163 struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
37f8a883
MA
2164 arc_buf_t *abuf = arc_loan_buf(dmu_objset_spa(ra->os),
2165 drrw->drr_length);
2166
fcff0f35 2167 err = receive_read_payload_and_next_header(ra,
37f8a883 2168 drrw->drr_length, abuf->b_data);
fcff0f35 2169 if (err != 0) {
37f8a883 2170 dmu_return_arcbuf(abuf);
fcff0f35
PD
2171 return (err);
2172 }
2173 ra->rrd->write_buf = abuf;
2174 receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
2175 drrw->drr_length);
37f8a883
MA
2176 return (err);
2177 }
2178 case DRR_WRITE_BYREF:
2179 {
fcff0f35
PD
2180 struct drr_write_byref *drrwb =
2181 &ra->rrd->header.drr_u.drr_write_byref;
2182 err = receive_read_payload_and_next_header(ra, 0, NULL);
2183 receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
2184 drrwb->drr_length);
2185 return (err);
37f8a883
MA
2186 }
2187 case DRR_WRITE_EMBEDDED:
2188 {
2189 struct drr_write_embedded *drrwe =
fcff0f35
PD
2190 &ra->rrd->header.drr_u.drr_write_embedded;
2191 uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
2192 void *buf = kmem_zalloc(size, KM_SLEEP);
2193
2194 err = receive_read_payload_and_next_header(ra, size, buf);
2195 if (err != 0) {
2196 kmem_free(buf, size);
37f8a883 2197 return (err);
fcff0f35
PD
2198 }
2199
2200 receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
2201 drrwe->drr_length);
2202 return (err);
37f8a883
MA
2203 }
2204 case DRR_FREE:
2205 {
fcff0f35
PD
2206 /*
2207 * It might be beneficial to prefetch indirect blocks here, but
2208 * we don't really have the data to decide for sure.
2209 */
2210 err = receive_read_payload_and_next_header(ra, 0, NULL);
2211 return (err);
37f8a883
MA
2212 }
2213 case DRR_END:
2214 {
fcff0f35 2215 struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
37f8a883
MA
2216 if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
2217 return (SET_ERROR(EINVAL));
2218 return (0);
2219 }
2220 case DRR_SPILL:
2221 {
fcff0f35
PD
2222 struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
2223 void *buf = kmem_zalloc(drrs->drr_length, KM_SLEEP);
2224 err = receive_read_payload_and_next_header(ra, drrs->drr_length,
2225 buf);
37f8a883 2226 if (err != 0)
fcff0f35
PD
2227 kmem_free(buf, drrs->drr_length);
2228 return (err);
2229 }
2230 default:
2231 return (SET_ERROR(EINVAL));
2232 }
2233}
2234
2235/*
2236 * Commit the records to the pool.
2237 */
2238static int
2239receive_process_record(struct receive_writer_arg *rwa,
2240 struct receive_record_arg *rrd)
2241{
2242 int err;
2243
2244 switch (rrd->header.drr_type) {
2245 case DRR_OBJECT:
2246 {
2247 struct drr_object *drro = &rrd->header.drr_u.drr_object;
2248 err = receive_object(rwa, drro, rrd->payload);
2249 kmem_free(rrd->payload, rrd->payload_size);
2250 rrd->payload = NULL;
2251 return (err);
2252 }
2253 case DRR_FREEOBJECTS:
2254 {
2255 struct drr_freeobjects *drrfo =
2256 &rrd->header.drr_u.drr_freeobjects;
2257 return (receive_freeobjects(rwa, drrfo));
2258 }
2259 case DRR_WRITE:
2260 {
2261 struct drr_write *drrw = &rrd->header.drr_u.drr_write;
2262 err = receive_write(rwa, drrw, rrd->write_buf);
2263 /* if receive_write() is successful, it consumes the arc_buf */
2264 if (err != 0)
2265 dmu_return_arcbuf(rrd->write_buf);
2266 rrd->write_buf = NULL;
2267 rrd->payload = NULL;
2268 return (err);
2269 }
2270 case DRR_WRITE_BYREF:
2271 {
2272 struct drr_write_byref *drrwbr =
2273 &rrd->header.drr_u.drr_write_byref;
2274 return (receive_write_byref(rwa, drrwbr));
2275 }
2276 case DRR_WRITE_EMBEDDED:
2277 {
2278 struct drr_write_embedded *drrwe =
2279 &rrd->header.drr_u.drr_write_embedded;
2280 err = receive_write_embedded(rwa, drrwe, rrd->payload);
2281 kmem_free(rrd->payload, rrd->payload_size);
2282 rrd->payload = NULL;
2283 return (err);
2284 }
2285 case DRR_FREE:
2286 {
2287 struct drr_free *drrf = &rrd->header.drr_u.drr_free;
2288 return (receive_free(rwa, drrf));
2289 }
2290 case DRR_SPILL:
2291 {
2292 struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
2293 err = receive_spill(rwa, drrs, rrd->payload);
2294 kmem_free(rrd->payload, rrd->payload_size);
2295 rrd->payload = NULL;
2296 return (err);
37f8a883
MA
2297 }
2298 default:
2299 return (SET_ERROR(EINVAL));
2300 }
2301}
2302
34dc7c2f 2303/*
fcff0f35
PD
2304 * dmu_recv_stream's worker thread; pull records off the queue, and then call
2305 * receive_process_record When we're done, signal the main thread and exit.
2306 */
2307static void
2308receive_writer_thread(void *arg)
2309{
2310 struct receive_writer_arg *rwa = arg;
2311 struct receive_record_arg *rrd;
2312 for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
2313 rrd = bqueue_dequeue(&rwa->q)) {
2314 /*
2315 * If there's an error, the main thread will stop putting things
2316 * on the queue, but we need to clear everything in it before we
2317 * can exit.
2318 */
2319 if (rwa->err == 0) {
2320 rwa->err = receive_process_record(rwa, rrd);
2321 } else if (rrd->write_buf != NULL) {
2322 dmu_return_arcbuf(rrd->write_buf);
2323 rrd->write_buf = NULL;
2324 rrd->payload = NULL;
2325 } else if (rrd->payload != NULL) {
2326 kmem_free(rrd->payload, rrd->payload_size);
2327 rrd->payload = NULL;
2328 }
2329 kmem_free(rrd, sizeof (*rrd));
2330 }
2331 kmem_free(rrd, sizeof (*rrd));
2332 mutex_enter(&rwa->mutex);
2333 rwa->done = B_TRUE;
2334 cv_signal(&rwa->cv);
2335 mutex_exit(&rwa->mutex);
2336}
2337
2338/*
2339 * Read in the stream's records, one by one, and apply them to the pool. There
2340 * are two threads involved; the thread that calls this function will spin up a
2341 * worker thread, read the records off the stream one by one, and issue
2342 * prefetches for any necessary indirect blocks. It will then push the records
2343 * onto an internal blocking queue. The worker thread will pull the records off
2344 * the queue, and actually write the data into the DMU. This way, the worker
2345 * thread doesn't have to wait for reads to complete, since everything it needs
2346 * (the indirect blocks) will be prefetched.
2347 *
34dc7c2f
BB
2348 * NB: callers *must* call dmu_recv_end() if this succeeds.
2349 */
2350int
572e2857
BB
2351dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
2352 int cleanup_fd, uint64_t *action_handlep)
34dc7c2f 2353{
37f8a883 2354 int err = 0;
04bc4610
NB
2355 struct receive_arg *ra;
2356 struct receive_writer_arg *rwa;
428870ff 2357 int featureflags;
fcff0f35 2358 struct receive_ign_obj_node *n;
34dc7c2f 2359
04bc4610
NB
2360 ra = kmem_zalloc(sizeof (*ra), KM_SLEEP);
2361 rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP);
2362
2363 ra->byteswap = drc->drc_byteswap;
2364 ra->cksum = drc->drc_cksum;
2365 ra->vp = vp;
2366 ra->voff = *voffp;
2367 list_create(&ra->ignore_obj_list, sizeof (struct receive_ign_obj_node),
fcff0f35 2368 offsetof(struct receive_ign_obj_node, node));
34dc7c2f
BB
2369
2370 /* these were verified in dmu_recv_begin */
13fe0198 2371 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
428870ff 2372 DMU_SUBSTREAM);
13fe0198 2373 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
34dc7c2f
BB
2374
2375 /*
2376 * Open the objset we are modifying.
2377 */
04bc4610 2378 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra->os));
34dc7c2f 2379
d683ddbb 2380 ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
34dc7c2f 2381
428870ff
BB
2382 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
2383
2384 /* if this stream is dedup'ed, set up the avl tree for guid mapping */
2385 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
572e2857
BB
2386 minor_t minor;
2387
2388 if (cleanup_fd == -1) {
04bc4610 2389 ra->err = SET_ERROR(EBADF);
572e2857
BB
2390 goto out;
2391 }
04bc4610
NB
2392 ra->err = zfs_onexit_fd_hold(cleanup_fd, &minor);
2393 if (ra->err != 0) {
572e2857
BB
2394 cleanup_fd = -1;
2395 goto out;
2396 }
2397
2398 if (*action_handlep == 0) {
04bc4610 2399 rwa->guid_to_ds_map =
572e2857 2400 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
04bc4610 2401 avl_create(rwa->guid_to_ds_map, guid_compare,
572e2857
BB
2402 sizeof (guid_map_entry_t),
2403 offsetof(guid_map_entry_t, avlnode));
37f8a883 2404 err = zfs_onexit_add_cb(minor,
04bc4610 2405 free_guid_map_onexit, rwa->guid_to_ds_map,
572e2857 2406 action_handlep);
04bc4610 2407 if (ra->err != 0)
572e2857
BB
2408 goto out;
2409 } else {
37f8a883 2410 err = zfs_onexit_cb_data(minor, *action_handlep,
04bc4610
NB
2411 (void **)&rwa->guid_to_ds_map);
2412 if (ra->err != 0)
572e2857
BB
2413 goto out;
2414 }
8d35c149 2415
04bc4610 2416 drc->drc_guid_to_ds_map = rwa->guid_to_ds_map;
428870ff
BB
2417 }
2418
04bc4610 2419 err = receive_read_payload_and_next_header(ra, 0, NULL);
fcff0f35 2420 if (err)
37f8a883 2421 goto out;
37f8a883 2422
04bc4610 2423 (void) bqueue_init(&rwa->q, zfs_recv_queue_length,
fcff0f35 2424 offsetof(struct receive_record_arg, node));
04bc4610
NB
2425 cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL);
2426 mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
2427 rwa->os = ra->os;
2428 rwa->byteswap = drc->drc_byteswap;
fcff0f35 2429
04bc4610 2430 (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
fcff0f35
PD
2431 TS_RUN, minclsyspri);
2432 /*
04bc4610 2433 * We're reading rwa->err without locks, which is safe since we are the
fcff0f35
PD
2434 * only reader, and the worker thread is the only writer. It's ok if we
2435 * miss a write for an iteration or two of the loop, since the writer
2436 * thread will keep freeing records we send it until we send it an eos
2437 * marker.
2438 *
04bc4610 2439 * We can leave this loop in 3 ways: First, if rwa->err is
fcff0f35
PD
2440 * non-zero. In that case, the writer thread will free the rrd we just
2441 * pushed. Second, if we're interrupted; in that case, either it's the
04bc4610 2442 * first loop and ra->rrd was never allocated, or it's later, and ra.rrd
fcff0f35
PD
2443 * has been handed off to the writer thread who will free it. Finally,
2444 * if receive_read_record fails or we're at the end of the stream, then
04bc4610 2445 * we free ra->rrd and exit.
fcff0f35 2446 */
04bc4610 2447 while (rwa->err == 0) {
34dc7c2f 2448 if (issig(JUSTLOOKING) && issig(FORREAL)) {
37f8a883
MA
2449 err = SET_ERROR(EINTR);
2450 break;
34dc7c2f
BB
2451 }
2452
04bc4610
NB
2453 ASSERT3P(ra->rrd, ==, NULL);
2454 ra->rrd = ra->next_rrd;
2455 ra->next_rrd = NULL;
2456 /* Allocates and loads header into ra->next_rrd */
2457 err = receive_read_record(ra);
34dc7c2f 2458
04bc4610
NB
2459 if (ra->rrd->header.drr_type == DRR_END || err != 0) {
2460 kmem_free(ra->rrd, sizeof (*ra->rrd));
2461 ra->rrd = NULL;
428870ff 2462 break;
fcff0f35
PD
2463 }
2464
04bc4610
NB
2465 bqueue_enqueue(&rwa->q, ra->rrd,
2466 sizeof (struct receive_record_arg) + ra->rrd->payload_size);
2467 ra->rrd = NULL;
fcff0f35 2468 }
04bc4610
NB
2469 if (ra->next_rrd == NULL)
2470 ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
2471 ra->next_rrd->eos_marker = B_TRUE;
2472 bqueue_enqueue(&rwa->q, ra->next_rrd, 1);
fcff0f35 2473
04bc4610
NB
2474 mutex_enter(&rwa->mutex);
2475 while (!rwa->done) {
2476 cv_wait(&rwa->cv, &rwa->mutex);
34dc7c2f 2477 }
04bc4610 2478 mutex_exit(&rwa->mutex);
fcff0f35 2479
04bc4610
NB
2480 cv_destroy(&rwa->cv);
2481 mutex_destroy(&rwa->mutex);
2482 bqueue_destroy(&rwa->q);
fcff0f35 2483 if (err == 0)
04bc4610 2484 err = rwa->err;
34dc7c2f
BB
2485
2486out:
572e2857
BB
2487 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
2488 zfs_onexit_fd_rele(cleanup_fd);
2489
37f8a883 2490 if (err != 0) {
34dc7c2f 2491 /*
45d1cae3
BB
2492 * destroy what we created, so we don't leave it in the
2493 * inconsistent restoring state.
34dc7c2f 2494 */
13fe0198 2495 dmu_recv_cleanup_ds(drc);
34dc7c2f
BB
2496 }
2497
04bc4610 2498 *voffp = ra->voff;
fcff0f35 2499
04bc4610
NB
2500 for (n = list_remove_head(&ra->ignore_obj_list); n != NULL;
2501 n = list_remove_head(&ra->ignore_obj_list)) {
fcff0f35
PD
2502 kmem_free(n, sizeof (*n));
2503 }
04bc4610
NB
2504 list_destroy(&ra->ignore_obj_list);
2505 kmem_free(ra, sizeof (*ra));
2506 kmem_free(rwa, sizeof (*rwa));
37f8a883 2507 return (err);
34dc7c2f
BB
2508}
2509
34dc7c2f 2510static int
13fe0198 2511dmu_recv_end_check(void *arg, dmu_tx_t *tx)
34dc7c2f 2512{
13fe0198
MA
2513 dmu_recv_cookie_t *drc = arg;
2514 dsl_pool_t *dp = dmu_tx_pool(tx);
2515 int error;
2516
2517 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
2518
2519 if (!drc->drc_newfs) {
2520 dsl_dataset_t *origin_head;
2521
2522 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
2523 if (error != 0)
2524 return (error);
19580676
MA
2525 if (drc->drc_force) {
2526 /*
2527 * We will destroy any snapshots in tofs (i.e. before
2528 * origin_head) that are after the origin (which is
2529 * the snap before drc_ds, because drc_ds can not
2530 * have any snaps of its own).
2531 */
d683ddbb
JG
2532 uint64_t obj;
2533
2534 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
2535 while (obj !=
2536 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
19580676
MA
2537 dsl_dataset_t *snap;
2538 error = dsl_dataset_hold_obj(dp, obj, FTAG,
2539 &snap);
2540 if (error != 0)
b6640117 2541 break;
19580676
MA
2542 if (snap->ds_dir != origin_head->ds_dir)
2543 error = SET_ERROR(EINVAL);
2544 if (error == 0) {
2545 error = dsl_destroy_snapshot_check_impl(
2546 snap, B_FALSE);
2547 }
d683ddbb 2548 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
19580676
MA
2549 dsl_dataset_rele(snap, FTAG);
2550 if (error != 0)
b6640117
AG
2551 break;
2552 }
2553 if (error != 0) {
2554 dsl_dataset_rele(origin_head, FTAG);
2555 return (error);
19580676
MA
2556 }
2557 }
13fe0198 2558 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
831baf06 2559 origin_head, drc->drc_force, drc->drc_owner, tx);
13fe0198
MA
2560 if (error != 0) {
2561 dsl_dataset_rele(origin_head, FTAG);
2562 return (error);
2563 }
2564 error = dsl_dataset_snapshot_check_impl(origin_head,
788eb90c 2565 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
13fe0198
MA
2566 dsl_dataset_rele(origin_head, FTAG);
2567 if (error != 0)
2568 return (error);
34dc7c2f 2569
13fe0198
MA
2570 error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
2571 } else {
2572 error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
788eb90c 2573 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
13fe0198
MA
2574 }
2575 return (error);
34dc7c2f
BB
2576}
2577
2578static void
13fe0198 2579dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
34dc7c2f 2580{
13fe0198
MA
2581 dmu_recv_cookie_t *drc = arg;
2582 dsl_pool_t *dp = dmu_tx_pool(tx);
2583
2584 spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
2585 tx, "snap=%s", drc->drc_tosnap);
2586
2587 if (!drc->drc_newfs) {
2588 dsl_dataset_t *origin_head;
2589
2590 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
2591 &origin_head));
19580676
MA
2592
2593 if (drc->drc_force) {
2594 /*
2595 * Destroy any snapshots of drc_tofs (origin_head)
2596 * after the origin (the snap before drc_ds).
2597 */
d683ddbb
JG
2598 uint64_t obj;
2599
2600 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
2601 while (obj !=
2602 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
19580676
MA
2603 dsl_dataset_t *snap;
2604 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
2605 &snap));
2606 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
d683ddbb 2607 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
19580676
MA
2608 dsl_destroy_snapshot_sync_impl(snap,
2609 B_FALSE, tx);
2610 dsl_dataset_rele(snap, FTAG);
2611 }
2612 }
2613 VERIFY3P(drc->drc_ds->ds_prev, ==,
2614 origin_head->ds_prev);
2615
13fe0198
MA
2616 dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
2617 origin_head, tx);
2618 dsl_dataset_snapshot_sync_impl(origin_head,
2619 drc->drc_tosnap, tx);
2620
2621 /* set snapshot's creation time and guid */
2622 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
d683ddbb 2623 dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
13fe0198 2624 drc->drc_drrb->drr_creation_time;
d683ddbb 2625 dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
13fe0198 2626 drc->drc_drrb->drr_toguid;
d683ddbb 2627 dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
13fe0198
MA
2628 ~DS_FLAG_INCONSISTENT;
2629
2630 dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
d683ddbb
JG
2631 dsl_dataset_phys(origin_head)->ds_flags &=
2632 ~DS_FLAG_INCONSISTENT;
13fe0198
MA
2633
2634 dsl_dataset_rele(origin_head, FTAG);
2635 dsl_destroy_head_sync_impl(drc->drc_ds, tx);
831baf06
KW
2636
2637 if (drc->drc_owner != NULL)
2638 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
13fe0198
MA
2639 } else {
2640 dsl_dataset_t *ds = drc->drc_ds;
34dc7c2f 2641
13fe0198 2642 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
34dc7c2f 2643
13fe0198
MA
2644 /* set snapshot's creation time and guid */
2645 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
d683ddbb 2646 dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
13fe0198 2647 drc->drc_drrb->drr_creation_time;
d683ddbb
JG
2648 dsl_dataset_phys(ds->ds_prev)->ds_guid =
2649 drc->drc_drrb->drr_toguid;
2650 dsl_dataset_phys(ds->ds_prev)->ds_flags &=
2651 ~DS_FLAG_INCONSISTENT;
34dc7c2f 2652
13fe0198 2653 dmu_buf_will_dirty(ds->ds_dbuf, tx);
d683ddbb 2654 dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
13fe0198 2655 }
d683ddbb 2656 drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
a0bd735a 2657 zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE);
13fe0198
MA
2658 /*
2659 * Release the hold from dmu_recv_begin. This must be done before
2660 * we return to open context, so that when we free the dataset's dnode,
2661 * we can evict its bonus buffer.
2662 */
2663 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
2664 drc->drc_ds = NULL;
34dc7c2f
BB
2665}
2666
8d35c149 2667static int
13fe0198 2668add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
8d35c149 2669{
13fe0198 2670 dsl_pool_t *dp;
8d35c149
AS
2671 dsl_dataset_t *snapds;
2672 guid_map_entry_t *gmep;
2673 int err;
2674
2675 ASSERT(guid_map != NULL);
2676
13fe0198
MA
2677 err = dsl_pool_hold(name, FTAG, &dp);
2678 if (err != 0)
2679 return (err);
7ec09286
MA
2680 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
2681 err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
8d35c149 2682 if (err == 0) {
d683ddbb 2683 gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
8d35c149
AS
2684 gmep->gme_ds = snapds;
2685 avl_add(guid_map, gmep);
13fe0198 2686 dsl_dataset_long_hold(snapds, gmep);
7ec09286
MA
2687 } else {
2688 kmem_free(gmep, sizeof (*gmep));
8d35c149
AS
2689 }
2690
13fe0198 2691 dsl_pool_rele(dp, FTAG);
8d35c149
AS
2692 return (err);
2693}
2694
13fe0198
MA
2695static int dmu_recv_end_modified_blocks = 3;
2696
428870ff
BB
2697static int
2698dmu_recv_existing_end(dmu_recv_cookie_t *drc)
34dc7c2f 2699{
13fe0198 2700 int error;
34dc7c2f 2701
13fe0198
MA
2702#ifdef _KERNEL
2703 char *name;
34dc7c2f 2704
13fe0198
MA
2705 /*
2706 * We will be destroying the ds; make sure its origin is unmounted if
2707 * necessary.
2708 */
2709 name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
2710 dsl_dataset_name(drc->drc_ds, name);
2711 zfs_destroy_unmount_origin(name);
2712 kmem_free(name, MAXNAMELEN);
2713#endif
34dc7c2f 2714
13fe0198
MA
2715 error = dsl_sync_task(drc->drc_tofs,
2716 dmu_recv_end_check, dmu_recv_end_sync, drc,
3d45fdd6 2717 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
13fe0198
MA
2718
2719 if (error != 0)
2720 dmu_recv_cleanup_ds(drc);
2721 return (error);
34dc7c2f 2722}
428870ff
BB
2723
2724static int
2725dmu_recv_new_end(dmu_recv_cookie_t *drc)
2726{
13fe0198 2727 int error;
428870ff 2728
13fe0198
MA
2729 error = dsl_sync_task(drc->drc_tofs,
2730 dmu_recv_end_check, dmu_recv_end_sync, drc,
3d45fdd6 2731 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
428870ff 2732
13fe0198
MA
2733 if (error != 0) {
2734 dmu_recv_cleanup_ds(drc);
2735 } else if (drc->drc_guid_to_ds_map != NULL) {
2736 (void) add_ds_to_guidmap(drc->drc_tofs,
2737 drc->drc_guid_to_ds_map,
2738 drc->drc_newsnapobj);
428870ff 2739 }
13fe0198 2740 return (error);
428870ff
BB
2741}
2742
2743int
831baf06 2744dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
428870ff 2745{
831baf06
KW
2746 drc->drc_owner = owner;
2747
13fe0198 2748 if (drc->drc_newfs)
428870ff 2749 return (dmu_recv_new_end(drc));
13fe0198
MA
2750 else
2751 return (dmu_recv_existing_end(drc));
428870ff 2752}
ea97f8ce
MA
2753
2754/*
2755 * Return TRUE if this objset is currently being received into.
2756 */
2757boolean_t
2758dmu_objset_is_receiving(objset_t *os)
2759{
2760 return (os->os_dsl_dataset != NULL &&
2761 os->os_dsl_dataset->ds_owner == dmu_recv_tag);
2762}
fd8febbd
TF
2763
2764#if defined(_KERNEL)
2765module_param(zfs_send_corrupt_data, int, 0644);
2766MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data");
2767#endif