]> git.proxmox.com Git - mirror_zfs-debian.git/blame - module/zfs/dmu_send.c
Imported Upstream version 0.6.5.9
[mirror_zfs-debian.git] / module / zfs / dmu_send.c
CommitLineData
34dc7c2f
BB
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
428870ff 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
330d06f9 23 * Copyright (c) 2011 by Delphix. All rights reserved.
8d35c149 24 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
e10b0808 25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
ea04106b 26 * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
4e820b5a 27 * Copyright (c) 2016 Actifio, Inc. All rights reserved.
8d35c149 28 */
34dc7c2f 29
34dc7c2f
BB
30#include <sys/dmu.h>
31#include <sys/dmu_impl.h>
32#include <sys/dmu_tx.h>
33#include <sys/dbuf.h>
34#include <sys/dnode.h>
35#include <sys/zfs_context.h>
36#include <sys/dmu_objset.h>
37#include <sys/dmu_traverse.h>
38#include <sys/dsl_dataset.h>
39#include <sys/dsl_dir.h>
428870ff 40#include <sys/dsl_prop.h>
34dc7c2f
BB
41#include <sys/dsl_pool.h>
42#include <sys/dsl_synctask.h>
c06d4368 43#include <sys/spa_impl.h>
34dc7c2f
BB
44#include <sys/zfs_ioctl.h>
45#include <sys/zap.h>
46#include <sys/zio_checksum.h>
428870ff
BB
47#include <sys/zfs_znode.h>
48#include <zfs_fletcher.h>
49#include <sys/avl.h>
50#include <sys/ddt.h>
572e2857 51#include <sys/zfs_onexit.h>
a08ee875
LG
52#include <sys/dmu_send.h>
53#include <sys/dsl_destroy.h>
ea04106b
AX
54#include <sys/blkptr.h>
55#include <sys/dsl_bookmark.h>
56#include <sys/zfeature.h>
4e820b5a 57#include <sys/zvol.h>
34dc7c2f 58
330d06f9
MA
59/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
60int zfs_send_corrupt_data = B_FALSE;
61
34dc7c2f 62static char *dmu_recv_tag = "dmu_recv_tag";
a08ee875 63static const char *recv_clone_name = "%recv";
34dc7c2f 64
c06d4368
AX
65typedef struct dump_bytes_io {
66 dmu_sendarg_t *dbi_dsp;
67 void *dbi_buf;
68 int dbi_len;
69} dump_bytes_io_t;
70
71static void
68d83c55 72dump_bytes_cb(void *arg)
34dc7c2f 73{
c06d4368
AX
74 dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
75 dmu_sendarg_t *dsp = dbi->dbi_dsp;
37abac6d 76 dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
34dc7c2f 77 ssize_t resid; /* have to get resid to get detailed errno */
c06d4368 78 ASSERT0(dbi->dbi_len % 8);
34dc7c2f 79
c06d4368 80 fletcher_4_incremental_native(dbi->dbi_buf, dbi->dbi_len, &dsp->dsa_zc);
37abac6d 81 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
c06d4368 82 (caddr_t)dbi->dbi_buf, dbi->dbi_len,
34dc7c2f 83 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
37abac6d
BP
84
85 mutex_enter(&ds->ds_sendstream_lock);
c06d4368 86 *dsp->dsa_off += dbi->dbi_len;
37abac6d 87 mutex_exit(&ds->ds_sendstream_lock);
c06d4368
AX
88}
89
90static int
91dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
92{
93 dump_bytes_io_t dbi;
94
95 dbi.dbi_dsp = dsp;
96 dbi.dbi_buf = buf;
97 dbi.dbi_len = len;
98
68d83c55
AX
99#if defined(HAVE_LARGE_STACKS)
100 dump_bytes_cb(&dbi);
101#else
c06d4368
AX
102 /*
103 * The vn_rdwr() call is performed in a taskq to ensure that there is
104 * always enough stack space to write safely to the target filesystem.
105 * The ZIO_TYPE_FREE threads are used because there can be a lot of
106 * them and they are used in vdev_file.c for a similar purpose.
107 */
108 spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE,
68d83c55
AX
109 ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP);
110#endif /* HAVE_LARGE_STACKS */
37abac6d
BP
111
112 return (dsp->dsa_err);
34dc7c2f
BB
113}
114
115static int
37abac6d 116dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
34dc7c2f
BB
117 uint64_t length)
118{
37abac6d 119 struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
428870ff 120
a08ee875
LG
121 /*
122 * When we receive a free record, dbuf_free_range() assumes
123 * that the receiving system doesn't have any dbufs in the range
124 * being freed. This is always true because there is a one-record
125 * constraint: we only send one WRITE record for any given
126 * object+offset. We know that the one-record constraint is
127 * true because we always send data in increasing order by
128 * object,offset.
129 *
130 * If the increasing-order constraint ever changes, we should find
131 * another way to assert that the one-record constraint is still
132 * satisfied.
133 */
134 ASSERT(object > dsp->dsa_last_data_object ||
135 (object == dsp->dsa_last_data_object &&
136 offset > dsp->dsa_last_data_offset));
137
138 /*
139 * If we are doing a non-incremental send, then there can't
140 * be any data in the dataset we're receiving into. Therefore
141 * a free record would simply be a no-op. Save space by not
142 * sending it to begin with.
143 */
144 if (!dsp->dsa_incremental)
145 return (0);
146
c578f007
SK
147 if (length != -1ULL && offset + length < offset)
148 length = -1ULL;
149
428870ff
BB
150 /*
151 * If there is a pending op, but it's not PENDING_FREE, push it out,
152 * since free block aggregation can only be done for blocks of the
153 * same type (i.e., DRR_FREE records can only be aggregated with
154 * other DRR_FREE records. DRR_FREEOBJECTS records can only be
155 * aggregated with other DRR_FREEOBJECTS records.
156 */
37abac6d
BP
157 if (dsp->dsa_pending_op != PENDING_NONE &&
158 dsp->dsa_pending_op != PENDING_FREE) {
159 if (dump_bytes(dsp, dsp->dsa_drr,
160 sizeof (dmu_replay_record_t)) != 0)
a08ee875 161 return (SET_ERROR(EINTR));
37abac6d 162 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
163 }
164
37abac6d 165 if (dsp->dsa_pending_op == PENDING_FREE) {
428870ff
BB
166 /*
167 * There should never be a PENDING_FREE if length is -1
168 * (because dump_dnode is the only place where this
169 * function is called with a -1, and only after flushing
170 * any pending record).
171 */
172 ASSERT(length != -1ULL);
173 /*
174 * Check to see whether this free block can be aggregated
175 * with pending one.
176 */
177 if (drrf->drr_object == object && drrf->drr_offset +
178 drrf->drr_length == offset) {
179 drrf->drr_length += length;
180 return (0);
181 } else {
182 /* not a continuation. Push out pending record */
37abac6d 183 if (dump_bytes(dsp, dsp->dsa_drr,
428870ff 184 sizeof (dmu_replay_record_t)) != 0)
a08ee875 185 return (SET_ERROR(EINTR));
37abac6d 186 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
187 }
188 }
189 /* create a FREE record and make it pending */
37abac6d
BP
190 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
191 dsp->dsa_drr->drr_type = DRR_FREE;
428870ff
BB
192 drrf->drr_object = object;
193 drrf->drr_offset = offset;
194 drrf->drr_length = length;
37abac6d 195 drrf->drr_toguid = dsp->dsa_toguid;
428870ff 196 if (length == -1ULL) {
37abac6d
BP
197 if (dump_bytes(dsp, dsp->dsa_drr,
198 sizeof (dmu_replay_record_t)) != 0)
a08ee875 199 return (SET_ERROR(EINTR));
428870ff 200 } else {
37abac6d 201 dsp->dsa_pending_op = PENDING_FREE;
428870ff 202 }
34dc7c2f 203
34dc7c2f
BB
204 return (0);
205}
206
207static int
ea04106b 208dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type,
428870ff 209 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
34dc7c2f 210{
37abac6d 211 struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
428870ff 212
a08ee875
LG
213 /*
214 * We send data in increasing object, offset order.
215 * See comment in dump_free() for details.
216 */
217 ASSERT(object > dsp->dsa_last_data_object ||
218 (object == dsp->dsa_last_data_object &&
219 offset > dsp->dsa_last_data_offset));
220 dsp->dsa_last_data_object = object;
221 dsp->dsa_last_data_offset = offset + blksz - 1;
428870ff
BB
222
223 /*
224 * If there is any kind of pending aggregation (currently either
225 * a grouping of free objects or free blocks), push it out to
226 * the stream, since aggregation can't be done across operations
227 * of different types.
228 */
37abac6d
BP
229 if (dsp->dsa_pending_op != PENDING_NONE) {
230 if (dump_bytes(dsp, dsp->dsa_drr,
231 sizeof (dmu_replay_record_t)) != 0)
a08ee875 232 return (SET_ERROR(EINTR));
37abac6d 233 dsp->dsa_pending_op = PENDING_NONE;
428870ff 234 }
34dc7c2f 235 /* write a DATA record */
37abac6d
BP
236 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
237 dsp->dsa_drr->drr_type = DRR_WRITE;
428870ff
BB
238 drrw->drr_object = object;
239 drrw->drr_type = type;
240 drrw->drr_offset = offset;
241 drrw->drr_length = blksz;
37abac6d 242 drrw->drr_toguid = dsp->dsa_toguid;
e10b0808 243 if (bp == NULL || BP_IS_EMBEDDED(bp)) {
ea04106b 244 /*
e10b0808
AX
245 * There's no pre-computed checksum for partial-block
246 * writes or embedded BP's, so (like
247 * fletcher4-checkummed blocks) userland will have to
248 * compute a dedup-capable checksum itself.
ea04106b
AX
249 */
250 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
251 } else {
252 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
253 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
254 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
255 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
256 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
257 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
258 drrw->drr_key.ddk_cksum = bp->blk_cksum;
259 }
428870ff 260
37abac6d 261 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
a08ee875 262 return (SET_ERROR(EINTR));
37abac6d 263 if (dump_bytes(dsp, data, blksz) != 0)
a08ee875 264 return (SET_ERROR(EINTR));
428870ff
BB
265 return (0);
266}
267
ea04106b
AX
268static int
269dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
270 int blksz, const blkptr_t *bp)
271{
272 char buf[BPE_PAYLOAD_SIZE];
273 struct drr_write_embedded *drrw =
274 &(dsp->dsa_drr->drr_u.drr_write_embedded);
275
276 if (dsp->dsa_pending_op != PENDING_NONE) {
277 if (dump_bytes(dsp, dsp->dsa_drr,
278 sizeof (dmu_replay_record_t)) != 0)
279 return (EINTR);
280 dsp->dsa_pending_op = PENDING_NONE;
281 }
282
283 ASSERT(BP_IS_EMBEDDED(bp));
284
285 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
286 dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
287 drrw->drr_object = object;
288 drrw->drr_offset = offset;
289 drrw->drr_length = blksz;
290 drrw->drr_toguid = dsp->dsa_toguid;
291 drrw->drr_compression = BP_GET_COMPRESS(bp);
292 drrw->drr_etype = BPE_GET_ETYPE(bp);
293 drrw->drr_lsize = BPE_GET_LSIZE(bp);
294 drrw->drr_psize = BPE_GET_PSIZE(bp);
295
296 decode_embedded_bp_compressed(bp, buf);
297
298 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
299 return (EINTR);
300 if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
301 return (EINTR);
302 return (0);
303}
304
428870ff 305static int
37abac6d 306dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
428870ff 307{
37abac6d 308 struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
428870ff 309
37abac6d
BP
310 if (dsp->dsa_pending_op != PENDING_NONE) {
311 if (dump_bytes(dsp, dsp->dsa_drr,
312 sizeof (dmu_replay_record_t)) != 0)
a08ee875 313 return (SET_ERROR(EINTR));
37abac6d 314 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
315 }
316
317 /* write a SPILL record */
37abac6d
BP
318 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
319 dsp->dsa_drr->drr_type = DRR_SPILL;
428870ff
BB
320 drrs->drr_object = object;
321 drrs->drr_length = blksz;
37abac6d 322 drrs->drr_toguid = dsp->dsa_toguid;
34dc7c2f 323
37abac6d 324 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)))
a08ee875 325 return (SET_ERROR(EINTR));
37abac6d 326 if (dump_bytes(dsp, data, blksz))
a08ee875 327 return (SET_ERROR(EINTR));
34dc7c2f
BB
328 return (0);
329}
330
331static int
37abac6d 332dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
34dc7c2f 333{
37abac6d 334 struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
428870ff 335
a08ee875
LG
336 /* See comment in dump_free(). */
337 if (!dsp->dsa_incremental)
338 return (0);
339
428870ff
BB
340 /*
341 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
342 * push it out, since free block aggregation can only be done for
343 * blocks of the same type (i.e., DRR_FREE records can only be
344 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
345 * can only be aggregated with other DRR_FREEOBJECTS records.
346 */
37abac6d
BP
347 if (dsp->dsa_pending_op != PENDING_NONE &&
348 dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
349 if (dump_bytes(dsp, dsp->dsa_drr,
350 sizeof (dmu_replay_record_t)) != 0)
a08ee875 351 return (SET_ERROR(EINTR));
37abac6d 352 dsp->dsa_pending_op = PENDING_NONE;
428870ff 353 }
37abac6d 354 if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
428870ff
BB
355 /*
356 * See whether this free object array can be aggregated
357 * with pending one
358 */
359 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
360 drrfo->drr_numobjs += numobjs;
361 return (0);
362 } else {
363 /* can't be aggregated. Push out pending record */
37abac6d 364 if (dump_bytes(dsp, dsp->dsa_drr,
428870ff 365 sizeof (dmu_replay_record_t)) != 0)
a08ee875 366 return (SET_ERROR(EINTR));
37abac6d 367 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
368 }
369 }
370
34dc7c2f 371 /* write a FREEOBJECTS record */
37abac6d
BP
372 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
373 dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
428870ff
BB
374 drrfo->drr_firstobj = firstobj;
375 drrfo->drr_numobjs = numobjs;
37abac6d 376 drrfo->drr_toguid = dsp->dsa_toguid;
428870ff 377
37abac6d 378 dsp->dsa_pending_op = PENDING_FREEOBJECTS;
34dc7c2f 379
34dc7c2f
BB
380 return (0);
381}
382
383static int
37abac6d 384dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
34dc7c2f 385{
37abac6d 386 struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
428870ff 387
34dc7c2f 388 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
37abac6d 389 return (dump_freeobjects(dsp, object, 1));
34dc7c2f 390
37abac6d
BP
391 if (dsp->dsa_pending_op != PENDING_NONE) {
392 if (dump_bytes(dsp, dsp->dsa_drr,
393 sizeof (dmu_replay_record_t)) != 0)
a08ee875 394 return (SET_ERROR(EINTR));
37abac6d 395 dsp->dsa_pending_op = PENDING_NONE;
428870ff
BB
396 }
397
34dc7c2f 398 /* write an OBJECT record */
37abac6d
BP
399 bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
400 dsp->dsa_drr->drr_type = DRR_OBJECT;
428870ff
BB
401 drro->drr_object = object;
402 drro->drr_type = dnp->dn_type;
403 drro->drr_bonustype = dnp->dn_bonustype;
404 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
405 drro->drr_bonuslen = dnp->dn_bonuslen;
406 drro->drr_checksumtype = dnp->dn_checksum;
407 drro->drr_compress = dnp->dn_compress;
37abac6d 408 drro->drr_toguid = dsp->dsa_toguid;
428870ff 409
e10b0808
AX
410 if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
411 drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
412 drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
413
37abac6d 414 if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
a08ee875 415 return (SET_ERROR(EINTR));
34dc7c2f 416
37abac6d 417 if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
a08ee875 418 return (SET_ERROR(EINTR));
34dc7c2f 419
a08ee875 420 /* Free anything past the end of the file. */
37abac6d 421 if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
a08ee875
LG
422 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0)
423 return (SET_ERROR(EINTR));
424 if (dsp->dsa_err != 0)
425 return (SET_ERROR(EINTR));
34dc7c2f
BB
426 return (0);
427}
428
ea04106b
AX
429static boolean_t
430backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
431{
432 if (!BP_IS_EMBEDDED(bp))
433 return (B_FALSE);
434
435 /*
436 * Compression function must be legacy, or explicitly enabled.
437 */
438 if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
439 !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4)))
440 return (B_FALSE);
441
442 /*
443 * Embed type must be explicitly enabled.
444 */
445 switch (BPE_GET_ETYPE(bp)) {
446 case BP_EMBEDDED_TYPE_DATA:
447 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
448 return (B_TRUE);
449 break;
450 default:
451 return (B_FALSE);
452 }
453 return (B_FALSE);
454}
455
34dc7c2f
BB
456#define BP_SPAN(dnp, level) \
457 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
458 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
459
428870ff 460/* ARGSUSED */
34dc7c2f 461static int
c06d4368 462backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
ea04106b 463 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
34dc7c2f 464{
37abac6d 465 dmu_sendarg_t *dsp = arg;
34dc7c2f 466 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
34dc7c2f
BB
467 int err = 0;
468
469 if (issig(JUSTLOOKING) && issig(FORREAL))
a08ee875 470 return (SET_ERROR(EINTR));
34dc7c2f 471
428870ff
BB
472 if (zb->zb_object != DMU_META_DNODE_OBJECT &&
473 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
9babb374 474 return (0);
ea04106b
AX
475 } else if (zb->zb_level == ZB_ZIL_LEVEL) {
476 /*
477 * If we are sending a non-snapshot (which is allowed on
478 * read-only pools), it may have a ZIL, which must be ignored.
479 */
480 return (0);
481 } else if (BP_IS_HOLE(bp) &&
482 zb->zb_object == DMU_META_DNODE_OBJECT) {
b128c09f
BB
483 uint64_t span = BP_SPAN(dnp, zb->zb_level);
484 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
37abac6d 485 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
ea04106b 486 } else if (BP_IS_HOLE(bp)) {
b128c09f 487 uint64_t span = BP_SPAN(dnp, zb->zb_level);
37abac6d 488 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
b128c09f
BB
489 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
490 return (0);
491 } else if (type == DMU_OT_DNODE) {
492 dnode_phys_t *blk;
34dc7c2f
BB
493 int i;
494 int blksz = BP_GET_LSIZE(bp);
e10b0808 495 arc_flags_t aflags = ARC_FLAG_WAIT;
b128c09f
BB
496 arc_buf_t *abuf;
497
c06d4368
AX
498 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
499 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
500 &aflags, zb) != 0)
a08ee875 501 return (SET_ERROR(EIO));
34dc7c2f 502
b128c09f 503 blk = abuf->b_data;
34dc7c2f 504 for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
b128c09f
BB
505 uint64_t dnobj = (zb->zb_blkid <<
506 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
37abac6d 507 err = dump_dnode(dsp, dnobj, blk+i);
a08ee875 508 if (err != 0)
34dc7c2f
BB
509 break;
510 }
b128c09f 511 (void) arc_buf_remove_ref(abuf, &abuf);
428870ff 512 } else if (type == DMU_OT_SA) {
e10b0808 513 arc_flags_t aflags = ARC_FLAG_WAIT;
b128c09f 514 arc_buf_t *abuf;
34dc7c2f 515 int blksz = BP_GET_LSIZE(bp);
b128c09f 516
c06d4368
AX
517 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
518 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
519 &aflags, zb) != 0)
a08ee875 520 return (SET_ERROR(EIO));
b128c09f 521
37abac6d 522 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
428870ff 523 (void) arc_buf_remove_ref(abuf, &abuf);
ea04106b
AX
524 } else if (backup_do_embed(dsp, bp)) {
525 /* it's an embedded level-0 block of a regular object */
526 int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
527 err = dump_write_embedded(dsp, zb->zb_object,
528 zb->zb_blkid * blksz, blksz, bp);
428870ff 529 } else { /* it's a level-0 block of a regular object */
e10b0808
AX
530 uint64_t offset;
531 arc_flags_t aflags = ARC_FLAG_WAIT;
428870ff
BB
532 arc_buf_t *abuf;
533 int blksz = BP_GET_LSIZE(bp);
534
ea04106b
AX
535 ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
536 ASSERT0(zb->zb_level);
c06d4368
AX
537 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
538 ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
539 &aflags, zb) != 0) {
330d06f9
MA
540 if (zfs_send_corrupt_data) {
541 uint64_t *ptr;
542 /* Send a block filled with 0x"zfs badd bloc" */
543 abuf = arc_buf_alloc(spa, blksz, &abuf,
544 ARC_BUFC_DATA);
545 for (ptr = abuf->b_data;
546 (char *)ptr < (char *)abuf->b_data + blksz;
547 ptr++)
dd26aa53 548 *ptr = 0x2f5baddb10cULL;
330d06f9 549 } else {
a08ee875 550 return (SET_ERROR(EIO));
330d06f9
MA
551 }
552 }
428870ff 553
e10b0808
AX
554 offset = zb->zb_blkid * blksz;
555
556 if (!(dsp->dsa_featureflags &
557 DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
558 blksz > SPA_OLD_MAXBLOCKSIZE) {
559 char *buf = abuf->b_data;
560 while (blksz > 0 && err == 0) {
561 int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
562 err = dump_write(dsp, type, zb->zb_object,
563 offset, n, NULL, buf);
564 offset += n;
565 buf += n;
566 blksz -= n;
567 }
568 } else {
569 err = dump_write(dsp, type, zb->zb_object,
570 offset, blksz, bp, abuf->b_data);
571 }
b128c09f 572 (void) arc_buf_remove_ref(abuf, &abuf);
34dc7c2f
BB
573 }
574
575 ASSERT(err == 0 || err == EINTR);
576 return (err);
577}
578
a08ee875 579/*
ea04106b 580 * Releases dp using the specified tag.
a08ee875
LG
581 */
582static int
583dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds,
ea04106b 584 zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok,
e10b0808 585 boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
34dc7c2f 586{
a08ee875 587 objset_t *os;
34dc7c2f 588 dmu_replay_record_t *drr;
37abac6d 589 dmu_sendarg_t *dsp;
34dc7c2f
BB
590 int err;
591 uint64_t fromtxg = 0;
ea04106b 592 uint64_t featureflags = 0;
34dc7c2f 593
a08ee875
LG
594 err = dmu_objset_from_ds(ds, &os);
595 if (err != 0) {
a08ee875
LG
596 dsl_pool_rele(dp, tag);
597 return (err);
598 }
34dc7c2f
BB
599
600 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
601 drr->drr_type = DRR_BEGIN;
602 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
428870ff
BB
603 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
604 DMU_SUBSTREAM);
605
606#ifdef _KERNEL
a08ee875 607 if (dmu_objset_type(os) == DMU_OST_ZFS) {
428870ff 608 uint64_t version;
a08ee875 609 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
37abac6d 610 kmem_free(drr, sizeof (dmu_replay_record_t));
a08ee875
LG
611 dsl_pool_rele(dp, tag);
612 return (SET_ERROR(EINVAL));
37abac6d 613 }
a08ee875 614 if (version >= ZPL_VERSION_SA) {
ea04106b 615 featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
428870ff
BB
616 }
617 }
618#endif
619
e10b0808
AX
620 if (large_block_ok && ds->ds_large_blocks)
621 featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
ea04106b
AX
622 if (embedok &&
623 spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
624 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
625 if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
626 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
627 } else {
628 embedok = B_FALSE;
629 }
630
631 DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
632 featureflags);
633
34dc7c2f 634 drr->drr_u.drr_begin.drr_creation_time =
e10b0808 635 dsl_dataset_phys(ds)->ds_creation_time;
a08ee875 636 drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
ea04106b 637 if (is_clone)
34dc7c2f 638 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
e10b0808
AX
639 drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid;
640 if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET)
34dc7c2f
BB
641 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
642
ea04106b
AX
643 if (fromzb != NULL) {
644 drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid;
645 fromtxg = fromzb->zbm_creation_txg;
646 }
34dc7c2f 647 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
e10b0808 648 if (!ds->ds_is_snapshot) {
ea04106b
AX
649 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
650 sizeof (drr->drr_u.drr_begin.drr_toname));
a08ee875 651 }
34dc7c2f 652
37abac6d
BP
653 dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
654
655 dsp->dsa_drr = drr;
656 dsp->dsa_vp = vp;
657 dsp->dsa_outfd = outfd;
658 dsp->dsa_proc = curproc;
a08ee875 659 dsp->dsa_os = os;
37abac6d 660 dsp->dsa_off = off;
e10b0808 661 dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid;
37abac6d
BP
662 ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
663 dsp->dsa_pending_op = PENDING_NONE;
ea04106b
AX
664 dsp->dsa_incremental = (fromzb != NULL);
665 dsp->dsa_featureflags = featureflags;
37abac6d
BP
666
667 mutex_enter(&ds->ds_sendstream_lock);
668 list_insert_head(&ds->ds_sendstreams, dsp);
669 mutex_exit(&ds->ds_sendstream_lock);
670
a08ee875
LG
671 dsl_dataset_long_hold(ds, FTAG);
672 dsl_pool_rele(dp, tag);
673
37abac6d
BP
674 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
675 err = dsp->dsa_err;
676 goto out;
34dc7c2f
BB
677 }
678
b128c09f 679 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
37abac6d 680 backup_cb, dsp);
34dc7c2f 681
37abac6d
BP
682 if (dsp->dsa_pending_op != PENDING_NONE)
683 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
a08ee875 684 err = SET_ERROR(EINTR);
428870ff 685
a08ee875
LG
686 if (err != 0) {
687 if (err == EINTR && dsp->dsa_err != 0)
37abac6d
BP
688 err = dsp->dsa_err;
689 goto out;
34dc7c2f
BB
690 }
691
692 bzero(drr, sizeof (dmu_replay_record_t));
693 drr->drr_type = DRR_END;
37abac6d
BP
694 drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
695 drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
34dc7c2f 696
37abac6d
BP
697 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
698 err = dsp->dsa_err;
699 goto out;
34dc7c2f
BB
700 }
701
37abac6d
BP
702out:
703 mutex_enter(&ds->ds_sendstream_lock);
704 list_remove(&ds->ds_sendstreams, dsp);
705 mutex_exit(&ds->ds_sendstream_lock);
706
34dc7c2f 707 kmem_free(drr, sizeof (dmu_replay_record_t));
37abac6d 708 kmem_free(dsp, sizeof (dmu_sendarg_t));
34dc7c2f 709
a08ee875 710 dsl_dataset_long_rele(ds, FTAG);
a08ee875 711
37abac6d 712 return (err);
34dc7c2f
BB
713}
714
330d06f9 715int
a08ee875 716dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
e10b0808
AX
717 boolean_t embedok, boolean_t large_block_ok,
718 int outfd, vnode_t *vp, offset_t *off)
330d06f9 719{
a08ee875
LG
720 dsl_pool_t *dp;
721 dsl_dataset_t *ds;
722 dsl_dataset_t *fromds = NULL;
330d06f9 723 int err;
330d06f9 724
a08ee875
LG
725 err = dsl_pool_hold(pool, FTAG, &dp);
726 if (err != 0)
727 return (err);
728
729 err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds);
730 if (err != 0) {
731 dsl_pool_rele(dp, FTAG);
732 return (err);
733 }
734
735 if (fromsnap != 0) {
ea04106b
AX
736 zfs_bookmark_phys_t zb;
737 boolean_t is_clone;
738
a08ee875
LG
739 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
740 if (err != 0) {
741 dsl_dataset_rele(ds, FTAG);
742 dsl_pool_rele(dp, FTAG);
743 return (err);
744 }
ea04106b
AX
745 if (!dsl_dataset_is_before(ds, fromds, 0))
746 err = SET_ERROR(EXDEV);
e10b0808
AX
747 zb.zbm_creation_time =
748 dsl_dataset_phys(fromds)->ds_creation_time;
749 zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
750 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
ea04106b
AX
751 is_clone = (fromds->ds_dir != ds->ds_dir);
752 dsl_dataset_rele(fromds, FTAG);
e10b0808
AX
753 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
754 embedok, large_block_ok, outfd, vp, off);
ea04106b 755 } else {
e10b0808
AX
756 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
757 embedok, large_block_ok, outfd, vp, off);
a08ee875 758 }
ea04106b
AX
759 dsl_dataset_rele(ds, FTAG);
760 return (err);
a08ee875
LG
761}
762
763int
e10b0808
AX
764dmu_send(const char *tosnap, const char *fromsnap,
765 boolean_t embedok, boolean_t large_block_ok,
a08ee875
LG
766 int outfd, vnode_t *vp, offset_t *off)
767{
768 dsl_pool_t *dp;
769 dsl_dataset_t *ds;
a08ee875 770 int err;
ea04106b 771 boolean_t owned = B_FALSE;
a08ee875 772
ea04106b 773 if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
a08ee875
LG
774 return (SET_ERROR(EINVAL));
775
776 err = dsl_pool_hold(tosnap, FTAG, &dp);
777 if (err != 0)
778 return (err);
779
ea04106b
AX
780 if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
781 /*
782 * We are sending a filesystem or volume. Ensure
783 * that it doesn't change by owning the dataset.
784 */
785 err = dsl_dataset_own(dp, tosnap, FTAG, &ds);
786 owned = B_TRUE;
787 } else {
788 err = dsl_dataset_hold(dp, tosnap, FTAG, &ds);
789 }
a08ee875
LG
790 if (err != 0) {
791 dsl_pool_rele(dp, FTAG);
792 return (err);
793 }
794
795 if (fromsnap != NULL) {
ea04106b
AX
796 zfs_bookmark_phys_t zb;
797 boolean_t is_clone = B_FALSE;
798 int fsnamelen = strchr(tosnap, '@') - tosnap;
799
800 /*
801 * If the fromsnap is in a different filesystem, then
802 * mark the send stream as a clone.
803 */
804 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
805 (fromsnap[fsnamelen] != '@' &&
806 fromsnap[fsnamelen] != '#')) {
807 is_clone = B_TRUE;
808 }
809
810 if (strchr(fromsnap, '@')) {
811 dsl_dataset_t *fromds;
812 err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
813 if (err == 0) {
814 if (!dsl_dataset_is_before(ds, fromds, 0))
815 err = SET_ERROR(EXDEV);
816 zb.zbm_creation_time =
e10b0808 817 dsl_dataset_phys(fromds)->ds_creation_time;
ea04106b 818 zb.zbm_creation_txg =
e10b0808
AX
819 dsl_dataset_phys(fromds)->ds_creation_txg;
820 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
ea04106b
AX
821 is_clone = (ds->ds_dir != fromds->ds_dir);
822 dsl_dataset_rele(fromds, FTAG);
823 }
824 } else {
825 err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
826 }
a08ee875
LG
827 if (err != 0) {
828 dsl_dataset_rele(ds, FTAG);
829 dsl_pool_rele(dp, FTAG);
830 return (err);
330d06f9 831 }
e10b0808
AX
832 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
833 embedok, large_block_ok, outfd, vp, off);
ea04106b 834 } else {
e10b0808
AX
835 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
836 embedok, large_block_ok, outfd, vp, off);
330d06f9 837 }
ea04106b
AX
838 if (owned)
839 dsl_dataset_disown(ds, FTAG);
840 else
841 dsl_dataset_rele(ds, FTAG);
842 return (err);
a08ee875
LG
843}
844
e10b0808
AX
845static int
846dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t size,
847 uint64_t *sizep)
848{
849 int err;
850 /*
851 * Assume that space (both on-disk and in-stream) is dominated by
852 * data. We will adjust for indirect blocks and the copies property,
853 * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
854 */
855
856 /*
857 * Subtract out approximate space used by indirect blocks.
858 * Assume most space is used by data blocks (non-indirect, non-dnode).
859 * Assume all blocks are recordsize. Assume ditto blocks and
860 * internal fragmentation counter out compression.
861 *
862 * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
863 * block, which we observe in practice.
864 */
865 uint64_t recordsize;
866 err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize);
867 if (err != 0)
868 return (err);
869 size -= size / recordsize * sizeof (blkptr_t);
870
871 /* Add in the space for the record associated with each block. */
872 size += size / recordsize * sizeof (dmu_replay_record_t);
873
874 *sizep = size;
875
876 return (0);
877}
878
a08ee875
LG
879int
880dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep)
881{
882 int err;
e10b0808 883 uint64_t size;
a08ee875 884
e10b0808 885 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
a08ee875
LG
886
887 /* tosnap must be a snapshot */
e10b0808
AX
888 if (!ds->ds_is_snapshot)
889 return (SET_ERROR(EINVAL));
890
891 /* fromsnap, if provided, must be a snapshot */
892 if (fromds != NULL && !fromds->ds_is_snapshot)
a08ee875
LG
893 return (SET_ERROR(EINVAL));
894
895 /*
896 * fromsnap must be an earlier snapshot from the same fs as tosnap,
897 * or the origin's fs.
898 */
ea04106b 899 if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
a08ee875 900 return (SET_ERROR(EXDEV));
330d06f9
MA
901
902 /* Get uncompressed size estimate of changed data. */
903 if (fromds == NULL) {
e10b0808 904 size = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
330d06f9
MA
905 } else {
906 uint64_t used, comp;
907 err = dsl_dataset_space_written(fromds, ds,
908 &used, &comp, &size);
a08ee875 909 if (err != 0)
330d06f9
MA
910 return (err);
911 }
912
e10b0808
AX
913 err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
914 return (err);
915}
916
917/*
918 * Simple callback used to traverse the blocks of a snapshot and sum their
919 * uncompressed size
920 */
921/* ARGSUSED */
922static int
923dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
924 const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
925{
926 uint64_t *spaceptr = arg;
927 if (bp != NULL && !BP_IS_HOLE(bp)) {
928 *spaceptr += BP_GET_UCSIZE(bp);
929 }
930 return (0);
931}
932
933/*
934 * Given a desination snapshot and a TXG, calculate the approximate size of a
935 * send stream sent from that TXG. from_txg may be zero, indicating that the
936 * whole snapshot will be sent.
937 */
938int
939dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
940 uint64_t *sizep)
941{
942 int err;
943 uint64_t size = 0;
944
945 ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
946
947 /* tosnap must be a snapshot */
948 if (!dsl_dataset_is_snapshot(ds))
949 return (SET_ERROR(EINVAL));
330d06f9 950
e10b0808
AX
951 /* verify that from_txg is before the provided snapshot was taken */
952 if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
953 return (SET_ERROR(EXDEV));
954 }
330d06f9 955 /*
e10b0808
AX
956 * traverse the blocks of the snapshot with birth times after
957 * from_txg, summing their uncompressed size
330d06f9 958 */
e10b0808
AX
959 err = traverse_dataset(ds, from_txg, TRAVERSE_POST,
960 dmu_calculate_send_traversal, &size);
961 if (err)
330d06f9 962 return (err);
330d06f9 963
e10b0808
AX
964 err = dmu_adjust_send_estimate_for_indirects(ds, size, sizep);
965 return (err);
330d06f9
MA
966}
967
a08ee875
LG
968typedef struct dmu_recv_begin_arg {
969 const char *drba_origin;
970 dmu_recv_cookie_t *drba_cookie;
971 cred_t *drba_cred;
972 uint64_t drba_snapobj;
973} dmu_recv_begin_arg_t;
34dc7c2f 974
34dc7c2f 975static int
a08ee875
LG
976recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
977 uint64_t fromguid)
34dc7c2f 978{
34dc7c2f 979 uint64_t val;
a08ee875
LG
980 int error;
981 dsl_pool_t *dp = ds->ds_dir->dd_pool;
982
983 /* temporary clone name must not exist */
984 error = zap_lookup(dp->dp_meta_objset,
e10b0808 985 dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
a08ee875
LG
986 8, 1, &val);
987 if (error != ENOENT)
988 return (error == 0 ? EBUSY : error);
34dc7c2f 989
a08ee875
LG
990 /* new snapshot name must not exist */
991 error = zap_lookup(dp->dp_meta_objset,
e10b0808
AX
992 dsl_dataset_phys(ds)->ds_snapnames_zapobj,
993 drba->drba_cookie->drc_tosnap, 8, 1, &val);
a08ee875
LG
994 if (error != ENOENT)
995 return (error == 0 ? EEXIST : error);
996
e10b0808
AX
997 /*
998 * Check snapshot limit before receiving. We'll recheck again at the
999 * end, but might as well abort before receiving if we're already over
1000 * the limit.
1001 *
1002 * Note that we do not check the file system limit with
1003 * dsl_dir_fscount_check because the temporary %clones don't count
1004 * against that limit.
1005 */
1006 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
1007 NULL, drba->drba_cred);
1008 if (error != 0)
1009 return (error);
1010
a08ee875
LG
1011 if (fromguid != 0) {
1012 dsl_dataset_t *snap;
e10b0808 1013 uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
a08ee875
LG
1014
1015 /* Find snapshot in this dir that matches fromguid. */
1016 while (obj != 0) {
1017 error = dsl_dataset_hold_obj(dp, obj, FTAG,
1018 &snap);
1019 if (error != 0)
1020 return (SET_ERROR(ENODEV));
1021 if (snap->ds_dir != ds->ds_dir) {
1022 dsl_dataset_rele(snap, FTAG);
1023 return (SET_ERROR(ENODEV));
1024 }
e10b0808 1025 if (dsl_dataset_phys(snap)->ds_guid == fromguid)
a08ee875 1026 break;
e10b0808 1027 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
a08ee875
LG
1028 dsl_dataset_rele(snap, FTAG);
1029 }
1030 if (obj == 0)
1031 return (SET_ERROR(ENODEV));
34dc7c2f 1032
a08ee875
LG
1033 if (drba->drba_cookie->drc_force) {
1034 drba->drba_snapobj = obj;
1035 } else {
1036 /*
1037 * If we are not forcing, there must be no
1038 * changes since fromsnap.
1039 */
1040 if (dsl_dataset_modified_since_snap(ds, snap)) {
1041 dsl_dataset_rele(snap, FTAG);
1042 return (SET_ERROR(ETXTBSY));
1043 }
1044 drba->drba_snapobj = ds->ds_prev->ds_object;
1045 }
34dc7c2f 1046
a08ee875
LG
1047 dsl_dataset_rele(snap, FTAG);
1048 } else {
e10b0808
AX
1049 /* if full, then must be forced */
1050 if (!drba->drba_cookie->drc_force)
1051 return (SET_ERROR(EEXIST));
1052 /* start from $ORIGIN@$ORIGIN, if supported */
1053 drba->drba_snapobj = dp->dp_origin_snap != NULL ?
1054 dp->dp_origin_snap->ds_object : 0;
34dc7c2f
BB
1055 }
1056
1057 return (0);
a08ee875 1058
34dc7c2f
BB
1059}
1060
a08ee875
LG
1061static int
1062dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
34dc7c2f 1063{
a08ee875
LG
1064 dmu_recv_begin_arg_t *drba = arg;
1065 dsl_pool_t *dp = dmu_tx_pool(tx);
1066 struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
1067 uint64_t fromguid = drrb->drr_fromguid;
1068 int flags = drrb->drr_flags;
1069 int error;
ea04106b 1070 uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
a08ee875
LG
1071 dsl_dataset_t *ds;
1072 const char *tofs = drba->drba_cookie->drc_tofs;
34dc7c2f 1073
a08ee875
LG
1074 /* already checked */
1075 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
34dc7c2f 1076
a08ee875
LG
1077 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
1078 DMU_COMPOUNDSTREAM ||
1079 drrb->drr_type >= DMU_OST_NUMTYPES ||
1080 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
1081 return (SET_ERROR(EINVAL));
34dc7c2f 1082
a08ee875 1083 /* Verify pool version supports SA if SA_SPILL feature set */
ea04106b
AX
1084 if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
1085 spa_version(dp->dp_spa) < SPA_VERSION_SA)
1086 return (SET_ERROR(ENOTSUP));
1087
1088 /*
1089 * The receiving code doesn't know how to translate a WRITE_EMBEDDED
1090 * record to a plan WRITE record, so the pool must have the
1091 * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
1092 * records. Same with WRITE_EMBEDDED records that use LZ4 compression.
1093 */
1094 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
1095 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
1096 return (SET_ERROR(ENOTSUP));
1097 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
1098 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
a08ee875 1099 return (SET_ERROR(ENOTSUP));
34dc7c2f 1100
e10b0808
AX
1101 /*
1102 * The receiving code doesn't know how to translate large blocks
1103 * to smaller ones, so the pool must have the LARGE_BLOCKS
1104 * feature enabled if the stream has LARGE_BLOCKS.
1105 */
1106 if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
1107 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
1108 return (SET_ERROR(ENOTSUP));
1109
a08ee875
LG
1110 error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
1111 if (error == 0) {
1112 /* target fs already exists; recv into temp clone */
34dc7c2f 1113
a08ee875
LG
1114 /* Can't recv a clone into an existing fs */
1115 if (flags & DRR_FLAG_CLONE) {
1116 dsl_dataset_rele(ds, FTAG);
1117 return (SET_ERROR(EINVAL));
1118 }
572e2857 1119
a08ee875
LG
1120 error = recv_begin_check_existing_impl(drba, ds, fromguid);
1121 dsl_dataset_rele(ds, FTAG);
1122 } else if (error == ENOENT) {
1123 /* target fs does not exist; must be a full backup or clone */
1124 char buf[MAXNAMELEN];
34dc7c2f 1125
428870ff 1126 /*
a08ee875
LG
1127 * If it's a non-clone incremental, we are missing the
1128 * target fs, so fail the recv.
428870ff 1129 */
a08ee875
LG
1130 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE))
1131 return (SET_ERROR(ENOENT));
1132
1133 /* Open the parent of tofs */
1134 ASSERT3U(strlen(tofs), <, MAXNAMELEN);
1135 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
1136 error = dsl_dataset_hold(dp, buf, FTAG, &ds);
1137 if (error != 0)
1138 return (error);
1139
e10b0808
AX
1140 /*
1141 * Check filesystem and snapshot limits before receiving. We'll
1142 * recheck snapshot limits again at the end (we create the
1143 * filesystems and increment those counts during begin_sync).
1144 */
1145 error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
1146 ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
1147 if (error != 0) {
1148 dsl_dataset_rele(ds, FTAG);
1149 return (error);
1150 }
1151
1152 error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
1153 ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
1154 if (error != 0) {
1155 dsl_dataset_rele(ds, FTAG);
1156 return (error);
1157 }
1158
a08ee875
LG
1159 if (drba->drba_origin != NULL) {
1160 dsl_dataset_t *origin;
1161 error = dsl_dataset_hold(dp, drba->drba_origin,
1162 FTAG, &origin);
1163 if (error != 0) {
1164 dsl_dataset_rele(ds, FTAG);
1165 return (error);
1166 }
e10b0808 1167 if (!origin->ds_is_snapshot) {
a08ee875
LG
1168 dsl_dataset_rele(origin, FTAG);
1169 dsl_dataset_rele(ds, FTAG);
1170 return (SET_ERROR(EINVAL));
1171 }
e10b0808 1172 if (dsl_dataset_phys(origin)->ds_guid != fromguid) {
a08ee875
LG
1173 dsl_dataset_rele(origin, FTAG);
1174 dsl_dataset_rele(ds, FTAG);
1175 return (SET_ERROR(ENODEV));
428870ff 1176 }
a08ee875 1177 dsl_dataset_rele(origin, FTAG);
428870ff 1178 }
a08ee875
LG
1179 dsl_dataset_rele(ds, FTAG);
1180 error = 0;
428870ff 1181 }
a08ee875 1182 return (error);
34dc7c2f
BB
1183}
1184
34dc7c2f 1185static void
a08ee875 1186dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
34dc7c2f 1187{
a08ee875
LG
1188 dmu_recv_begin_arg_t *drba = arg;
1189 dsl_pool_t *dp = dmu_tx_pool(tx);
1190 struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
1191 const char *tofs = drba->drba_cookie->drc_tofs;
1192 dsl_dataset_t *ds, *newds;
34dc7c2f 1193 uint64_t dsobj;
a08ee875
LG
1194 int error;
1195 uint64_t crflags;
1196
1197 crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
1198 DS_FLAG_CI_DATASET : 0;
1199
1200 error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
1201 if (error == 0) {
1202 /* create temporary clone */
1203 dsl_dataset_t *snap = NULL;
1204 if (drba->drba_snapobj != 0) {
1205 VERIFY0(dsl_dataset_hold_obj(dp,
1206 drba->drba_snapobj, FTAG, &snap));
1207 }
1208 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
1209 snap, crflags, drba->drba_cred, tx);
1210 dsl_dataset_rele(snap, FTAG);
1211 dsl_dataset_rele(ds, FTAG);
1212 } else {
1213 dsl_dir_t *dd;
1214 const char *tail;
1215 dsl_dataset_t *origin = NULL;
1216
1217 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
1218
1219 if (drba->drba_origin != NULL) {
1220 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
1221 FTAG, &origin));
1222 }
1223
1224 /* Create new dataset. */
1225 dsobj = dsl_dataset_create_sync(dd,
1226 strrchr(tofs, '/') + 1,
1227 origin, crflags, drba->drba_cred, tx);
1228 if (origin != NULL)
1229 dsl_dataset_rele(origin, FTAG);
1230 dsl_dir_rele(dd, FTAG);
1231 drba->drba_cookie->drc_newfs = B_TRUE;
1232 }
1233 VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
34dc7c2f 1234
e10b0808
AX
1235 if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
1236 DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
1237 !newds->ds_large_blocks) {
1238 dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx);
1239 newds->ds_large_blocks = B_TRUE;
1240 }
1241
a08ee875 1242 dmu_buf_will_dirty(newds->ds_dbuf, tx);
e10b0808 1243 dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
34dc7c2f 1244
428870ff
BB
1245 /*
1246 * If we actually created a non-clone, we need to create the
1247 * objset in our new dataset.
1248 */
a08ee875 1249 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) {
428870ff 1250 (void) dmu_objset_create_impl(dp->dp_spa,
a08ee875 1251 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
428870ff 1252 }
34dc7c2f 1253
a08ee875 1254 drba->drba_cookie->drc_ds = newds;
34dc7c2f 1255
a08ee875 1256 spa_history_log_internal_ds(newds, "receive", tx, "");
34dc7c2f
BB
1257}
1258
34dc7c2f
BB
1259/*
1260 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
1261 * succeeds; otherwise we will leak the holds on the datasets.
1262 */
1263int
a08ee875
LG
1264dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
1265 boolean_t force, char *origin, dmu_recv_cookie_t *drc)
34dc7c2f 1266{
a08ee875
LG
1267 dmu_recv_begin_arg_t drba = { 0 };
1268 dmu_replay_record_t *drr;
34dc7c2f
BB
1269
1270 bzero(drc, sizeof (dmu_recv_cookie_t));
1271 drc->drc_drrb = drrb;
1272 drc->drc_tosnap = tosnap;
a08ee875 1273 drc->drc_tofs = tofs;
34dc7c2f 1274 drc->drc_force = force;
e10b0808 1275 drc->drc_cred = CRED();
34dc7c2f 1276
a08ee875
LG
1277 if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
1278 drc->drc_byteswap = B_TRUE;
1279 else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
1280 return (SET_ERROR(EINVAL));
34dc7c2f 1281
a08ee875
LG
1282 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1283 drr->drr_type = DRR_BEGIN;
1284 drr->drr_u.drr_begin = *drc->drc_drrb;
1285 if (drc->drc_byteswap) {
1286 fletcher_4_incremental_byteswap(drr,
1287 sizeof (dmu_replay_record_t), &drc->drc_cksum);
1288 } else {
1289 fletcher_4_incremental_native(drr,
1290 sizeof (dmu_replay_record_t), &drc->drc_cksum);
1291 }
1292 kmem_free(drr, sizeof (dmu_replay_record_t));
428870ff 1293
a08ee875
LG
1294 if (drc->drc_byteswap) {
1295 drrb->drr_magic = BSWAP_64(drrb->drr_magic);
1296 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
1297 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
1298 drrb->drr_type = BSWAP_32(drrb->drr_type);
1299 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
1300 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
34dc7c2f
BB
1301 }
1302
a08ee875
LG
1303 drba.drba_origin = origin;
1304 drba.drba_cookie = drc;
1305 drba.drba_cred = CRED();
1306
1307 return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
e10b0808 1308 &drba, 5, ZFS_SPACE_CHECK_NORMAL));
34dc7c2f
BB
1309}
1310
1311struct restorearg {
1312 int err;
a08ee875 1313 boolean_t byteswap;
34dc7c2f
BB
1314 vnode_t *vp;
1315 char *buf;
1316 uint64_t voff;
1317 int bufsize; /* amount of memory allocated for buf */
1318 zio_cksum_t cksum;
572e2857 1319 avl_tree_t *guid_to_ds_map;
34dc7c2f
BB
1320};
1321
428870ff
BB
1322typedef struct guid_map_entry {
1323 uint64_t guid;
1324 dsl_dataset_t *gme_ds;
1325 avl_node_t avlnode;
1326} guid_map_entry_t;
1327
1328static int
1329guid_compare(const void *arg1, const void *arg2)
1330{
1331 const guid_map_entry_t *gmep1 = arg1;
1332 const guid_map_entry_t *gmep2 = arg2;
1333
1334 if (gmep1->guid < gmep2->guid)
1335 return (-1);
1336 else if (gmep1->guid > gmep2->guid)
1337 return (1);
1338 return (0);
1339}
1340
572e2857
BB
1341static void
1342free_guid_map_onexit(void *arg)
1343{
1344 avl_tree_t *ca = arg;
1345 void *cookie = NULL;
1346 guid_map_entry_t *gmep;
1347
1348 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
a08ee875
LG
1349 dsl_dataset_long_rele(gmep->gme_ds, gmep);
1350 dsl_dataset_rele(gmep->gme_ds, gmep);
572e2857
BB
1351 kmem_free(gmep, sizeof (guid_map_entry_t));
1352 }
1353 avl_destroy(ca);
1354 kmem_free(ca, sizeof (avl_tree_t));
1355}
1356
34dc7c2f 1357static void *
ea04106b 1358restore_read(struct restorearg *ra, int len, char *buf)
34dc7c2f 1359{
34dc7c2f
BB
1360 int done = 0;
1361
ea04106b
AX
1362 if (buf == NULL)
1363 buf = ra->buf;
1364
34dc7c2f 1365 /* some things will require 8-byte alignment, so everything must */
c06d4368 1366 ASSERT0(len % 8);
e10b0808 1367 ASSERT3U(len, <=, ra->bufsize);
34dc7c2f
BB
1368
1369 while (done < len) {
1370 ssize_t resid;
1371
1372 ra->err = vn_rdwr(UIO_READ, ra->vp,
ea04106b 1373 buf + done, len - done,
34dc7c2f
BB
1374 ra->voff, UIO_SYSSPACE, FAPPEND,
1375 RLIM64_INFINITY, CRED(), &resid);
1376
1377 if (resid == len - done)
a08ee875 1378 ra->err = SET_ERROR(EINVAL);
34dc7c2f
BB
1379 ra->voff += len - done - resid;
1380 done = len - resid;
a08ee875 1381 if (ra->err != 0)
34dc7c2f
BB
1382 return (NULL);
1383 }
1384
1385 ASSERT3U(done, ==, len);
34dc7c2f 1386 if (ra->byteswap)
ea04106b 1387 fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
34dc7c2f 1388 else
ea04106b
AX
1389 fletcher_4_incremental_native(buf, len, &ra->cksum);
1390 return (buf);
34dc7c2f
BB
1391}
1392
60948de1 1393noinline static void
34dc7c2f
BB
1394backup_byteswap(dmu_replay_record_t *drr)
1395{
1396#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
1397#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
1398 drr->drr_type = BSWAP_32(drr->drr_type);
1399 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
1400 switch (drr->drr_type) {
1401 case DRR_BEGIN:
1402 DO64(drr_begin.drr_magic);
428870ff 1403 DO64(drr_begin.drr_versioninfo);
34dc7c2f
BB
1404 DO64(drr_begin.drr_creation_time);
1405 DO32(drr_begin.drr_type);
1406 DO32(drr_begin.drr_flags);
1407 DO64(drr_begin.drr_toguid);
1408 DO64(drr_begin.drr_fromguid);
1409 break;
1410 case DRR_OBJECT:
1411 DO64(drr_object.drr_object);
34dc7c2f
BB
1412 DO32(drr_object.drr_type);
1413 DO32(drr_object.drr_bonustype);
1414 DO32(drr_object.drr_blksz);
1415 DO32(drr_object.drr_bonuslen);
428870ff 1416 DO64(drr_object.drr_toguid);
34dc7c2f
BB
1417 break;
1418 case DRR_FREEOBJECTS:
1419 DO64(drr_freeobjects.drr_firstobj);
1420 DO64(drr_freeobjects.drr_numobjs);
428870ff 1421 DO64(drr_freeobjects.drr_toguid);
34dc7c2f
BB
1422 break;
1423 case DRR_WRITE:
1424 DO64(drr_write.drr_object);
1425 DO32(drr_write.drr_type);
1426 DO64(drr_write.drr_offset);
1427 DO64(drr_write.drr_length);
428870ff
BB
1428 DO64(drr_write.drr_toguid);
1429 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
1430 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
1431 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
1432 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
1433 DO64(drr_write.drr_key.ddk_prop);
1434 break;
1435 case DRR_WRITE_BYREF:
1436 DO64(drr_write_byref.drr_object);
1437 DO64(drr_write_byref.drr_offset);
1438 DO64(drr_write_byref.drr_length);
1439 DO64(drr_write_byref.drr_toguid);
1440 DO64(drr_write_byref.drr_refguid);
1441 DO64(drr_write_byref.drr_refobject);
1442 DO64(drr_write_byref.drr_refoffset);
1443 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
1444 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
1445 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
1446 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
1447 DO64(drr_write_byref.drr_key.ddk_prop);
34dc7c2f 1448 break;
ea04106b
AX
1449 case DRR_WRITE_EMBEDDED:
1450 DO64(drr_write_embedded.drr_object);
1451 DO64(drr_write_embedded.drr_offset);
1452 DO64(drr_write_embedded.drr_length);
1453 DO64(drr_write_embedded.drr_toguid);
1454 DO32(drr_write_embedded.drr_lsize);
1455 DO32(drr_write_embedded.drr_psize);
1456 break;
34dc7c2f
BB
1457 case DRR_FREE:
1458 DO64(drr_free.drr_object);
1459 DO64(drr_free.drr_offset);
1460 DO64(drr_free.drr_length);
428870ff
BB
1461 DO64(drr_free.drr_toguid);
1462 break;
1463 case DRR_SPILL:
1464 DO64(drr_spill.drr_object);
1465 DO64(drr_spill.drr_length);
1466 DO64(drr_spill.drr_toguid);
34dc7c2f
BB
1467 break;
1468 case DRR_END:
1469 DO64(drr_end.drr_checksum.zc_word[0]);
1470 DO64(drr_end.drr_checksum.zc_word[1]);
1471 DO64(drr_end.drr_checksum.zc_word[2]);
1472 DO64(drr_end.drr_checksum.zc_word[3]);
428870ff 1473 DO64(drr_end.drr_toguid);
34dc7c2f 1474 break;
e75c13c3
BB
1475 default:
1476 break;
34dc7c2f
BB
1477 }
1478#undef DO64
1479#undef DO32
1480}
1481
ea04106b
AX
1482static inline uint8_t
1483deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
1484{
1485 if (bonus_type == DMU_OT_SA) {
1486 return (1);
1487 } else {
1488 return (1 +
1489 ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT));
1490 }
1491}
1492
60948de1 1493noinline static int
34dc7c2f
BB
1494restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
1495{
ea04106b 1496 dmu_object_info_t doi;
34dc7c2f 1497 dmu_tx_t *tx;
b128c09f 1498 void *data = NULL;
ea04106b
AX
1499 uint64_t object;
1500 int err;
34dc7c2f 1501
34dc7c2f 1502 if (drro->drr_type == DMU_OT_NONE ||
9ae529ec
CS
1503 !DMU_OT_IS_VALID(drro->drr_type) ||
1504 !DMU_OT_IS_VALID(drro->drr_bonustype) ||
428870ff 1505 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
34dc7c2f
BB
1506 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1507 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1508 drro->drr_blksz < SPA_MINBLOCKSIZE ||
e10b0808 1509 drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) ||
34dc7c2f 1510 drro->drr_bonuslen > DN_MAX_BONUSLEN) {
a08ee875 1511 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1512 }
1513
ea04106b 1514 err = dmu_object_info(os, drro->drr_object, &doi);
9babb374
BB
1515
1516 if (err != 0 && err != ENOENT)
a08ee875 1517 return (SET_ERROR(EINVAL));
ea04106b 1518 object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT;
9babb374 1519
b128c09f 1520 if (drro->drr_bonuslen) {
ea04106b 1521 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8), NULL);
a08ee875 1522 if (ra->err != 0)
b128c09f
BB
1523 return (ra->err);
1524 }
1525
ea04106b
AX
1526 /*
1527 * If we are losing blkptrs or changing the block size this must
1528 * be a new file instance. We must clear out the previous file
1529 * contents before we can change this type of metadata in the dnode.
1530 */
1531 if (err == 0) {
1532 int nblkptr;
1533
1534 nblkptr = deduce_nblkptr(drro->drr_bonustype,
1535 drro->drr_bonuslen);
1536
1537 if (drro->drr_blksz != doi.doi_data_block_size ||
1538 nblkptr < doi.doi_nblkptr) {
1539 err = dmu_free_long_range(os, drro->drr_object,
1540 0, DMU_OBJECT_END);
1541 if (err != 0)
1542 return (SET_ERROR(EINVAL));
34dc7c2f 1543 }
ea04106b
AX
1544 }
1545
1546 tx = dmu_tx_create(os);
1547 dmu_tx_hold_bonus(tx, object);
1548 err = dmu_tx_assign(tx, TXG_WAIT);
1549 if (err != 0) {
1550 dmu_tx_abort(tx);
1551 return (err);
1552 }
1553
1554 if (object == DMU_NEW_OBJECT) {
1555 /* currently free, want to be allocated */
34dc7c2f
BB
1556 err = dmu_object_claim(os, drro->drr_object,
1557 drro->drr_type, drro->drr_blksz,
1558 drro->drr_bonustype, drro->drr_bonuslen, tx);
ea04106b
AX
1559 } else if (drro->drr_type != doi.doi_type ||
1560 drro->drr_blksz != doi.doi_data_block_size ||
1561 drro->drr_bonustype != doi.doi_bonus_type ||
1562 drro->drr_bonuslen != doi.doi_bonus_size) {
1563 /* currently allocated, but with different properties */
34dc7c2f
BB
1564 err = dmu_object_reclaim(os, drro->drr_object,
1565 drro->drr_type, drro->drr_blksz,
ea04106b 1566 drro->drr_bonustype, drro->drr_bonuslen, tx);
34dc7c2f 1567 }
a08ee875 1568 if (err != 0) {
ea04106b 1569 dmu_tx_commit(tx);
a08ee875 1570 return (SET_ERROR(EINVAL));
428870ff 1571 }
9babb374 1572
428870ff
BB
1573 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
1574 tx);
34dc7c2f
BB
1575 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
1576
b128c09f 1577 if (data != NULL) {
34dc7c2f 1578 dmu_buf_t *db;
b128c09f 1579
34dc7c2f
BB
1580 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
1581 dmu_buf_will_dirty(db, tx);
1582
1583 ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
34dc7c2f
BB
1584 bcopy(data, db->db_data, drro->drr_bonuslen);
1585 if (ra->byteswap) {
9ae529ec
CS
1586 dmu_object_byteswap_t byteswap =
1587 DMU_OT_BYTESWAP(drro->drr_bonustype);
1588 dmu_ot_byteswap[byteswap].ob_func(db->db_data,
34dc7c2f
BB
1589 drro->drr_bonuslen);
1590 }
1591 dmu_buf_rele(db, FTAG);
1592 }
1593 dmu_tx_commit(tx);
1594 return (0);
1595}
1596
1597/* ARGSUSED */
60948de1 1598noinline static int
34dc7c2f
BB
1599restore_freeobjects(struct restorearg *ra, objset_t *os,
1600 struct drr_freeobjects *drrfo)
1601{
1602 uint64_t obj;
1603
1604 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
a08ee875 1605 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1606
1607 for (obj = drrfo->drr_firstobj;
1608 obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
1609 (void) dmu_object_next(os, &obj, FALSE, 0)) {
34dc7c2f
BB
1610 int err;
1611
1612 if (dmu_object_info(os, obj, NULL) != 0)
1613 continue;
1614
a08ee875
LG
1615 err = dmu_free_long_object(os, obj);
1616 if (err != 0)
34dc7c2f 1617 return (err);
34dc7c2f
BB
1618 }
1619 return (0);
1620}
1621
60948de1 1622noinline static int
34dc7c2f
BB
1623restore_write(struct restorearg *ra, objset_t *os,
1624 struct drr_write *drrw)
1625{
1626 dmu_tx_t *tx;
ea04106b
AX
1627 dmu_buf_t *bonus;
1628 arc_buf_t *abuf;
34dc7c2f
BB
1629 void *data;
1630 int err;
1631
1632 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
9ae529ec 1633 !DMU_OT_IS_VALID(drrw->drr_type))
a08ee875 1634 return (SET_ERROR(EINVAL));
34dc7c2f 1635
34dc7c2f 1636 if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
a08ee875 1637 return (SET_ERROR(EINVAL));
34dc7c2f 1638
ea04106b
AX
1639 if (dmu_bonus_hold(os, drrw->drr_object, FTAG, &bonus) != 0)
1640 return (SET_ERROR(EINVAL));
1641
1642 abuf = dmu_request_arcbuf(bonus, drrw->drr_length);
1643
1644 data = restore_read(ra, drrw->drr_length, abuf->b_data);
1645 if (data == NULL) {
1646 dmu_return_arcbuf(abuf);
1647 dmu_buf_rele(bonus, FTAG);
1648 return (ra->err);
1649 }
1650
34dc7c2f
BB
1651 tx = dmu_tx_create(os);
1652
1653 dmu_tx_hold_write(tx, drrw->drr_object,
1654 drrw->drr_offset, drrw->drr_length);
1655 err = dmu_tx_assign(tx, TXG_WAIT);
a08ee875 1656 if (err != 0) {
ea04106b
AX
1657 dmu_return_arcbuf(abuf);
1658 dmu_buf_rele(bonus, FTAG);
34dc7c2f
BB
1659 dmu_tx_abort(tx);
1660 return (err);
1661 }
9ae529ec
CS
1662 if (ra->byteswap) {
1663 dmu_object_byteswap_t byteswap =
1664 DMU_OT_BYTESWAP(drrw->drr_type);
1665 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
1666 }
ea04106b 1667 dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
34dc7c2f 1668 dmu_tx_commit(tx);
ea04106b 1669 dmu_buf_rele(bonus, FTAG);
34dc7c2f
BB
1670 return (0);
1671}
1672
428870ff
BB
1673/*
1674 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed
1675 * streams to refer to a copy of the data that is already on the
1676 * system because it came in earlier in the stream. This function
1677 * finds the earlier copy of the data, and uses that copy instead of
1678 * data from the stream to fulfill this write.
1679 */
1680static int
1681restore_write_byref(struct restorearg *ra, objset_t *os,
1682 struct drr_write_byref *drrwbr)
1683{
1684 dmu_tx_t *tx;
1685 int err;
1686 guid_map_entry_t gmesrch;
1687 guid_map_entry_t *gmep;
ea04106b 1688 avl_index_t where;
428870ff
BB
1689 objset_t *ref_os = NULL;
1690 dmu_buf_t *dbp;
1691
1692 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
a08ee875 1693 return (SET_ERROR(EINVAL));
428870ff
BB
1694
1695 /*
1696 * If the GUID of the referenced dataset is different from the
1697 * GUID of the target dataset, find the referenced dataset.
1698 */
1699 if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1700 gmesrch.guid = drrwbr->drr_refguid;
572e2857 1701 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
428870ff 1702 &where)) == NULL) {
a08ee875 1703 return (SET_ERROR(EINVAL));
428870ff
BB
1704 }
1705 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
a08ee875 1706 return (SET_ERROR(EINVAL));
428870ff
BB
1707 } else {
1708 ref_os = os;
1709 }
1710
c65aa5b2
BB
1711 err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1712 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH);
ea04106b 1713 if (err != 0)
428870ff
BB
1714 return (err);
1715
1716 tx = dmu_tx_create(os);
1717
1718 dmu_tx_hold_write(tx, drrwbr->drr_object,
1719 drrwbr->drr_offset, drrwbr->drr_length);
1720 err = dmu_tx_assign(tx, TXG_WAIT);
a08ee875 1721 if (err != 0) {
428870ff
BB
1722 dmu_tx_abort(tx);
1723 return (err);
1724 }
1725 dmu_write(os, drrwbr->drr_object,
1726 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1727 dmu_buf_rele(dbp, FTAG);
1728 dmu_tx_commit(tx);
1729 return (0);
1730}
1731
ea04106b
AX
1732static int
1733restore_write_embedded(struct restorearg *ra, objset_t *os,
1734 struct drr_write_embedded *drrwnp)
1735{
1736 dmu_tx_t *tx;
1737 int err;
1738 void *data;
1739
1740 if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
1741 return (EINVAL);
1742
1743 if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
1744 return (EINVAL);
1745
1746 if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
1747 return (EINVAL);
1748 if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
1749 return (EINVAL);
1750
1751 data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8), NULL);
1752 if (data == NULL)
1753 return (ra->err);
1754
1755 tx = dmu_tx_create(os);
1756
1757 dmu_tx_hold_write(tx, drrwnp->drr_object,
1758 drrwnp->drr_offset, drrwnp->drr_length);
1759 err = dmu_tx_assign(tx, TXG_WAIT);
1760 if (err != 0) {
1761 dmu_tx_abort(tx);
1762 return (err);
1763 }
1764
1765 dmu_write_embedded(os, drrwnp->drr_object,
1766 drrwnp->drr_offset, data, drrwnp->drr_etype,
1767 drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
1768 ra->byteswap ^ ZFS_HOST_BYTEORDER, tx);
1769
1770 dmu_tx_commit(tx);
1771 return (0);
1772}
1773
428870ff
BB
1774static int
1775restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
1776{
1777 dmu_tx_t *tx;
1778 void *data;
1779 dmu_buf_t *db, *db_spill;
1780 int err;
1781
1782 if (drrs->drr_length < SPA_MINBLOCKSIZE ||
e10b0808 1783 drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os)))
a08ee875 1784 return (SET_ERROR(EINVAL));
428870ff 1785
ea04106b 1786 data = restore_read(ra, drrs->drr_length, NULL);
428870ff
BB
1787 if (data == NULL)
1788 return (ra->err);
1789
1790 if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
a08ee875 1791 return (SET_ERROR(EINVAL));
428870ff
BB
1792
1793 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
1794 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
1795 dmu_buf_rele(db, FTAG);
1796 return (err);
1797 }
1798
1799 tx = dmu_tx_create(os);
1800
1801 dmu_tx_hold_spill(tx, db->db_object);
1802
1803 err = dmu_tx_assign(tx, TXG_WAIT);
a08ee875 1804 if (err != 0) {
428870ff
BB
1805 dmu_buf_rele(db, FTAG);
1806 dmu_buf_rele(db_spill, FTAG);
1807 dmu_tx_abort(tx);
1808 return (err);
1809 }
1810 dmu_buf_will_dirty(db_spill, tx);
1811
1812 if (db_spill->db_size < drrs->drr_length)
1813 VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1814 drrs->drr_length, tx));
1815 bcopy(data, db_spill->db_data, drrs->drr_length);
1816
1817 dmu_buf_rele(db, FTAG);
1818 dmu_buf_rele(db_spill, FTAG);
1819
1820 dmu_tx_commit(tx);
1821 return (0);
1822}
1823
34dc7c2f 1824/* ARGSUSED */
60948de1 1825noinline static int
34dc7c2f
BB
1826restore_free(struct restorearg *ra, objset_t *os,
1827 struct drr_free *drrf)
1828{
34dc7c2f
BB
1829 int err;
1830
1831 if (drrf->drr_length != -1ULL &&
1832 drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
a08ee875 1833 return (SET_ERROR(EINVAL));
34dc7c2f
BB
1834
1835 if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
a08ee875 1836 return (SET_ERROR(EINVAL));
34dc7c2f 1837
b128c09f 1838 err = dmu_free_long_range(os, drrf->drr_object,
34dc7c2f 1839 drrf->drr_offset, drrf->drr_length);
34dc7c2f
BB
1840 return (err);
1841}
1842
a08ee875
LG
1843/* used to destroy the drc_ds on error */
1844static void
1845dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
1846{
1847 char name[MAXNAMELEN];
1848 dsl_dataset_name(drc->drc_ds, name);
1849 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
1850 (void) dsl_destroy_head(name);
1851}
1852
34dc7c2f
BB
1853/*
1854 * NB: callers *must* call dmu_recv_end() if this succeeds.
1855 */
1856int
572e2857
BB
1857dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
1858 int cleanup_fd, uint64_t *action_handlep)
34dc7c2f
BB
1859{
1860 struct restorearg ra = { 0 };
1861 dmu_replay_record_t *drr;
1862 objset_t *os;
1863 zio_cksum_t pcksum;
428870ff 1864 int featureflags;
34dc7c2f 1865
a08ee875
LG
1866 ra.byteswap = drc->drc_byteswap;
1867 ra.cksum = drc->drc_cksum;
34dc7c2f
BB
1868 ra.vp = vp;
1869 ra.voff = *voffp;
e10b0808 1870 ra.bufsize = SPA_MAXBLOCKSIZE;
00b46022 1871 ra.buf = vmem_alloc(ra.bufsize, KM_SLEEP);
34dc7c2f
BB
1872
1873 /* these were verified in dmu_recv_begin */
a08ee875 1874 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
428870ff 1875 DMU_SUBSTREAM);
a08ee875 1876 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
34dc7c2f
BB
1877
1878 /*
1879 * Open the objset we are modifying.
1880 */
a08ee875 1881 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os));
34dc7c2f 1882
e10b0808 1883 ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
34dc7c2f 1884
428870ff
BB
1885 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
1886
1887 /* if this stream is dedup'ed, set up the avl tree for guid mapping */
1888 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
572e2857
BB
1889 minor_t minor;
1890
1891 if (cleanup_fd == -1) {
a08ee875 1892 ra.err = SET_ERROR(EBADF);
572e2857
BB
1893 goto out;
1894 }
1895 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
a08ee875 1896 if (ra.err != 0) {
572e2857
BB
1897 cleanup_fd = -1;
1898 goto out;
1899 }
1900
1901 if (*action_handlep == 0) {
1902 ra.guid_to_ds_map =
1903 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
1904 avl_create(ra.guid_to_ds_map, guid_compare,
1905 sizeof (guid_map_entry_t),
1906 offsetof(guid_map_entry_t, avlnode));
572e2857
BB
1907 ra.err = zfs_onexit_add_cb(minor,
1908 free_guid_map_onexit, ra.guid_to_ds_map,
1909 action_handlep);
a08ee875 1910 if (ra.err != 0)
572e2857
BB
1911 goto out;
1912 } else {
1913 ra.err = zfs_onexit_cb_data(minor, *action_handlep,
1914 (void **)&ra.guid_to_ds_map);
a08ee875 1915 if (ra.err != 0)
572e2857
BB
1916 goto out;
1917 }
8d35c149
AS
1918
1919 drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
428870ff
BB
1920 }
1921
34dc7c2f
BB
1922 /*
1923 * Read records and process them.
1924 */
1925 pcksum = ra.cksum;
1926 while (ra.err == 0 &&
ea04106b 1927 NULL != (drr = restore_read(&ra, sizeof (*drr), NULL))) {
34dc7c2f 1928 if (issig(JUSTLOOKING) && issig(FORREAL)) {
a08ee875 1929 ra.err = SET_ERROR(EINTR);
34dc7c2f
BB
1930 goto out;
1931 }
1932
1933 if (ra.byteswap)
1934 backup_byteswap(drr);
1935
1936 switch (drr->drr_type) {
1937 case DRR_OBJECT:
1938 {
1939 /*
1940 * We need to make a copy of the record header,
1941 * because restore_{object,write} may need to
1942 * restore_read(), which will invalidate drr.
1943 */
1944 struct drr_object drro = drr->drr_u.drr_object;
1945 ra.err = restore_object(&ra, os, &drro);
1946 break;
1947 }
1948 case DRR_FREEOBJECTS:
1949 {
1950 struct drr_freeobjects drrfo =
1951 drr->drr_u.drr_freeobjects;
1952 ra.err = restore_freeobjects(&ra, os, &drrfo);
1953 break;
1954 }
1955 case DRR_WRITE:
1956 {
1957 struct drr_write drrw = drr->drr_u.drr_write;
1958 ra.err = restore_write(&ra, os, &drrw);
1959 break;
1960 }
428870ff
BB
1961 case DRR_WRITE_BYREF:
1962 {
1963 struct drr_write_byref drrwbr =
1964 drr->drr_u.drr_write_byref;
1965 ra.err = restore_write_byref(&ra, os, &drrwbr);
1966 break;
1967 }
ea04106b
AX
1968 case DRR_WRITE_EMBEDDED:
1969 {
1970 struct drr_write_embedded drrwe =
1971 drr->drr_u.drr_write_embedded;
1972 ra.err = restore_write_embedded(&ra, os, &drrwe);
1973 break;
1974 }
34dc7c2f
BB
1975 case DRR_FREE:
1976 {
1977 struct drr_free drrf = drr->drr_u.drr_free;
1978 ra.err = restore_free(&ra, os, &drrf);
1979 break;
1980 }
1981 case DRR_END:
1982 {
1983 struct drr_end drre = drr->drr_u.drr_end;
1984 /*
1985 * We compare against the *previous* checksum
1986 * value, because the stored checksum is of
1987 * everything before the DRR_END record.
1988 */
1989 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
a08ee875 1990 ra.err = SET_ERROR(ECKSUM);
34dc7c2f
BB
1991 goto out;
1992 }
428870ff
BB
1993 case DRR_SPILL:
1994 {
1995 struct drr_spill drrs = drr->drr_u.drr_spill;
1996 ra.err = restore_spill(&ra, os, &drrs);
1997 break;
1998 }
34dc7c2f 1999 default:
a08ee875 2000 ra.err = SET_ERROR(EINVAL);
34dc7c2f
BB
2001 goto out;
2002 }
2003 pcksum = ra.cksum;
2004 }
2005 ASSERT(ra.err != 0);
2006
2007out:
572e2857
BB
2008 if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
2009 zfs_onexit_fd_rele(cleanup_fd);
2010
34dc7c2f
BB
2011 if (ra.err != 0) {
2012 /*
45d1cae3
BB
2013 * destroy what we created, so we don't leave it in the
2014 * inconsistent restoring state.
34dc7c2f 2015 */
a08ee875 2016 dmu_recv_cleanup_ds(drc);
34dc7c2f
BB
2017 }
2018
00b46022 2019 vmem_free(ra.buf, ra.bufsize);
34dc7c2f
BB
2020 *voffp = ra.voff;
2021 return (ra.err);
2022}
2023
34dc7c2f 2024static int
a08ee875 2025dmu_recv_end_check(void *arg, dmu_tx_t *tx)
34dc7c2f 2026{
a08ee875
LG
2027 dmu_recv_cookie_t *drc = arg;
2028 dsl_pool_t *dp = dmu_tx_pool(tx);
2029 int error;
2030
2031 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
2032
2033 if (!drc->drc_newfs) {
2034 dsl_dataset_t *origin_head;
2035
2036 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
2037 if (error != 0)
2038 return (error);
2039 if (drc->drc_force) {
2040 /*
2041 * We will destroy any snapshots in tofs (i.e. before
2042 * origin_head) that are after the origin (which is
2043 * the snap before drc_ds, because drc_ds can not
2044 * have any snaps of its own).
2045 */
e10b0808
AX
2046 uint64_t obj;
2047
2048 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
2049 while (obj !=
2050 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
a08ee875
LG
2051 dsl_dataset_t *snap;
2052 error = dsl_dataset_hold_obj(dp, obj, FTAG,
2053 &snap);
2054 if (error != 0)
e10b0808 2055 break;
a08ee875
LG
2056 if (snap->ds_dir != origin_head->ds_dir)
2057 error = SET_ERROR(EINVAL);
2058 if (error == 0) {
2059 error = dsl_destroy_snapshot_check_impl(
2060 snap, B_FALSE);
2061 }
e10b0808 2062 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
a08ee875
LG
2063 dsl_dataset_rele(snap, FTAG);
2064 if (error != 0)
e10b0808
AX
2065 break;
2066 }
2067 if (error != 0) {
2068 dsl_dataset_rele(origin_head, FTAG);
2069 return (error);
a08ee875
LG
2070 }
2071 }
2072 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
2073 origin_head, drc->drc_force, drc->drc_owner, tx);
2074 if (error != 0) {
2075 dsl_dataset_rele(origin_head, FTAG);
2076 return (error);
2077 }
2078 error = dsl_dataset_snapshot_check_impl(origin_head,
e10b0808 2079 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
a08ee875
LG
2080 dsl_dataset_rele(origin_head, FTAG);
2081 if (error != 0)
2082 return (error);
34dc7c2f 2083
a08ee875
LG
2084 error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
2085 } else {
2086 error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
e10b0808 2087 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
a08ee875
LG
2088 }
2089 return (error);
34dc7c2f
BB
2090}
2091
2092static void
a08ee875 2093dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
34dc7c2f 2094{
a08ee875
LG
2095 dmu_recv_cookie_t *drc = arg;
2096 dsl_pool_t *dp = dmu_tx_pool(tx);
2097
2098 spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
2099 tx, "snap=%s", drc->drc_tosnap);
2100
2101 if (!drc->drc_newfs) {
2102 dsl_dataset_t *origin_head;
2103
2104 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
2105 &origin_head));
2106
2107 if (drc->drc_force) {
2108 /*
2109 * Destroy any snapshots of drc_tofs (origin_head)
2110 * after the origin (the snap before drc_ds).
2111 */
e10b0808
AX
2112 uint64_t obj;
2113
2114 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
2115 while (obj !=
2116 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
a08ee875
LG
2117 dsl_dataset_t *snap;
2118 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
2119 &snap));
2120 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
e10b0808 2121 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
a08ee875
LG
2122 dsl_destroy_snapshot_sync_impl(snap,
2123 B_FALSE, tx);
2124 dsl_dataset_rele(snap, FTAG);
2125 }
2126 }
2127 VERIFY3P(drc->drc_ds->ds_prev, ==,
2128 origin_head->ds_prev);
2129
2130 dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
2131 origin_head, tx);
2132 dsl_dataset_snapshot_sync_impl(origin_head,
2133 drc->drc_tosnap, tx);
2134
2135 /* set snapshot's creation time and guid */
2136 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
e10b0808 2137 dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
a08ee875 2138 drc->drc_drrb->drr_creation_time;
e10b0808 2139 dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
a08ee875 2140 drc->drc_drrb->drr_toguid;
e10b0808 2141 dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
a08ee875
LG
2142 ~DS_FLAG_INCONSISTENT;
2143
2144 dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
e10b0808
AX
2145 dsl_dataset_phys(origin_head)->ds_flags &=
2146 ~DS_FLAG_INCONSISTENT;
a08ee875
LG
2147
2148 dsl_dataset_rele(origin_head, FTAG);
2149 dsl_destroy_head_sync_impl(drc->drc_ds, tx);
2150
2151 if (drc->drc_owner != NULL)
2152 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
2153 } else {
2154 dsl_dataset_t *ds = drc->drc_ds;
34dc7c2f 2155
a08ee875 2156 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
34dc7c2f 2157
a08ee875
LG
2158 /* set snapshot's creation time and guid */
2159 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
e10b0808 2160 dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
a08ee875 2161 drc->drc_drrb->drr_creation_time;
e10b0808
AX
2162 dsl_dataset_phys(ds->ds_prev)->ds_guid =
2163 drc->drc_drrb->drr_toguid;
2164 dsl_dataset_phys(ds->ds_prev)->ds_flags &=
2165 ~DS_FLAG_INCONSISTENT;
34dc7c2f 2166
a08ee875 2167 dmu_buf_will_dirty(ds->ds_dbuf, tx);
e10b0808 2168 dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
a08ee875 2169 }
e10b0808 2170 drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
4e820b5a 2171 zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE);
a08ee875
LG
2172 /*
2173 * Release the hold from dmu_recv_begin. This must be done before
2174 * we return to open context, so that when we free the dataset's dnode,
2175 * we can evict its bonus buffer.
2176 */
2177 dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
2178 drc->drc_ds = NULL;
34dc7c2f
BB
2179}
2180
8d35c149 2181static int
a08ee875 2182add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj)
8d35c149 2183{
a08ee875 2184 dsl_pool_t *dp;
8d35c149
AS
2185 dsl_dataset_t *snapds;
2186 guid_map_entry_t *gmep;
2187 int err;
2188
2189 ASSERT(guid_map != NULL);
2190
a08ee875
LG
2191 err = dsl_pool_hold(name, FTAG, &dp);
2192 if (err != 0)
2193 return (err);
2194 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
2195 err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds);
8d35c149 2196 if (err == 0) {
e10b0808 2197 gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
8d35c149
AS
2198 gmep->gme_ds = snapds;
2199 avl_add(guid_map, gmep);
a08ee875
LG
2200 dsl_dataset_long_hold(snapds, gmep);
2201 } else {
2202 kmem_free(gmep, sizeof (*gmep));
8d35c149
AS
2203 }
2204
a08ee875 2205 dsl_pool_rele(dp, FTAG);
8d35c149
AS
2206 return (err);
2207}
2208
a08ee875
LG
2209static int dmu_recv_end_modified_blocks = 3;
2210
428870ff
BB
2211static int
2212dmu_recv_existing_end(dmu_recv_cookie_t *drc)
34dc7c2f 2213{
a08ee875
LG
2214 int error;
2215
2216#ifdef _KERNEL
2217 char *name;
34dc7c2f 2218
a08ee875
LG
2219 /*
2220 * We will be destroying the ds; make sure its origin is unmounted if
2221 * necessary.
2222 */
2223 name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
2224 dsl_dataset_name(drc->drc_ds, name);
2225 zfs_destroy_unmount_origin(name);
2226 kmem_free(name, MAXNAMELEN);
2227#endif
34dc7c2f 2228
a08ee875
LG
2229 error = dsl_sync_task(drc->drc_tofs,
2230 dmu_recv_end_check, dmu_recv_end_sync, drc,
e10b0808 2231 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
34dc7c2f 2232
a08ee875
LG
2233 if (error != 0)
2234 dmu_recv_cleanup_ds(drc);
2235 return (error);
34dc7c2f 2236}
428870ff
BB
2237
2238static int
2239dmu_recv_new_end(dmu_recv_cookie_t *drc)
2240{
a08ee875
LG
2241 int error;
2242
2243 error = dsl_sync_task(drc->drc_tofs,
2244 dmu_recv_end_check, dmu_recv_end_sync, drc,
e10b0808 2245 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL);
a08ee875
LG
2246
2247 if (error != 0) {
2248 dmu_recv_cleanup_ds(drc);
2249 } else if (drc->drc_guid_to_ds_map != NULL) {
2250 (void) add_ds_to_guidmap(drc->drc_tofs,
2251 drc->drc_guid_to_ds_map,
2252 drc->drc_newsnapobj);
428870ff 2253 }
a08ee875 2254 return (error);
428870ff
BB
2255}
2256
2257int
a08ee875 2258dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
428870ff 2259{
a08ee875
LG
2260 drc->drc_owner = owner;
2261
2262 if (drc->drc_newfs)
428870ff 2263 return (dmu_recv_new_end(drc));
a08ee875
LG
2264 else
2265 return (dmu_recv_existing_end(drc));
428870ff 2266}
a08ee875
LG
2267
2268/*
2269 * Return TRUE if this objset is currently being received into.
2270 */
2271boolean_t
2272dmu_objset_is_receiving(objset_t *os)
2273{
2274 return (os->os_dsl_dataset != NULL &&
2275 os->os_dsl_dataset->ds_owner == dmu_recv_tag);
2276}
2277
2278#if defined(_KERNEL)
2279module_param(zfs_send_corrupt_data, int, 0644);
2280MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data");
2281#endif