]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/dmu_send.c
Update core ZFS code from build 121 to build 141.
[mirror_zfs.git] / module / zfs / dmu_send.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 #include <sys/dmu.h>
26 #include <sys/dmu_impl.h>
27 #include <sys/dmu_tx.h>
28 #include <sys/dbuf.h>
29 #include <sys/dnode.h>
30 #include <sys/zfs_context.h>
31 #include <sys/dmu_objset.h>
32 #include <sys/dmu_traverse.h>
33 #include <sys/dsl_dataset.h>
34 #include <sys/dsl_dir.h>
35 #include <sys/dsl_prop.h>
36 #include <sys/dsl_pool.h>
37 #include <sys/dsl_synctask.h>
38 #include <sys/zfs_ioctl.h>
39 #include <sys/zap.h>
40 #include <sys/zio_checksum.h>
41 #include <sys/zfs_znode.h>
42 #include <zfs_fletcher.h>
43 #include <sys/avl.h>
44 #include <sys/ddt.h>
45
46 static char *dmu_recv_tag = "dmu_recv_tag";
47
48 /*
49 * The list of data whose inclusion in a send stream can be pending from
50 * one call to backup_cb to another. Multiple calls to dump_free() and
51 * dump_freeobjects() can be aggregated into a single DRR_FREE or
52 * DRR_FREEOBJECTS replay record.
53 */
54 typedef enum {
55 PENDING_NONE,
56 PENDING_FREE,
57 PENDING_FREEOBJECTS
58 } pendop_t;
59
60 struct backuparg {
61 dmu_replay_record_t *drr;
62 vnode_t *vp;
63 offset_t *off;
64 objset_t *os;
65 zio_cksum_t zc;
66 uint64_t toguid;
67 int err;
68 pendop_t pending_op;
69 };
70
71 static int
72 dump_bytes(struct backuparg *ba, void *buf, int len)
73 {
74 ssize_t resid; /* have to get resid to get detailed errno */
75 ASSERT3U(len % 8, ==, 0);
76
77 fletcher_4_incremental_native(buf, len, &ba->zc);
78 ba->err = vn_rdwr(UIO_WRITE, ba->vp,
79 (caddr_t)buf, len,
80 0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
81 *ba->off += len;
82 return (ba->err);
83 }
84
85 static int
86 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
87 uint64_t length)
88 {
89 struct drr_free *drrf = &(ba->drr->drr_u.drr_free);
90
91 /*
92 * If there is a pending op, but it's not PENDING_FREE, push it out,
93 * since free block aggregation can only be done for blocks of the
94 * same type (i.e., DRR_FREE records can only be aggregated with
95 * other DRR_FREE records. DRR_FREEOBJECTS records can only be
96 * aggregated with other DRR_FREEOBJECTS records.
97 */
98 if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) {
99 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
100 return (EINTR);
101 ba->pending_op = PENDING_NONE;
102 }
103
104 if (ba->pending_op == PENDING_FREE) {
105 /*
106 * There should never be a PENDING_FREE if length is -1
107 * (because dump_dnode is the only place where this
108 * function is called with a -1, and only after flushing
109 * any pending record).
110 */
111 ASSERT(length != -1ULL);
112 /*
113 * Check to see whether this free block can be aggregated
114 * with pending one.
115 */
116 if (drrf->drr_object == object && drrf->drr_offset +
117 drrf->drr_length == offset) {
118 drrf->drr_length += length;
119 return (0);
120 } else {
121 /* not a continuation. Push out pending record */
122 if (dump_bytes(ba, ba->drr,
123 sizeof (dmu_replay_record_t)) != 0)
124 return (EINTR);
125 ba->pending_op = PENDING_NONE;
126 }
127 }
128 /* create a FREE record and make it pending */
129 bzero(ba->drr, sizeof (dmu_replay_record_t));
130 ba->drr->drr_type = DRR_FREE;
131 drrf->drr_object = object;
132 drrf->drr_offset = offset;
133 drrf->drr_length = length;
134 drrf->drr_toguid = ba->toguid;
135 if (length == -1ULL) {
136 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
137 return (EINTR);
138 } else {
139 ba->pending_op = PENDING_FREE;
140 }
141
142 return (0);
143 }
144
145 static int
146 dump_data(struct backuparg *ba, dmu_object_type_t type,
147 uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
148 {
149 struct drr_write *drrw = &(ba->drr->drr_u.drr_write);
150
151
152 /*
153 * If there is any kind of pending aggregation (currently either
154 * a grouping of free objects or free blocks), push it out to
155 * the stream, since aggregation can't be done across operations
156 * of different types.
157 */
158 if (ba->pending_op != PENDING_NONE) {
159 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
160 return (EINTR);
161 ba->pending_op = PENDING_NONE;
162 }
163 /* write a DATA record */
164 bzero(ba->drr, sizeof (dmu_replay_record_t));
165 ba->drr->drr_type = DRR_WRITE;
166 drrw->drr_object = object;
167 drrw->drr_type = type;
168 drrw->drr_offset = offset;
169 drrw->drr_length = blksz;
170 drrw->drr_toguid = ba->toguid;
171 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
172 if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
173 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
174 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
175 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
176 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
177 drrw->drr_key.ddk_cksum = bp->blk_cksum;
178
179 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
180 return (EINTR);
181 if (dump_bytes(ba, data, blksz) != 0)
182 return (EINTR);
183 return (0);
184 }
185
186 static int
187 dump_spill(struct backuparg *ba, uint64_t object, int blksz, void *data)
188 {
189 struct drr_spill *drrs = &(ba->drr->drr_u.drr_spill);
190
191 if (ba->pending_op != PENDING_NONE) {
192 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
193 return (EINTR);
194 ba->pending_op = PENDING_NONE;
195 }
196
197 /* write a SPILL record */
198 bzero(ba->drr, sizeof (dmu_replay_record_t));
199 ba->drr->drr_type = DRR_SPILL;
200 drrs->drr_object = object;
201 drrs->drr_length = blksz;
202 drrs->drr_toguid = ba->toguid;
203
204 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
205 return (EINTR);
206 if (dump_bytes(ba, data, blksz))
207 return (EINTR);
208 return (0);
209 }
210
211 static int
212 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
213 {
214 struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects);
215
216 /*
217 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
218 * push it out, since free block aggregation can only be done for
219 * blocks of the same type (i.e., DRR_FREE records can only be
220 * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records
221 * can only be aggregated with other DRR_FREEOBJECTS records.
222 */
223 if (ba->pending_op != PENDING_NONE &&
224 ba->pending_op != PENDING_FREEOBJECTS) {
225 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
226 return (EINTR);
227 ba->pending_op = PENDING_NONE;
228 }
229 if (ba->pending_op == PENDING_FREEOBJECTS) {
230 /*
231 * See whether this free object array can be aggregated
232 * with pending one
233 */
234 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
235 drrfo->drr_numobjs += numobjs;
236 return (0);
237 } else {
238 /* can't be aggregated. Push out pending record */
239 if (dump_bytes(ba, ba->drr,
240 sizeof (dmu_replay_record_t)) != 0)
241 return (EINTR);
242 ba->pending_op = PENDING_NONE;
243 }
244 }
245
246 /* write a FREEOBJECTS record */
247 bzero(ba->drr, sizeof (dmu_replay_record_t));
248 ba->drr->drr_type = DRR_FREEOBJECTS;
249 drrfo->drr_firstobj = firstobj;
250 drrfo->drr_numobjs = numobjs;
251 drrfo->drr_toguid = ba->toguid;
252
253 ba->pending_op = PENDING_FREEOBJECTS;
254
255 return (0);
256 }
257
258 static int
259 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
260 {
261 struct drr_object *drro = &(ba->drr->drr_u.drr_object);
262
263 if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
264 return (dump_freeobjects(ba, object, 1));
265
266 if (ba->pending_op != PENDING_NONE) {
267 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
268 return (EINTR);
269 ba->pending_op = PENDING_NONE;
270 }
271
272 /* write an OBJECT record */
273 bzero(ba->drr, sizeof (dmu_replay_record_t));
274 ba->drr->drr_type = DRR_OBJECT;
275 drro->drr_object = object;
276 drro->drr_type = dnp->dn_type;
277 drro->drr_bonustype = dnp->dn_bonustype;
278 drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
279 drro->drr_bonuslen = dnp->dn_bonuslen;
280 drro->drr_checksumtype = dnp->dn_checksum;
281 drro->drr_compress = dnp->dn_compress;
282 drro->drr_toguid = ba->toguid;
283
284 if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
285 return (EINTR);
286
287 if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
288 return (EINTR);
289
290 /* free anything past the end of the file */
291 if (dump_free(ba, object, (dnp->dn_maxblkid + 1) *
292 (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
293 return (EINTR);
294 if (ba->err)
295 return (EINTR);
296 return (0);
297 }
298
299 #define BP_SPAN(dnp, level) \
300 (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
301 (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
302
303 /* ARGSUSED */
304 static int
305 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
306 const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
307 {
308 struct backuparg *ba = arg;
309 dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
310 int err = 0;
311
312 if (issig(JUSTLOOKING) && issig(FORREAL))
313 return (EINTR);
314
315 if (zb->zb_object != DMU_META_DNODE_OBJECT &&
316 DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
317 return (0);
318 } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
319 uint64_t span = BP_SPAN(dnp, zb->zb_level);
320 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
321 err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
322 } else if (bp == NULL) {
323 uint64_t span = BP_SPAN(dnp, zb->zb_level);
324 err = dump_free(ba, zb->zb_object, zb->zb_blkid * span, span);
325 } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
326 return (0);
327 } else if (type == DMU_OT_DNODE) {
328 dnode_phys_t *blk;
329 int i;
330 int blksz = BP_GET_LSIZE(bp);
331 uint32_t aflags = ARC_WAIT;
332 arc_buf_t *abuf;
333
334 if (dsl_read(NULL, spa, bp, pbuf,
335 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
336 ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
337 return (EIO);
338
339 blk = abuf->b_data;
340 for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
341 uint64_t dnobj = (zb->zb_blkid <<
342 (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
343 err = dump_dnode(ba, dnobj, blk+i);
344 if (err)
345 break;
346 }
347 (void) arc_buf_remove_ref(abuf, &abuf);
348 } else if (type == DMU_OT_SA) {
349 uint32_t aflags = ARC_WAIT;
350 arc_buf_t *abuf;
351 int blksz = BP_GET_LSIZE(bp);
352
353 if (arc_read_nolock(NULL, spa, bp,
354 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
355 ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
356 return (EIO);
357
358 err = dump_spill(ba, zb->zb_object, blksz, abuf->b_data);
359 (void) arc_buf_remove_ref(abuf, &abuf);
360 } else { /* it's a level-0 block of a regular object */
361 uint32_t aflags = ARC_WAIT;
362 arc_buf_t *abuf;
363 int blksz = BP_GET_LSIZE(bp);
364
365 if (dsl_read(NULL, spa, bp, pbuf,
366 arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
367 ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
368 return (EIO);
369
370 err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
371 blksz, bp, abuf->b_data);
372 (void) arc_buf_remove_ref(abuf, &abuf);
373 }
374
375 ASSERT(err == 0 || err == EINTR);
376 return (err);
377 }
378
379 int
380 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
381 vnode_t *vp, offset_t *off)
382 {
383 dsl_dataset_t *ds = tosnap->os_dsl_dataset;
384 dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
385 dmu_replay_record_t *drr;
386 struct backuparg ba;
387 int err;
388 uint64_t fromtxg = 0;
389
390 /* tosnap must be a snapshot */
391 if (ds->ds_phys->ds_next_snap_obj == 0)
392 return (EINVAL);
393
394 /* fromsnap must be an earlier snapshot from the same fs as tosnap */
395 if (fromds && (ds->ds_dir != fromds->ds_dir ||
396 fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
397 return (EXDEV);
398
399 if (fromorigin) {
400 dsl_pool_t *dp = ds->ds_dir->dd_pool;
401
402 if (fromsnap)
403 return (EINVAL);
404
405 if (dsl_dir_is_clone(ds->ds_dir)) {
406 rw_enter(&dp->dp_config_rwlock, RW_READER);
407 err = dsl_dataset_hold_obj(dp,
408 ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
409 rw_exit(&dp->dp_config_rwlock);
410 if (err)
411 return (err);
412 } else {
413 fromorigin = B_FALSE;
414 }
415 }
416
417
418 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
419 drr->drr_type = DRR_BEGIN;
420 drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
421 DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
422 DMU_SUBSTREAM);
423
424 #ifdef _KERNEL
425 if (dmu_objset_type(tosnap) == DMU_OST_ZFS) {
426 uint64_t version;
427 if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0)
428 return (EINVAL);
429 if (version == ZPL_VERSION_SA) {
430 DMU_SET_FEATUREFLAGS(
431 drr->drr_u.drr_begin.drr_versioninfo,
432 DMU_BACKUP_FEATURE_SA_SPILL);
433 }
434 }
435 #endif
436
437 drr->drr_u.drr_begin.drr_creation_time =
438 ds->ds_phys->ds_creation_time;
439 drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
440 if (fromorigin)
441 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
442 drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
443 if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
444 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
445
446 if (fromds)
447 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
448 dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
449
450 if (fromds)
451 fromtxg = fromds->ds_phys->ds_creation_txg;
452 if (fromorigin)
453 dsl_dataset_rele(fromds, FTAG);
454
455 ba.drr = drr;
456 ba.vp = vp;
457 ba.os = tosnap;
458 ba.off = off;
459 ba.toguid = ds->ds_phys->ds_guid;
460 ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
461 ba.pending_op = PENDING_NONE;
462
463 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
464 kmem_free(drr, sizeof (dmu_replay_record_t));
465 return (ba.err);
466 }
467
468 err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
469 backup_cb, &ba);
470
471 if (ba.pending_op != PENDING_NONE)
472 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0)
473 err = EINTR;
474
475 if (err) {
476 if (err == EINTR && ba.err)
477 err = ba.err;
478 kmem_free(drr, sizeof (dmu_replay_record_t));
479 return (err);
480 }
481
482 bzero(drr, sizeof (dmu_replay_record_t));
483 drr->drr_type = DRR_END;
484 drr->drr_u.drr_end.drr_checksum = ba.zc;
485 drr->drr_u.drr_end.drr_toguid = ba.toguid;
486
487 if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
488 kmem_free(drr, sizeof (dmu_replay_record_t));
489 return (ba.err);
490 }
491
492 kmem_free(drr, sizeof (dmu_replay_record_t));
493
494 return (0);
495 }
496
497 struct recvbeginsyncarg {
498 const char *tofs;
499 const char *tosnap;
500 dsl_dataset_t *origin;
501 uint64_t fromguid;
502 dmu_objset_type_t type;
503 void *tag;
504 boolean_t force;
505 uint64_t dsflags;
506 char clonelastname[MAXNAMELEN];
507 dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
508 cred_t *cr;
509 };
510
511 /* ARGSUSED */
512 static int
513 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
514 {
515 dsl_dir_t *dd = arg1;
516 struct recvbeginsyncarg *rbsa = arg2;
517 objset_t *mos = dd->dd_pool->dp_meta_objset;
518 uint64_t val;
519 int err;
520
521 err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
522 strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
523
524 if (err != ENOENT)
525 return (err ? err : EEXIST);
526
527 if (rbsa->origin) {
528 /* make sure it's a snap in the same pool */
529 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
530 return (EXDEV);
531 if (!dsl_dataset_is_snapshot(rbsa->origin))
532 return (EINVAL);
533 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
534 return (ENODEV);
535 }
536
537 return (0);
538 }
539
540 static void
541 recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
542 {
543 dsl_dir_t *dd = arg1;
544 struct recvbeginsyncarg *rbsa = arg2;
545 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
546 uint64_t dsobj;
547
548 /* Create and open new dataset. */
549 dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
550 rbsa->origin, flags, rbsa->cr, tx);
551 VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
552 B_TRUE, dmu_recv_tag, &rbsa->ds));
553
554 if (rbsa->origin == NULL) {
555 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
556 rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
557 }
558
559 spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC,
560 dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj);
561 }
562
563 /* ARGSUSED */
564 static int
565 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
566 {
567 dsl_dataset_t *ds = arg1;
568 struct recvbeginsyncarg *rbsa = arg2;
569 int err;
570 uint64_t val;
571
572 /* must not have any changes since most recent snapshot */
573 if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
574 return (ETXTBSY);
575
576 if (rbsa->fromguid) {
577 /* if incremental, most recent snapshot must match fromguid */
578 if (ds->ds_prev == NULL)
579 return (ENODEV);
580
581 /*
582 * most recent snapshot must match fromguid, or there are no
583 * changes since the fromguid one
584 */
585 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
586 uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
587 uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
588 while (obj != 0) {
589 dsl_dataset_t *snap;
590 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
591 obj, FTAG, &snap);
592 if (err)
593 return (ENODEV);
594 if (snap->ds_phys->ds_creation_txg < birth) {
595 dsl_dataset_rele(snap, FTAG);
596 return (ENODEV);
597 }
598 if (snap->ds_phys->ds_guid == rbsa->fromguid) {
599 dsl_dataset_rele(snap, FTAG);
600 break; /* it's ok */
601 }
602 obj = snap->ds_phys->ds_prev_snap_obj;
603 dsl_dataset_rele(snap, FTAG);
604 }
605 if (obj == 0)
606 return (ENODEV);
607 }
608 } else {
609 /* if full, most recent snapshot must be $ORIGIN */
610 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
611 return (ENODEV);
612 }
613
614 /* temporary clone name must not exist */
615 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
616 ds->ds_dir->dd_phys->dd_child_dir_zapobj,
617 rbsa->clonelastname, 8, 1, &val);
618 if (err == 0)
619 return (EEXIST);
620 if (err != ENOENT)
621 return (err);
622
623 /* new snapshot name must not exist */
624 err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
625 ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
626 if (err == 0)
627 return (EEXIST);
628 if (err != ENOENT)
629 return (err);
630 return (0);
631 }
632
633 /* ARGSUSED */
634 static void
635 recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
636 {
637 dsl_dataset_t *ohds = arg1;
638 struct recvbeginsyncarg *rbsa = arg2;
639 dsl_pool_t *dp = ohds->ds_dir->dd_pool;
640 dsl_dataset_t *cds;
641 uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
642 uint64_t dsobj;
643
644 /* create and open the temporary clone */
645 dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
646 ohds->ds_prev, flags, rbsa->cr, tx);
647 VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
648
649 /*
650 * If we actually created a non-clone, we need to create the
651 * objset in our new dataset.
652 */
653 if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
654 (void) dmu_objset_create_impl(dp->dp_spa,
655 cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
656 }
657
658 rbsa->ds = cds;
659
660 spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC,
661 dp->dp_spa, tx, "dataset = %lld", dsobj);
662 }
663
664
665 static boolean_t
666 dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
667 {
668 int featureflags;
669
670 featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
671
672 /* Verify pool version supports SA if SA_SPILL feature set */
673 return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
674 (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA));
675 }
676
677 /*
678 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
679 * succeeds; otherwise we will leak the holds on the datasets.
680 */
681 int
682 dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
683 boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
684 {
685 int err = 0;
686 boolean_t byteswap;
687 struct recvbeginsyncarg rbsa = { 0 };
688 uint64_t versioninfo;
689 int flags;
690 dsl_dataset_t *ds;
691
692 if (drrb->drr_magic == DMU_BACKUP_MAGIC)
693 byteswap = FALSE;
694 else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
695 byteswap = TRUE;
696 else
697 return (EINVAL);
698
699 rbsa.tofs = tofs;
700 rbsa.tosnap = tosnap;
701 rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
702 rbsa.fromguid = drrb->drr_fromguid;
703 rbsa.type = drrb->drr_type;
704 rbsa.tag = FTAG;
705 rbsa.dsflags = 0;
706 rbsa.cr = CRED();
707 versioninfo = drrb->drr_versioninfo;
708 flags = drrb->drr_flags;
709
710 if (byteswap) {
711 rbsa.type = BSWAP_32(rbsa.type);
712 rbsa.fromguid = BSWAP_64(rbsa.fromguid);
713 versioninfo = BSWAP_64(versioninfo);
714 flags = BSWAP_32(flags);
715 }
716
717 if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
718 rbsa.type >= DMU_OST_NUMTYPES ||
719 ((flags & DRR_FLAG_CLONE) && origin == NULL))
720 return (EINVAL);
721
722 if (flags & DRR_FLAG_CI_DATA)
723 rbsa.dsflags = DS_FLAG_CI_DATASET;
724
725 bzero(drc, sizeof (dmu_recv_cookie_t));
726 drc->drc_drrb = drrb;
727 drc->drc_tosnap = tosnap;
728 drc->drc_top_ds = top_ds;
729 drc->drc_force = force;
730
731 /*
732 * Process the begin in syncing context.
733 */
734
735 /* open the dataset we are logically receiving into */
736 err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
737 if (err == 0) {
738 if (dmu_recv_verify_features(ds, drrb)) {
739 dsl_dataset_rele(ds, dmu_recv_tag);
740 return (ENOTSUP);
741 }
742 /* target fs already exists; recv into temp clone */
743
744 /* Can't recv a clone into an existing fs */
745 if (flags & DRR_FLAG_CLONE) {
746 dsl_dataset_rele(ds, dmu_recv_tag);
747 return (EINVAL);
748 }
749
750 /* must not have an incremental recv already in progress */
751 if (!mutex_tryenter(&ds->ds_recvlock)) {
752 dsl_dataset_rele(ds, dmu_recv_tag);
753 return (EBUSY);
754 }
755
756 /* tmp clone name is: tofs/%tosnap" */
757 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
758 "%%%s", tosnap);
759 rbsa.force = force;
760 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
761 recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
762 if (err) {
763 mutex_exit(&ds->ds_recvlock);
764 dsl_dataset_rele(ds, dmu_recv_tag);
765 return (err);
766 }
767 drc->drc_logical_ds = ds;
768 drc->drc_real_ds = rbsa.ds;
769 } else if (err == ENOENT) {
770 /* target fs does not exist; must be a full backup or clone */
771 char *cp;
772
773 /*
774 * If it's a non-clone incremental, we are missing the
775 * target fs, so fail the recv.
776 */
777 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
778 return (ENOENT);
779
780 /* Open the parent of tofs */
781 cp = strrchr(tofs, '/');
782 *cp = '\0';
783 err = dsl_dataset_hold(tofs, FTAG, &ds);
784 *cp = '/';
785 if (err)
786 return (err);
787
788 if (dmu_recv_verify_features(ds, drrb)) {
789 dsl_dataset_rele(ds, dmu_recv_tag);
790 return (ENOTSUP);
791 }
792
793 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
794 recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
795 dsl_dataset_rele(ds, FTAG);
796 if (err)
797 return (err);
798 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
799 drc->drc_newfs = B_TRUE;
800 }
801
802 return (err);
803 }
804
805 struct restorearg {
806 int err;
807 int byteswap;
808 vnode_t *vp;
809 char *buf;
810 uint64_t voff;
811 int bufsize; /* amount of memory allocated for buf */
812 zio_cksum_t cksum;
813 avl_tree_t guid_to_ds_map;
814 };
815
816 typedef struct guid_map_entry {
817 uint64_t guid;
818 dsl_dataset_t *gme_ds;
819 avl_node_t avlnode;
820 } guid_map_entry_t;
821
822 static int
823 guid_compare(const void *arg1, const void *arg2)
824 {
825 const guid_map_entry_t *gmep1 = arg1;
826 const guid_map_entry_t *gmep2 = arg2;
827
828 if (gmep1->guid < gmep2->guid)
829 return (-1);
830 else if (gmep1->guid > gmep2->guid)
831 return (1);
832 return (0);
833 }
834
835 /*
836 * This function is a callback used by dmu_objset_find() (which
837 * enumerates the object sets) to build an avl tree that maps guids
838 * to datasets. The resulting table is used when processing DRR_WRITE_BYREF
839 * send stream records. These records, which are used in dedup'ed
840 * streams, do not contain data themselves, but refer to a copy
841 * of the data block that has already been written because it was
842 * earlier in the stream. That previous copy is identified by the
843 * guid of the dataset with the referenced data.
844 */
845 int
846 find_ds_by_guid(const char *name, void *arg)
847 {
848 avl_tree_t *guid_map = arg;
849 dsl_dataset_t *ds, *snapds;
850 guid_map_entry_t *gmep;
851 dsl_pool_t *dp;
852 int err;
853 uint64_t lastobj, firstobj;
854
855 if (dsl_dataset_hold(name, FTAG, &ds) != 0)
856 return (0);
857
858 dp = ds->ds_dir->dd_pool;
859 rw_enter(&dp->dp_config_rwlock, RW_READER);
860 firstobj = ds->ds_dir->dd_phys->dd_origin_obj;
861 lastobj = ds->ds_phys->ds_prev_snap_obj;
862
863 while (lastobj != firstobj) {
864 err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds);
865 if (err) {
866 /*
867 * Skip this snapshot and move on. It's not
868 * clear why this would ever happen, but the
869 * remainder of the snapshot streadm can be
870 * processed.
871 */
872 rw_exit(&dp->dp_config_rwlock);
873 dsl_dataset_rele(ds, FTAG);
874 return (0);
875 }
876
877 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
878 gmep->guid = snapds->ds_phys->ds_guid;
879 gmep->gme_ds = snapds;
880 avl_add(guid_map, gmep);
881 lastobj = snapds->ds_phys->ds_prev_snap_obj;
882 }
883
884 rw_exit(&dp->dp_config_rwlock);
885 dsl_dataset_rele(ds, FTAG);
886
887 return (0);
888 }
889
890 static void *
891 restore_read(struct restorearg *ra, int len)
892 {
893 void *rv;
894 int done = 0;
895
896 /* some things will require 8-byte alignment, so everything must */
897 ASSERT3U(len % 8, ==, 0);
898
899 while (done < len) {
900 ssize_t resid;
901
902 ra->err = vn_rdwr(UIO_READ, ra->vp,
903 (caddr_t)ra->buf + done, len - done,
904 ra->voff, UIO_SYSSPACE, FAPPEND,
905 RLIM64_INFINITY, CRED(), &resid);
906
907 if (resid == len - done)
908 ra->err = EINVAL;
909 ra->voff += len - done - resid;
910 done = len - resid;
911 if (ra->err)
912 return (NULL);
913 }
914
915 ASSERT3U(done, ==, len);
916 rv = ra->buf;
917 if (ra->byteswap)
918 fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
919 else
920 fletcher_4_incremental_native(rv, len, &ra->cksum);
921 return (rv);
922 }
923
924 static void
925 backup_byteswap(dmu_replay_record_t *drr)
926 {
927 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
928 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
929 drr->drr_type = BSWAP_32(drr->drr_type);
930 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
931 switch (drr->drr_type) {
932 case DRR_BEGIN:
933 DO64(drr_begin.drr_magic);
934 DO64(drr_begin.drr_versioninfo);
935 DO64(drr_begin.drr_creation_time);
936 DO32(drr_begin.drr_type);
937 DO32(drr_begin.drr_flags);
938 DO64(drr_begin.drr_toguid);
939 DO64(drr_begin.drr_fromguid);
940 break;
941 case DRR_OBJECT:
942 DO64(drr_object.drr_object);
943 /* DO64(drr_object.drr_allocation_txg); */
944 DO32(drr_object.drr_type);
945 DO32(drr_object.drr_bonustype);
946 DO32(drr_object.drr_blksz);
947 DO32(drr_object.drr_bonuslen);
948 DO64(drr_object.drr_toguid);
949 break;
950 case DRR_FREEOBJECTS:
951 DO64(drr_freeobjects.drr_firstobj);
952 DO64(drr_freeobjects.drr_numobjs);
953 DO64(drr_freeobjects.drr_toguid);
954 break;
955 case DRR_WRITE:
956 DO64(drr_write.drr_object);
957 DO32(drr_write.drr_type);
958 DO64(drr_write.drr_offset);
959 DO64(drr_write.drr_length);
960 DO64(drr_write.drr_toguid);
961 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
962 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
963 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
964 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
965 DO64(drr_write.drr_key.ddk_prop);
966 break;
967 case DRR_WRITE_BYREF:
968 DO64(drr_write_byref.drr_object);
969 DO64(drr_write_byref.drr_offset);
970 DO64(drr_write_byref.drr_length);
971 DO64(drr_write_byref.drr_toguid);
972 DO64(drr_write_byref.drr_refguid);
973 DO64(drr_write_byref.drr_refobject);
974 DO64(drr_write_byref.drr_refoffset);
975 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
976 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
977 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
978 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
979 DO64(drr_write_byref.drr_key.ddk_prop);
980 break;
981 case DRR_FREE:
982 DO64(drr_free.drr_object);
983 DO64(drr_free.drr_offset);
984 DO64(drr_free.drr_length);
985 DO64(drr_free.drr_toguid);
986 break;
987 case DRR_SPILL:
988 DO64(drr_spill.drr_object);
989 DO64(drr_spill.drr_length);
990 DO64(drr_spill.drr_toguid);
991 break;
992 case DRR_END:
993 DO64(drr_end.drr_checksum.zc_word[0]);
994 DO64(drr_end.drr_checksum.zc_word[1]);
995 DO64(drr_end.drr_checksum.zc_word[2]);
996 DO64(drr_end.drr_checksum.zc_word[3]);
997 DO64(drr_end.drr_toguid);
998 break;
999 }
1000 #undef DO64
1001 #undef DO32
1002 }
1003
1004 static int
1005 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
1006 {
1007 int err;
1008 dmu_tx_t *tx;
1009 void *data = NULL;
1010
1011 if (drro->drr_type == DMU_OT_NONE ||
1012 drro->drr_type >= DMU_OT_NUMTYPES ||
1013 drro->drr_bonustype >= DMU_OT_NUMTYPES ||
1014 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
1015 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1016 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1017 drro->drr_blksz < SPA_MINBLOCKSIZE ||
1018 drro->drr_blksz > SPA_MAXBLOCKSIZE ||
1019 drro->drr_bonuslen > DN_MAX_BONUSLEN) {
1020 return (EINVAL);
1021 }
1022
1023 err = dmu_object_info(os, drro->drr_object, NULL);
1024
1025 if (err != 0 && err != ENOENT)
1026 return (EINVAL);
1027
1028 if (drro->drr_bonuslen) {
1029 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
1030 if (ra->err)
1031 return (ra->err);
1032 }
1033
1034 if (err == ENOENT) {
1035 /* currently free, want to be allocated */
1036 tx = dmu_tx_create(os);
1037 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1038 err = dmu_tx_assign(tx, TXG_WAIT);
1039 if (err) {
1040 dmu_tx_abort(tx);
1041 return (err);
1042 }
1043 err = dmu_object_claim(os, drro->drr_object,
1044 drro->drr_type, drro->drr_blksz,
1045 drro->drr_bonustype, drro->drr_bonuslen, tx);
1046 dmu_tx_commit(tx);
1047 } else {
1048 /* currently allocated, want to be allocated */
1049 err = dmu_object_reclaim(os, drro->drr_object,
1050 drro->drr_type, drro->drr_blksz,
1051 drro->drr_bonustype, drro->drr_bonuslen);
1052 }
1053 if (err) {
1054 return (EINVAL);
1055 }
1056
1057 tx = dmu_tx_create(os);
1058 dmu_tx_hold_bonus(tx, drro->drr_object);
1059 err = dmu_tx_assign(tx, TXG_WAIT);
1060 if (err) {
1061 dmu_tx_abort(tx);
1062 return (err);
1063 }
1064
1065 dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
1066 tx);
1067 dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
1068
1069 if (data != NULL) {
1070 dmu_buf_t *db;
1071
1072 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
1073 dmu_buf_will_dirty(db, tx);
1074
1075 ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
1076 bcopy(data, db->db_data, drro->drr_bonuslen);
1077 if (ra->byteswap) {
1078 dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
1079 drro->drr_bonuslen);
1080 }
1081 dmu_buf_rele(db, FTAG);
1082 }
1083 dmu_tx_commit(tx);
1084 return (0);
1085 }
1086
1087 /* ARGSUSED */
1088 static int
1089 restore_freeobjects(struct restorearg *ra, objset_t *os,
1090 struct drr_freeobjects *drrfo)
1091 {
1092 uint64_t obj;
1093
1094 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
1095 return (EINVAL);
1096
1097 for (obj = drrfo->drr_firstobj;
1098 obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
1099 (void) dmu_object_next(os, &obj, FALSE, 0)) {
1100 int err;
1101
1102 if (dmu_object_info(os, obj, NULL) != 0)
1103 continue;
1104
1105 err = dmu_free_object(os, obj);
1106 if (err)
1107 return (err);
1108 }
1109 return (0);
1110 }
1111
1112 static int
1113 restore_write(struct restorearg *ra, objset_t *os,
1114 struct drr_write *drrw)
1115 {
1116 dmu_tx_t *tx;
1117 void *data;
1118 int err;
1119
1120 if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
1121 drrw->drr_type >= DMU_OT_NUMTYPES)
1122 return (EINVAL);
1123
1124 data = restore_read(ra, drrw->drr_length);
1125 if (data == NULL)
1126 return (ra->err);
1127
1128 if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
1129 return (EINVAL);
1130
1131 tx = dmu_tx_create(os);
1132
1133 dmu_tx_hold_write(tx, drrw->drr_object,
1134 drrw->drr_offset, drrw->drr_length);
1135 err = dmu_tx_assign(tx, TXG_WAIT);
1136 if (err) {
1137 dmu_tx_abort(tx);
1138 return (err);
1139 }
1140 if (ra->byteswap)
1141 dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
1142 dmu_write(os, drrw->drr_object,
1143 drrw->drr_offset, drrw->drr_length, data, tx);
1144 dmu_tx_commit(tx);
1145 return (0);
1146 }
1147
1148 /*
1149 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed
1150 * streams to refer to a copy of the data that is already on the
1151 * system because it came in earlier in the stream. This function
1152 * finds the earlier copy of the data, and uses that copy instead of
1153 * data from the stream to fulfill this write.
1154 */
1155 static int
1156 restore_write_byref(struct restorearg *ra, objset_t *os,
1157 struct drr_write_byref *drrwbr)
1158 {
1159 dmu_tx_t *tx;
1160 int err;
1161 guid_map_entry_t gmesrch;
1162 guid_map_entry_t *gmep;
1163 avl_index_t where;
1164 objset_t *ref_os = NULL;
1165 dmu_buf_t *dbp;
1166
1167 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
1168 return (EINVAL);
1169
1170 /*
1171 * If the GUID of the referenced dataset is different from the
1172 * GUID of the target dataset, find the referenced dataset.
1173 */
1174 if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1175 gmesrch.guid = drrwbr->drr_refguid;
1176 if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch,
1177 &where)) == NULL) {
1178 return (EINVAL);
1179 }
1180 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
1181 return (EINVAL);
1182 } else {
1183 ref_os = os;
1184 }
1185
1186 if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1187 drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
1188 return (err);
1189
1190 tx = dmu_tx_create(os);
1191
1192 dmu_tx_hold_write(tx, drrwbr->drr_object,
1193 drrwbr->drr_offset, drrwbr->drr_length);
1194 err = dmu_tx_assign(tx, TXG_WAIT);
1195 if (err) {
1196 dmu_tx_abort(tx);
1197 return (err);
1198 }
1199 dmu_write(os, drrwbr->drr_object,
1200 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1201 dmu_buf_rele(dbp, FTAG);
1202 dmu_tx_commit(tx);
1203 return (0);
1204 }
1205
1206 static int
1207 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
1208 {
1209 dmu_tx_t *tx;
1210 void *data;
1211 dmu_buf_t *db, *db_spill;
1212 int err;
1213
1214 if (drrs->drr_length < SPA_MINBLOCKSIZE ||
1215 drrs->drr_length > SPA_MAXBLOCKSIZE)
1216 return (EINVAL);
1217
1218 data = restore_read(ra, drrs->drr_length);
1219 if (data == NULL)
1220 return (ra->err);
1221
1222 if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
1223 return (EINVAL);
1224
1225 VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
1226 if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
1227 dmu_buf_rele(db, FTAG);
1228 return (err);
1229 }
1230
1231 tx = dmu_tx_create(os);
1232
1233 dmu_tx_hold_spill(tx, db->db_object);
1234
1235 err = dmu_tx_assign(tx, TXG_WAIT);
1236 if (err) {
1237 dmu_buf_rele(db, FTAG);
1238 dmu_buf_rele(db_spill, FTAG);
1239 dmu_tx_abort(tx);
1240 return (err);
1241 }
1242 dmu_buf_will_dirty(db_spill, tx);
1243
1244 if (db_spill->db_size < drrs->drr_length)
1245 VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1246 drrs->drr_length, tx));
1247 bcopy(data, db_spill->db_data, drrs->drr_length);
1248
1249 dmu_buf_rele(db, FTAG);
1250 dmu_buf_rele(db_spill, FTAG);
1251
1252 dmu_tx_commit(tx);
1253 return (0);
1254 }
1255
1256 /* ARGSUSED */
1257 static int
1258 restore_free(struct restorearg *ra, objset_t *os,
1259 struct drr_free *drrf)
1260 {
1261 int err;
1262
1263 if (drrf->drr_length != -1ULL &&
1264 drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
1265 return (EINVAL);
1266
1267 if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
1268 return (EINVAL);
1269
1270 err = dmu_free_long_range(os, drrf->drr_object,
1271 drrf->drr_offset, drrf->drr_length);
1272 return (err);
1273 }
1274
1275 /*
1276 * NB: callers *must* call dmu_recv_end() if this succeeds.
1277 */
1278 int
1279 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
1280 {
1281 struct restorearg ra = { 0 };
1282 dmu_replay_record_t *drr;
1283 objset_t *os;
1284 zio_cksum_t pcksum;
1285 guid_map_entry_t *gmep;
1286 int featureflags;
1287
1288 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
1289 ra.byteswap = TRUE;
1290
1291 {
1292 /* compute checksum of drr_begin record */
1293 dmu_replay_record_t *drr;
1294 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1295
1296 drr->drr_type = DRR_BEGIN;
1297 drr->drr_u.drr_begin = *drc->drc_drrb;
1298 if (ra.byteswap) {
1299 fletcher_4_incremental_byteswap(drr,
1300 sizeof (dmu_replay_record_t), &ra.cksum);
1301 } else {
1302 fletcher_4_incremental_native(drr,
1303 sizeof (dmu_replay_record_t), &ra.cksum);
1304 }
1305 kmem_free(drr, sizeof (dmu_replay_record_t));
1306 }
1307
1308 if (ra.byteswap) {
1309 struct drr_begin *drrb = drc->drc_drrb;
1310 drrb->drr_magic = BSWAP_64(drrb->drr_magic);
1311 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
1312 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
1313 drrb->drr_type = BSWAP_32(drrb->drr_type);
1314 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
1315 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
1316 }
1317
1318 ra.vp = vp;
1319 ra.voff = *voffp;
1320 ra.bufsize = 1<<20;
1321 ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
1322
1323 /* these were verified in dmu_recv_begin */
1324 ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
1325 DMU_SUBSTREAM);
1326 ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
1327
1328 /*
1329 * Open the objset we are modifying.
1330 */
1331 VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
1332
1333 ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
1334
1335 featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
1336
1337 /* if this stream is dedup'ed, set up the avl tree for guid mapping */
1338 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
1339 avl_create(&ra.guid_to_ds_map, guid_compare,
1340 sizeof (guid_map_entry_t),
1341 offsetof(guid_map_entry_t, avlnode));
1342 (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
1343 (void *)&ra.guid_to_ds_map,
1344 DS_FIND_CHILDREN);
1345 }
1346
1347 /*
1348 * Read records and process them.
1349 */
1350 pcksum = ra.cksum;
1351 while (ra.err == 0 &&
1352 NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
1353 if (issig(JUSTLOOKING) && issig(FORREAL)) {
1354 ra.err = EINTR;
1355 goto out;
1356 }
1357
1358 if (ra.byteswap)
1359 backup_byteswap(drr);
1360
1361 switch (drr->drr_type) {
1362 case DRR_OBJECT:
1363 {
1364 /*
1365 * We need to make a copy of the record header,
1366 * because restore_{object,write} may need to
1367 * restore_read(), which will invalidate drr.
1368 */
1369 struct drr_object drro = drr->drr_u.drr_object;
1370 ra.err = restore_object(&ra, os, &drro);
1371 break;
1372 }
1373 case DRR_FREEOBJECTS:
1374 {
1375 struct drr_freeobjects drrfo =
1376 drr->drr_u.drr_freeobjects;
1377 ra.err = restore_freeobjects(&ra, os, &drrfo);
1378 break;
1379 }
1380 case DRR_WRITE:
1381 {
1382 struct drr_write drrw = drr->drr_u.drr_write;
1383 ra.err = restore_write(&ra, os, &drrw);
1384 break;
1385 }
1386 case DRR_WRITE_BYREF:
1387 {
1388 struct drr_write_byref drrwbr =
1389 drr->drr_u.drr_write_byref;
1390 ra.err = restore_write_byref(&ra, os, &drrwbr);
1391 break;
1392 }
1393 case DRR_FREE:
1394 {
1395 struct drr_free drrf = drr->drr_u.drr_free;
1396 ra.err = restore_free(&ra, os, &drrf);
1397 break;
1398 }
1399 case DRR_END:
1400 {
1401 struct drr_end drre = drr->drr_u.drr_end;
1402 /*
1403 * We compare against the *previous* checksum
1404 * value, because the stored checksum is of
1405 * everything before the DRR_END record.
1406 */
1407 if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
1408 ra.err = ECKSUM;
1409 goto out;
1410 }
1411 case DRR_SPILL:
1412 {
1413 struct drr_spill drrs = drr->drr_u.drr_spill;
1414 ra.err = restore_spill(&ra, os, &drrs);
1415 break;
1416 }
1417 default:
1418 ra.err = EINVAL;
1419 goto out;
1420 }
1421 pcksum = ra.cksum;
1422 }
1423 ASSERT(ra.err != 0);
1424
1425 out:
1426 if (ra.err != 0) {
1427 /*
1428 * destroy what we created, so we don't leave it in the
1429 * inconsistent restoring state.
1430 */
1431 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
1432
1433 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
1434 B_FALSE);
1435 if (drc->drc_real_ds != drc->drc_logical_ds) {
1436 mutex_exit(&drc->drc_logical_ds->ds_recvlock);
1437 dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
1438 }
1439 }
1440
1441 if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
1442 void *cookie = NULL;
1443
1444 while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) {
1445 dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map);
1446 kmem_free(gmep, sizeof (guid_map_entry_t));
1447 }
1448 avl_destroy(&ra.guid_to_ds_map);
1449 }
1450
1451 kmem_free(ra.buf, ra.bufsize);
1452 *voffp = ra.voff;
1453 return (ra.err);
1454 }
1455
1456 struct recvendsyncarg {
1457 char *tosnap;
1458 uint64_t creation_time;
1459 uint64_t toguid;
1460 };
1461
1462 static int
1463 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
1464 {
1465 dsl_dataset_t *ds = arg1;
1466 struct recvendsyncarg *resa = arg2;
1467
1468 return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
1469 }
1470
1471 static void
1472 recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1473 {
1474 dsl_dataset_t *ds = arg1;
1475 struct recvendsyncarg *resa = arg2;
1476
1477 dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
1478
1479 /* set snapshot's creation time and guid */
1480 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1481 ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
1482 ds->ds_prev->ds_phys->ds_guid = resa->toguid;
1483 ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1484
1485 dmu_buf_will_dirty(ds->ds_dbuf, tx);
1486 ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1487 }
1488
1489 static int
1490 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
1491 {
1492 struct recvendsyncarg resa;
1493 dsl_dataset_t *ds = drc->drc_logical_ds;
1494 int err;
1495
1496 /*
1497 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
1498 * expects it to have a ds_user_ptr (and zil), but clone_swap()
1499 * can close it.
1500 */
1501 txg_wait_synced(ds->ds_dir->dd_pool, 0);
1502
1503 if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
1504 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
1505 drc->drc_force);
1506 if (err)
1507 goto out;
1508 } else {
1509 mutex_exit(&ds->ds_recvlock);
1510 dsl_dataset_rele(ds, dmu_recv_tag);
1511 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
1512 B_FALSE);
1513 return (EBUSY);
1514 }
1515
1516 resa.creation_time = drc->drc_drrb->drr_creation_time;
1517 resa.toguid = drc->drc_drrb->drr_toguid;
1518 resa.tosnap = drc->drc_tosnap;
1519
1520 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1521 recv_end_check, recv_end_sync, ds, &resa, 3);
1522 if (err) {
1523 /* swap back */
1524 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
1525 }
1526
1527 out:
1528 mutex_exit(&ds->ds_recvlock);
1529 dsl_dataset_disown(ds, dmu_recv_tag);
1530 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
1531 return (err);
1532 }
1533
1534 static int
1535 dmu_recv_new_end(dmu_recv_cookie_t *drc)
1536 {
1537 struct recvendsyncarg resa;
1538 dsl_dataset_t *ds = drc->drc_logical_ds;
1539 int err;
1540
1541 /*
1542 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
1543 * expects it to have a ds_user_ptr (and zil), but clone_swap()
1544 * can close it.
1545 */
1546 txg_wait_synced(ds->ds_dir->dd_pool, 0);
1547
1548 resa.creation_time = drc->drc_drrb->drr_creation_time;
1549 resa.toguid = drc->drc_drrb->drr_toguid;
1550 resa.tosnap = drc->drc_tosnap;
1551
1552 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1553 recv_end_check, recv_end_sync, ds, &resa, 3);
1554 if (err) {
1555 /* clean up the fs we just recv'd into */
1556 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
1557 } else {
1558 /* release the hold from dmu_recv_begin */
1559 dsl_dataset_disown(ds, dmu_recv_tag);
1560 }
1561 return (err);
1562 }
1563
1564 int
1565 dmu_recv_end(dmu_recv_cookie_t *drc)
1566 {
1567 if (drc->drc_logical_ds != drc->drc_real_ds)
1568 return (dmu_recv_existing_end(drc));
1569 else
1570 return (dmu_recv_new_end(drc));
1571 }