]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/dmu_recv.c
Always refuse receving non-resume stream when resume state exists
[mirror_zfs.git] / module / zfs / dmu_recv.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
24 * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
25 * Copyright (c) 2014, Joyent, Inc. All rights reserved.
26 * Copyright 2014 HybridCluster. All rights reserved.
27 * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
28 */
29
30 #include <sys/dmu.h>
31 #include <sys/dmu_impl.h>
32 #include <sys/dmu_send.h>
33 #include <sys/dmu_recv.h>
34 #include <sys/dmu_tx.h>
35 #include <sys/dbuf.h>
36 #include <sys/dnode.h>
37 #include <sys/zfs_context.h>
38 #include <sys/dmu_objset.h>
39 #include <sys/dmu_traverse.h>
40 #include <sys/dsl_dataset.h>
41 #include <sys/dsl_dir.h>
42 #include <sys/dsl_prop.h>
43 #include <sys/dsl_pool.h>
44 #include <sys/dsl_synctask.h>
45 #include <sys/zfs_ioctl.h>
46 #include <sys/zap.h>
47 #include <sys/zvol.h>
48 #include <sys/zio_checksum.h>
49 #include <sys/zfs_znode.h>
50 #include <zfs_fletcher.h>
51 #include <sys/avl.h>
52 #include <sys/ddt.h>
53 #include <sys/zfs_onexit.h>
54 #include <sys/dmu_send.h>
55 #include <sys/dsl_destroy.h>
56 #include <sys/blkptr.h>
57 #include <sys/dsl_bookmark.h>
58 #include <sys/zfeature.h>
59 #include <sys/bqueue.h>
60 #include <sys/objlist.h>
61 #ifdef _KERNEL
62 #include <sys/zfs_vfsops.h>
63 #endif
64
65 int zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
66 int zfs_recv_queue_ff = 20;
67
68 static char *dmu_recv_tag = "dmu_recv_tag";
69 const char *recv_clone_name = "%recv";
70
71 static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len,
72 void *buf);
73
74 struct receive_record_arg {
75 dmu_replay_record_t header;
76 void *payload; /* Pointer to a buffer containing the payload */
77 /*
78 * If the record is a write, pointer to the arc_buf_t containing the
79 * payload.
80 */
81 arc_buf_t *arc_buf;
82 int payload_size;
83 uint64_t bytes_read; /* bytes read from stream when record created */
84 boolean_t eos_marker; /* Marks the end of the stream */
85 bqueue_node_t node;
86 };
87
88 struct receive_writer_arg {
89 objset_t *os;
90 boolean_t byteswap;
91 bqueue_t q;
92
93 /*
94 * These three args are used to signal to the main thread that we're
95 * done.
96 */
97 kmutex_t mutex;
98 kcondvar_t cv;
99 boolean_t done;
100
101 int err;
102 /* A map from guid to dataset to help handle dedup'd streams. */
103 avl_tree_t *guid_to_ds_map;
104 boolean_t resumable;
105 boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */
106 boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
107 uint64_t last_object;
108 uint64_t last_offset;
109 uint64_t max_object; /* highest object ID referenced in stream */
110 uint64_t bytes_read; /* bytes read when current record created */
111
112 /* Encryption parameters for the last received DRR_OBJECT_RANGE */
113 boolean_t or_crypt_params_present;
114 uint64_t or_firstobj;
115 uint64_t or_numslots;
116 uint8_t or_salt[ZIO_DATA_SALT_LEN];
117 uint8_t or_iv[ZIO_DATA_IV_LEN];
118 uint8_t or_mac[ZIO_DATA_MAC_LEN];
119 boolean_t or_byteorder;
120 };
121
122 typedef struct guid_map_entry {
123 uint64_t guid;
124 boolean_t raw;
125 dsl_dataset_t *gme_ds;
126 avl_node_t avlnode;
127 } guid_map_entry_t;
128
129 typedef struct dmu_recv_begin_arg {
130 const char *drba_origin;
131 dmu_recv_cookie_t *drba_cookie;
132 cred_t *drba_cred;
133 dsl_crypto_params_t *drba_dcp;
134 } dmu_recv_begin_arg_t;
135
136 static void
137 byteswap_record(dmu_replay_record_t *drr)
138 {
139 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
140 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
141 drr->drr_type = BSWAP_32(drr->drr_type);
142 drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
143
144 switch (drr->drr_type) {
145 case DRR_BEGIN:
146 DO64(drr_begin.drr_magic);
147 DO64(drr_begin.drr_versioninfo);
148 DO64(drr_begin.drr_creation_time);
149 DO32(drr_begin.drr_type);
150 DO32(drr_begin.drr_flags);
151 DO64(drr_begin.drr_toguid);
152 DO64(drr_begin.drr_fromguid);
153 break;
154 case DRR_OBJECT:
155 DO64(drr_object.drr_object);
156 DO32(drr_object.drr_type);
157 DO32(drr_object.drr_bonustype);
158 DO32(drr_object.drr_blksz);
159 DO32(drr_object.drr_bonuslen);
160 DO32(drr_object.drr_raw_bonuslen);
161 DO64(drr_object.drr_toguid);
162 DO64(drr_object.drr_maxblkid);
163 break;
164 case DRR_FREEOBJECTS:
165 DO64(drr_freeobjects.drr_firstobj);
166 DO64(drr_freeobjects.drr_numobjs);
167 DO64(drr_freeobjects.drr_toguid);
168 break;
169 case DRR_WRITE:
170 DO64(drr_write.drr_object);
171 DO32(drr_write.drr_type);
172 DO64(drr_write.drr_offset);
173 DO64(drr_write.drr_logical_size);
174 DO64(drr_write.drr_toguid);
175 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
176 DO64(drr_write.drr_key.ddk_prop);
177 DO64(drr_write.drr_compressed_size);
178 break;
179 case DRR_WRITE_BYREF:
180 DO64(drr_write_byref.drr_object);
181 DO64(drr_write_byref.drr_offset);
182 DO64(drr_write_byref.drr_length);
183 DO64(drr_write_byref.drr_toguid);
184 DO64(drr_write_byref.drr_refguid);
185 DO64(drr_write_byref.drr_refobject);
186 DO64(drr_write_byref.drr_refoffset);
187 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
188 drr_key.ddk_cksum);
189 DO64(drr_write_byref.drr_key.ddk_prop);
190 break;
191 case DRR_WRITE_EMBEDDED:
192 DO64(drr_write_embedded.drr_object);
193 DO64(drr_write_embedded.drr_offset);
194 DO64(drr_write_embedded.drr_length);
195 DO64(drr_write_embedded.drr_toguid);
196 DO32(drr_write_embedded.drr_lsize);
197 DO32(drr_write_embedded.drr_psize);
198 break;
199 case DRR_FREE:
200 DO64(drr_free.drr_object);
201 DO64(drr_free.drr_offset);
202 DO64(drr_free.drr_length);
203 DO64(drr_free.drr_toguid);
204 break;
205 case DRR_SPILL:
206 DO64(drr_spill.drr_object);
207 DO64(drr_spill.drr_length);
208 DO64(drr_spill.drr_toguid);
209 DO64(drr_spill.drr_compressed_size);
210 DO32(drr_spill.drr_type);
211 break;
212 case DRR_OBJECT_RANGE:
213 DO64(drr_object_range.drr_firstobj);
214 DO64(drr_object_range.drr_numslots);
215 DO64(drr_object_range.drr_toguid);
216 break;
217 case DRR_REDACT:
218 DO64(drr_redact.drr_object);
219 DO64(drr_redact.drr_offset);
220 DO64(drr_redact.drr_length);
221 DO64(drr_redact.drr_toguid);
222 break;
223 case DRR_END:
224 DO64(drr_end.drr_toguid);
225 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
226 break;
227 default:
228 break;
229 }
230
231 if (drr->drr_type != DRR_BEGIN) {
232 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
233 }
234
235 #undef DO64
236 #undef DO32
237 }
238
239 static boolean_t
240 redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid)
241 {
242 for (int i = 0; i < num_snaps; i++) {
243 if (snaps[i] == guid)
244 return (B_TRUE);
245 }
246 return (B_FALSE);
247 }
248
249 /*
250 * Check that the new stream we're trying to receive is redacted with respect to
251 * a subset of the snapshots that the origin was redacted with respect to. For
252 * the reasons behind this, see the man page on redacted zfs sends and receives.
253 */
254 static boolean_t
255 compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps,
256 uint64_t *redact_snaps, uint64_t num_redact_snaps)
257 {
258 /*
259 * Short circuit the comparison; if we are redacted with respect to
260 * more snapshots than the origin, we can't be redacted with respect
261 * to a subset.
262 */
263 if (num_redact_snaps > origin_num_snaps) {
264 return (B_FALSE);
265 }
266
267 for (int i = 0; i < num_redact_snaps; i++) {
268 if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
269 redact_snaps[i])) {
270 return (B_FALSE);
271 }
272 }
273 return (B_TRUE);
274 }
275
276 static boolean_t
277 redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin)
278 {
279 uint64_t *origin_snaps;
280 uint64_t origin_num_snaps;
281 dmu_recv_cookie_t *drc = drba->drba_cookie;
282 struct drr_begin *drrb = drc->drc_drrb;
283 int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
284 int err = 0;
285 boolean_t ret = B_TRUE;
286 uint64_t *redact_snaps;
287 uint_t numredactsnaps;
288
289 /*
290 * If this is a full send stream, we're safe no matter what.
291 */
292 if (drrb->drr_fromguid == 0)
293 return (ret);
294
295 VERIFY(dsl_dataset_get_uint64_array_feature(origin,
296 SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps));
297
298 if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
299 BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) ==
300 0) {
301 /*
302 * If the send stream was sent from the redaction bookmark or
303 * the redacted version of the dataset, then we're safe. Verify
304 * that this is from the a compatible redaction bookmark or
305 * redacted dataset.
306 */
307 if (!compatible_redact_snaps(origin_snaps, origin_num_snaps,
308 redact_snaps, numredactsnaps)) {
309 err = EINVAL;
310 }
311 } else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
312 /*
313 * If the stream is redacted, it must be redacted with respect
314 * to a subset of what the origin is redacted with respect to.
315 * See case number 2 in the zfs man page section on redacted zfs
316 * send.
317 */
318 err = nvlist_lookup_uint64_array(drc->drc_begin_nvl,
319 BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps);
320
321 if (err != 0 || !compatible_redact_snaps(origin_snaps,
322 origin_num_snaps, redact_snaps, numredactsnaps)) {
323 err = EINVAL;
324 }
325 } else if (!redact_snaps_contains(origin_snaps, origin_num_snaps,
326 drrb->drr_toguid)) {
327 /*
328 * If the stream isn't redacted but the origin is, this must be
329 * one of the snapshots the origin is redacted with respect to.
330 * See case number 1 in the zfs man page section on redacted zfs
331 * send.
332 */
333 err = EINVAL;
334 }
335
336 if (err != 0)
337 ret = B_FALSE;
338 return (ret);
339 }
340
341 static int
342 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
343 uint64_t fromguid, uint64_t featureflags)
344 {
345 uint64_t val;
346 uint64_t children;
347 int error;
348 dsl_pool_t *dp = ds->ds_dir->dd_pool;
349 boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0;
350 boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
351 boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0;
352
353 /* Temporary clone name must not exist. */
354 error = zap_lookup(dp->dp_meta_objset,
355 dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
356 8, 1, &val);
357 if (error != ENOENT)
358 return (error == 0 ? SET_ERROR(EBUSY) : error);
359
360 /* Resume state must not be set. */
361 if (dsl_dataset_has_resume_receive_state(ds))
362 return (SET_ERROR(EBUSY));
363
364 /* New snapshot name must not exist. */
365 error = zap_lookup(dp->dp_meta_objset,
366 dsl_dataset_phys(ds)->ds_snapnames_zapobj,
367 drba->drba_cookie->drc_tosnap, 8, 1, &val);
368 if (error != ENOENT)
369 return (error == 0 ? SET_ERROR(EEXIST) : error);
370
371 /* Must not have children if receiving a ZVOL. */
372 error = zap_count(dp->dp_meta_objset,
373 dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
374 if (error != 0)
375 return (error);
376 if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS &&
377 children > 0)
378 return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
379
380 /*
381 * Check snapshot limit before receiving. We'll recheck again at the
382 * end, but might as well abort before receiving if we're already over
383 * the limit.
384 *
385 * Note that we do not check the file system limit with
386 * dsl_dir_fscount_check because the temporary %clones don't count
387 * against that limit.
388 */
389 error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
390 NULL, drba->drba_cred);
391 if (error != 0)
392 return (error);
393
394 if (fromguid != 0) {
395 dsl_dataset_t *snap;
396 uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
397
398 /* Can't perform a raw receive on top of a non-raw receive */
399 if (!encrypted && raw)
400 return (SET_ERROR(EINVAL));
401
402 /* Encryption is incompatible with embedded data */
403 if (encrypted && embed)
404 return (SET_ERROR(EINVAL));
405
406 /* Find snapshot in this dir that matches fromguid. */
407 while (obj != 0) {
408 error = dsl_dataset_hold_obj(dp, obj, FTAG,
409 &snap);
410 if (error != 0)
411 return (SET_ERROR(ENODEV));
412 if (snap->ds_dir != ds->ds_dir) {
413 dsl_dataset_rele(snap, FTAG);
414 return (SET_ERROR(ENODEV));
415 }
416 if (dsl_dataset_phys(snap)->ds_guid == fromguid)
417 break;
418 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
419 dsl_dataset_rele(snap, FTAG);
420 }
421 if (obj == 0)
422 return (SET_ERROR(ENODEV));
423
424 if (drba->drba_cookie->drc_force) {
425 drba->drba_cookie->drc_fromsnapobj = obj;
426 } else {
427 /*
428 * If we are not forcing, there must be no
429 * changes since fromsnap. Raw sends have an
430 * additional constraint that requires that
431 * no "noop" snapshots exist between fromsnap
432 * and tosnap for the IVset checking code to
433 * work properly.
434 */
435 if (dsl_dataset_modified_since_snap(ds, snap) ||
436 (raw &&
437 dsl_dataset_phys(ds)->ds_prev_snap_obj !=
438 snap->ds_object)) {
439 dsl_dataset_rele(snap, FTAG);
440 return (SET_ERROR(ETXTBSY));
441 }
442 drba->drba_cookie->drc_fromsnapobj =
443 ds->ds_prev->ds_object;
444 }
445
446 if (dsl_dataset_feature_is_active(snap,
447 SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba,
448 snap)) {
449 dsl_dataset_rele(snap, FTAG);
450 return (SET_ERROR(EINVAL));
451 }
452
453 dsl_dataset_rele(snap, FTAG);
454 } else {
455 /* if full, then must be forced */
456 if (!drba->drba_cookie->drc_force)
457 return (SET_ERROR(EEXIST));
458
459 /*
460 * We don't support using zfs recv -F to blow away
461 * encrypted filesystems. This would require the
462 * dsl dir to point to the old encryption key and
463 * the new one at the same time during the receive.
464 */
465 if ((!encrypted && raw) || encrypted)
466 return (SET_ERROR(EINVAL));
467
468 /*
469 * Perform the same encryption checks we would if
470 * we were creating a new dataset from scratch.
471 */
472 if (!raw) {
473 boolean_t will_encrypt;
474
475 error = dmu_objset_create_crypt_check(
476 ds->ds_dir->dd_parent, drba->drba_dcp,
477 &will_encrypt);
478 if (error != 0)
479 return (error);
480
481 if (will_encrypt && embed)
482 return (SET_ERROR(EINVAL));
483 }
484 }
485
486 return (0);
487
488 }
489
490 /*
491 * Check that any feature flags used in the data stream we're receiving are
492 * supported by the pool we are receiving into.
493 *
494 * Note that some of the features we explicitly check here have additional
495 * (implicit) features they depend on, but those dependencies are enforced
496 * through the zfeature_register() calls declaring the features that we
497 * explicitly check.
498 */
499 static int
500 recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa)
501 {
502 /*
503 * Check if there are any unsupported feature flags.
504 */
505 if (!DMU_STREAM_SUPPORTED(featureflags)) {
506 return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE));
507 }
508
509 /* Verify pool version supports SA if SA_SPILL feature set */
510 if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
511 spa_version(spa) < SPA_VERSION_SA)
512 return (SET_ERROR(ENOTSUP));
513
514 /*
515 * LZ4 compressed, embedded, mooched, large blocks, and large_dnodes
516 * in the stream can only be used if those pool features are enabled
517 * because we don't attempt to decompress / un-embed / un-mooch /
518 * split up the blocks / dnodes during the receive process.
519 */
520 if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
521 !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS))
522 return (SET_ERROR(ENOTSUP));
523 if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
524 !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA))
525 return (SET_ERROR(ENOTSUP));
526 if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
527 !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS))
528 return (SET_ERROR(ENOTSUP));
529 if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
530 !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE))
531 return (SET_ERROR(ENOTSUP));
532
533 /*
534 * Receiving redacted streams requires that redacted datasets are
535 * enabled.
536 */
537 if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) &&
538 !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS))
539 return (SET_ERROR(ENOTSUP));
540
541 return (0);
542 }
543
544 static int
545 dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
546 {
547 dmu_recv_begin_arg_t *drba = arg;
548 dsl_pool_t *dp = dmu_tx_pool(tx);
549 struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
550 uint64_t fromguid = drrb->drr_fromguid;
551 int flags = drrb->drr_flags;
552 ds_hold_flags_t dsflags = 0;
553 int error;
554 uint64_t featureflags = drba->drba_cookie->drc_featureflags;
555 dsl_dataset_t *ds;
556 const char *tofs = drba->drba_cookie->drc_tofs;
557
558 /* already checked */
559 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
560 ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
561
562 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
563 DMU_COMPOUNDSTREAM ||
564 drrb->drr_type >= DMU_OST_NUMTYPES ||
565 ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
566 return (SET_ERROR(EINVAL));
567
568 error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa);
569 if (error != 0)
570 return (error);
571
572 /* Resumable receives require extensible datasets */
573 if (drba->drba_cookie->drc_resumable &&
574 !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
575 return (SET_ERROR(ENOTSUP));
576
577 if (featureflags & DMU_BACKUP_FEATURE_RAW) {
578 /* raw receives require the encryption feature */
579 if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION))
580 return (SET_ERROR(ENOTSUP));
581
582 /* embedded data is incompatible with encryption and raw recv */
583 if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
584 return (SET_ERROR(EINVAL));
585
586 /* raw receives require spill block allocation flag */
587 if (!(flags & DRR_FLAG_SPILL_BLOCK))
588 return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
589 } else {
590 dsflags |= DS_HOLD_FLAG_DECRYPT;
591 }
592
593 error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
594 if (error == 0) {
595 /* target fs already exists; recv into temp clone */
596
597 /* Can't recv a clone into an existing fs */
598 if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
599 dsl_dataset_rele_flags(ds, dsflags, FTAG);
600 return (SET_ERROR(EINVAL));
601 }
602
603 error = recv_begin_check_existing_impl(drba, ds, fromguid,
604 featureflags);
605 dsl_dataset_rele_flags(ds, dsflags, FTAG);
606 } else if (error == ENOENT) {
607 /* target fs does not exist; must be a full backup or clone */
608 char buf[ZFS_MAX_DATASET_NAME_LEN];
609 objset_t *os;
610
611 /*
612 * If it's a non-clone incremental, we are missing the
613 * target fs, so fail the recv.
614 */
615 if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) ||
616 drba->drba_origin))
617 return (SET_ERROR(ENOENT));
618
619 /*
620 * If we're receiving a full send as a clone, and it doesn't
621 * contain all the necessary free records and freeobject
622 * records, reject it.
623 */
624 if (fromguid == 0 && drba->drba_origin != NULL &&
625 !(flags & DRR_FLAG_FREERECORDS))
626 return (SET_ERROR(EINVAL));
627
628 /* Open the parent of tofs */
629 ASSERT3U(strlen(tofs), <, sizeof (buf));
630 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
631 error = dsl_dataset_hold(dp, buf, FTAG, &ds);
632 if (error != 0)
633 return (error);
634
635 if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
636 drba->drba_origin == NULL) {
637 boolean_t will_encrypt;
638
639 /*
640 * Check that we aren't breaking any encryption rules
641 * and that we have all the parameters we need to
642 * create an encrypted dataset if necessary. If we are
643 * making an encrypted dataset the stream can't have
644 * embedded data.
645 */
646 error = dmu_objset_create_crypt_check(ds->ds_dir,
647 drba->drba_dcp, &will_encrypt);
648 if (error != 0) {
649 dsl_dataset_rele(ds, FTAG);
650 return (error);
651 }
652
653 if (will_encrypt &&
654 (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
655 dsl_dataset_rele(ds, FTAG);
656 return (SET_ERROR(EINVAL));
657 }
658 }
659
660 /*
661 * Check filesystem and snapshot limits before receiving. We'll
662 * recheck snapshot limits again at the end (we create the
663 * filesystems and increment those counts during begin_sync).
664 */
665 error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
666 ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
667 if (error != 0) {
668 dsl_dataset_rele(ds, FTAG);
669 return (error);
670 }
671
672 error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
673 ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
674 if (error != 0) {
675 dsl_dataset_rele(ds, FTAG);
676 return (error);
677 }
678
679 /* can't recv below anything but filesystems (eg. no ZVOLs) */
680 error = dmu_objset_from_ds(ds, &os);
681 if (error != 0) {
682 dsl_dataset_rele(ds, FTAG);
683 return (error);
684 }
685 if (dmu_objset_type(os) != DMU_OST_ZFS) {
686 dsl_dataset_rele(ds, FTAG);
687 return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
688 }
689
690 if (drba->drba_origin != NULL) {
691 dsl_dataset_t *origin;
692 error = dsl_dataset_hold_flags(dp, drba->drba_origin,
693 dsflags, FTAG, &origin);
694 if (error != 0) {
695 dsl_dataset_rele(ds, FTAG);
696 return (error);
697 }
698 if (!origin->ds_is_snapshot) {
699 dsl_dataset_rele_flags(origin, dsflags, FTAG);
700 dsl_dataset_rele(ds, FTAG);
701 return (SET_ERROR(EINVAL));
702 }
703 if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
704 fromguid != 0) {
705 dsl_dataset_rele_flags(origin, dsflags, FTAG);
706 dsl_dataset_rele(ds, FTAG);
707 return (SET_ERROR(ENODEV));
708 }
709
710 if (origin->ds_dir->dd_crypto_obj != 0 &&
711 (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
712 dsl_dataset_rele_flags(origin, dsflags, FTAG);
713 dsl_dataset_rele(ds, FTAG);
714 return (SET_ERROR(EINVAL));
715 }
716
717 /*
718 * If the origin is redacted we need to verify that this
719 * send stream can safely be received on top of the
720 * origin.
721 */
722 if (dsl_dataset_feature_is_active(origin,
723 SPA_FEATURE_REDACTED_DATASETS)) {
724 if (!redact_check(drba, origin)) {
725 dsl_dataset_rele_flags(origin, dsflags,
726 FTAG);
727 dsl_dataset_rele_flags(ds, dsflags,
728 FTAG);
729 return (SET_ERROR(EINVAL));
730 }
731 }
732
733 dsl_dataset_rele_flags(origin, dsflags, FTAG);
734 }
735
736 dsl_dataset_rele(ds, FTAG);
737 error = 0;
738 }
739 return (error);
740 }
741
742 static void
743 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
744 {
745 dmu_recv_begin_arg_t *drba = arg;
746 dsl_pool_t *dp = dmu_tx_pool(tx);
747 objset_t *mos = dp->dp_meta_objset;
748 dmu_recv_cookie_t *drc = drba->drba_cookie;
749 struct drr_begin *drrb = drc->drc_drrb;
750 const char *tofs = drc->drc_tofs;
751 uint64_t featureflags = drc->drc_featureflags;
752 dsl_dataset_t *ds, *newds;
753 objset_t *os;
754 uint64_t dsobj;
755 ds_hold_flags_t dsflags = 0;
756 int error;
757 uint64_t crflags = 0;
758 dsl_crypto_params_t dummy_dcp = { 0 };
759 dsl_crypto_params_t *dcp = drba->drba_dcp;
760
761 if (drrb->drr_flags & DRR_FLAG_CI_DATA)
762 crflags |= DS_FLAG_CI_DATASET;
763
764 if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
765 dsflags |= DS_HOLD_FLAG_DECRYPT;
766
767 /*
768 * Raw, non-incremental recvs always use a dummy dcp with
769 * the raw cmd set. Raw incremental recvs do not use a dcp
770 * since the encryption parameters are already set in stone.
771 */
772 if (dcp == NULL && drrb->drr_fromguid == 0 &&
773 drba->drba_origin == NULL) {
774 ASSERT3P(dcp, ==, NULL);
775 dcp = &dummy_dcp;
776
777 if (featureflags & DMU_BACKUP_FEATURE_RAW)
778 dcp->cp_cmd = DCP_CMD_RAW_RECV;
779 }
780
781 error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
782 if (error == 0) {
783 /* create temporary clone */
784 dsl_dataset_t *snap = NULL;
785
786 if (drba->drba_cookie->drc_fromsnapobj != 0) {
787 VERIFY0(dsl_dataset_hold_obj(dp,
788 drba->drba_cookie->drc_fromsnapobj, FTAG, &snap));
789 ASSERT3P(dcp, ==, NULL);
790 }
791 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
792 snap, crflags, drba->drba_cred, dcp, tx);
793 if (drba->drba_cookie->drc_fromsnapobj != 0)
794 dsl_dataset_rele(snap, FTAG);
795 dsl_dataset_rele_flags(ds, dsflags, FTAG);
796 } else {
797 dsl_dir_t *dd;
798 const char *tail;
799 dsl_dataset_t *origin = NULL;
800
801 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
802
803 if (drba->drba_origin != NULL) {
804 VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
805 FTAG, &origin));
806 ASSERT3P(dcp, ==, NULL);
807 }
808
809 /* Create new dataset. */
810 dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1,
811 origin, crflags, drba->drba_cred, dcp, tx);
812 if (origin != NULL)
813 dsl_dataset_rele(origin, FTAG);
814 dsl_dir_rele(dd, FTAG);
815 drc->drc_newfs = B_TRUE;
816 }
817 VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag,
818 &newds));
819 if (dsl_dataset_feature_is_active(newds,
820 SPA_FEATURE_REDACTED_DATASETS)) {
821 /*
822 * If the origin dataset is redacted, the child will be redacted
823 * when we create it. We clear the new dataset's
824 * redaction info; if it should be redacted, we'll fill
825 * in its information later.
826 */
827 dsl_dataset_deactivate_feature(newds,
828 SPA_FEATURE_REDACTED_DATASETS, tx);
829 }
830 VERIFY0(dmu_objset_from_ds(newds, &os));
831
832 if (drc->drc_resumable) {
833 dsl_dataset_zapify(newds, tx);
834 if (drrb->drr_fromguid != 0) {
835 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
836 8, 1, &drrb->drr_fromguid, tx));
837 }
838 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
839 8, 1, &drrb->drr_toguid, tx));
840 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
841 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
842 uint64_t one = 1;
843 uint64_t zero = 0;
844 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
845 8, 1, &one, tx));
846 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
847 8, 1, &zero, tx));
848 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
849 8, 1, &zero, tx));
850 if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
851 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
852 8, 1, &one, tx));
853 }
854 if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) {
855 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
856 8, 1, &one, tx));
857 }
858 if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) {
859 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
860 8, 1, &one, tx));
861 }
862 if (featureflags & DMU_BACKUP_FEATURE_RAW) {
863 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK,
864 8, 1, &one, tx));
865 }
866
867 uint64_t *redact_snaps;
868 uint_t numredactsnaps;
869 if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
870 BEGINNV_REDACT_FROM_SNAPS, &redact_snaps,
871 &numredactsnaps) == 0) {
872 VERIFY0(zap_add(mos, dsobj,
873 DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS,
874 sizeof (*redact_snaps), numredactsnaps,
875 redact_snaps, tx));
876 }
877 }
878
879 /*
880 * Usually the os->os_encrypted value is tied to the presence of a
881 * DSL Crypto Key object in the dd. However, that will not be received
882 * until dmu_recv_stream(), so we set the value manually for now.
883 */
884 if (featureflags & DMU_BACKUP_FEATURE_RAW) {
885 os->os_encrypted = B_TRUE;
886 drba->drba_cookie->drc_raw = B_TRUE;
887 }
888
889
890 if (featureflags & DMU_BACKUP_FEATURE_REDACTED) {
891 uint64_t *redact_snaps;
892 uint_t numredactsnaps;
893 VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl,
894 BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps));
895 dsl_dataset_activate_redaction(newds, redact_snaps,
896 numredactsnaps, tx);
897 }
898
899 dmu_buf_will_dirty(newds->ds_dbuf, tx);
900 dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
901
902 /*
903 * If we actually created a non-clone, we need to create the objset
904 * in our new dataset. If this is a raw send we postpone this until
905 * dmu_recv_stream() so that we can allocate the metadnode with the
906 * properties from the DRR_BEGIN payload.
907 */
908 rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
909 if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
910 (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) {
911 (void) dmu_objset_create_impl(dp->dp_spa,
912 newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
913 }
914 rrw_exit(&newds->ds_bp_rwlock, FTAG);
915
916 drba->drba_cookie->drc_ds = newds;
917
918 spa_history_log_internal_ds(newds, "receive", tx, "");
919 }
920
921 static int
922 dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
923 {
924 dmu_recv_begin_arg_t *drba = arg;
925 dmu_recv_cookie_t *drc = drba->drba_cookie;
926 dsl_pool_t *dp = dmu_tx_pool(tx);
927 struct drr_begin *drrb = drc->drc_drrb;
928 int error;
929 ds_hold_flags_t dsflags = 0;
930 dsl_dataset_t *ds;
931 const char *tofs = drc->drc_tofs;
932
933 /* already checked */
934 ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
935 ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING);
936
937 if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
938 DMU_COMPOUNDSTREAM ||
939 drrb->drr_type >= DMU_OST_NUMTYPES)
940 return (SET_ERROR(EINVAL));
941
942 /*
943 * This is mostly a sanity check since we should have already done these
944 * checks during a previous attempt to receive the data.
945 */
946 error = recv_begin_check_feature_flags_impl(drc->drc_featureflags,
947 dp->dp_spa);
948 if (error != 0)
949 return (error);
950
951 /* 6 extra bytes for /%recv */
952 char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
953
954 (void) snprintf(recvname, sizeof (recvname), "%s/%s",
955 tofs, recv_clone_name);
956
957 if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
958 /* raw receives require spill block allocation flag */
959 if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK))
960 return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
961 } else {
962 dsflags |= DS_HOLD_FLAG_DECRYPT;
963 }
964
965 if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
966 /* %recv does not exist; continue in tofs */
967 error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
968 if (error != 0)
969 return (error);
970 }
971
972 /* check that ds is marked inconsistent */
973 if (!DS_IS_INCONSISTENT(ds)) {
974 dsl_dataset_rele_flags(ds, dsflags, FTAG);
975 return (SET_ERROR(EINVAL));
976 }
977
978 /* check that there is resuming data, and that the toguid matches */
979 if (!dsl_dataset_is_zapified(ds)) {
980 dsl_dataset_rele_flags(ds, dsflags, FTAG);
981 return (SET_ERROR(EINVAL));
982 }
983 uint64_t val;
984 error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
985 DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
986 if (error != 0 || drrb->drr_toguid != val) {
987 dsl_dataset_rele_flags(ds, dsflags, FTAG);
988 return (SET_ERROR(EINVAL));
989 }
990
991 /*
992 * Check if the receive is still running. If so, it will be owned.
993 * Note that nothing else can own the dataset (e.g. after the receive
994 * fails) because it will be marked inconsistent.
995 */
996 if (dsl_dataset_has_owner(ds)) {
997 dsl_dataset_rele_flags(ds, dsflags, FTAG);
998 return (SET_ERROR(EBUSY));
999 }
1000
1001 /* There should not be any snapshots of this fs yet. */
1002 if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
1003 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1004 return (SET_ERROR(EINVAL));
1005 }
1006
1007 /*
1008 * Note: resume point will be checked when we process the first WRITE
1009 * record.
1010 */
1011
1012 /* check that the origin matches */
1013 val = 0;
1014 (void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
1015 DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
1016 if (drrb->drr_fromguid != val) {
1017 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1018 return (SET_ERROR(EINVAL));
1019 }
1020
1021 /*
1022 * If we're resuming, and the send is redacted, then the original send
1023 * must have been redacted, and must have been redacted with respect to
1024 * the same snapshots.
1025 */
1026 if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) {
1027 uint64_t num_ds_redact_snaps;
1028 uint64_t *ds_redact_snaps;
1029
1030 uint_t num_stream_redact_snaps;
1031 uint64_t *stream_redact_snaps;
1032
1033 if (nvlist_lookup_uint64_array(drc->drc_begin_nvl,
1034 BEGINNV_REDACT_SNAPS, &stream_redact_snaps,
1035 &num_stream_redact_snaps) != 0) {
1036 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1037 return (SET_ERROR(EINVAL));
1038 }
1039
1040 if (!dsl_dataset_get_uint64_array_feature(ds,
1041 SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps,
1042 &ds_redact_snaps)) {
1043 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1044 return (SET_ERROR(EINVAL));
1045 }
1046
1047 for (int i = 0; i < num_ds_redact_snaps; i++) {
1048 if (!redact_snaps_contains(ds_redact_snaps,
1049 num_ds_redact_snaps, stream_redact_snaps[i])) {
1050 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1051 return (SET_ERROR(EINVAL));
1052 }
1053 }
1054 }
1055 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1056 return (0);
1057 }
1058
1059 static void
1060 dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
1061 {
1062 dmu_recv_begin_arg_t *drba = arg;
1063 dsl_pool_t *dp = dmu_tx_pool(tx);
1064 const char *tofs = drba->drba_cookie->drc_tofs;
1065 uint64_t featureflags = drba->drba_cookie->drc_featureflags;
1066 dsl_dataset_t *ds;
1067 ds_hold_flags_t dsflags = 0;
1068 /* 6 extra bytes for /%recv */
1069 char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
1070
1071 (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs,
1072 recv_clone_name);
1073
1074 if (featureflags & DMU_BACKUP_FEATURE_RAW) {
1075 drba->drba_cookie->drc_raw = B_TRUE;
1076 } else {
1077 dsflags |= DS_HOLD_FLAG_DECRYPT;
1078 }
1079
1080 if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds)
1081 != 0) {
1082 /* %recv does not exist; continue in tofs */
1083 VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag,
1084 &ds));
1085 drba->drba_cookie->drc_newfs = B_TRUE;
1086 }
1087
1088 ASSERT(DS_IS_INCONSISTENT(ds));
1089 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
1090 ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) ||
1091 drba->drba_cookie->drc_raw);
1092 rrw_exit(&ds->ds_bp_rwlock, FTAG);
1093
1094 drba->drba_cookie->drc_ds = ds;
1095
1096 spa_history_log_internal_ds(ds, "resume receive", tx, "");
1097 }
1098
1099 /*
1100 * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
1101 * succeeds; otherwise we will leak the holds on the datasets.
1102 */
1103 int
1104 dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
1105 boolean_t force, boolean_t resumable, nvlist_t *localprops,
1106 nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc, vnode_t *vp,
1107 offset_t *voffp)
1108 {
1109 dmu_recv_begin_arg_t drba = { 0 };
1110 int err;
1111
1112 bzero(drc, sizeof (dmu_recv_cookie_t));
1113 drc->drc_drr_begin = drr_begin;
1114 drc->drc_drrb = &drr_begin->drr_u.drr_begin;
1115 drc->drc_tosnap = tosnap;
1116 drc->drc_tofs = tofs;
1117 drc->drc_force = force;
1118 drc->drc_resumable = resumable;
1119 drc->drc_cred = CRED();
1120 drc->drc_clone = (origin != NULL);
1121
1122 if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
1123 drc->drc_byteswap = B_TRUE;
1124 (void) fletcher_4_incremental_byteswap(drr_begin,
1125 sizeof (dmu_replay_record_t), &drc->drc_cksum);
1126 byteswap_record(drr_begin);
1127 } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
1128 (void) fletcher_4_incremental_native(drr_begin,
1129 sizeof (dmu_replay_record_t), &drc->drc_cksum);
1130 } else {
1131 return (SET_ERROR(EINVAL));
1132 }
1133
1134 drc->drc_vp = vp;
1135 drc->drc_voff = *voffp;
1136 drc->drc_featureflags =
1137 DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
1138
1139 uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen;
1140 void *payload = NULL;
1141 if (payloadlen != 0)
1142 payload = kmem_alloc(payloadlen, KM_SLEEP);
1143
1144 err = receive_read_payload_and_next_header(drc, payloadlen,
1145 payload);
1146 if (err != 0) {
1147 kmem_free(payload, payloadlen);
1148 return (err);
1149 }
1150 if (payloadlen != 0) {
1151 err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl,
1152 KM_SLEEP);
1153 kmem_free(payload, payloadlen);
1154 if (err != 0) {
1155 kmem_free(drc->drc_next_rrd,
1156 sizeof (*drc->drc_next_rrd));
1157 return (err);
1158 }
1159 }
1160
1161 if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)
1162 drc->drc_spill = B_TRUE;
1163
1164 drba.drba_origin = origin;
1165 drba.drba_cookie = drc;
1166 drba.drba_cred = CRED();
1167
1168 if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
1169 err = dsl_sync_task(tofs,
1170 dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
1171 &drba, 5, ZFS_SPACE_CHECK_NORMAL);
1172 } else {
1173
1174 /*
1175 * For non-raw, non-incremental, non-resuming receives the
1176 * user can specify encryption parameters on the command line
1177 * with "zfs recv -o". For these receives we create a dcp and
1178 * pass it to the sync task. Creating the dcp will implicitly
1179 * remove the encryption params from the localprops nvlist,
1180 * which avoids errors when trying to set these normally
1181 * read-only properties. Any other kind of receive that
1182 * attempts to set these properties will fail as a result.
1183 */
1184 if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
1185 DMU_BACKUP_FEATURE_RAW) == 0 &&
1186 origin == NULL && drc->drc_drrb->drr_fromguid == 0) {
1187 err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
1188 localprops, hidden_args, &drba.drba_dcp);
1189 }
1190
1191 if (err == 0) {
1192 err = dsl_sync_task(tofs,
1193 dmu_recv_begin_check, dmu_recv_begin_sync,
1194 &drba, 5, ZFS_SPACE_CHECK_NORMAL);
1195 dsl_crypto_params_free(drba.drba_dcp, !!err);
1196 }
1197 }
1198
1199 if (err != 0) {
1200 kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
1201 nvlist_free(drc->drc_begin_nvl);
1202 }
1203 return (err);
1204 }
1205
1206 static int
1207 guid_compare(const void *arg1, const void *arg2)
1208 {
1209 const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1;
1210 const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2;
1211
1212 return (AVL_CMP(gmep1->guid, gmep2->guid));
1213 }
1214
1215 static void
1216 free_guid_map_onexit(void *arg)
1217 {
1218 avl_tree_t *ca = arg;
1219 void *cookie = NULL;
1220 guid_map_entry_t *gmep;
1221
1222 while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
1223 ds_hold_flags_t dsflags = DS_HOLD_FLAG_DECRYPT;
1224
1225 if (gmep->raw) {
1226 gmep->gme_ds->ds_objset->os_raw_receive = B_FALSE;
1227 dsflags &= ~DS_HOLD_FLAG_DECRYPT;
1228 }
1229
1230 dsl_dataset_disown(gmep->gme_ds, dsflags, gmep);
1231 kmem_free(gmep, sizeof (guid_map_entry_t));
1232 }
1233 avl_destroy(ca);
1234 kmem_free(ca, sizeof (avl_tree_t));
1235 }
1236
1237 static int
1238 receive_read(dmu_recv_cookie_t *drc, int len, void *buf)
1239 {
1240 int done = 0;
1241
1242 /*
1243 * The code doesn't rely on this (lengths being multiples of 8). See
1244 * comment in dump_bytes.
1245 */
1246 ASSERT(len % 8 == 0 ||
1247 (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
1248
1249 while (done < len) {
1250 ssize_t resid;
1251
1252 drc->drc_err = vn_rdwr(UIO_READ, drc->drc_vp,
1253 (char *)buf + done, len - done,
1254 drc->drc_voff, UIO_SYSSPACE, FAPPEND,
1255 RLIM64_INFINITY, CRED(), &resid);
1256
1257 if (resid == len - done) {
1258 /*
1259 * Note: ECKSUM indicates that the receive
1260 * was interrupted and can potentially be resumed.
1261 */
1262 drc->drc_err = SET_ERROR(ECKSUM);
1263 }
1264 drc->drc_voff += len - done - resid;
1265 done = len - resid;
1266 if (drc->drc_err != 0)
1267 return (drc->drc_err);
1268 }
1269
1270 drc->drc_bytes_read += len;
1271
1272 ASSERT3U(done, ==, len);
1273 return (0);
1274 }
1275
1276 static inline uint8_t
1277 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
1278 {
1279 if (bonus_type == DMU_OT_SA) {
1280 return (1);
1281 } else {
1282 return (1 +
1283 ((DN_OLD_MAX_BONUSLEN -
1284 MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
1285 }
1286 }
1287
1288 static void
1289 save_resume_state(struct receive_writer_arg *rwa,
1290 uint64_t object, uint64_t offset, dmu_tx_t *tx)
1291 {
1292 int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
1293
1294 if (!rwa->resumable)
1295 return;
1296
1297 /*
1298 * We use ds_resume_bytes[] != 0 to indicate that we need to
1299 * update this on disk, so it must not be 0.
1300 */
1301 ASSERT(rwa->bytes_read != 0);
1302
1303 /*
1304 * We only resume from write records, which have a valid
1305 * (non-meta-dnode) object number.
1306 */
1307 ASSERT(object != 0);
1308
1309 /*
1310 * For resuming to work correctly, we must receive records in order,
1311 * sorted by object,offset. This is checked by the callers, but
1312 * assert it here for good measure.
1313 */
1314 ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
1315 ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
1316 offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
1317 ASSERT3U(rwa->bytes_read, >=,
1318 rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
1319
1320 rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
1321 rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
1322 rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
1323 }
1324
1325 noinline static int
1326 receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
1327 void *data)
1328 {
1329 dmu_object_info_t doi;
1330 dmu_tx_t *tx;
1331 uint64_t object;
1332 int err;
1333 uint8_t dn_slots = drro->drr_dn_slots != 0 ?
1334 drro->drr_dn_slots : DNODE_MIN_SLOTS;
1335
1336 if (drro->drr_type == DMU_OT_NONE ||
1337 !DMU_OT_IS_VALID(drro->drr_type) ||
1338 !DMU_OT_IS_VALID(drro->drr_bonustype) ||
1339 drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
1340 drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1341 P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1342 drro->drr_blksz < SPA_MINBLOCKSIZE ||
1343 drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
1344 drro->drr_bonuslen >
1345 DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
1346 dn_slots >
1347 (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) {
1348 return (SET_ERROR(EINVAL));
1349 }
1350
1351 if (rwa->raw) {
1352 /*
1353 * We should have received a DRR_OBJECT_RANGE record
1354 * containing this block and stored it in rwa.
1355 */
1356 if (drro->drr_object < rwa->or_firstobj ||
1357 drro->drr_object >= rwa->or_firstobj + rwa->or_numslots ||
1358 drro->drr_raw_bonuslen < drro->drr_bonuslen ||
1359 drro->drr_indblkshift > SPA_MAXBLOCKSHIFT ||
1360 drro->drr_nlevels > DN_MAX_LEVELS ||
1361 drro->drr_nblkptr > DN_MAX_NBLKPTR ||
1362 DN_SLOTS_TO_BONUSLEN(dn_slots) <
1363 drro->drr_raw_bonuslen)
1364 return (SET_ERROR(EINVAL));
1365 } else {
1366 /*
1367 * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN
1368 * record indicates this by setting DRR_FLAG_SPILL_BLOCK.
1369 */
1370 if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) ||
1371 (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) {
1372 return (SET_ERROR(EINVAL));
1373 }
1374
1375 if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 ||
1376 drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) {
1377 return (SET_ERROR(EINVAL));
1378 }
1379 }
1380
1381 err = dmu_object_info(rwa->os, drro->drr_object, &doi);
1382
1383 if (err != 0 && err != ENOENT && err != EEXIST)
1384 return (SET_ERROR(EINVAL));
1385
1386 if (drro->drr_object > rwa->max_object)
1387 rwa->max_object = drro->drr_object;
1388
1389 /*
1390 * If we are losing blkptrs or changing the block size this must
1391 * be a new file instance. We must clear out the previous file
1392 * contents before we can change this type of metadata in the dnode.
1393 * Raw receives will also check that the indirect structure of the
1394 * dnode hasn't changed.
1395 */
1396 if (err == 0) {
1397 uint32_t indblksz = drro->drr_indblkshift ?
1398 1ULL << drro->drr_indblkshift : 0;
1399 int nblkptr = deduce_nblkptr(drro->drr_bonustype,
1400 drro->drr_bonuslen);
1401 boolean_t did_free = B_FALSE;
1402
1403 object = drro->drr_object;
1404
1405 /* nblkptr should be bounded by the bonus size and type */
1406 if (rwa->raw && nblkptr != drro->drr_nblkptr)
1407 return (SET_ERROR(EINVAL));
1408
1409 /*
1410 * Check for indicators that the object was freed and
1411 * reallocated. For all sends, these indicators are:
1412 * - A changed block size
1413 * - A smaller nblkptr
1414 * - A changed dnode size
1415 * For raw sends we also check a few other fields to
1416 * ensure we are preserving the objset structure exactly
1417 * as it was on the receive side:
1418 * - A changed indirect block size
1419 * - A smaller nlevels
1420 */
1421 if (drro->drr_blksz != doi.doi_data_block_size ||
1422 nblkptr < doi.doi_nblkptr ||
1423 dn_slots != doi.doi_dnodesize >> DNODE_SHIFT ||
1424 (rwa->raw &&
1425 (indblksz != doi.doi_metadata_block_size ||
1426 drro->drr_nlevels < doi.doi_indirection))) {
1427 err = dmu_free_long_range(rwa->os, drro->drr_object,
1428 0, DMU_OBJECT_END);
1429 if (err != 0)
1430 return (SET_ERROR(EINVAL));
1431 else
1432 did_free = B_TRUE;
1433 }
1434
1435 /*
1436 * The dmu does not currently support decreasing nlevels
1437 * or changing the number of dnode slots on an object. For
1438 * non-raw sends, this does not matter and the new object
1439 * can just use the previous one's nlevels. For raw sends,
1440 * however, the structure of the received dnode (including
1441 * nlevels and dnode slots) must match that of the send
1442 * side. Therefore, instead of using dmu_object_reclaim(),
1443 * we must free the object completely and call
1444 * dmu_object_claim_dnsize() instead.
1445 */
1446 if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) ||
1447 dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
1448 err = dmu_free_long_object(rwa->os, drro->drr_object);
1449 if (err != 0)
1450 return (SET_ERROR(EINVAL));
1451
1452 txg_wait_synced(dmu_objset_pool(rwa->os), 0);
1453 object = DMU_NEW_OBJECT;
1454 }
1455
1456 /*
1457 * For raw receives, free everything beyond the new incoming
1458 * maxblkid. Normally this would be done with a DRR_FREE
1459 * record that would come after this DRR_OBJECT record is
1460 * processed. However, for raw receives we manually set the
1461 * maxblkid from the drr_maxblkid and so we must first free
1462 * everything above that blkid to ensure the DMU is always
1463 * consistent with itself. We will never free the first block
1464 * of the object here because a maxblkid of 0 could indicate
1465 * an object with a single block or one with no blocks. This
1466 * free may be skipped when dmu_free_long_range() was called
1467 * above since it covers the entire object's contents.
1468 */
1469 if (rwa->raw && object != DMU_NEW_OBJECT && !did_free) {
1470 err = dmu_free_long_range(rwa->os, drro->drr_object,
1471 (drro->drr_maxblkid + 1) * doi.doi_data_block_size,
1472 DMU_OBJECT_END);
1473 if (err != 0)
1474 return (SET_ERROR(EINVAL));
1475 }
1476 } else if (err == EEXIST) {
1477 /*
1478 * The object requested is currently an interior slot of a
1479 * multi-slot dnode. This will be resolved when the next txg
1480 * is synced out, since the send stream will have told us
1481 * to free this slot when we freed the associated dnode
1482 * earlier in the stream.
1483 */
1484 txg_wait_synced(dmu_objset_pool(rwa->os), 0);
1485
1486 if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT)
1487 return (SET_ERROR(EINVAL));
1488
1489 /* object was freed and we are about to allocate a new one */
1490 object = DMU_NEW_OBJECT;
1491 } else {
1492 /* object is free and we are about to allocate a new one */
1493 object = DMU_NEW_OBJECT;
1494 }
1495
1496 /*
1497 * If this is a multi-slot dnode there is a chance that this
1498 * object will expand into a slot that is already used by
1499 * another object from the previous snapshot. We must free
1500 * these objects before we attempt to allocate the new dnode.
1501 */
1502 if (dn_slots > 1) {
1503 boolean_t need_sync = B_FALSE;
1504
1505 for (uint64_t slot = drro->drr_object + 1;
1506 slot < drro->drr_object + dn_slots;
1507 slot++) {
1508 dmu_object_info_t slot_doi;
1509
1510 err = dmu_object_info(rwa->os, slot, &slot_doi);
1511 if (err == ENOENT || err == EEXIST)
1512 continue;
1513 else if (err != 0)
1514 return (err);
1515
1516 err = dmu_free_long_object(rwa->os, slot);
1517 if (err != 0)
1518 return (err);
1519
1520 need_sync = B_TRUE;
1521 }
1522
1523 if (need_sync)
1524 txg_wait_synced(dmu_objset_pool(rwa->os), 0);
1525 }
1526
1527 tx = dmu_tx_create(rwa->os);
1528 dmu_tx_hold_bonus(tx, object);
1529 dmu_tx_hold_write(tx, object, 0, 0);
1530 err = dmu_tx_assign(tx, TXG_WAIT);
1531 if (err != 0) {
1532 dmu_tx_abort(tx);
1533 return (err);
1534 }
1535
1536 if (object == DMU_NEW_OBJECT) {
1537 /* Currently free, wants to be allocated */
1538 err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
1539 drro->drr_type, drro->drr_blksz,
1540 drro->drr_bonustype, drro->drr_bonuslen,
1541 dn_slots << DNODE_SHIFT, tx);
1542 } else if (drro->drr_type != doi.doi_type ||
1543 drro->drr_blksz != doi.doi_data_block_size ||
1544 drro->drr_bonustype != doi.doi_bonus_type ||
1545 drro->drr_bonuslen != doi.doi_bonus_size) {
1546 /* Currently allocated, but with different properties */
1547 err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
1548 drro->drr_type, drro->drr_blksz,
1549 drro->drr_bonustype, drro->drr_bonuslen,
1550 dn_slots << DNODE_SHIFT, rwa->spill ?
1551 DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx);
1552 } else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) {
1553 /*
1554 * Currently allocated, the existing version of this object
1555 * may reference a spill block that is no longer allocated
1556 * at the source and needs to be freed.
1557 */
1558 err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx);
1559 }
1560
1561 if (err != 0) {
1562 dmu_tx_commit(tx);
1563 return (SET_ERROR(EINVAL));
1564 }
1565
1566 if (rwa->or_crypt_params_present) {
1567 /*
1568 * Set the crypt params for the buffer associated with this
1569 * range of dnodes. This causes the blkptr_t to have the
1570 * same crypt params (byteorder, salt, iv, mac) as on the
1571 * sending side.
1572 *
1573 * Since we are committing this tx now, it is possible for
1574 * the dnode block to end up on-disk with the incorrect MAC,
1575 * if subsequent objects in this block are received in a
1576 * different txg. However, since the dataset is marked as
1577 * inconsistent, no code paths will do a non-raw read (or
1578 * decrypt the block / verify the MAC). The receive code and
1579 * scrub code can safely do raw reads and verify the
1580 * checksum. They don't need to verify the MAC.
1581 */
1582 dmu_buf_t *db = NULL;
1583 uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE;
1584
1585 err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os),
1586 offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT);
1587 if (err != 0) {
1588 dmu_tx_commit(tx);
1589 return (SET_ERROR(EINVAL));
1590 }
1591
1592 dmu_buf_set_crypt_params(db, rwa->or_byteorder,
1593 rwa->or_salt, rwa->or_iv, rwa->or_mac, tx);
1594
1595 dmu_buf_rele(db, FTAG);
1596
1597 rwa->or_crypt_params_present = B_FALSE;
1598 }
1599
1600 dmu_object_set_checksum(rwa->os, drro->drr_object,
1601 drro->drr_checksumtype, tx);
1602 dmu_object_set_compress(rwa->os, drro->drr_object,
1603 drro->drr_compress, tx);
1604
1605 /* handle more restrictive dnode structuring for raw recvs */
1606 if (rwa->raw) {
1607 /*
1608 * Set the indirect block size, block shift, nlevels.
1609 * This will not fail because we ensured all of the
1610 * blocks were freed earlier if this is a new object.
1611 * For non-new objects block size and indirect block
1612 * shift cannot change and nlevels can only increase.
1613 */
1614 VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
1615 drro->drr_blksz, drro->drr_indblkshift, tx));
1616 VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
1617 drro->drr_nlevels, tx));
1618
1619 /*
1620 * Set the maxblkid. This will always succeed because
1621 * we freed all blocks beyond the new maxblkid above.
1622 */
1623 VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object,
1624 drro->drr_maxblkid, tx));
1625 }
1626
1627 if (data != NULL) {
1628 dmu_buf_t *db;
1629 dnode_t *dn;
1630 uint32_t flags = DMU_READ_NO_PREFETCH;
1631
1632 if (rwa->raw)
1633 flags |= DMU_READ_NO_DECRYPT;
1634
1635 VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn));
1636 VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags));
1637
1638 dmu_buf_will_dirty(db, tx);
1639
1640 ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
1641 bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro));
1642
1643 /*
1644 * Raw bonus buffers have their byteorder determined by the
1645 * DRR_OBJECT_RANGE record.
1646 */
1647 if (rwa->byteswap && !rwa->raw) {
1648 dmu_object_byteswap_t byteswap =
1649 DMU_OT_BYTESWAP(drro->drr_bonustype);
1650 dmu_ot_byteswap[byteswap].ob_func(db->db_data,
1651 DRR_OBJECT_PAYLOAD_SIZE(drro));
1652 }
1653 dmu_buf_rele(db, FTAG);
1654 dnode_rele(dn, FTAG);
1655 }
1656 dmu_tx_commit(tx);
1657
1658 return (0);
1659 }
1660
1661 /* ARGSUSED */
1662 noinline static int
1663 receive_freeobjects(struct receive_writer_arg *rwa,
1664 struct drr_freeobjects *drrfo)
1665 {
1666 uint64_t obj;
1667 int next_err = 0;
1668
1669 if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
1670 return (SET_ERROR(EINVAL));
1671
1672 for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
1673 obj < drrfo->drr_firstobj + drrfo->drr_numobjs &&
1674 obj < DN_MAX_OBJECT && next_err == 0;
1675 next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
1676 dmu_object_info_t doi;
1677 int err;
1678
1679 err = dmu_object_info(rwa->os, obj, &doi);
1680 if (err == ENOENT)
1681 continue;
1682 else if (err != 0)
1683 return (err);
1684
1685 err = dmu_free_long_object(rwa->os, obj);
1686
1687 if (err != 0)
1688 return (err);
1689 }
1690 if (next_err != ESRCH)
1691 return (next_err);
1692 return (0);
1693 }
1694
1695 noinline static int
1696 receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
1697 arc_buf_t *abuf)
1698 {
1699 int err;
1700 dmu_tx_t *tx;
1701 dnode_t *dn;
1702
1703 if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
1704 !DMU_OT_IS_VALID(drrw->drr_type))
1705 return (SET_ERROR(EINVAL));
1706
1707 /*
1708 * For resuming to work, records must be in increasing order
1709 * by (object, offset).
1710 */
1711 if (drrw->drr_object < rwa->last_object ||
1712 (drrw->drr_object == rwa->last_object &&
1713 drrw->drr_offset < rwa->last_offset)) {
1714 return (SET_ERROR(EINVAL));
1715 }
1716 rwa->last_object = drrw->drr_object;
1717 rwa->last_offset = drrw->drr_offset;
1718
1719 if (rwa->last_object > rwa->max_object)
1720 rwa->max_object = rwa->last_object;
1721
1722 if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
1723 return (SET_ERROR(EINVAL));
1724
1725 tx = dmu_tx_create(rwa->os);
1726 dmu_tx_hold_write(tx, drrw->drr_object,
1727 drrw->drr_offset, drrw->drr_logical_size);
1728 err = dmu_tx_assign(tx, TXG_WAIT);
1729 if (err != 0) {
1730 dmu_tx_abort(tx);
1731 return (err);
1732 }
1733
1734 if (rwa->byteswap && !arc_is_encrypted(abuf) &&
1735 arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
1736 dmu_object_byteswap_t byteswap =
1737 DMU_OT_BYTESWAP(drrw->drr_type);
1738 dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
1739 DRR_WRITE_PAYLOAD_SIZE(drrw));
1740 }
1741
1742 /* use the bonus buf to look up the dnode in dmu_assign_arcbuf */
1743 VERIFY0(dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn));
1744 err = dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx);
1745 if (err != 0) {
1746 dnode_rele(dn, FTAG);
1747 dmu_tx_commit(tx);
1748 return (err);
1749 }
1750 dnode_rele(dn, FTAG);
1751
1752 /*
1753 * Note: If the receive fails, we want the resume stream to start
1754 * with the same record that we last successfully received (as opposed
1755 * to the next record), so that we can verify that we are
1756 * resuming from the correct location.
1757 */
1758 save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
1759 dmu_tx_commit(tx);
1760
1761 return (0);
1762 }
1763
1764 /*
1765 * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed
1766 * streams to refer to a copy of the data that is already on the
1767 * system because it came in earlier in the stream. This function
1768 * finds the earlier copy of the data, and uses that copy instead of
1769 * data from the stream to fulfill this write.
1770 */
1771 noinline static int
1772 receive_write_byref(struct receive_writer_arg *rwa,
1773 struct drr_write_byref *drrwbr)
1774 {
1775 dmu_tx_t *tx;
1776 int err;
1777 guid_map_entry_t gmesrch;
1778 guid_map_entry_t *gmep;
1779 avl_index_t where;
1780 objset_t *ref_os = NULL;
1781 int flags = DMU_READ_PREFETCH;
1782 dmu_buf_t *dbp;
1783
1784 if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
1785 return (SET_ERROR(EINVAL));
1786
1787 /*
1788 * If the GUID of the referenced dataset is different from the
1789 * GUID of the target dataset, find the referenced dataset.
1790 */
1791 if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1792 gmesrch.guid = drrwbr->drr_refguid;
1793 if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
1794 &where)) == NULL) {
1795 return (SET_ERROR(EINVAL));
1796 }
1797 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
1798 return (SET_ERROR(EINVAL));
1799 } else {
1800 ref_os = rwa->os;
1801 }
1802
1803 if (drrwbr->drr_object > rwa->max_object)
1804 rwa->max_object = drrwbr->drr_object;
1805
1806 if (rwa->raw)
1807 flags |= DMU_READ_NO_DECRYPT;
1808
1809 /* may return either a regular db or an encrypted one */
1810 err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1811 drrwbr->drr_refoffset, FTAG, &dbp, flags);
1812 if (err != 0)
1813 return (err);
1814
1815 tx = dmu_tx_create(rwa->os);
1816
1817 dmu_tx_hold_write(tx, drrwbr->drr_object,
1818 drrwbr->drr_offset, drrwbr->drr_length);
1819 err = dmu_tx_assign(tx, TXG_WAIT);
1820 if (err != 0) {
1821 dmu_tx_abort(tx);
1822 return (err);
1823 }
1824
1825 if (rwa->raw) {
1826 dmu_copy_from_buf(rwa->os, drrwbr->drr_object,
1827 drrwbr->drr_offset, dbp, tx);
1828 } else {
1829 dmu_write(rwa->os, drrwbr->drr_object,
1830 drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1831 }
1832 dmu_buf_rele(dbp, FTAG);
1833
1834 /* See comment in restore_write. */
1835 save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
1836 dmu_tx_commit(tx);
1837 return (0);
1838 }
1839
1840 static int
1841 receive_write_embedded(struct receive_writer_arg *rwa,
1842 struct drr_write_embedded *drrwe, void *data)
1843 {
1844 dmu_tx_t *tx;
1845 int err;
1846
1847 if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
1848 return (SET_ERROR(EINVAL));
1849
1850 if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
1851 return (SET_ERROR(EINVAL));
1852
1853 if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
1854 return (SET_ERROR(EINVAL));
1855 if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
1856 return (SET_ERROR(EINVAL));
1857 if (rwa->raw)
1858 return (SET_ERROR(EINVAL));
1859
1860 if (drrwe->drr_object > rwa->max_object)
1861 rwa->max_object = drrwe->drr_object;
1862
1863 tx = dmu_tx_create(rwa->os);
1864
1865 dmu_tx_hold_write(tx, drrwe->drr_object,
1866 drrwe->drr_offset, drrwe->drr_length);
1867 err = dmu_tx_assign(tx, TXG_WAIT);
1868 if (err != 0) {
1869 dmu_tx_abort(tx);
1870 return (err);
1871 }
1872
1873 dmu_write_embedded(rwa->os, drrwe->drr_object,
1874 drrwe->drr_offset, data, drrwe->drr_etype,
1875 drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
1876 rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
1877
1878 /* See comment in restore_write. */
1879 save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
1880 dmu_tx_commit(tx);
1881 return (0);
1882 }
1883
1884 static int
1885 receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
1886 arc_buf_t *abuf)
1887 {
1888 dmu_tx_t *tx;
1889 dmu_buf_t *db, *db_spill;
1890 int err;
1891
1892 if (drrs->drr_length < SPA_MINBLOCKSIZE ||
1893 drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
1894 return (SET_ERROR(EINVAL));
1895
1896 /*
1897 * This is an unmodified spill block which was added to the stream
1898 * to resolve an issue with incorrectly removing spill blocks. It
1899 * should be ignored by current versions of the code which support
1900 * the DRR_FLAG_SPILL_BLOCK flag.
1901 */
1902 if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
1903 dmu_return_arcbuf(abuf);
1904 return (0);
1905 }
1906
1907 if (rwa->raw) {
1908 if (!DMU_OT_IS_VALID(drrs->drr_type) ||
1909 drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
1910 drrs->drr_compressed_size == 0)
1911 return (SET_ERROR(EINVAL));
1912 }
1913
1914 if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
1915 return (SET_ERROR(EINVAL));
1916
1917 if (drrs->drr_object > rwa->max_object)
1918 rwa->max_object = drrs->drr_object;
1919
1920 VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
1921 if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG,
1922 &db_spill)) != 0) {
1923 dmu_buf_rele(db, FTAG);
1924 return (err);
1925 }
1926
1927 tx = dmu_tx_create(rwa->os);
1928
1929 dmu_tx_hold_spill(tx, db->db_object);
1930
1931 err = dmu_tx_assign(tx, TXG_WAIT);
1932 if (err != 0) {
1933 dmu_buf_rele(db, FTAG);
1934 dmu_buf_rele(db_spill, FTAG);
1935 dmu_tx_abort(tx);
1936 return (err);
1937 }
1938
1939 /*
1940 * Spill blocks may both grow and shrink. When a change in size
1941 * occurs any existing dbuf must be updated to match the logical
1942 * size of the provided arc_buf_t.
1943 */
1944 if (db_spill->db_size != drrs->drr_length) {
1945 dmu_buf_will_fill(db_spill, tx);
1946 VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1947 drrs->drr_length, tx));
1948 }
1949
1950 if (rwa->byteswap && !arc_is_encrypted(abuf) &&
1951 arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
1952 dmu_object_byteswap_t byteswap =
1953 DMU_OT_BYTESWAP(drrs->drr_type);
1954 dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
1955 DRR_SPILL_PAYLOAD_SIZE(drrs));
1956 }
1957
1958 dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
1959
1960 dmu_buf_rele(db, FTAG);
1961 dmu_buf_rele(db_spill, FTAG);
1962
1963 dmu_tx_commit(tx);
1964 return (0);
1965 }
1966
1967 /* ARGSUSED */
1968 noinline static int
1969 receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
1970 {
1971 int err;
1972
1973 if (drrf->drr_length != -1ULL &&
1974 drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
1975 return (SET_ERROR(EINVAL));
1976
1977 if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
1978 return (SET_ERROR(EINVAL));
1979
1980 if (drrf->drr_object > rwa->max_object)
1981 rwa->max_object = drrf->drr_object;
1982
1983 err = dmu_free_long_range(rwa->os, drrf->drr_object,
1984 drrf->drr_offset, drrf->drr_length);
1985
1986 return (err);
1987 }
1988
1989 static int
1990 receive_object_range(struct receive_writer_arg *rwa,
1991 struct drr_object_range *drror)
1992 {
1993 /*
1994 * By default, we assume this block is in our native format
1995 * (ZFS_HOST_BYTEORDER). We then take into account whether
1996 * the send stream is byteswapped (rwa->byteswap). Finally,
1997 * we need to byteswap again if this particular block was
1998 * in non-native format on the send side.
1999 */
2000 boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^
2001 !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags);
2002
2003 /*
2004 * Since dnode block sizes are constant, we should not need to worry
2005 * about making sure that the dnode block size is the same on the
2006 * sending and receiving sides for the time being. For non-raw sends,
2007 * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE
2008 * record at all). Raw sends require this record type because the
2009 * encryption parameters are used to protect an entire block of bonus
2010 * buffers. If the size of dnode blocks ever becomes variable,
2011 * handling will need to be added to ensure that dnode block sizes
2012 * match on the sending and receiving side.
2013 */
2014 if (drror->drr_numslots != DNODES_PER_BLOCK ||
2015 P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 ||
2016 !rwa->raw)
2017 return (SET_ERROR(EINVAL));
2018
2019 if (drror->drr_firstobj > rwa->max_object)
2020 rwa->max_object = drror->drr_firstobj;
2021
2022 /*
2023 * The DRR_OBJECT_RANGE handling must be deferred to receive_object()
2024 * so that the block of dnodes is not written out when it's empty,
2025 * and converted to a HOLE BP.
2026 */
2027 rwa->or_crypt_params_present = B_TRUE;
2028 rwa->or_firstobj = drror->drr_firstobj;
2029 rwa->or_numslots = drror->drr_numslots;
2030 bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN);
2031 bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN);
2032 bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN);
2033 rwa->or_byteorder = byteorder;
2034
2035 return (0);
2036 }
2037
2038 /*
2039 * Until we have the ability to redact large ranges of data efficiently, we
2040 * process these records as frees.
2041 */
2042 /* ARGSUSED */
2043 noinline static int
2044 receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr)
2045 {
2046 struct drr_free drrf = {0};
2047 drrf.drr_length = drrr->drr_length;
2048 drrf.drr_object = drrr->drr_object;
2049 drrf.drr_offset = drrr->drr_offset;
2050 drrf.drr_toguid = drrr->drr_toguid;
2051 return (receive_free(rwa, &drrf));
2052 }
2053
2054 /* used to destroy the drc_ds on error */
2055 static void
2056 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
2057 {
2058 dsl_dataset_t *ds = drc->drc_ds;
2059 ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT;
2060
2061 /*
2062 * Wait for the txg sync before cleaning up the receive. For
2063 * resumable receives, this ensures that our resume state has
2064 * been written out to disk. For raw receives, this ensures
2065 * that the user accounting code will not attempt to do anything
2066 * after we stopped receiving the dataset.
2067 */
2068 txg_wait_synced(ds->ds_dir->dd_pool, 0);
2069 ds->ds_objset->os_raw_receive = B_FALSE;
2070
2071 rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
2072 if (drc->drc_resumable && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) {
2073 rrw_exit(&ds->ds_bp_rwlock, FTAG);
2074 dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
2075 } else {
2076 char name[ZFS_MAX_DATASET_NAME_LEN];
2077 rrw_exit(&ds->ds_bp_rwlock, FTAG);
2078 dsl_dataset_name(ds, name);
2079 dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
2080 (void) dsl_destroy_head(name);
2081 }
2082 }
2083
2084 static void
2085 receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf)
2086 {
2087 if (drc->drc_byteswap) {
2088 (void) fletcher_4_incremental_byteswap(buf, len,
2089 &drc->drc_cksum);
2090 } else {
2091 (void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum);
2092 }
2093 }
2094
2095 /*
2096 * Read the payload into a buffer of size len, and update the current record's
2097 * payload field.
2098 * Allocate drc->drc_next_rrd and read the next record's header into
2099 * drc->drc_next_rrd->header.
2100 * Verify checksum of payload and next record.
2101 */
2102 static int
2103 receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf)
2104 {
2105 int err;
2106
2107 if (len != 0) {
2108 ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
2109 err = receive_read(drc, len, buf);
2110 if (err != 0)
2111 return (err);
2112 receive_cksum(drc, len, buf);
2113
2114 /* note: rrd is NULL when reading the begin record's payload */
2115 if (drc->drc_rrd != NULL) {
2116 drc->drc_rrd->payload = buf;
2117 drc->drc_rrd->payload_size = len;
2118 drc->drc_rrd->bytes_read = drc->drc_bytes_read;
2119 }
2120 } else {
2121 ASSERT3P(buf, ==, NULL);
2122 }
2123
2124 drc->drc_prev_cksum = drc->drc_cksum;
2125
2126 drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP);
2127 err = receive_read(drc, sizeof (drc->drc_next_rrd->header),
2128 &drc->drc_next_rrd->header);
2129 drc->drc_next_rrd->bytes_read = drc->drc_bytes_read;
2130
2131 if (err != 0) {
2132 kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
2133 drc->drc_next_rrd = NULL;
2134 return (err);
2135 }
2136 if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) {
2137 kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
2138 drc->drc_next_rrd = NULL;
2139 return (SET_ERROR(EINVAL));
2140 }
2141
2142 /*
2143 * Note: checksum is of everything up to but not including the
2144 * checksum itself.
2145 */
2146 ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
2147 ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
2148 receive_cksum(drc,
2149 offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
2150 &drc->drc_next_rrd->header);
2151
2152 zio_cksum_t cksum_orig =
2153 drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
2154 zio_cksum_t *cksump =
2155 &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum;
2156
2157 if (drc->drc_byteswap)
2158 byteswap_record(&drc->drc_next_rrd->header);
2159
2160 if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
2161 !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) {
2162 kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
2163 drc->drc_next_rrd = NULL;
2164 return (SET_ERROR(ECKSUM));
2165 }
2166
2167 receive_cksum(drc, sizeof (cksum_orig), &cksum_orig);
2168
2169 return (0);
2170 }
2171
2172 /*
2173 * Issue the prefetch reads for any necessary indirect blocks.
2174 *
2175 * We use the object ignore list to tell us whether or not to issue prefetches
2176 * for a given object. We do this for both correctness (in case the blocksize
2177 * of an object has changed) and performance (if the object doesn't exist, don't
2178 * needlessly try to issue prefetches). We also trim the list as we go through
2179 * the stream to prevent it from growing to an unbounded size.
2180 *
2181 * The object numbers within will always be in sorted order, and any write
2182 * records we see will also be in sorted order, but they're not sorted with
2183 * respect to each other (i.e. we can get several object records before
2184 * receiving each object's write records). As a result, once we've reached a
2185 * given object number, we can safely remove any reference to lower object
2186 * numbers in the ignore list. In practice, we receive up to 32 object records
2187 * before receiving write records, so the list can have up to 32 nodes in it.
2188 */
2189 /* ARGSUSED */
2190 static void
2191 receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset,
2192 uint64_t length)
2193 {
2194 if (!objlist_exists(drc->drc_ignore_objlist, object)) {
2195 dmu_prefetch(drc->drc_os, object, 1, offset, length,
2196 ZIO_PRIORITY_SYNC_READ);
2197 }
2198 }
2199
2200 /*
2201 * Read records off the stream, issuing any necessary prefetches.
2202 */
2203 static int
2204 receive_read_record(dmu_recv_cookie_t *drc)
2205 {
2206 int err;
2207
2208 switch (drc->drc_rrd->header.drr_type) {
2209 case DRR_OBJECT:
2210 {
2211 struct drr_object *drro =
2212 &drc->drc_rrd->header.drr_u.drr_object;
2213 uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro);
2214 void *buf = NULL;
2215 dmu_object_info_t doi;
2216
2217 if (size != 0)
2218 buf = kmem_zalloc(size, KM_SLEEP);
2219
2220 err = receive_read_payload_and_next_header(drc, size, buf);
2221 if (err != 0) {
2222 kmem_free(buf, size);
2223 return (err);
2224 }
2225 err = dmu_object_info(drc->drc_os, drro->drr_object, &doi);
2226 /*
2227 * See receive_read_prefetch for an explanation why we're
2228 * storing this object in the ignore_obj_list.
2229 */
2230 if (err == ENOENT || err == EEXIST ||
2231 (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
2232 objlist_insert(drc->drc_ignore_objlist,
2233 drro->drr_object);
2234 err = 0;
2235 }
2236 return (err);
2237 }
2238 case DRR_FREEOBJECTS:
2239 {
2240 err = receive_read_payload_and_next_header(drc, 0, NULL);
2241 return (err);
2242 }
2243 case DRR_WRITE:
2244 {
2245 struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write;
2246 arc_buf_t *abuf;
2247 boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type);
2248
2249 if (drc->drc_raw) {
2250 boolean_t byteorder = ZFS_HOST_BYTEORDER ^
2251 !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
2252 drc->drc_byteswap;
2253
2254 abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os),
2255 drrw->drr_object, byteorder, drrw->drr_salt,
2256 drrw->drr_iv, drrw->drr_mac, drrw->drr_type,
2257 drrw->drr_compressed_size, drrw->drr_logical_size,
2258 drrw->drr_compressiontype);
2259 } else if (DRR_WRITE_COMPRESSED(drrw)) {
2260 ASSERT3U(drrw->drr_compressed_size, >, 0);
2261 ASSERT3U(drrw->drr_logical_size, >=,
2262 drrw->drr_compressed_size);
2263 ASSERT(!is_meta);
2264 abuf = arc_loan_compressed_buf(
2265 dmu_objset_spa(drc->drc_os),
2266 drrw->drr_compressed_size, drrw->drr_logical_size,
2267 drrw->drr_compressiontype);
2268 } else {
2269 abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os),
2270 is_meta, drrw->drr_logical_size);
2271 }
2272
2273 err = receive_read_payload_and_next_header(drc,
2274 DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data);
2275 if (err != 0) {
2276 dmu_return_arcbuf(abuf);
2277 return (err);
2278 }
2279 drc->drc_rrd->arc_buf = abuf;
2280 receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset,
2281 drrw->drr_logical_size);
2282 return (err);
2283 }
2284 case DRR_WRITE_BYREF:
2285 {
2286 struct drr_write_byref *drrwb =
2287 &drc->drc_rrd->header.drr_u.drr_write_byref;
2288 err = receive_read_payload_and_next_header(drc, 0, NULL);
2289 receive_read_prefetch(drc, drrwb->drr_object, drrwb->drr_offset,
2290 drrwb->drr_length);
2291 return (err);
2292 }
2293 case DRR_WRITE_EMBEDDED:
2294 {
2295 struct drr_write_embedded *drrwe =
2296 &drc->drc_rrd->header.drr_u.drr_write_embedded;
2297 uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
2298 void *buf = kmem_zalloc(size, KM_SLEEP);
2299
2300 err = receive_read_payload_and_next_header(drc, size, buf);
2301 if (err != 0) {
2302 kmem_free(buf, size);
2303 return (err);
2304 }
2305
2306 receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset,
2307 drrwe->drr_length);
2308 return (err);
2309 }
2310 case DRR_FREE:
2311 case DRR_REDACT:
2312 {
2313 /*
2314 * It might be beneficial to prefetch indirect blocks here, but
2315 * we don't really have the data to decide for sure.
2316 */
2317 err = receive_read_payload_and_next_header(drc, 0, NULL);
2318 return (err);
2319 }
2320 case DRR_END:
2321 {
2322 struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end;
2323 if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum,
2324 drre->drr_checksum))
2325 return (SET_ERROR(ECKSUM));
2326 return (0);
2327 }
2328 case DRR_SPILL:
2329 {
2330 struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill;
2331 arc_buf_t *abuf;
2332 /* DRR_SPILL records are either raw or uncompressed */
2333 if (drc->drc_raw) {
2334 boolean_t byteorder = ZFS_HOST_BYTEORDER ^
2335 !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
2336 drc->drc_byteswap;
2337
2338 abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os),
2339 drrs->drr_object, byteorder, drrs->drr_salt,
2340 drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
2341 drrs->drr_compressed_size, drrs->drr_length,
2342 drrs->drr_compressiontype);
2343 } else {
2344 abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os),
2345 DMU_OT_IS_METADATA(drrs->drr_type),
2346 drrs->drr_length);
2347 }
2348 err = receive_read_payload_and_next_header(drc,
2349 DRR_SPILL_PAYLOAD_SIZE(drrs), abuf->b_data);
2350 if (err != 0)
2351 dmu_return_arcbuf(abuf);
2352 else
2353 drc->drc_rrd->arc_buf = abuf;
2354 return (err);
2355 }
2356 case DRR_OBJECT_RANGE:
2357 {
2358 err = receive_read_payload_and_next_header(drc, 0, NULL);
2359 return (err);
2360
2361 }
2362 default:
2363 return (SET_ERROR(EINVAL));
2364 }
2365 }
2366
2367
2368
2369 static void
2370 dprintf_drr(struct receive_record_arg *rrd, int err)
2371 {
2372 #ifdef ZFS_DEBUG
2373 switch (rrd->header.drr_type) {
2374 case DRR_OBJECT:
2375 {
2376 struct drr_object *drro = &rrd->header.drr_u.drr_object;
2377 dprintf("drr_type = OBJECT obj = %llu type = %u "
2378 "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u "
2379 "compress = %u dn_slots = %u err = %d\n",
2380 drro->drr_object, drro->drr_type, drro->drr_bonustype,
2381 drro->drr_blksz, drro->drr_bonuslen,
2382 drro->drr_checksumtype, drro->drr_compress,
2383 drro->drr_dn_slots, err);
2384 break;
2385 }
2386 case DRR_FREEOBJECTS:
2387 {
2388 struct drr_freeobjects *drrfo =
2389 &rrd->header.drr_u.drr_freeobjects;
2390 dprintf("drr_type = FREEOBJECTS firstobj = %llu "
2391 "numobjs = %llu err = %d\n",
2392 drrfo->drr_firstobj, drrfo->drr_numobjs, err);
2393 break;
2394 }
2395 case DRR_WRITE:
2396 {
2397 struct drr_write *drrw = &rrd->header.drr_u.drr_write;
2398 dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu "
2399 "lsize = %llu cksumtype = %u flags = %u "
2400 "compress = %u psize = %llu err = %d\n",
2401 drrw->drr_object, drrw->drr_type, drrw->drr_offset,
2402 drrw->drr_logical_size, drrw->drr_checksumtype,
2403 drrw->drr_flags, drrw->drr_compressiontype,
2404 drrw->drr_compressed_size, err);
2405 break;
2406 }
2407 case DRR_WRITE_BYREF:
2408 {
2409 struct drr_write_byref *drrwbr =
2410 &rrd->header.drr_u.drr_write_byref;
2411 dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu "
2412 "length = %llu toguid = %llx refguid = %llx "
2413 "refobject = %llu refoffset = %llu cksumtype = %u "
2414 "flags = %u err = %d\n",
2415 drrwbr->drr_object, drrwbr->drr_offset,
2416 drrwbr->drr_length, drrwbr->drr_toguid,
2417 drrwbr->drr_refguid, drrwbr->drr_refobject,
2418 drrwbr->drr_refoffset, drrwbr->drr_checksumtype,
2419 drrwbr->drr_flags, err);
2420 break;
2421 }
2422 case DRR_WRITE_EMBEDDED:
2423 {
2424 struct drr_write_embedded *drrwe =
2425 &rrd->header.drr_u.drr_write_embedded;
2426 dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu "
2427 "length = %llu compress = %u etype = %u lsize = %u "
2428 "psize = %u err = %d\n",
2429 drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length,
2430 drrwe->drr_compression, drrwe->drr_etype,
2431 drrwe->drr_lsize, drrwe->drr_psize, err);
2432 break;
2433 }
2434 case DRR_FREE:
2435 {
2436 struct drr_free *drrf = &rrd->header.drr_u.drr_free;
2437 dprintf("drr_type = FREE obj = %llu offset = %llu "
2438 "length = %lld err = %d\n",
2439 drrf->drr_object, drrf->drr_offset, drrf->drr_length,
2440 err);
2441 break;
2442 }
2443 case DRR_SPILL:
2444 {
2445 struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
2446 dprintf("drr_type = SPILL obj = %llu length = %llu "
2447 "err = %d\n", drrs->drr_object, drrs->drr_length, err);
2448 break;
2449 }
2450 case DRR_OBJECT_RANGE:
2451 {
2452 struct drr_object_range *drror =
2453 &rrd->header.drr_u.drr_object_range;
2454 dprintf("drr_type = OBJECT_RANGE firstobj = %llu "
2455 "numslots = %llu flags = %u err = %d\n",
2456 drror->drr_firstobj, drror->drr_numslots,
2457 drror->drr_flags, err);
2458 break;
2459 }
2460 default:
2461 return;
2462 }
2463 #endif
2464 }
2465
2466 /*
2467 * Commit the records to the pool.
2468 */
2469 static int
2470 receive_process_record(struct receive_writer_arg *rwa,
2471 struct receive_record_arg *rrd)
2472 {
2473 int err;
2474
2475 /* Processing in order, therefore bytes_read should be increasing. */
2476 ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
2477 rwa->bytes_read = rrd->bytes_read;
2478
2479 switch (rrd->header.drr_type) {
2480 case DRR_OBJECT:
2481 {
2482 struct drr_object *drro = &rrd->header.drr_u.drr_object;
2483 err = receive_object(rwa, drro, rrd->payload);
2484 kmem_free(rrd->payload, rrd->payload_size);
2485 rrd->payload = NULL;
2486 break;
2487 }
2488 case DRR_FREEOBJECTS:
2489 {
2490 struct drr_freeobjects *drrfo =
2491 &rrd->header.drr_u.drr_freeobjects;
2492 err = receive_freeobjects(rwa, drrfo);
2493 break;
2494 }
2495 case DRR_WRITE:
2496 {
2497 struct drr_write *drrw = &rrd->header.drr_u.drr_write;
2498 err = receive_write(rwa, drrw, rrd->arc_buf);
2499 /* if receive_write() is successful, it consumes the arc_buf */
2500 if (err != 0)
2501 dmu_return_arcbuf(rrd->arc_buf);
2502 rrd->arc_buf = NULL;
2503 rrd->payload = NULL;
2504 break;
2505 }
2506 case DRR_WRITE_BYREF:
2507 {
2508 struct drr_write_byref *drrwbr =
2509 &rrd->header.drr_u.drr_write_byref;
2510 err = receive_write_byref(rwa, drrwbr);
2511 break;
2512 }
2513 case DRR_WRITE_EMBEDDED:
2514 {
2515 struct drr_write_embedded *drrwe =
2516 &rrd->header.drr_u.drr_write_embedded;
2517 err = receive_write_embedded(rwa, drrwe, rrd->payload);
2518 kmem_free(rrd->payload, rrd->payload_size);
2519 rrd->payload = NULL;
2520 break;
2521 }
2522 case DRR_FREE:
2523 {
2524 struct drr_free *drrf = &rrd->header.drr_u.drr_free;
2525 err = receive_free(rwa, drrf);
2526 break;
2527 }
2528 case DRR_SPILL:
2529 {
2530 struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
2531 err = receive_spill(rwa, drrs, rrd->arc_buf);
2532 if (err != 0)
2533 dmu_return_arcbuf(rrd->arc_buf);
2534 rrd->arc_buf = NULL;
2535 rrd->payload = NULL;
2536 break;
2537 }
2538 case DRR_OBJECT_RANGE:
2539 {
2540 struct drr_object_range *drror =
2541 &rrd->header.drr_u.drr_object_range;
2542 err = receive_object_range(rwa, drror);
2543 break;
2544 }
2545 case DRR_REDACT:
2546 {
2547 struct drr_redact *drrr = &rrd->header.drr_u.drr_redact;
2548 err = receive_redact(rwa, drrr);
2549 break;
2550 }
2551 default:
2552 err = (SET_ERROR(EINVAL));
2553 }
2554
2555 if (err != 0)
2556 dprintf_drr(rrd, err);
2557
2558 return (err);
2559 }
2560
2561 /*
2562 * dmu_recv_stream's worker thread; pull records off the queue, and then call
2563 * receive_process_record When we're done, signal the main thread and exit.
2564 */
2565 static void
2566 receive_writer_thread(void *arg)
2567 {
2568 struct receive_writer_arg *rwa = arg;
2569 struct receive_record_arg *rrd;
2570 fstrans_cookie_t cookie = spl_fstrans_mark();
2571
2572 for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
2573 rrd = bqueue_dequeue(&rwa->q)) {
2574 /*
2575 * If there's an error, the main thread will stop putting things
2576 * on the queue, but we need to clear everything in it before we
2577 * can exit.
2578 */
2579 if (rwa->err == 0) {
2580 rwa->err = receive_process_record(rwa, rrd);
2581 } else if (rrd->arc_buf != NULL) {
2582 dmu_return_arcbuf(rrd->arc_buf);
2583 rrd->arc_buf = NULL;
2584 rrd->payload = NULL;
2585 } else if (rrd->payload != NULL) {
2586 kmem_free(rrd->payload, rrd->payload_size);
2587 rrd->payload = NULL;
2588 }
2589 kmem_free(rrd, sizeof (*rrd));
2590 }
2591 kmem_free(rrd, sizeof (*rrd));
2592 mutex_enter(&rwa->mutex);
2593 rwa->done = B_TRUE;
2594 cv_signal(&rwa->cv);
2595 mutex_exit(&rwa->mutex);
2596 spl_fstrans_unmark(cookie);
2597 thread_exit();
2598 }
2599
2600 static int
2601 resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl)
2602 {
2603 uint64_t val;
2604 objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset;
2605 uint64_t dsobj = dmu_objset_id(drc->drc_os);
2606 uint64_t resume_obj, resume_off;
2607
2608 if (nvlist_lookup_uint64(begin_nvl,
2609 "resume_object", &resume_obj) != 0 ||
2610 nvlist_lookup_uint64(begin_nvl,
2611 "resume_offset", &resume_off) != 0) {
2612 return (SET_ERROR(EINVAL));
2613 }
2614 VERIFY0(zap_lookup(mos, dsobj,
2615 DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
2616 if (resume_obj != val)
2617 return (SET_ERROR(EINVAL));
2618 VERIFY0(zap_lookup(mos, dsobj,
2619 DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
2620 if (resume_off != val)
2621 return (SET_ERROR(EINVAL));
2622
2623 return (0);
2624 }
2625
2626 /*
2627 * Read in the stream's records, one by one, and apply them to the pool. There
2628 * are two threads involved; the thread that calls this function will spin up a
2629 * worker thread, read the records off the stream one by one, and issue
2630 * prefetches for any necessary indirect blocks. It will then push the records
2631 * onto an internal blocking queue. The worker thread will pull the records off
2632 * the queue, and actually write the data into the DMU. This way, the worker
2633 * thread doesn't have to wait for reads to complete, since everything it needs
2634 * (the indirect blocks) will be prefetched.
2635 *
2636 * NB: callers *must* call dmu_recv_end() if this succeeds.
2637 */
2638 int
2639 dmu_recv_stream(dmu_recv_cookie_t *drc, int cleanup_fd,
2640 uint64_t *action_handlep, offset_t *voffp)
2641 {
2642 int err = 0;
2643 struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP);
2644
2645 if (dsl_dataset_is_zapified(drc->drc_ds)) {
2646 uint64_t bytes;
2647 (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
2648 drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
2649 sizeof (bytes), 1, &bytes);
2650 drc->drc_bytes_read += bytes;
2651 }
2652
2653 drc->drc_ignore_objlist = objlist_create();
2654
2655 /* these were verified in dmu_recv_begin */
2656 ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
2657 DMU_SUBSTREAM);
2658 ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
2659
2660 /*
2661 * Open the objset we are modifying.
2662 */
2663 VERIFY0(dmu_objset_from_ds(drc->drc_ds, &drc->drc_os));
2664 ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
2665 ASSERT0(drc->drc_os->os_encrypted &&
2666 (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA));
2667
2668 /* if this stream is dedup'ed, set up the avl tree for guid mapping */
2669 if (drc->drc_featureflags & DMU_BACKUP_FEATURE_DEDUP) {
2670 minor_t minor;
2671
2672 if (cleanup_fd == -1) {
2673 err = SET_ERROR(EBADF);
2674 goto out;
2675 }
2676 err = zfs_onexit_fd_hold(cleanup_fd, &minor);
2677 if (err != 0) {
2678 cleanup_fd = -1;
2679 goto out;
2680 }
2681
2682 if (*action_handlep == 0) {
2683 rwa->guid_to_ds_map =
2684 kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
2685 avl_create(rwa->guid_to_ds_map, guid_compare,
2686 sizeof (guid_map_entry_t),
2687 offsetof(guid_map_entry_t, avlnode));
2688 err = zfs_onexit_add_cb(minor,
2689 free_guid_map_onexit, rwa->guid_to_ds_map,
2690 action_handlep);
2691 if (err != 0)
2692 goto out;
2693 } else {
2694 err = zfs_onexit_cb_data(minor, *action_handlep,
2695 (void **)&rwa->guid_to_ds_map);
2696 if (err != 0)
2697 goto out;
2698 }
2699
2700 drc->drc_guid_to_ds_map = rwa->guid_to_ds_map;
2701 }
2702
2703 /* handle DSL encryption key payload */
2704 if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) {
2705 nvlist_t *keynvl = NULL;
2706
2707 ASSERT(drc->drc_os->os_encrypted);
2708 ASSERT(drc->drc_raw);
2709
2710 err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata",
2711 &keynvl);
2712 if (err != 0)
2713 goto out;
2714
2715 /*
2716 * If this is a new dataset we set the key immediately.
2717 * Otherwise we don't want to change the key until we
2718 * are sure the rest of the receive succeeded so we stash
2719 * the keynvl away until then.
2720 */
2721 err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa),
2722 drc->drc_ds->ds_object, drc->drc_fromsnapobj,
2723 drc->drc_drrb->drr_type, keynvl, drc->drc_newfs);
2724 if (err != 0)
2725 goto out;
2726
2727 /* see comment in dmu_recv_end_sync() */
2728 drc->drc_ivset_guid = 0;
2729 (void) nvlist_lookup_uint64(keynvl, "to_ivset_guid",
2730 &drc->drc_ivset_guid);
2731
2732 if (!drc->drc_newfs)
2733 drc->drc_keynvl = fnvlist_dup(keynvl);
2734 }
2735
2736 if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) {
2737 err = resume_check(drc, drc->drc_begin_nvl);
2738 if (err != 0)
2739 goto out;
2740 }
2741
2742 (void) bqueue_init(&rwa->q, zfs_recv_queue_ff,
2743 MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize),
2744 offsetof(struct receive_record_arg, node));
2745 cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL);
2746 mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
2747 rwa->os = drc->drc_os;
2748 rwa->byteswap = drc->drc_byteswap;
2749 rwa->resumable = drc->drc_resumable;
2750 rwa->raw = drc->drc_raw;
2751 rwa->spill = drc->drc_spill;
2752 rwa->os->os_raw_receive = drc->drc_raw;
2753
2754 (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
2755 TS_RUN, minclsyspri);
2756 /*
2757 * We're reading rwa->err without locks, which is safe since we are the
2758 * only reader, and the worker thread is the only writer. It's ok if we
2759 * miss a write for an iteration or two of the loop, since the writer
2760 * thread will keep freeing records we send it until we send it an eos
2761 * marker.
2762 *
2763 * We can leave this loop in 3 ways: First, if rwa->err is
2764 * non-zero. In that case, the writer thread will free the rrd we just
2765 * pushed. Second, if we're interrupted; in that case, either it's the
2766 * first loop and drc->drc_rrd was never allocated, or it's later, and
2767 * drc->drc_rrd has been handed off to the writer thread who will free
2768 * it. Finally, if receive_read_record fails or we're at the end of the
2769 * stream, then we free drc->drc_rrd and exit.
2770 */
2771 while (rwa->err == 0) {
2772 if (issig(JUSTLOOKING) && issig(FORREAL)) {
2773 err = SET_ERROR(EINTR);
2774 break;
2775 }
2776
2777 ASSERT3P(drc->drc_rrd, ==, NULL);
2778 drc->drc_rrd = drc->drc_next_rrd;
2779 drc->drc_next_rrd = NULL;
2780 /* Allocates and loads header into drc->drc_next_rrd */
2781 err = receive_read_record(drc);
2782
2783 if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) {
2784 kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd));
2785 drc->drc_rrd = NULL;
2786 break;
2787 }
2788
2789 bqueue_enqueue(&rwa->q, drc->drc_rrd,
2790 sizeof (struct receive_record_arg) +
2791 drc->drc_rrd->payload_size);
2792 drc->drc_rrd = NULL;
2793 }
2794
2795 ASSERT3P(drc->drc_rrd, ==, NULL);
2796 drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP);
2797 drc->drc_rrd->eos_marker = B_TRUE;
2798 bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1);
2799
2800 mutex_enter(&rwa->mutex);
2801 while (!rwa->done) {
2802 /*
2803 * We need to use cv_wait_sig() so that any process that may
2804 * be sleeping here can still fork.
2805 */
2806 (void) cv_wait_sig(&rwa->cv, &rwa->mutex);
2807 }
2808 mutex_exit(&rwa->mutex);
2809
2810 /*
2811 * If we are receiving a full stream as a clone, all object IDs which
2812 * are greater than the maximum ID referenced in the stream are
2813 * by definition unused and must be freed.
2814 */
2815 if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
2816 uint64_t obj = rwa->max_object + 1;
2817 int free_err = 0;
2818 int next_err = 0;
2819
2820 while (next_err == 0) {
2821 free_err = dmu_free_long_object(rwa->os, obj);
2822 if (free_err != 0 && free_err != ENOENT)
2823 break;
2824
2825 next_err = dmu_object_next(rwa->os, &obj, FALSE, 0);
2826 }
2827
2828 if (err == 0) {
2829 if (free_err != 0 && free_err != ENOENT)
2830 err = free_err;
2831 else if (next_err != ESRCH)
2832 err = next_err;
2833 }
2834 }
2835
2836 cv_destroy(&rwa->cv);
2837 mutex_destroy(&rwa->mutex);
2838 bqueue_destroy(&rwa->q);
2839 if (err == 0)
2840 err = rwa->err;
2841
2842 out:
2843 /*
2844 * If we hit an error before we started the receive_writer_thread
2845 * we need to clean up the next_rrd we create by processing the
2846 * DRR_BEGIN record.
2847 */
2848 if (drc->drc_next_rrd != NULL)
2849 kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd));
2850
2851 kmem_free(rwa, sizeof (*rwa));
2852 nvlist_free(drc->drc_begin_nvl);
2853 if ((drc->drc_featureflags & DMU_BACKUP_FEATURE_DEDUP) &&
2854 (cleanup_fd != -1))
2855 zfs_onexit_fd_rele(cleanup_fd);
2856
2857 if (err != 0) {
2858 /*
2859 * Clean up references. If receive is not resumable,
2860 * destroy what we created, so we don't leave it in
2861 * the inconsistent state.
2862 */
2863 dmu_recv_cleanup_ds(drc);
2864 nvlist_free(drc->drc_keynvl);
2865 }
2866
2867 objlist_destroy(drc->drc_ignore_objlist);
2868 drc->drc_ignore_objlist = NULL;
2869 *voffp = drc->drc_voff;
2870 return (err);
2871 }
2872
2873 static int
2874 dmu_recv_end_check(void *arg, dmu_tx_t *tx)
2875 {
2876 dmu_recv_cookie_t *drc = arg;
2877 dsl_pool_t *dp = dmu_tx_pool(tx);
2878 int error;
2879
2880 ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
2881
2882 if (!drc->drc_newfs) {
2883 dsl_dataset_t *origin_head;
2884
2885 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
2886 if (error != 0)
2887 return (error);
2888 if (drc->drc_force) {
2889 /*
2890 * We will destroy any snapshots in tofs (i.e. before
2891 * origin_head) that are after the origin (which is
2892 * the snap before drc_ds, because drc_ds can not
2893 * have any snaps of its own).
2894 */
2895 uint64_t obj;
2896
2897 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
2898 while (obj !=
2899 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
2900 dsl_dataset_t *snap;
2901 error = dsl_dataset_hold_obj(dp, obj, FTAG,
2902 &snap);
2903 if (error != 0)
2904 break;
2905 if (snap->ds_dir != origin_head->ds_dir)
2906 error = SET_ERROR(EINVAL);
2907 if (error == 0) {
2908 error = dsl_destroy_snapshot_check_impl(
2909 snap, B_FALSE);
2910 }
2911 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
2912 dsl_dataset_rele(snap, FTAG);
2913 if (error != 0)
2914 break;
2915 }
2916 if (error != 0) {
2917 dsl_dataset_rele(origin_head, FTAG);
2918 return (error);
2919 }
2920 }
2921 if (drc->drc_keynvl != NULL) {
2922 error = dsl_crypto_recv_raw_key_check(drc->drc_ds,
2923 drc->drc_keynvl, tx);
2924 if (error != 0) {
2925 dsl_dataset_rele(origin_head, FTAG);
2926 return (error);
2927 }
2928 }
2929
2930 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
2931 origin_head, drc->drc_force, drc->drc_owner, tx);
2932 if (error != 0) {
2933 dsl_dataset_rele(origin_head, FTAG);
2934 return (error);
2935 }
2936 error = dsl_dataset_snapshot_check_impl(origin_head,
2937 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
2938 dsl_dataset_rele(origin_head, FTAG);
2939 if (error != 0)
2940 return (error);
2941
2942 error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
2943 } else {
2944 error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
2945 drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
2946 }
2947 return (error);
2948 }
2949
2950 static void
2951 dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
2952 {
2953 dmu_recv_cookie_t *drc = arg;
2954 dsl_pool_t *dp = dmu_tx_pool(tx);
2955 boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
2956
2957 spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
2958 tx, "snap=%s", drc->drc_tosnap);
2959 drc->drc_ds->ds_objset->os_raw_receive = B_FALSE;
2960
2961 if (!drc->drc_newfs) {
2962 dsl_dataset_t *origin_head;
2963
2964 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
2965 &origin_head));
2966
2967 if (drc->drc_force) {
2968 /*
2969 * Destroy any snapshots of drc_tofs (origin_head)
2970 * after the origin (the snap before drc_ds).
2971 */
2972 uint64_t obj;
2973
2974 obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
2975 while (obj !=
2976 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
2977 dsl_dataset_t *snap;
2978 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
2979 &snap));
2980 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
2981 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
2982 dsl_destroy_snapshot_sync_impl(snap,
2983 B_FALSE, tx);
2984 dsl_dataset_rele(snap, FTAG);
2985 }
2986 }
2987 if (drc->drc_keynvl != NULL) {
2988 dsl_crypto_recv_raw_key_sync(drc->drc_ds,
2989 drc->drc_keynvl, tx);
2990 nvlist_free(drc->drc_keynvl);
2991 drc->drc_keynvl = NULL;
2992 }
2993
2994 VERIFY3P(drc->drc_ds->ds_prev, ==,
2995 origin_head->ds_prev);
2996
2997 dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
2998 origin_head, tx);
2999 dsl_dataset_snapshot_sync_impl(origin_head,
3000 drc->drc_tosnap, tx);
3001
3002 /* set snapshot's creation time and guid */
3003 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
3004 dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
3005 drc->drc_drrb->drr_creation_time;
3006 dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
3007 drc->drc_drrb->drr_toguid;
3008 dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
3009 ~DS_FLAG_INCONSISTENT;
3010
3011 dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
3012 dsl_dataset_phys(origin_head)->ds_flags &=
3013 ~DS_FLAG_INCONSISTENT;
3014
3015 drc->drc_newsnapobj =
3016 dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
3017
3018 dsl_dataset_rele(origin_head, FTAG);
3019 dsl_destroy_head_sync_impl(drc->drc_ds, tx);
3020
3021 if (drc->drc_owner != NULL)
3022 VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
3023 } else {
3024 dsl_dataset_t *ds = drc->drc_ds;
3025
3026 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
3027
3028 /* set snapshot's creation time and guid */
3029 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
3030 dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
3031 drc->drc_drrb->drr_creation_time;
3032 dsl_dataset_phys(ds->ds_prev)->ds_guid =
3033 drc->drc_drrb->drr_toguid;
3034 dsl_dataset_phys(ds->ds_prev)->ds_flags &=
3035 ~DS_FLAG_INCONSISTENT;
3036
3037 dmu_buf_will_dirty(ds->ds_dbuf, tx);
3038 dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
3039 if (dsl_dataset_has_resume_receive_state(ds)) {
3040 (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
3041 DS_FIELD_RESUME_FROMGUID, tx);
3042 (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
3043 DS_FIELD_RESUME_OBJECT, tx);
3044 (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
3045 DS_FIELD_RESUME_OFFSET, tx);
3046 (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
3047 DS_FIELD_RESUME_BYTES, tx);
3048 (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
3049 DS_FIELD_RESUME_TOGUID, tx);
3050 (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
3051 DS_FIELD_RESUME_TONAME, tx);
3052 (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
3053 DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx);
3054 }
3055 drc->drc_newsnapobj =
3056 dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
3057 }
3058
3059 /*
3060 * If this is a raw receive, the crypt_keydata nvlist will include
3061 * a to_ivset_guid for us to set on the new snapshot. This value
3062 * will override the value generated by the snapshot code. However,
3063 * this value may not be present, because older implementations of
3064 * the raw send code did not include this value, and we are still
3065 * allowed to receive them if the zfs_disable_ivset_guid_check
3066 * tunable is set, in which case we will leave the newly-generated
3067 * value.
3068 */
3069 if (drc->drc_raw && drc->drc_ivset_guid != 0) {
3070 dmu_object_zapify(dp->dp_meta_objset, drc->drc_newsnapobj,
3071 DMU_OT_DSL_DATASET, tx);
3072 VERIFY0(zap_update(dp->dp_meta_objset, drc->drc_newsnapobj,
3073 DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1,
3074 &drc->drc_ivset_guid, tx));
3075 }
3076
3077 zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE);
3078
3079 /*
3080 * Release the hold from dmu_recv_begin. This must be done before
3081 * we return to open context, so that when we free the dataset's dnode
3082 * we can evict its bonus buffer. Since the dataset may be destroyed
3083 * at this point (and therefore won't have a valid pointer to the spa)
3084 * we release the key mapping manually here while we do have a valid
3085 * pointer, if it exists.
3086 */
3087 if (!drc->drc_raw && encrypted) {
3088 (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa,
3089 drc->drc_ds->ds_object, drc->drc_ds);
3090 }
3091 dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag);
3092 drc->drc_ds = NULL;
3093 }
3094
3095 static int
3096 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj,
3097 boolean_t raw)
3098 {
3099 dsl_pool_t *dp;
3100 dsl_dataset_t *snapds;
3101 guid_map_entry_t *gmep;
3102 objset_t *os;
3103 ds_hold_flags_t dsflags = (raw) ? 0 : DS_HOLD_FLAG_DECRYPT;
3104 int err;
3105
3106 ASSERT(guid_map != NULL);
3107
3108 err = dsl_pool_hold(name, FTAG, &dp);
3109 if (err != 0)
3110 return (err);
3111 gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
3112 err = dsl_dataset_own_obj(dp, snapobj, dsflags, gmep, &snapds);
3113
3114 if (err == 0) {
3115 /*
3116 * If this is a deduplicated raw send stream, we need
3117 * to make sure that we can still read raw blocks from
3118 * earlier datasets in the stream, so we set the
3119 * os_raw_receive flag now.
3120 */
3121 if (raw) {
3122 err = dmu_objset_from_ds(snapds, &os);
3123 if (err != 0) {
3124 dsl_dataset_disown(snapds, dsflags, FTAG);
3125 dsl_pool_rele(dp, FTAG);
3126 kmem_free(gmep, sizeof (*gmep));
3127 return (err);
3128 }
3129 os->os_raw_receive = B_TRUE;
3130 }
3131
3132 gmep->raw = raw;
3133 gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
3134 gmep->gme_ds = snapds;
3135 avl_add(guid_map, gmep);
3136 } else {
3137 kmem_free(gmep, sizeof (*gmep));
3138 }
3139
3140 dsl_pool_rele(dp, FTAG);
3141 return (err);
3142 }
3143
3144 static int dmu_recv_end_modified_blocks = 3;
3145
3146 static int
3147 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
3148 {
3149 #ifdef _KERNEL
3150 /*
3151 * We will be destroying the ds; make sure its origin is unmounted if
3152 * necessary.
3153 */
3154 char name[ZFS_MAX_DATASET_NAME_LEN];
3155 dsl_dataset_name(drc->drc_ds, name);
3156 zfs_destroy_unmount_origin(name);
3157 #endif
3158
3159 return (dsl_sync_task(drc->drc_tofs,
3160 dmu_recv_end_check, dmu_recv_end_sync, drc,
3161 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
3162 }
3163
3164 static int
3165 dmu_recv_new_end(dmu_recv_cookie_t *drc)
3166 {
3167 return (dsl_sync_task(drc->drc_tofs,
3168 dmu_recv_end_check, dmu_recv_end_sync, drc,
3169 dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
3170 }
3171
3172 int
3173 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
3174 {
3175 int error;
3176
3177 drc->drc_owner = owner;
3178
3179 if (drc->drc_newfs)
3180 error = dmu_recv_new_end(drc);
3181 else
3182 error = dmu_recv_existing_end(drc);
3183
3184 if (error != 0) {
3185 dmu_recv_cleanup_ds(drc);
3186 nvlist_free(drc->drc_keynvl);
3187 } else if (drc->drc_guid_to_ds_map != NULL) {
3188 (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map,
3189 drc->drc_newsnapobj, drc->drc_raw);
3190 }
3191 return (error);
3192 }
3193
3194 /*
3195 * Return TRUE if this objset is currently being received into.
3196 */
3197 boolean_t
3198 dmu_objset_is_receiving(objset_t *os)
3199 {
3200 return (os->os_dsl_dataset != NULL &&
3201 os->os_dsl_dataset->ds_owner == dmu_recv_tag);
3202 }
3203
3204 #if defined(_KERNEL)
3205 module_param(zfs_recv_queue_length, int, 0644);
3206 MODULE_PARM_DESC(zfs_recv_queue_length, "Maximum receive queue length");
3207
3208 module_param(zfs_recv_queue_ff, int, 0644);
3209 MODULE_PARM_DESC(zfs_recv_queue_ff, "Receive queue fill fraction");
3210 #endif