module/zfs/dmu_recv.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  24  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  25  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  26  * Copyright 2014 HybridCluster. All rights reserved.
  27  * Copyright 2016 RackTop Systems.
  28  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  29  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  30  */
  31
  32 #include <sys/dmu.h>
  33 #include <sys/dmu_impl.h>
  34 #include <sys/dmu_tx.h>
  35 #include <sys/dbuf.h>
  36 #include <sys/dnode.h>
  37 #include <sys/zfs_context.h>
  38 #include <sys/dmu_objset.h>
  39 #include <sys/dmu_traverse.h>
  40 #include <sys/dsl_dataset.h>
  41 #include <sys/dsl_dir.h>
  42 #include <sys/dsl_prop.h>
  43 #include <sys/dsl_pool.h>
  44 #include <sys/dsl_synctask.h>
  45 #include <sys/spa_impl.h>
  46 #include <sys/zfs_ioctl.h>
  47 #include <sys/zap.h>
  48 #include <sys/zio_checksum.h>
  49 #include <sys/zfs_znode.h>
  50 #include <zfs_fletcher.h>
  51 #include <sys/avl.h>
  52 #include <sys/ddt.h>
  53 #include <sys/zfs_onexit.h>
  54 #include <sys/dmu_recv.h>
  55 #include <sys/dsl_destroy.h>
  56 #include <sys/blkptr.h>
  57 #include <sys/dsl_bookmark.h>
  58 #include <sys/zfeature.h>
  59 #include <sys/bqueue.h>
  60 #include <sys/zvol.h>
  61 #include <sys/policy.h>
  62
  63 int zfs_recv_queue_length = SPA_MAXBLOCKSIZE;
  64
  65 static char *dmu_recv_tag = "dmu_recv_tag";
  66 const char *recv_clone_name = "%recv";
  67
  68 static void byteswap_record(dmu_replay_record_t *drr);
  69
  70 typedef struct dmu_recv_begin_arg {
  71         const char *drba_origin;
  72         dmu_recv_cookie_t *drba_cookie;
  73         cred_t *drba_cred;
  74         dsl_crypto_params_t *drba_dcp;
  75         uint64_t drba_snapobj;
  76 } dmu_recv_begin_arg_t;
  77
  78 static int
  79 recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds,
  80     uint64_t fromguid, uint64_t featureflags)
  81 {
  82         uint64_t val;
  83         uint64_t children;
  84         int error;
  85         dsl_pool_t *dp = ds->ds_dir->dd_pool;
  86         boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0;
  87         boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
  88         boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0;
  89
  90         /* temporary clone name must not exist */
  91         error = zap_lookup(dp->dp_meta_objset,
  92             dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name,
  93             8, 1, &val);
  94         if (error != ENOENT)
  95                 return (error == 0 ? EBUSY : error);
  96
  97         /* new snapshot name must not exist */
  98         error = zap_lookup(dp->dp_meta_objset,
  99             dsl_dataset_phys(ds)->ds_snapnames_zapobj,
 100             drba->drba_cookie->drc_tosnap, 8, 1, &val);
 101         if (error != ENOENT)
 102                 return (error == 0 ? EEXIST : error);
 103
 104         /* must not have children if receiving a ZVOL */
 105         error = zap_count(dp->dp_meta_objset,
 106             dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children);
 107         if (error != 0)
 108                 return (error);
 109         if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS &&
 110             children > 0)
 111                 return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 112
 113         /*
 114          * Check snapshot limit before receiving. We'll recheck again at the
 115          * end, but might as well abort before receiving if we're already over
 116          * the limit.
 117          *
 118          * Note that we do not check the file system limit with
 119          * dsl_dir_fscount_check because the temporary %clones don't count
 120          * against that limit.
 121          */
 122         error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT,
 123             NULL, drba->drba_cred);
 124         if (error != 0)
 125                 return (error);
 126
 127         if (fromguid != 0) {
 128                 dsl_dataset_t *snap;
 129                 uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 130
 131                 /* Can't perform a raw receive on top of a non-raw receive */
 132                 if (!encrypted && raw)
 133                         return (SET_ERROR(EINVAL));
 134
 135                 /* Encryption is incompatible with embedded data */
 136                 if (encrypted && embed)
 137                         return (SET_ERROR(EINVAL));
 138
 139                 /* Find snapshot in this dir that matches fromguid. */
 140                 while (obj != 0) {
 141                         error = dsl_dataset_hold_obj(dp, obj, FTAG,
 142                             &snap);
 143                         if (error != 0)
 144                                 return (SET_ERROR(ENODEV));
 145                         if (snap->ds_dir != ds->ds_dir) {
 146                                 dsl_dataset_rele(snap, FTAG);
 147                                 return (SET_ERROR(ENODEV));
 148                         }
 149                         if (dsl_dataset_phys(snap)->ds_guid == fromguid)
 150                                 break;
 151                         obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
 152                         dsl_dataset_rele(snap, FTAG);
 153                 }
 154                 if (obj == 0)
 155                         return (SET_ERROR(ENODEV));
 156
 157                 if (drba->drba_cookie->drc_force) {
 158                         drba->drba_snapobj = obj;
 159                 } else {
 160                         /*
 161                          * If we are not forcing, there must be no
 162                          * changes since fromsnap.
 163                          */
 164                         if (dsl_dataset_modified_since_snap(ds, snap)) {
 165                                 dsl_dataset_rele(snap, FTAG);
 166                                 return (SET_ERROR(ETXTBSY));
 167                         }
 168                         drba->drba_snapobj = ds->ds_prev->ds_object;
 169                 }
 170
 171                 dsl_dataset_rele(snap, FTAG);
 172         } else {
 173                 /* if full, then must be forced */
 174                 if (!drba->drba_cookie->drc_force)
 175                         return (SET_ERROR(EEXIST));
 176
 177                 /*
 178                  * We don't support using zfs recv -F to blow away
 179                  * encrypted filesystems. This would require the
 180                  * dsl dir to point to the old encryption key and
 181                  * the new one at the same time during the receive.
 182                  */
 183                 if ((!encrypted && raw) || encrypted)
 184                         return (SET_ERROR(EINVAL));
 185
 186                 /*
 187                  * Perform the same encryption checks we would if
 188                  * we were creating a new dataset from scratch.
 189                  */
 190                 if (!raw) {
 191                         boolean_t will_encrypt;
 192
 193                         error = dmu_objset_create_crypt_check(
 194                             ds->ds_dir->dd_parent, drba->drba_dcp,
 195                             &will_encrypt);
 196                         if (error != 0)
 197                                 return (error);
 198
 199                         if (will_encrypt && embed)
 200                                 return (SET_ERROR(EINVAL));
 201                 }
 202
 203                 drba->drba_snapobj = 0;
 204         }
 205
 206         return (0);
 207
 208 }
 209
 210 static int
 211 dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
 212 {
 213         dmu_recv_begin_arg_t *drba = arg;
 214         dsl_pool_t *dp = dmu_tx_pool(tx);
 215         struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 216         uint64_t fromguid = drrb->drr_fromguid;
 217         int flags = drrb->drr_flags;
 218         ds_hold_flags_t dsflags = 0;
 219         int error;
 220         uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 221         dsl_dataset_t *ds;
 222         const char *tofs = drba->drba_cookie->drc_tofs;
 223
 224         /* already checked */
 225         ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 226         ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
 227
 228         if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 229             DMU_COMPOUNDSTREAM ||
 230             drrb->drr_type >= DMU_OST_NUMTYPES ||
 231             ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL))
 232                 return (SET_ERROR(EINVAL));
 233
 234         /* Verify pool version supports SA if SA_SPILL feature set */
 235         if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 236             spa_version(dp->dp_spa) < SPA_VERSION_SA)
 237                 return (SET_ERROR(ENOTSUP));
 238
 239         if (drba->drba_cookie->drc_resumable &&
 240             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
 241                 return (SET_ERROR(ENOTSUP));
 242
 243         /*
 244          * The receiving code doesn't know how to translate a WRITE_EMBEDDED
 245          * record to a plain WRITE record, so the pool must have the
 246          * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
 247          * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
 248          */
 249         if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
 250             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
 251                 return (SET_ERROR(ENOTSUP));
 252         if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
 253             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 254                 return (SET_ERROR(ENOTSUP));
 255
 256         /*
 257          * The receiving code doesn't know how to translate large blocks
 258          * to smaller ones, so the pool must have the LARGE_BLOCKS
 259          * feature enabled if the stream has LARGE_BLOCKS. Same with
 260          * large dnodes.
 261          */
 262         if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 263             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
 264                 return (SET_ERROR(ENOTSUP));
 265         if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
 266             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
 267                 return (SET_ERROR(ENOTSUP));
 268
 269         if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 270                 /* raw receives require the encryption feature */
 271                 if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION))
 272                         return (SET_ERROR(ENOTSUP));
 273
 274                 /* embedded data is incompatible with encryption and raw recv */
 275                 if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
 276                         return (SET_ERROR(EINVAL));
 277         } else {
 278                 dsflags |= DS_HOLD_FLAG_DECRYPT;
 279         }
 280
 281         error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 282         if (error == 0) {
 283                 /* target fs already exists; recv into temp clone */
 284
 285                 /* Can't recv a clone into an existing fs */
 286                 if (flags & DRR_FLAG_CLONE || drba->drba_origin) {
 287                         dsl_dataset_rele_flags(ds, dsflags, FTAG);
 288                         return (SET_ERROR(EINVAL));
 289                 }
 290
 291                 error = recv_begin_check_existing_impl(drba, ds, fromguid,
 292                     featureflags);
 293                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 294         } else if (error == ENOENT) {
 295                 /* target fs does not exist; must be a full backup or clone */
 296                 char buf[ZFS_MAX_DATASET_NAME_LEN];
 297                 objset_t *os;
 298
 299                 /*
 300                  * If it's a non-clone incremental, we are missing the
 301                  * target fs, so fail the recv.
 302                  */
 303                 if (fromguid != 0 && !(flags & DRR_FLAG_CLONE ||
 304                     drba->drba_origin))
 305                         return (SET_ERROR(ENOENT));
 306
 307                 /*
 308                  * If we're receiving a full send as a clone, and it doesn't
 309                  * contain all the necessary free records and freeobject
 310                  * records, reject it.
 311                  */
 312                 if (fromguid == 0 && drba->drba_origin &&
 313                     !(flags & DRR_FLAG_FREERECORDS))
 314                         return (SET_ERROR(EINVAL));
 315
 316                 /* Open the parent of tofs */
 317                 ASSERT3U(strlen(tofs), <, sizeof (buf));
 318                 (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1);
 319                 error = dsl_dataset_hold_flags(dp, buf, dsflags, FTAG, &ds);
 320                 if (error != 0)
 321                         return (error);
 322
 323                 if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 &&
 324                     drba->drba_origin == NULL) {
 325                         boolean_t will_encrypt;
 326
 327                         /*
 328                          * Check that we aren't breaking any encryption rules
 329                          * and that we have all the parameters we need to
 330                          * create an encrypted dataset if necessary. If we are
 331                          * making an encrypted dataset the stream can't have
 332                          * embedded data.
 333                          */
 334                         error = dmu_objset_create_crypt_check(ds->ds_dir,
 335                             drba->drba_dcp, &will_encrypt);
 336                         if (error != 0) {
 337                                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 338                                 return (error);
 339                         }
 340
 341                         if (will_encrypt &&
 342                             (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
 343                                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 344                                 return (SET_ERROR(EINVAL));
 345                         }
 346                 }
 347
 348                 /*
 349                  * Check filesystem and snapshot limits before receiving. We'll
 350                  * recheck snapshot limits again at the end (we create the
 351                  * filesystems and increment those counts during begin_sync).
 352                  */
 353                 error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 354                     ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred);
 355                 if (error != 0) {
 356                         dsl_dataset_rele_flags(ds, dsflags, FTAG);
 357                         return (error);
 358                 }
 359
 360                 error = dsl_fs_ss_limit_check(ds->ds_dir, 1,
 361                     ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred);
 362                 if (error != 0) {
 363                         dsl_dataset_rele_flags(ds, dsflags, FTAG);
 364                         return (error);
 365                 }
 366
 367                 /* can't recv below anything but filesystems (eg. no ZVOLs) */
 368                 error = dmu_objset_from_ds(ds, &os);
 369                 if (error != 0) {
 370                         dsl_dataset_rele_flags(ds, dsflags, FTAG);
 371                         return (error);
 372                 }
 373                 if (dmu_objset_type(os) != DMU_OST_ZFS) {
 374                         dsl_dataset_rele_flags(ds, dsflags, FTAG);
 375                         return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 376                 }
 377
 378                 if (drba->drba_origin != NULL) {
 379                         dsl_dataset_t *origin;
 380
 381                         error = dsl_dataset_hold_flags(dp, drba->drba_origin,
 382                             dsflags, FTAG, &origin);
 383                         if (error != 0) {
 384                                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 385                                 return (error);
 386                         }
 387                         if (!origin->ds_is_snapshot) {
 388                                 dsl_dataset_rele_flags(origin, dsflags, FTAG);
 389                                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 390                                 return (SET_ERROR(EINVAL));
 391                         }
 392                         if (dsl_dataset_phys(origin)->ds_guid != fromguid &&
 393                             fromguid != 0) {
 394                                 dsl_dataset_rele_flags(origin, dsflags, FTAG);
 395                                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 396                                 return (SET_ERROR(ENODEV));
 397                         }
 398                         if (origin->ds_dir->dd_crypto_obj != 0 &&
 399                             (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) {
 400                                 dsl_dataset_rele_flags(origin, dsflags, FTAG);
 401                                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 402                                 return (SET_ERROR(EINVAL));
 403                         }
 404                         dsl_dataset_rele_flags(origin,
 405                             dsflags, FTAG);
 406                 }
 407
 408                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 409                 error = 0;
 410         }
 411         return (error);
 412 }
 413
 414 static void
 415 dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
 416 {
 417         dmu_recv_begin_arg_t *drba = arg;
 418         dsl_pool_t *dp = dmu_tx_pool(tx);
 419         objset_t *mos = dp->dp_meta_objset;
 420         struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 421         const char *tofs = drba->drba_cookie->drc_tofs;
 422         uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 423         dsl_dataset_t *ds, *newds;
 424         objset_t *os;
 425         uint64_t dsobj;
 426         ds_hold_flags_t dsflags = 0;
 427         int error;
 428         uint64_t crflags = 0;
 429         dsl_crypto_params_t dummy_dcp = { 0 };
 430         dsl_crypto_params_t *dcp = drba->drba_dcp;
 431
 432         if (drrb->drr_flags & DRR_FLAG_CI_DATA)
 433                 crflags |= DS_FLAG_CI_DATASET;
 434
 435         if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
 436                 dsflags |= DS_HOLD_FLAG_DECRYPT;
 437
 438         /*
 439          * Raw, non-incremental recvs always use a dummy dcp with
 440          * the raw cmd set. Raw incremental recvs do not use a dcp
 441          * since the encryption parameters are already set in stone.
 442          */
 443         if (dcp == NULL && drba->drba_snapobj == 0 &&
 444             drba->drba_origin == NULL) {
 445                 ASSERT3P(dcp, ==, NULL);
 446                 dcp = &dummy_dcp;
 447
 448                 if (featureflags & DMU_BACKUP_FEATURE_RAW)
 449                         dcp->cp_cmd = DCP_CMD_RAW_RECV;
 450         }
 451
 452         error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 453         if (error == 0) {
 454                 /* create temporary clone */
 455                 dsl_dataset_t *snap = NULL;
 456
 457                 if (drba->drba_snapobj != 0) {
 458                         VERIFY0(dsl_dataset_hold_obj(dp,
 459                             drba->drba_snapobj, FTAG, &snap));
 460                         ASSERT3P(dcp, ==, NULL);
 461                 }
 462
 463                 dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name,
 464                     snap, crflags, drba->drba_cred, dcp, tx);
 465                 if (drba->drba_snapobj != 0)
 466                         dsl_dataset_rele(snap, FTAG);
 467                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 468         } else {
 469                 dsl_dir_t *dd;
 470                 const char *tail;
 471                 dsl_dataset_t *origin = NULL;
 472
 473                 VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail));
 474
 475                 if (drba->drba_origin != NULL) {
 476                         VERIFY0(dsl_dataset_hold(dp, drba->drba_origin,
 477                             FTAG, &origin));
 478                         ASSERT3P(dcp, ==, NULL);
 479                 }
 480
 481                 /* Create new dataset. */
 482                 dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1,
 483                     origin, crflags, drba->drba_cred, dcp, tx);
 484                 if (origin != NULL)
 485                         dsl_dataset_rele(origin, FTAG);
 486                 dsl_dir_rele(dd, FTAG);
 487                 drba->drba_cookie->drc_newfs = B_TRUE;
 488         }
 489
 490         VERIFY0(dsl_dataset_own_obj(dp, dsobj, dsflags, dmu_recv_tag, &newds));
 491         VERIFY0(dmu_objset_from_ds(newds, &os));
 492
 493         if (drba->drba_cookie->drc_resumable) {
 494                 dsl_dataset_zapify(newds, tx);
 495                 if (drrb->drr_fromguid != 0) {
 496                         VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
 497                             8, 1, &drrb->drr_fromguid, tx));
 498                 }
 499                 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
 500                     8, 1, &drrb->drr_toguid, tx));
 501                 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
 502                     1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
 503                 uint64_t one = 1;
 504                 uint64_t zero = 0;
 505                 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
 506                     8, 1, &one, tx));
 507                 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
 508                     8, 1, &zero, tx));
 509                 VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
 510                     8, 1, &zero, tx));
 511                 if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) {
 512                         VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK,
 513                             8, 1, &one, tx));
 514                 }
 515                 if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) {
 516                         VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
 517                             8, 1, &one, tx));
 518                 }
 519                 if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) {
 520                         VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK,
 521                             8, 1, &one, tx));
 522                 }
 523                 if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 524                         VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK,
 525                             8, 1, &one, tx));
 526                 }
 527         }
 528
 529         /*
 530          * Usually the os->os_encrypted value is tied to the presence of a
 531          * DSL Crypto Key object in the dd. However, that will not be received
 532          * until dmu_recv_stream(), so we set the value manually for now.
 533          */
 534         if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 535                 os->os_encrypted = B_TRUE;
 536                 drba->drba_cookie->drc_raw = B_TRUE;
 537         }
 538
 539         dmu_buf_will_dirty(newds->ds_dbuf, tx);
 540         dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
 541
 542         /*
 543          * If we actually created a non-clone, we need to create the objset
 544          * in our new dataset. If this is a raw send we postpone this until
 545          * dmu_recv_stream() so that we can allocate the metadnode with the
 546          * properties from the DRR_BEGIN payload.
 547          */
 548         rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG);
 549         if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) &&
 550             (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) {
 551                 (void) dmu_objset_create_impl(dp->dp_spa,
 552                     newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx);
 553         }
 554         rrw_exit(&newds->ds_bp_rwlock, FTAG);
 555
 556         drba->drba_cookie->drc_ds = newds;
 557
 558         spa_history_log_internal_ds(newds, "receive", tx, "");
 559 }
 560
 561 static int
 562 dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
 563 {
 564         dmu_recv_begin_arg_t *drba = arg;
 565         dsl_pool_t *dp = dmu_tx_pool(tx);
 566         struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 567         int error;
 568         ds_hold_flags_t dsflags = 0;
 569         uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 570         dsl_dataset_t *ds;
 571         const char *tofs = drba->drba_cookie->drc_tofs;
 572
 573         /* already checked */
 574         ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
 575         ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
 576
 577         if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
 578             DMU_COMPOUNDSTREAM ||
 579             drrb->drr_type >= DMU_OST_NUMTYPES)
 580                 return (SET_ERROR(EINVAL));
 581
 582         /* Verify pool version supports SA if SA_SPILL feature set */
 583         if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 584             spa_version(dp->dp_spa) < SPA_VERSION_SA)
 585                 return (SET_ERROR(ENOTSUP));
 586
 587         /*
 588          * The receiving code doesn't know how to translate a WRITE_EMBEDDED
 589          * record to a plain WRITE record, so the pool must have the
 590          * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
 591          * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
 592          */
 593         if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
 594             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
 595                 return (SET_ERROR(ENOTSUP));
 596         if ((featureflags & DMU_BACKUP_FEATURE_LZ4) &&
 597             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
 598                 return (SET_ERROR(ENOTSUP));
 599
 600         /*
 601          * The receiving code doesn't know how to translate large blocks
 602          * to smaller ones, so the pool must have the LARGE_BLOCKS
 603          * feature enabled if the stream has LARGE_BLOCKS. Same with
 604          * large dnodes.
 605          */
 606         if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 607             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS))
 608                 return (SET_ERROR(ENOTSUP));
 609         if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) &&
 610             !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_DNODE))
 611                 return (SET_ERROR(ENOTSUP));
 612
 613         /* 6 extra bytes for /%recv */
 614         char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
 615         (void) snprintf(recvname, sizeof (recvname), "%s/%s",
 616             tofs, recv_clone_name);
 617
 618         if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
 619                 dsflags |= DS_HOLD_FLAG_DECRYPT;
 620
 621         if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
 622                 /* %recv does not exist; continue in tofs */
 623                 error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds);
 624                 if (error != 0)
 625                         return (error);
 626         }
 627
 628         /* check that ds is marked inconsistent */
 629         if (!DS_IS_INCONSISTENT(ds)) {
 630                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 631                 return (SET_ERROR(EINVAL));
 632         }
 633
 634         /* check that there is resuming data, and that the toguid matches */
 635         if (!dsl_dataset_is_zapified(ds)) {
 636                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 637                 return (SET_ERROR(EINVAL));
 638         }
 639         uint64_t val;
 640         error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
 641             DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
 642         if (error != 0 || drrb->drr_toguid != val) {
 643                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 644                 return (SET_ERROR(EINVAL));
 645         }
 646
 647         /*
 648          * Check if the receive is still running.  If so, it will be owned.
 649          * Note that nothing else can own the dataset (e.g. after the receive
 650          * fails) because it will be marked inconsistent.
 651          */
 652         if (dsl_dataset_has_owner(ds)) {
 653                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 654                 return (SET_ERROR(EBUSY));
 655         }
 656
 657         /* There should not be any snapshots of this fs yet. */
 658         if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
 659                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 660                 return (SET_ERROR(EINVAL));
 661         }
 662
 663         /*
 664          * Note: resume point will be checked when we process the first WRITE
 665          * record.
 666          */
 667
 668         /* check that the origin matches */
 669         val = 0;
 670         (void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
 671             DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
 672         if (drrb->drr_fromguid != val) {
 673                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
 674                 return (SET_ERROR(EINVAL));
 675         }
 676
 677         dsl_dataset_rele_flags(ds, dsflags, FTAG);
 678         return (0);
 679 }
 680
 681 static void
 682 dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
 683 {
 684         dmu_recv_begin_arg_t *drba = arg;
 685         dsl_pool_t *dp = dmu_tx_pool(tx);
 686         const char *tofs = drba->drba_cookie->drc_tofs;
 687         struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
 688         uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 689         dsl_dataset_t *ds;
 690         objset_t *os;
 691         ds_hold_flags_t dsflags = 0;
 692         uint64_t dsobj;
 693         /* 6 extra bytes for /%recv */
 694         char recvname[ZFS_MAX_DATASET_NAME_LEN + 6];
 695
 696         (void) snprintf(recvname, sizeof (recvname), "%s/%s",
 697             tofs, recv_clone_name);
 698
 699         if (featureflags & DMU_BACKUP_FEATURE_RAW) {
 700                 drba->drba_cookie->drc_raw = B_TRUE;
 701         } else {
 702                 dsflags |= DS_HOLD_FLAG_DECRYPT;
 703         }
 704
 705         if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
 706                 /* %recv does not exist; continue in tofs */
 707                 VERIFY0(dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds));
 708                 drba->drba_cookie->drc_newfs = B_TRUE;
 709         }
 710
 711         /* clear the inconsistent flag so that we can own it */
 712         ASSERT(DS_IS_INCONSISTENT(ds));
 713         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 714         dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
 715         dsobj = ds->ds_object;
 716         dsl_dataset_rele_flags(ds, dsflags, FTAG);
 717
 718         VERIFY0(dsl_dataset_own_obj(dp, dsobj, dsflags, dmu_recv_tag, &ds));
 719         VERIFY0(dmu_objset_from_ds(ds, &os));
 720
 721         dmu_buf_will_dirty(ds->ds_dbuf, tx);
 722         dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
 723
 724         rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 725         ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) ||
 726             drba->drba_cookie->drc_raw);
 727         rrw_exit(&ds->ds_bp_rwlock, FTAG);
 728
 729         drba->drba_cookie->drc_ds = ds;
 730
 731         spa_history_log_internal_ds(ds, "resume receive", tx, "");
 732 }
 733
 734 /*
 735  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
 736  * succeeds; otherwise we will leak the holds on the datasets.
 737  */
 738 int
 739 dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
 740     boolean_t force, boolean_t resumable, nvlist_t *localprops,
 741     nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc)
 742 {
 743         dmu_recv_begin_arg_t drba = { 0 };
 744
 745         bzero(drc, sizeof (dmu_recv_cookie_t));
 746         drc->drc_drr_begin = drr_begin;
 747         drc->drc_drrb = &drr_begin->drr_u.drr_begin;
 748         drc->drc_tosnap = tosnap;
 749         drc->drc_tofs = tofs;
 750         drc->drc_force = force;
 751         drc->drc_resumable = resumable;
 752         drc->drc_cred = CRED();
 753         drc->drc_clone = (origin != NULL);
 754
 755         if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
 756                 drc->drc_byteswap = B_TRUE;
 757                 (void) fletcher_4_incremental_byteswap(drr_begin,
 758                     sizeof (dmu_replay_record_t), &drc->drc_cksum);
 759                 byteswap_record(drr_begin);
 760         } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
 761                 (void) fletcher_4_incremental_native(drr_begin,
 762                     sizeof (dmu_replay_record_t), &drc->drc_cksum);
 763         } else {
 764                 return (SET_ERROR(EINVAL));
 765         }
 766
 767         drba.drba_origin = origin;
 768         drba.drba_cookie = drc;
 769         drba.drba_cred = CRED();
 770
 771         if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
 772             DMU_BACKUP_FEATURE_RESUMING) {
 773                 return (dsl_sync_task(tofs,
 774                     dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
 775                     &drba, 5, ZFS_SPACE_CHECK_NORMAL));
 776         } else  {
 777                 int err;
 778
 779                 /*
 780                  * For non-raw, non-incremental, non-resuming receives the
 781                  * user can specify encryption parameters on the command line
 782                  * with "zfs recv -o". For these receives we create a dcp and
 783                  * pass it to the sync task. Creating the dcp will implicitly
 784                  * remove the encryption params from the localprops nvlist,
 785                  * which avoids errors when trying to set these normally
 786                  * read-only properties. Any other kind of receive that
 787                  * attempts to set these properties will fail as a result.
 788                  */
 789                 if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
 790                     DMU_BACKUP_FEATURE_RAW) == 0 &&
 791                     origin == NULL && drc->drc_drrb->drr_fromguid == 0) {
 792                         err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE,
 793                             localprops, hidden_args, &drba.drba_dcp);
 794                         if (err != 0)
 795                                 return (err);
 796                 }
 797
 798                 err = dsl_sync_task(tofs,
 799                     dmu_recv_begin_check, dmu_recv_begin_sync,
 800                     &drba, 5, ZFS_SPACE_CHECK_NORMAL);
 801                 dsl_crypto_params_free(drba.drba_dcp, !!err);
 802
 803                 return (err);
 804         }
 805 }
 806
 807 struct receive_record_arg {
 808         dmu_replay_record_t header;
 809         void *payload; /* Pointer to a buffer containing the payload */
 810         /*
 811          * If the record is a write, pointer to the arc_buf_t containing the
 812          * payload.
 813          */
 814         arc_buf_t *arc_buf;
 815         int payload_size;
 816         uint64_t bytes_read; /* bytes read from stream when record created */
 817         boolean_t eos_marker; /* Marks the end of the stream */
 818         bqueue_node_t node;
 819 };
 820
 821 struct receive_writer_arg {
 822         objset_t *os;
 823         boolean_t byteswap;
 824         bqueue_t q;
 825
 826         /*
 827          * These three args are used to signal to the main thread that we're
 828          * done.
 829          */
 830         kmutex_t mutex;
 831         kcondvar_t cv;
 832         boolean_t done;
 833
 834         int err;
 835         /* A map from guid to dataset to help handle dedup'd streams. */
 836         avl_tree_t *guid_to_ds_map;
 837         boolean_t resumable;
 838         boolean_t raw;
 839         uint64_t last_object;
 840         uint64_t last_offset;
 841         uint64_t max_object; /* highest object ID referenced in stream */
 842         uint64_t bytes_read; /* bytes read when current record created */
 843
 844         /* Encryption parameters for the last received DRR_OBJECT_RANGE */
 845         boolean_t or_crypt_params_present;
 846         uint64_t or_firstobj;
 847         uint64_t or_numslots;
 848         uint8_t or_salt[ZIO_DATA_SALT_LEN];
 849         uint8_t or_iv[ZIO_DATA_IV_LEN];
 850         uint8_t or_mac[ZIO_DATA_MAC_LEN];
 851         boolean_t or_byteorder;
 852 };
 853
 854 struct objlist {
 855         list_t list; /* List of struct receive_objnode. */
 856         /*
 857          * Last object looked up. Used to assert that objects are being looked
 858          * up in ascending order.
 859          */
 860         uint64_t last_lookup;
 861 };
 862
 863 struct receive_objnode {
 864         list_node_t node;
 865         uint64_t object;
 866 };
 867
 868 struct receive_arg  {
 869         objset_t *os;
 870         vnode_t *vp; /* The vnode to read the stream from */
 871         uint64_t voff; /* The current offset in the stream */
 872         uint64_t bytes_read;
 873         /*
 874          * A record that has had its payload read in, but hasn't yet been handed
 875          * off to the worker thread.
 876          */
 877         struct receive_record_arg *rrd;
 878         /* A record that has had its header read in, but not its payload. */
 879         struct receive_record_arg *next_rrd;
 880         zio_cksum_t cksum;
 881         zio_cksum_t prev_cksum;
 882         int err;
 883         boolean_t byteswap;
 884         boolean_t raw;
 885         uint64_t featureflags;
 886         /* Sorted list of objects not to issue prefetches for. */
 887         struct objlist ignore_objlist;
 888 };
 889
 890 typedef struct guid_map_entry {
 891         uint64_t        guid;
 892         boolean_t       raw;
 893         dsl_dataset_t   *gme_ds;
 894         avl_node_t      avlnode;
 895 } guid_map_entry_t;
 896
 897 static int
 898 guid_compare(const void *arg1, const void *arg2)
 899 {
 900         const guid_map_entry_t *gmep1 = (const guid_map_entry_t *)arg1;
 901         const guid_map_entry_t *gmep2 = (const guid_map_entry_t *)arg2;
 902
 903         return (AVL_CMP(gmep1->guid, gmep2->guid));
 904 }
 905
 906 static void
 907 free_guid_map_onexit(void *arg)
 908 {
 909         avl_tree_t *ca = arg;
 910         void *cookie = NULL;
 911         guid_map_entry_t *gmep;
 912
 913         while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
 914                 ds_hold_flags_t dsflags = DS_HOLD_FLAG_DECRYPT;
 915
 916                 if (gmep->raw) {
 917                         gmep->gme_ds->ds_objset->os_raw_receive = B_FALSE;
 918                         dsflags &= ~DS_HOLD_FLAG_DECRYPT;
 919                 }
 920
 921                 dsl_dataset_disown(gmep->gme_ds, dsflags, gmep);
 922                 kmem_free(gmep, sizeof (guid_map_entry_t));
 923         }
 924         avl_destroy(ca);
 925         kmem_free(ca, sizeof (avl_tree_t));
 926 }
 927
 928 static int
 929 receive_read(struct receive_arg *ra, int len, void *buf)
 930 {
 931         int done = 0;
 932
 933         /*
 934          * The code doesn't rely on this (lengths being multiples of 8).  See
 935          * comment in dump_bytes.
 936          */
 937         ASSERT(len % 8 == 0 ||
 938             (ra->featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
 939
 940         while (done < len) {
 941                 ssize_t resid;
 942
 943                 ra->err = vn_rdwr(UIO_READ, ra->vp,
 944                     (char *)buf + done, len - done,
 945                     ra->voff, UIO_SYSSPACE, FAPPEND,
 946                     RLIM64_INFINITY, CRED(), &resid);
 947
 948                 if (resid == len - done) {
 949                         /*
 950                          * Note: ECKSUM indicates that the receive
 951                          * was interrupted and can potentially be resumed.
 952                          */
 953                         ra->err = SET_ERROR(ECKSUM);
 954                 }
 955                 ra->voff += len - done - resid;
 956                 done = len - resid;
 957                 if (ra->err != 0)
 958                         return (ra->err);
 959         }
 960
 961         ra->bytes_read += len;
 962
 963         ASSERT3U(done, ==, len);
 964         return (0);
 965 }
 966
 967 noinline static void
 968 byteswap_record(dmu_replay_record_t *drr)
 969 {
 970 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 971 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 972         drr->drr_type = BSWAP_32(drr->drr_type);
 973         drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
 974
 975         switch (drr->drr_type) {
 976         case DRR_BEGIN:
 977                 DO64(drr_begin.drr_magic);
 978                 DO64(drr_begin.drr_versioninfo);
 979                 DO64(drr_begin.drr_creation_time);
 980                 DO32(drr_begin.drr_type);
 981                 DO32(drr_begin.drr_flags);
 982                 DO64(drr_begin.drr_toguid);
 983                 DO64(drr_begin.drr_fromguid);
 984                 break;
 985         case DRR_OBJECT:
 986                 DO64(drr_object.drr_object);
 987                 DO32(drr_object.drr_type);
 988                 DO32(drr_object.drr_bonustype);
 989                 DO32(drr_object.drr_blksz);
 990                 DO32(drr_object.drr_bonuslen);
 991                 DO32(drr_object.drr_raw_bonuslen);
 992                 DO64(drr_object.drr_toguid);
 993                 DO64(drr_object.drr_maxblkid);
 994                 break;
 995         case DRR_FREEOBJECTS:
 996                 DO64(drr_freeobjects.drr_firstobj);
 997                 DO64(drr_freeobjects.drr_numobjs);
 998                 DO64(drr_freeobjects.drr_toguid);
 999                 break;
1000         case DRR_WRITE:
1001                 DO64(drr_write.drr_object);
1002                 DO32(drr_write.drr_type);
1003                 DO64(drr_write.drr_offset);
1004                 DO64(drr_write.drr_logical_size);
1005                 DO64(drr_write.drr_toguid);
1006                 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum);
1007                 DO64(drr_write.drr_key.ddk_prop);
1008                 DO64(drr_write.drr_compressed_size);
1009                 break;
1010         case DRR_WRITE_BYREF:
1011                 DO64(drr_write_byref.drr_object);
1012                 DO64(drr_write_byref.drr_offset);
1013                 DO64(drr_write_byref.drr_length);
1014                 DO64(drr_write_byref.drr_toguid);
1015                 DO64(drr_write_byref.drr_refguid);
1016                 DO64(drr_write_byref.drr_refobject);
1017                 DO64(drr_write_byref.drr_refoffset);
1018                 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref.
1019                     drr_key.ddk_cksum);
1020                 DO64(drr_write_byref.drr_key.ddk_prop);
1021                 break;
1022         case DRR_WRITE_EMBEDDED:
1023                 DO64(drr_write_embedded.drr_object);
1024                 DO64(drr_write_embedded.drr_offset);
1025                 DO64(drr_write_embedded.drr_length);
1026                 DO64(drr_write_embedded.drr_toguid);
1027                 DO32(drr_write_embedded.drr_lsize);
1028                 DO32(drr_write_embedded.drr_psize);
1029                 break;
1030         case DRR_FREE:
1031                 DO64(drr_free.drr_object);
1032                 DO64(drr_free.drr_offset);
1033                 DO64(drr_free.drr_length);
1034                 DO64(drr_free.drr_toguid);
1035                 break;
1036         case DRR_SPILL:
1037                 DO64(drr_spill.drr_object);
1038                 DO64(drr_spill.drr_length);
1039                 DO64(drr_spill.drr_toguid);
1040                 DO64(drr_spill.drr_compressed_size);
1041                 DO32(drr_spill.drr_type);
1042                 break;
1043         case DRR_OBJECT_RANGE:
1044                 DO64(drr_object_range.drr_firstobj);
1045                 DO64(drr_object_range.drr_numslots);
1046                 DO64(drr_object_range.drr_toguid);
1047                 break;
1048         case DRR_END:
1049                 DO64(drr_end.drr_toguid);
1050                 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum);
1051                 break;
1052         default:
1053                 break;
1054         }
1055
1056         if (drr->drr_type != DRR_BEGIN) {
1057                 ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum);
1058         }
1059
1060 #undef DO64
1061 #undef DO32
1062 }
1063
1064 static inline uint8_t
1065 deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
1066 {
1067         if (bonus_type == DMU_OT_SA) {
1068                 return (1);
1069         } else {
1070                 return (1 +
1071                     ((DN_OLD_MAX_BONUSLEN -
1072                     MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT));
1073         }
1074 }
1075
1076 static void
1077 save_resume_state(struct receive_writer_arg *rwa,
1078     uint64_t object, uint64_t offset, dmu_tx_t *tx)
1079 {
1080         int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
1081
1082         if (!rwa->resumable)
1083                 return;
1084
1085         /*
1086          * We use ds_resume_bytes[] != 0 to indicate that we need to
1087          * update this on disk, so it must not be 0.
1088          */
1089         ASSERT(rwa->bytes_read != 0);
1090
1091         /*
1092          * We only resume from write records, which have a valid
1093          * (non-meta-dnode) object number.
1094          */
1095         ASSERT(object != 0);
1096
1097         /*
1098          * For resuming to work correctly, we must receive records in order,
1099          * sorted by object,offset.  This is checked by the callers, but
1100          * assert it here for good measure.
1101          */
1102         ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
1103         ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
1104             offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
1105         ASSERT3U(rwa->bytes_read, >=,
1106             rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
1107
1108         rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
1109         rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
1110         rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
1111 }
1112
1113 noinline static int
1114 receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
1115     void *data)
1116 {
1117         dmu_object_info_t doi;
1118         dmu_tx_t *tx;
1119         uint64_t object;
1120         int err;
1121         uint8_t dn_slots = drro->drr_dn_slots != 0 ?
1122             drro->drr_dn_slots : DNODE_MIN_SLOTS;
1123
1124         if (drro->drr_type == DMU_OT_NONE ||
1125             !DMU_OT_IS_VALID(drro->drr_type) ||
1126             !DMU_OT_IS_VALID(drro->drr_bonustype) ||
1127             drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
1128             drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1129             P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1130             drro->drr_blksz < SPA_MINBLOCKSIZE ||
1131             drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) ||
1132             drro->drr_bonuslen >
1133             DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) ||
1134             dn_slots >
1135             (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT))  {
1136                 return (SET_ERROR(EINVAL));
1137         }
1138
1139         if (rwa->raw) {
1140                 /*
1141                  * We should have received a DRR_OBJECT_RANGE record
1142                  * containing this block and stored it in rwa.
1143                  */
1144                 if (drro->drr_object < rwa->or_firstobj ||
1145                     drro->drr_object >= rwa->or_firstobj + rwa->or_numslots ||
1146                     drro->drr_raw_bonuslen < drro->drr_bonuslen ||
1147                     drro->drr_indblkshift > SPA_MAXBLOCKSHIFT ||
1148                     drro->drr_nlevels > DN_MAX_LEVELS ||
1149                     drro->drr_nblkptr > DN_MAX_NBLKPTR ||
1150                     DN_SLOTS_TO_BONUSLEN(dn_slots) <
1151                     drro->drr_raw_bonuslen)
1152                         return (SET_ERROR(EINVAL));
1153         } else {
1154                 if (drro->drr_flags != 0 || drro->drr_raw_bonuslen != 0 ||
1155                     drro->drr_indblkshift != 0 || drro->drr_nlevels != 0 ||
1156                     drro->drr_nblkptr != 0)
1157                         return (SET_ERROR(EINVAL));
1158         }
1159
1160         err = dmu_object_info(rwa->os, drro->drr_object, &doi);
1161         if (err != 0 && err != ENOENT && err != EEXIST)
1162                 return (SET_ERROR(EINVAL));
1163
1164         if (drro->drr_object > rwa->max_object)
1165                 rwa->max_object = drro->drr_object;
1166
1167         /*
1168          * If we are losing blkptrs or changing the block size this must
1169          * be a new file instance.  We must clear out the previous file
1170          * contents before we can change this type of metadata in the dnode.
1171          * Raw receives will also check that the indirect structure of the
1172          * dnode hasn't changed.
1173          */
1174         if (err == 0) {
1175                 uint32_t indblksz = drro->drr_indblkshift ?
1176                     1ULL << drro->drr_indblkshift : 0;
1177                 int nblkptr = deduce_nblkptr(drro->drr_bonustype,
1178                     drro->drr_bonuslen);
1179
1180                 object = drro->drr_object;
1181
1182                 /* nblkptr will be bounded by the bonus size and type */
1183                 if (rwa->raw && nblkptr != drro->drr_nblkptr)
1184                         return (SET_ERROR(EINVAL));
1185
1186                 if (drro->drr_blksz != doi.doi_data_block_size ||
1187                     nblkptr < doi.doi_nblkptr ||
1188                     dn_slots != doi.doi_dnodesize >> DNODE_SHIFT ||
1189                     (rwa->raw &&
1190                     (indblksz != doi.doi_metadata_block_size ||
1191                     drro->drr_nlevels < doi.doi_indirection))) {
1192                         err = dmu_free_long_range(rwa->os,
1193                             drro->drr_object, 0, DMU_OBJECT_END);
1194                         if (err != 0)
1195                                 return (SET_ERROR(EINVAL));
1196                 }
1197
1198                 /*
1199                  * The dmu does not currently support decreasing nlevels
1200                  * on an object. For non-raw sends, this does not matter
1201                  * and the new object can just use the previous one's nlevels.
1202                  * For raw sends, however, the structure of the received dnode
1203                  * (including nlevels) must match that of the send side.
1204                  * Therefore, instead of using dmu_object_reclaim(), we must
1205                  * free the object completely and call dmu_object_claim_dnsize()
1206                  * instead.
1207                  */
1208                 if ((rwa->raw && drro->drr_nlevels < doi.doi_indirection) ||
1209                     dn_slots != doi.doi_dnodesize >> DNODE_SHIFT) {
1210                         err = dmu_free_long_object(rwa->os, drro->drr_object);
1211                         if (err != 0)
1212                                 return (SET_ERROR(EINVAL));
1213
1214                         txg_wait_synced(dmu_objset_pool(rwa->os), 0);
1215                         object = DMU_NEW_OBJECT;
1216                 }
1217         } else if (err == EEXIST) {
1218                 /*
1219                  * The object requested is currently an interior slot of a
1220                  * multi-slot dnode. This will be resolved when the next txg
1221                  * is synced out, since the send stream will have told us
1222                  * to free this slot when we freed the associated dnode
1223                  * earlier in the stream.
1224                  */
1225                 txg_wait_synced(dmu_objset_pool(rwa->os), 0);
1226                 object = drro->drr_object;
1227         } else {
1228                 /* object is free and we are about to allocate a new one */
1229                 object = DMU_NEW_OBJECT;
1230         }
1231
1232         /*
1233          * If this is a multi-slot dnode there is a chance that this
1234          * object will expand into a slot that is already used by
1235          * another object from the previous snapshot. We must free
1236          * these objects before we attempt to allocate the new dnode.
1237          */
1238         if (dn_slots > 1) {
1239                 boolean_t need_sync = B_FALSE;
1240
1241                 for (uint64_t slot = drro->drr_object + 1;
1242                     slot < drro->drr_object + dn_slots;
1243                     slot++) {
1244                         dmu_object_info_t slot_doi;
1245
1246                         err = dmu_object_info(rwa->os, slot, &slot_doi);
1247                         if (err == ENOENT || err == EEXIST)
1248                                 continue;
1249                         else if (err != 0)
1250                                 return (err);
1251
1252                         err = dmu_free_long_object(rwa->os, slot);
1253
1254                         if (err != 0)
1255                                 return (err);
1256
1257                         need_sync = B_TRUE;
1258                 }
1259
1260                 if (need_sync)
1261                         txg_wait_synced(dmu_objset_pool(rwa->os), 0);
1262         }
1263
1264         tx = dmu_tx_create(rwa->os);
1265         dmu_tx_hold_bonus(tx, object);
1266         dmu_tx_hold_write(tx, object, 0, 0);
1267         err = dmu_tx_assign(tx, TXG_WAIT);
1268         if (err != 0) {
1269                 dmu_tx_abort(tx);
1270                 return (err);
1271         }
1272
1273         if (object == DMU_NEW_OBJECT) {
1274                 /* currently free, want to be allocated */
1275                 err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
1276                     drro->drr_type, drro->drr_blksz,
1277                     drro->drr_bonustype, drro->drr_bonuslen,
1278                     dn_slots << DNODE_SHIFT, tx);
1279         } else if (drro->drr_type != doi.doi_type ||
1280             drro->drr_blksz != doi.doi_data_block_size ||
1281             drro->drr_bonustype != doi.doi_bonus_type ||
1282             drro->drr_bonuslen != doi.doi_bonus_size) {
1283                 /* currently allocated, but with different properties */
1284                 err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
1285                     drro->drr_type, drro->drr_blksz,
1286                     drro->drr_bonustype, drro->drr_bonuslen,
1287                     dn_slots << DNODE_SHIFT, tx);
1288         }
1289         if (err != 0) {
1290                 dmu_tx_commit(tx);
1291                 return (SET_ERROR(EINVAL));
1292         }
1293
1294         if (rwa->or_crypt_params_present) {
1295                 /*
1296                  * Set the crypt params for the buffer associated with this
1297                  * range of dnodes.  This causes the blkptr_t to have the
1298                  * same crypt params (byteorder, salt, iv, mac) as on the
1299                  * sending side.
1300                  *
1301                  * Since we are committing this tx now, it is possible for
1302                  * the dnode block to end up on-disk with the incorrect MAC,
1303                  * if subsequent objects in this block are received in a
1304                  * different txg.  However, since the dataset is marked as
1305                  * inconsistent, no code paths will do a non-raw read (or
1306                  * decrypt the block / verify the MAC). The receive code and
1307                  * scrub code can safely do raw reads and verify the
1308                  * checksum.  They don't need to verify the MAC.
1309                  */
1310                 dmu_buf_t *db = NULL;
1311                 uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE;
1312
1313                 err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os),
1314                     offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT);
1315                 if (err != 0) {
1316                         dmu_tx_commit(tx);
1317                         return (SET_ERROR(EINVAL));
1318                 }
1319
1320                 dmu_buf_set_crypt_params(db, rwa->or_byteorder,
1321                     rwa->or_salt, rwa->or_iv, rwa->or_mac, tx);
1322
1323                 dmu_buf_rele(db, FTAG);
1324
1325                 rwa->or_crypt_params_present = B_FALSE;
1326         }
1327
1328         dmu_object_set_checksum(rwa->os, drro->drr_object,
1329             drro->drr_checksumtype, tx);
1330         dmu_object_set_compress(rwa->os, drro->drr_object,
1331             drro->drr_compress, tx);
1332
1333         /* handle more restrictive dnode structuring for raw recvs */
1334         if (rwa->raw) {
1335                 /*
1336                  * Set the indirect block shift and nlevels. This will not fail
1337                  * because we ensured all of the blocks were free earlier if
1338                  * this is a new object.
1339                  */
1340                 VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object,
1341                     drro->drr_blksz, drro->drr_indblkshift, tx));
1342                 VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object,
1343                     drro->drr_nlevels, tx));
1344                 VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object,
1345                     drro->drr_maxblkid, tx));
1346         }
1347
1348         if (data != NULL) {
1349                 dmu_buf_t *db;
1350                 dnode_t *dn;
1351                 uint32_t flags = DMU_READ_NO_PREFETCH;
1352
1353                 if (rwa->raw)
1354                         flags |= DMU_READ_NO_DECRYPT;
1355
1356                 VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn));
1357                 VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags));
1358
1359                 dmu_buf_will_dirty(db, tx);
1360
1361                 ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
1362                 bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro));
1363
1364                 /*
1365                  * Raw bonus buffers have their byteorder determined by the
1366                  * DRR_OBJECT_RANGE record.
1367                  */
1368                 if (rwa->byteswap && !rwa->raw) {
1369                         dmu_object_byteswap_t byteswap =
1370                             DMU_OT_BYTESWAP(drro->drr_bonustype);
1371                         dmu_ot_byteswap[byteswap].ob_func(db->db_data,
1372                             DRR_OBJECT_PAYLOAD_SIZE(drro));
1373                 }
1374                 dmu_buf_rele(db, FTAG);
1375                 dnode_rele(dn, FTAG);
1376         }
1377         dmu_tx_commit(tx);
1378
1379         return (0);
1380 }
1381
1382 /* ARGSUSED */
1383 noinline static int
1384 receive_freeobjects(struct receive_writer_arg *rwa,
1385     struct drr_freeobjects *drrfo)
1386 {
1387         uint64_t obj;
1388         int next_err = 0;
1389
1390         if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
1391                 return (SET_ERROR(EINVAL));
1392
1393         for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj;
1394             obj < drrfo->drr_firstobj + drrfo->drr_numobjs && next_err == 0;
1395             next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) {
1396                 dmu_object_info_t doi;
1397                 int err;
1398
1399                 err = dmu_object_info(rwa->os, obj, &doi);
1400                 if (err == ENOENT)
1401                         continue;
1402                 else if (err != 0)
1403                         return (err);
1404
1405                 err = dmu_free_long_object(rwa->os, obj);
1406
1407                 if (err != 0)
1408                         return (err);
1409
1410                 if (obj > rwa->max_object)
1411                         rwa->max_object = obj;
1412         }
1413         if (next_err != ESRCH)
1414                 return (next_err);
1415         return (0);
1416 }
1417
1418 noinline static int
1419 receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
1420     arc_buf_t *abuf)
1421 {
1422         int err;
1423         dmu_tx_t *tx;
1424         dnode_t *dn;
1425
1426         if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset ||
1427             !DMU_OT_IS_VALID(drrw->drr_type))
1428                 return (SET_ERROR(EINVAL));
1429
1430         /*
1431          * For resuming to work, records must be in increasing order
1432          * by (object, offset).
1433          */
1434         if (drrw->drr_object < rwa->last_object ||
1435             (drrw->drr_object == rwa->last_object &&
1436             drrw->drr_offset < rwa->last_offset)) {
1437                 return (SET_ERROR(EINVAL));
1438         }
1439         rwa->last_object = drrw->drr_object;
1440         rwa->last_offset = drrw->drr_offset;
1441
1442         if (rwa->last_object > rwa->max_object)
1443                 rwa->max_object = rwa->last_object;
1444
1445         if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
1446                 return (SET_ERROR(EINVAL));
1447
1448         tx = dmu_tx_create(rwa->os);
1449         dmu_tx_hold_write(tx, drrw->drr_object,
1450             drrw->drr_offset, drrw->drr_logical_size);
1451         err = dmu_tx_assign(tx, TXG_WAIT);
1452         if (err != 0) {
1453                 dmu_tx_abort(tx);
1454                 return (err);
1455         }
1456
1457         if (rwa->byteswap && !arc_is_encrypted(abuf) &&
1458             arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
1459                 dmu_object_byteswap_t byteswap =
1460                     DMU_OT_BYTESWAP(drrw->drr_type);
1461                 dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
1462                     DRR_WRITE_PAYLOAD_SIZE(drrw));
1463         }
1464
1465         VERIFY0(dnode_hold(rwa->os, drrw->drr_object, FTAG, &dn));
1466         err = dmu_assign_arcbuf_by_dnode(dn, drrw->drr_offset, abuf, tx);
1467         if (err != 0) {
1468                 dnode_rele(dn, FTAG);
1469                 dmu_tx_commit(tx);
1470                 return (err);
1471         }
1472         dnode_rele(dn, FTAG);
1473
1474         /*
1475          * Note: If the receive fails, we want the resume stream to start
1476          * with the same record that we last successfully received (as opposed
1477          * to the next record), so that we can verify that we are
1478          * resuming from the correct location.
1479          */
1480         save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
1481         dmu_tx_commit(tx);
1482
1483         return (0);
1484 }
1485
1486 /*
1487  * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
1488  * streams to refer to a copy of the data that is already on the
1489  * system because it came in earlier in the stream.  This function
1490  * finds the earlier copy of the data, and uses that copy instead of
1491  * data from the stream to fulfill this write.
1492  */
1493 static int
1494 receive_write_byref(struct receive_writer_arg *rwa,
1495     struct drr_write_byref *drrwbr)
1496 {
1497         dmu_tx_t *tx;
1498         int err;
1499         guid_map_entry_t gmesrch;
1500         guid_map_entry_t *gmep;
1501         avl_index_t where;
1502         objset_t *ref_os = NULL;
1503         int flags = DMU_READ_PREFETCH;
1504         dmu_buf_t *dbp;
1505
1506         if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
1507                 return (SET_ERROR(EINVAL));
1508
1509         /*
1510          * If the GUID of the referenced dataset is different from the
1511          * GUID of the target dataset, find the referenced dataset.
1512          */
1513         if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1514                 gmesrch.guid = drrwbr->drr_refguid;
1515                 if ((gmep = avl_find(rwa->guid_to_ds_map, &gmesrch,
1516                     &where)) == NULL) {
1517                         return (SET_ERROR(EINVAL));
1518                 }
1519                 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
1520                         return (SET_ERROR(EINVAL));
1521         } else {
1522                 ref_os = rwa->os;
1523         }
1524
1525         if (drrwbr->drr_object > rwa->max_object)
1526                 rwa->max_object = drrwbr->drr_object;
1527
1528         if (rwa->raw)
1529                 flags |= DMU_READ_NO_DECRYPT;
1530
1531         /* may return either a regular db or an encrypted one */
1532         err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1533             drrwbr->drr_refoffset, FTAG, &dbp, flags);
1534         if (err != 0)
1535                 return (err);
1536
1537         tx = dmu_tx_create(rwa->os);
1538
1539         dmu_tx_hold_write(tx, drrwbr->drr_object,
1540             drrwbr->drr_offset, drrwbr->drr_length);
1541         err = dmu_tx_assign(tx, TXG_WAIT);
1542         if (err != 0) {
1543                 dmu_tx_abort(tx);
1544                 return (err);
1545         }
1546
1547         if (rwa->raw) {
1548                 dmu_copy_from_buf(rwa->os, drrwbr->drr_object,
1549                     drrwbr->drr_offset, dbp, tx);
1550         } else {
1551                 dmu_write(rwa->os, drrwbr->drr_object,
1552                     drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1553         }
1554         dmu_buf_rele(dbp, FTAG);
1555
1556         /* See comment in restore_write. */
1557         save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
1558         dmu_tx_commit(tx);
1559         return (0);
1560 }
1561
1562 static int
1563 receive_write_embedded(struct receive_writer_arg *rwa,
1564     struct drr_write_embedded *drrwe, void *data)
1565 {
1566         dmu_tx_t *tx;
1567         int err;
1568
1569         if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
1570                 return (SET_ERROR(EINVAL));
1571
1572         if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
1573                 return (SET_ERROR(EINVAL));
1574
1575         if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
1576                 return (SET_ERROR(EINVAL));
1577         if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
1578                 return (SET_ERROR(EINVAL));
1579         if (rwa->raw)
1580                 return (SET_ERROR(EINVAL));
1581
1582         if (drrwe->drr_object > rwa->max_object)
1583                 rwa->max_object = drrwe->drr_object;
1584
1585         tx = dmu_tx_create(rwa->os);
1586
1587         dmu_tx_hold_write(tx, drrwe->drr_object,
1588             drrwe->drr_offset, drrwe->drr_length);
1589         err = dmu_tx_assign(tx, TXG_WAIT);
1590         if (err != 0) {
1591                 dmu_tx_abort(tx);
1592                 return (err);
1593         }
1594
1595         dmu_write_embedded(rwa->os, drrwe->drr_object,
1596             drrwe->drr_offset, data, drrwe->drr_etype,
1597             drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
1598             rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
1599
1600         /* See comment in restore_write. */
1601         save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
1602         dmu_tx_commit(tx);
1603         return (0);
1604 }
1605
1606 static int
1607 receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
1608     arc_buf_t *abuf)
1609 {
1610         dmu_tx_t *tx;
1611         dmu_buf_t *db, *db_spill;
1612         int err;
1613         uint32_t flags = 0;
1614
1615         if (drrs->drr_length < SPA_MINBLOCKSIZE ||
1616             drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
1617                 return (SET_ERROR(EINVAL));
1618
1619         if (rwa->raw) {
1620                 if (!DMU_OT_IS_VALID(drrs->drr_type) ||
1621                     drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
1622                     drrs->drr_compressed_size == 0)
1623                         return (SET_ERROR(EINVAL));
1624
1625                 flags |= DMU_READ_NO_DECRYPT;
1626         }
1627
1628         if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0)
1629                 return (SET_ERROR(EINVAL));
1630
1631         if (drrs->drr_object > rwa->max_object)
1632                 rwa->max_object = drrs->drr_object;
1633
1634         VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db));
1635         if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG,
1636             &db_spill)) != 0) {
1637                 dmu_buf_rele(db, FTAG);
1638                 return (err);
1639         }
1640
1641         tx = dmu_tx_create(rwa->os);
1642
1643         dmu_tx_hold_spill(tx, db->db_object);
1644
1645         err = dmu_tx_assign(tx, TXG_WAIT);
1646         if (err != 0) {
1647                 dmu_buf_rele(db, FTAG);
1648                 dmu_buf_rele(db_spill, FTAG);
1649                 dmu_tx_abort(tx);
1650                 return (err);
1651         }
1652
1653         if (db_spill->db_size < drrs->drr_length)
1654                 VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1655                     drrs->drr_length, tx));
1656
1657         if (rwa->byteswap && !arc_is_encrypted(abuf) &&
1658             arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
1659                 dmu_object_byteswap_t byteswap =
1660                     DMU_OT_BYTESWAP(drrs->drr_type);
1661                 dmu_ot_byteswap[byteswap].ob_func(abuf->b_data,
1662                     DRR_SPILL_PAYLOAD_SIZE(drrs));
1663         }
1664
1665         dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx);
1666
1667         dmu_buf_rele(db, FTAG);
1668         dmu_buf_rele(db_spill, FTAG);
1669
1670         dmu_tx_commit(tx);
1671         return (0);
1672 }
1673
1674 /* ARGSUSED */
1675 noinline static int
1676 receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
1677 {
1678         int err;
1679
1680         if (drrf->drr_length != DMU_OBJECT_END &&
1681             drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
1682                 return (SET_ERROR(EINVAL));
1683
1684         if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0)
1685                 return (SET_ERROR(EINVAL));
1686
1687         if (drrf->drr_object > rwa->max_object)
1688                 rwa->max_object = drrf->drr_object;
1689
1690         err = dmu_free_long_range(rwa->os, drrf->drr_object,
1691             drrf->drr_offset, drrf->drr_length);
1692
1693         return (err);
1694 }
1695
1696 static int
1697 receive_object_range(struct receive_writer_arg *rwa,
1698     struct drr_object_range *drror)
1699 {
1700         /*
1701          * By default, we assume this block is in our native format
1702          * (ZFS_HOST_BYTEORDER). We then take into account whether
1703          * the send stream is byteswapped (rwa->byteswap). Finally,
1704          * we need to byteswap again if this particular block was
1705          * in non-native format on the send side.
1706          */
1707         boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^
1708             !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags);
1709
1710         /*
1711          * Since dnode block sizes are constant, we should not need to worry
1712          * about making sure that the dnode block size is the same on the
1713          * sending and receiving sides for the time being. For non-raw sends,
1714          * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE
1715          * record at all). Raw sends require this record type because the
1716          * encryption parameters are used to protect an entire block of bonus
1717          * buffers. If the size of dnode blocks ever becomes variable,
1718          * handling will need to be added to ensure that dnode block sizes
1719          * match on the sending and receiving side.
1720          */
1721         if (drror->drr_numslots != DNODES_PER_BLOCK ||
1722             P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 ||
1723             !rwa->raw)
1724                 return (SET_ERROR(EINVAL));
1725
1726         if (drror->drr_firstobj > rwa->max_object)
1727                 rwa->max_object = drror->drr_firstobj;
1728
1729         /*
1730          * The DRR_OBJECT_RANGE handling must be deferred to receive_object()
1731          * so that the block of dnodes is not written out when it's empty,
1732          * and converted to a HOLE BP.
1733          */
1734         rwa->or_crypt_params_present = B_TRUE;
1735         rwa->or_firstobj = drror->drr_firstobj;
1736         rwa->or_numslots = drror->drr_numslots;
1737         bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN);
1738         bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN);
1739         bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN);
1740         rwa->or_byteorder = byteorder;
1741
1742         return (0);
1743 }
1744
1745 /* used to destroy the drc_ds on error */
1746 static void
1747 dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
1748 {
1749         dsl_dataset_t *ds = drc->drc_ds;
1750         ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT;
1751
1752         /*
1753          * Wait for the txg sync before cleaning up the receive. For
1754          * resumable receives, this ensures that our resume state has
1755          * been written out to disk. For raw receives, this ensures
1756          * that the user accounting code will not attempt to do anything
1757          * after we stopped receiving the dataset.
1758          */
1759         txg_wait_synced(ds->ds_dir->dd_pool, 0);
1760         ds->ds_objset->os_raw_receive = B_FALSE;
1761
1762         rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
1763         if (drc->drc_resumable && !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) {
1764                 rrw_exit(&ds->ds_bp_rwlock, FTAG);
1765                 dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
1766         } else {
1767                 char name[ZFS_MAX_DATASET_NAME_LEN];
1768                 rrw_exit(&ds->ds_bp_rwlock, FTAG);
1769                 dsl_dataset_name(ds, name);
1770                 dsl_dataset_disown(ds, dsflags, dmu_recv_tag);
1771                 (void) dsl_destroy_head(name);
1772         }
1773 }
1774
1775 static void
1776 receive_cksum(struct receive_arg *ra, int len, void *buf)
1777 {
1778         if (ra->byteswap) {
1779                 (void) fletcher_4_incremental_byteswap(buf, len, &ra->cksum);
1780         } else {
1781                 (void) fletcher_4_incremental_native(buf, len, &ra->cksum);
1782         }
1783 }
1784
1785 /*
1786  * Read the payload into a buffer of size len, and update the current record's
1787  * payload field.
1788  * Allocate ra->next_rrd and read the next record's header into
1789  * ra->next_rrd->header.
1790  * Verify checksum of payload and next record.
1791  */
1792 static int
1793 receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
1794 {
1795         int err;
1796         zio_cksum_t cksum_orig;
1797         zio_cksum_t *cksump;
1798
1799         if (len != 0) {
1800                 ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
1801                 err = receive_read(ra, len, buf);
1802                 if (err != 0)
1803                         return (err);
1804                 receive_cksum(ra, len, buf);
1805
1806                 /* note: rrd is NULL when reading the begin record's payload */
1807                 if (ra->rrd != NULL) {
1808                         ra->rrd->payload = buf;
1809                         ra->rrd->payload_size = len;
1810                         ra->rrd->bytes_read = ra->bytes_read;
1811                 }
1812         } else {
1813                 ASSERT3P(buf, ==, NULL);
1814         }
1815
1816         ra->prev_cksum = ra->cksum;
1817
1818         ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
1819         err = receive_read(ra, sizeof (ra->next_rrd->header),
1820             &ra->next_rrd->header);
1821         ra->next_rrd->bytes_read = ra->bytes_read;
1822
1823         if (err != 0) {
1824                 kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
1825                 ra->next_rrd = NULL;
1826                 return (err);
1827         }
1828         if (ra->next_rrd->header.drr_type == DRR_BEGIN) {
1829                 kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
1830                 ra->next_rrd = NULL;
1831                 return (SET_ERROR(EINVAL));
1832         }
1833
1834         /*
1835          * Note: checksum is of everything up to but not including the
1836          * checksum itself.
1837          */
1838         ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
1839             ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
1840         receive_cksum(ra,
1841             offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
1842             &ra->next_rrd->header);
1843
1844         cksum_orig = ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
1845         cksump = &ra->next_rrd->header.drr_u.drr_checksum.drr_checksum;
1846
1847         if (ra->byteswap)
1848                 byteswap_record(&ra->next_rrd->header);
1849
1850         if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) &&
1851             !ZIO_CHECKSUM_EQUAL(ra->cksum, *cksump)) {
1852                 kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
1853                 ra->next_rrd = NULL;
1854                 return (SET_ERROR(ECKSUM));
1855         }
1856
1857         receive_cksum(ra, sizeof (cksum_orig), &cksum_orig);
1858
1859         return (0);
1860 }
1861
1862 static void
1863 objlist_create(struct objlist *list)
1864 {
1865         list_create(&list->list, sizeof (struct receive_objnode),
1866             offsetof(struct receive_objnode, node));
1867         list->last_lookup = 0;
1868 }
1869
1870 static void
1871 objlist_destroy(struct objlist *list)
1872 {
1873         for (struct receive_objnode *n = list_remove_head(&list->list);
1874             n != NULL; n = list_remove_head(&list->list)) {
1875                 kmem_free(n, sizeof (*n));
1876         }
1877         list_destroy(&list->list);
1878 }
1879
1880 /*
1881  * This function looks through the objlist to see if the specified object number
1882  * is contained in the objlist.  In the process, it will remove all object
1883  * numbers in the list that are smaller than the specified object number.  Thus,
1884  * any lookup of an object number smaller than a previously looked up object
1885  * number will always return false; therefore, all lookups should be done in
1886  * ascending order.
1887  */
1888 static boolean_t
1889 objlist_exists(struct objlist *list, uint64_t object)
1890 {
1891         struct receive_objnode *node = list_head(&list->list);
1892         ASSERT3U(object, >=, list->last_lookup);
1893         list->last_lookup = object;
1894         while (node != NULL && node->object < object) {
1895                 VERIFY3P(node, ==, list_remove_head(&list->list));
1896                 kmem_free(node, sizeof (*node));
1897                 node = list_head(&list->list);
1898         }
1899         return (node != NULL && node->object == object);
1900 }
1901
1902 /*
1903  * The objlist is a list of object numbers stored in ascending order.  However,
1904  * the insertion of new object numbers does not seek out the correct location to
1905  * store a new object number; instead, it appends it to the list for simplicity.
1906  * Thus, any users must take care to only insert new object numbers in ascending
1907  * order.
1908  */
1909 static void
1910 objlist_insert(struct objlist *list, uint64_t object)
1911 {
1912         struct receive_objnode *node = kmem_zalloc(sizeof (*node), KM_SLEEP);
1913         node->object = object;
1914 #ifdef ZFS_DEBUG
1915         {
1916         struct receive_objnode *last_object = list_tail(&list->list);
1917         uint64_t last_objnum = (last_object != NULL ? last_object->object : 0);
1918         ASSERT3U(node->object, >, last_objnum);
1919         }
1920 #endif
1921         list_insert_tail(&list->list, node);
1922 }
1923
1924 /*
1925  * Issue the prefetch reads for any necessary indirect blocks.
1926  *
1927  * We use the object ignore list to tell us whether or not to issue prefetches
1928  * for a given object.  We do this for both correctness (in case the blocksize
1929  * of an object has changed) and performance (if the object doesn't exist, don't
1930  * needlessly try to issue prefetches).  We also trim the list as we go through
1931  * the stream to prevent it from growing to an unbounded size.
1932  *
1933  * The object numbers within will always be in sorted order, and any write
1934  * records we see will also be in sorted order, but they're not sorted with
1935  * respect to each other (i.e. we can get several object records before
1936  * receiving each object's write records).  As a result, once we've reached a
1937  * given object number, we can safely remove any reference to lower object
1938  * numbers in the ignore list. In practice, we receive up to 32 object records
1939  * before receiving write records, so the list can have up to 32 nodes in it.
1940  */
1941 /* ARGSUSED */
1942 static void
1943 receive_read_prefetch(struct receive_arg *ra,
1944     uint64_t object, uint64_t offset, uint64_t length)
1945 {
1946         if (!objlist_exists(&ra->ignore_objlist, object)) {
1947                 dmu_prefetch(ra->os, object, 1, offset, length,
1948                     ZIO_PRIORITY_SYNC_READ);
1949         }
1950 }
1951
1952 /*
1953  * Read records off the stream, issuing any necessary prefetches.
1954  */
1955 static int
1956 receive_read_record(struct receive_arg *ra)
1957 {
1958         int err;
1959
1960         switch (ra->rrd->header.drr_type) {
1961         case DRR_OBJECT:
1962         {
1963                 struct drr_object *drro = &ra->rrd->header.drr_u.drr_object;
1964                 uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro);
1965                 void *buf = NULL;
1966                 dmu_object_info_t doi;
1967
1968                 if (size != 0)
1969                         buf = kmem_zalloc(size, KM_SLEEP);
1970
1971                 err = receive_read_payload_and_next_header(ra, size, buf);
1972                 if (err != 0) {
1973                         kmem_free(buf, size);
1974                         return (err);
1975                 }
1976                 err = dmu_object_info(ra->os, drro->drr_object, &doi);
1977                 /*
1978                  * See receive_read_prefetch for an explanation why we're
1979                  * storing this object in the ignore_obj_list.
1980                  */
1981                 if (err == ENOENT || err == EEXIST ||
1982                     (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) {
1983                         objlist_insert(&ra->ignore_objlist, drro->drr_object);
1984                         err = 0;
1985                 }
1986                 return (err);
1987         }
1988         case DRR_FREEOBJECTS:
1989         {
1990                 err = receive_read_payload_and_next_header(ra, 0, NULL);
1991                 return (err);
1992         }
1993         case DRR_WRITE:
1994         {
1995                 struct drr_write *drrw = &ra->rrd->header.drr_u.drr_write;
1996                 arc_buf_t *abuf;
1997                 boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type);
1998
1999                 if (ra->raw) {
2000                         boolean_t byteorder = ZFS_HOST_BYTEORDER ^
2001                             !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^
2002                             ra->byteswap;
2003
2004                         abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os),
2005                             drrw->drr_object, byteorder, drrw->drr_salt,
2006                             drrw->drr_iv, drrw->drr_mac, drrw->drr_type,
2007                             drrw->drr_compressed_size, drrw->drr_logical_size,
2008                             drrw->drr_compressiontype);
2009                 } else if (DRR_WRITE_COMPRESSED(drrw)) {
2010                         ASSERT3U(drrw->drr_compressed_size, >, 0);
2011                         ASSERT3U(drrw->drr_logical_size, >=,
2012                             drrw->drr_compressed_size);
2013                         ASSERT(!is_meta);
2014                         abuf = arc_loan_compressed_buf(
2015                             dmu_objset_spa(ra->os),
2016                             drrw->drr_compressed_size, drrw->drr_logical_size,
2017                             drrw->drr_compressiontype);
2018                 } else {
2019                         abuf = arc_loan_buf(dmu_objset_spa(ra->os),
2020                             is_meta, drrw->drr_logical_size);
2021                 }
2022
2023                 err = receive_read_payload_and_next_header(ra,
2024                     DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data);
2025                 if (err != 0) {
2026                         dmu_return_arcbuf(abuf);
2027                         return (err);
2028                 }
2029                 ra->rrd->arc_buf = abuf;
2030                 receive_read_prefetch(ra, drrw->drr_object, drrw->drr_offset,
2031                     drrw->drr_logical_size);
2032                 return (err);
2033         }
2034         case DRR_WRITE_BYREF:
2035         {
2036                 struct drr_write_byref *drrwb =
2037                     &ra->rrd->header.drr_u.drr_write_byref;
2038                 err = receive_read_payload_and_next_header(ra, 0, NULL);
2039                 receive_read_prefetch(ra, drrwb->drr_object, drrwb->drr_offset,
2040                     drrwb->drr_length);
2041                 return (err);
2042         }
2043         case DRR_WRITE_EMBEDDED:
2044         {
2045                 struct drr_write_embedded *drrwe =
2046                     &ra->rrd->header.drr_u.drr_write_embedded;
2047                 uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8);
2048                 void *buf = kmem_zalloc(size, KM_SLEEP);
2049
2050                 err = receive_read_payload_and_next_header(ra, size, buf);
2051                 if (err != 0) {
2052                         kmem_free(buf, size);
2053                         return (err);
2054                 }
2055
2056                 receive_read_prefetch(ra, drrwe->drr_object, drrwe->drr_offset,
2057                     drrwe->drr_length);
2058                 return (err);
2059         }
2060         case DRR_FREE:
2061         {
2062                 /*
2063                  * It might be beneficial to prefetch indirect blocks here, but
2064                  * we don't really have the data to decide for sure.
2065                  */
2066                 err = receive_read_payload_and_next_header(ra, 0, NULL);
2067                 return (err);
2068         }
2069         case DRR_END:
2070         {
2071                 struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
2072                 if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
2073                         return (SET_ERROR(ECKSUM));
2074                 return (0);
2075         }
2076         case DRR_SPILL:
2077         {
2078                 struct drr_spill *drrs = &ra->rrd->header.drr_u.drr_spill;
2079                 arc_buf_t *abuf;
2080                 int len = DRR_SPILL_PAYLOAD_SIZE(drrs);
2081
2082                 /* DRR_SPILL records are either raw or uncompressed */
2083                 if (ra->raw) {
2084                         boolean_t byteorder = ZFS_HOST_BYTEORDER ^
2085                             !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^
2086                             ra->byteswap;
2087
2088                         abuf = arc_loan_raw_buf(dmu_objset_spa(ra->os),
2089                             dmu_objset_id(ra->os), byteorder, drrs->drr_salt,
2090                             drrs->drr_iv, drrs->drr_mac, drrs->drr_type,
2091                             drrs->drr_compressed_size, drrs->drr_length,
2092                             drrs->drr_compressiontype);
2093                 } else {
2094                         abuf = arc_loan_buf(dmu_objset_spa(ra->os),
2095                             DMU_OT_IS_METADATA(drrs->drr_type),
2096                             drrs->drr_length);
2097                 }
2098
2099                 err = receive_read_payload_and_next_header(ra, len,
2100                     abuf->b_data);
2101                 if (err != 0) {
2102                         dmu_return_arcbuf(abuf);
2103                         return (err);
2104                 }
2105                 ra->rrd->arc_buf = abuf;
2106                 return (err);
2107         }
2108         case DRR_OBJECT_RANGE:
2109         {
2110                 err = receive_read_payload_and_next_header(ra, 0, NULL);
2111                 return (err);
2112         }
2113         default:
2114                 return (SET_ERROR(EINVAL));
2115         }
2116 }
2117
2118 static void
2119 dprintf_drr(struct receive_record_arg *rrd, int err)
2120 {
2121 #ifdef ZFS_DEBUG
2122         switch (rrd->header.drr_type) {
2123         case DRR_OBJECT:
2124         {
2125                 struct drr_object *drro = &rrd->header.drr_u.drr_object;
2126                 dprintf("drr_type = OBJECT obj = %llu type = %u "
2127                     "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u "
2128                     "compress = %u dn_slots = %u err = %d\n",
2129                     drro->drr_object, drro->drr_type,  drro->drr_bonustype,
2130                     drro->drr_blksz, drro->drr_bonuslen,
2131                     drro->drr_checksumtype, drro->drr_compress,
2132                     drro->drr_dn_slots, err);
2133                 break;
2134         }
2135         case DRR_FREEOBJECTS:
2136         {
2137                 struct drr_freeobjects *drrfo =
2138                     &rrd->header.drr_u.drr_freeobjects;
2139                 dprintf("drr_type = FREEOBJECTS firstobj = %llu "
2140                     "numobjs = %llu err = %d\n",
2141                     drrfo->drr_firstobj, drrfo->drr_numobjs, err);
2142                 break;
2143         }
2144         case DRR_WRITE:
2145         {
2146                 struct drr_write *drrw = &rrd->header.drr_u.drr_write;
2147                 dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu "
2148                     "lsize = %llu cksumtype = %u cksumflags = %u "
2149                     "compress = %u psize = %llu err = %d\n",
2150                     drrw->drr_object, drrw->drr_type, drrw->drr_offset,
2151                     drrw->drr_logical_size, drrw->drr_checksumtype,
2152                     drrw->drr_flags, drrw->drr_compressiontype,
2153                     drrw->drr_compressed_size, err);
2154                 break;
2155         }
2156         case DRR_WRITE_BYREF:
2157         {
2158                 struct drr_write_byref *drrwbr =
2159                     &rrd->header.drr_u.drr_write_byref;
2160                 dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu "
2161                     "length = %llu toguid = %llx refguid = %llx "
2162                     "refobject = %llu refoffset = %llu cksumtype = %u "
2163                     "cksumflags = %u err = %d\n",
2164                     drrwbr->drr_object, drrwbr->drr_offset,
2165                     drrwbr->drr_length, drrwbr->drr_toguid,
2166                     drrwbr->drr_refguid, drrwbr->drr_refobject,
2167                     drrwbr->drr_refoffset, drrwbr->drr_checksumtype,
2168                     drrwbr->drr_flags, err);
2169                 break;
2170         }
2171         case DRR_WRITE_EMBEDDED:
2172         {
2173                 struct drr_write_embedded *drrwe =
2174                     &rrd->header.drr_u.drr_write_embedded;
2175                 dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu "
2176                     "length = %llu compress = %u etype = %u lsize = %u "
2177                     "psize = %u err = %d\n",
2178                     drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length,
2179                     drrwe->drr_compression, drrwe->drr_etype,
2180                     drrwe->drr_lsize, drrwe->drr_psize, err);
2181                 break;
2182         }
2183         case DRR_FREE:
2184         {
2185                 struct drr_free *drrf = &rrd->header.drr_u.drr_free;
2186                 dprintf("drr_type = FREE obj = %llu offset = %llu "
2187                     "length = %lld err = %d\n",
2188                     drrf->drr_object, drrf->drr_offset, drrf->drr_length,
2189                     err);
2190                 break;
2191         }
2192         case DRR_SPILL:
2193         {
2194                 struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
2195                 dprintf("drr_type = SPILL obj = %llu length = %llu "
2196                     "err = %d\n", drrs->drr_object, drrs->drr_length, err);
2197                 break;
2198         }
2199         default:
2200                 return;
2201         }
2202 #endif
2203 }
2204
2205 /*
2206  * Commit the records to the pool.
2207  */
2208 static int
2209 receive_process_record(struct receive_writer_arg *rwa,
2210     struct receive_record_arg *rrd)
2211 {
2212         int err;
2213
2214         /* Processing in order, therefore bytes_read should be increasing. */
2215         ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
2216         rwa->bytes_read = rrd->bytes_read;
2217
2218         switch (rrd->header.drr_type) {
2219         case DRR_OBJECT:
2220         {
2221                 struct drr_object *drro = &rrd->header.drr_u.drr_object;
2222                 err = receive_object(rwa, drro, rrd->payload);
2223                 kmem_free(rrd->payload, rrd->payload_size);
2224                 rrd->payload = NULL;
2225                 break;
2226         }
2227         case DRR_FREEOBJECTS:
2228         {
2229                 struct drr_freeobjects *drrfo =
2230                     &rrd->header.drr_u.drr_freeobjects;
2231                 err = receive_freeobjects(rwa, drrfo);
2232                 break;
2233         }
2234         case DRR_WRITE:
2235         {
2236                 struct drr_write *drrw = &rrd->header.drr_u.drr_write;
2237                 err = receive_write(rwa, drrw, rrd->arc_buf);
2238                 /* if receive_write() is successful, it consumes the arc_buf */
2239                 if (err != 0)
2240                         dmu_return_arcbuf(rrd->arc_buf);
2241                 rrd->arc_buf = NULL;
2242                 rrd->payload = NULL;
2243                 break;
2244         }
2245         case DRR_WRITE_BYREF:
2246         {
2247                 struct drr_write_byref *drrwbr =
2248                     &rrd->header.drr_u.drr_write_byref;
2249                 err = receive_write_byref(rwa, drrwbr);
2250                 break;
2251         }
2252         case DRR_WRITE_EMBEDDED:
2253         {
2254                 struct drr_write_embedded *drrwe =
2255                     &rrd->header.drr_u.drr_write_embedded;
2256                 err = receive_write_embedded(rwa, drrwe, rrd->payload);
2257                 kmem_free(rrd->payload, rrd->payload_size);
2258                 rrd->payload = NULL;
2259                 break;
2260         }
2261         case DRR_FREE:
2262         {
2263                 struct drr_free *drrf = &rrd->header.drr_u.drr_free;
2264                 err = receive_free(rwa, drrf);
2265                 break;
2266         }
2267         case DRR_SPILL:
2268         {
2269                 struct drr_spill *drrs = &rrd->header.drr_u.drr_spill;
2270                 err = receive_spill(rwa, drrs, rrd->arc_buf);
2271                 /* if receive_spill() is successful, it consumes the arc_buf */
2272                 if (err != 0)
2273                         dmu_return_arcbuf(rrd->arc_buf);
2274                 rrd->arc_buf = NULL;
2275                 rrd->payload = NULL;
2276                 break;
2277         }
2278         case DRR_OBJECT_RANGE:
2279         {
2280                 struct drr_object_range *drror =
2281                     &rrd->header.drr_u.drr_object_range;
2282                 return (receive_object_range(rwa, drror));
2283         }
2284         default:
2285                 return (SET_ERROR(EINVAL));
2286         }
2287
2288         if (err != 0)
2289                 dprintf_drr(rrd, err);
2290
2291         return (err);
2292 }
2293
2294 /*
2295  * dmu_recv_stream's worker thread; pull records off the queue, and then call
2296  * receive_process_record  When we're done, signal the main thread and exit.
2297  */
2298 static void
2299 receive_writer_thread(void *arg)
2300 {
2301         struct receive_writer_arg *rwa = arg;
2302         struct receive_record_arg *rrd;
2303         fstrans_cookie_t cookie = spl_fstrans_mark();
2304
2305         for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker;
2306             rrd = bqueue_dequeue(&rwa->q)) {
2307                 /*
2308                  * If there's an error, the main thread will stop putting things
2309                  * on the queue, but we need to clear everything in it before we
2310                  * can exit.
2311                  */
2312                 if (rwa->err == 0) {
2313                         rwa->err = receive_process_record(rwa, rrd);
2314                 } else if (rrd->arc_buf != NULL) {
2315                         dmu_return_arcbuf(rrd->arc_buf);
2316                         rrd->arc_buf = NULL;
2317                         rrd->payload = NULL;
2318                 } else if (rrd->payload != NULL) {
2319                         kmem_free(rrd->payload, rrd->payload_size);
2320                         rrd->payload = NULL;
2321                 }
2322                 kmem_free(rrd, sizeof (*rrd));
2323         }
2324         kmem_free(rrd, sizeof (*rrd));
2325         mutex_enter(&rwa->mutex);
2326         rwa->done = B_TRUE;
2327         cv_signal(&rwa->cv);
2328         mutex_exit(&rwa->mutex);
2329         spl_fstrans_unmark(cookie);
2330         thread_exit();
2331 }
2332
2333 static int
2334 resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
2335 {
2336         uint64_t val;
2337         objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
2338         uint64_t dsobj = dmu_objset_id(ra->os);
2339         uint64_t resume_obj, resume_off;
2340
2341         if (nvlist_lookup_uint64(begin_nvl,
2342             "resume_object", &resume_obj) != 0 ||
2343             nvlist_lookup_uint64(begin_nvl,
2344             "resume_offset", &resume_off) != 0) {
2345                 return (SET_ERROR(EINVAL));
2346         }
2347         VERIFY0(zap_lookup(mos, dsobj,
2348             DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
2349         if (resume_obj != val)
2350                 return (SET_ERROR(EINVAL));
2351         VERIFY0(zap_lookup(mos, dsobj,
2352             DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
2353         if (resume_off != val)
2354                 return (SET_ERROR(EINVAL));
2355
2356         return (0);
2357 }
2358
2359 /*
2360  * Read in the stream's records, one by one, and apply them to the pool.  There
2361  * are two threads involved; the thread that calls this function will spin up a
2362  * worker thread, read the records off the stream one by one, and issue
2363  * prefetches for any necessary indirect blocks.  It will then push the records
2364  * onto an internal blocking queue.  The worker thread will pull the records off
2365  * the queue, and actually write the data into the DMU.  This way, the worker
2366  * thread doesn't have to wait for reads to complete, since everything it needs
2367  * (the indirect blocks) will be prefetched.
2368  *
2369  * NB: callers *must* call dmu_recv_end() if this succeeds.
2370  */
2371 int
2372 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
2373     int cleanup_fd, uint64_t *action_handlep)
2374 {
2375         int err = 0;
2376         struct receive_arg *ra;
2377         struct receive_writer_arg *rwa;
2378         int featureflags;
2379         uint32_t payloadlen;
2380         void *payload;
2381         nvlist_t *begin_nvl = NULL;
2382
2383         ra = kmem_zalloc(sizeof (*ra), KM_SLEEP);
2384         rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP);
2385
2386         ra->byteswap = drc->drc_byteswap;
2387         ra->raw = drc->drc_raw;
2388         ra->cksum = drc->drc_cksum;
2389         ra->vp = vp;
2390         ra->voff = *voffp;
2391
2392         if (dsl_dataset_is_zapified(drc->drc_ds)) {
2393                 (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
2394                     drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
2395                     sizeof (ra->bytes_read), 1, &ra->bytes_read);
2396         }
2397
2398         objlist_create(&ra->ignore_objlist);
2399
2400         /* these were verified in dmu_recv_begin */
2401         ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==,
2402             DMU_SUBSTREAM);
2403         ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES);
2404
2405         /*
2406          * Open the objset we are modifying.
2407          */
2408         VERIFY0(dmu_objset_from_ds(drc->drc_ds, &ra->os));
2409
2410         ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT);
2411
2412         featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
2413         ra->featureflags = featureflags;
2414
2415         ASSERT0(ra->os->os_encrypted &&
2416             (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA));
2417
2418         /* if this stream is dedup'ed, set up the avl tree for guid mapping */
2419         if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
2420                 minor_t minor;
2421
2422                 if (cleanup_fd == -1) {
2423                         err = SET_ERROR(EBADF);
2424                         goto out;
2425                 }
2426                 err = zfs_onexit_fd_hold(cleanup_fd, &minor);
2427                 if (err != 0) {
2428                         cleanup_fd = -1;
2429                         goto out;
2430                 }
2431
2432                 if (*action_handlep == 0) {
2433                         rwa->guid_to_ds_map =
2434                             kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
2435                         avl_create(rwa->guid_to_ds_map, guid_compare,
2436                             sizeof (guid_map_entry_t),
2437                             offsetof(guid_map_entry_t, avlnode));
2438                         err = zfs_onexit_add_cb(minor,
2439                             free_guid_map_onexit, rwa->guid_to_ds_map,
2440                             action_handlep);
2441                         if (err != 0)
2442                                 goto out;
2443                 } else {
2444                         err = zfs_onexit_cb_data(minor, *action_handlep,
2445                             (void **)&rwa->guid_to_ds_map);
2446                         if (err != 0)
2447                                 goto out;
2448                 }
2449
2450                 drc->drc_guid_to_ds_map = rwa->guid_to_ds_map;
2451         }
2452
2453         payloadlen = drc->drc_drr_begin->drr_payloadlen;
2454         payload = NULL;
2455         if (payloadlen != 0)
2456                 payload = kmem_alloc(payloadlen, KM_SLEEP);
2457
2458         err = receive_read_payload_and_next_header(ra, payloadlen, payload);
2459         if (err != 0) {
2460                 if (payloadlen != 0)
2461                         kmem_free(payload, payloadlen);
2462                 goto out;
2463         }
2464         if (payloadlen != 0) {
2465                 err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
2466                 kmem_free(payload, payloadlen);
2467                 if (err != 0)
2468                         goto out;
2469         }
2470
2471         /* handle DSL encryption key payload */
2472         if (featureflags & DMU_BACKUP_FEATURE_RAW) {
2473                 nvlist_t *keynvl = NULL;
2474
2475                 ASSERT(ra->os->os_encrypted);
2476                 ASSERT(drc->drc_raw);
2477
2478                 err = nvlist_lookup_nvlist(begin_nvl, "crypt_keydata", &keynvl);
2479                 if (err != 0)
2480                         goto out;
2481
2482                 /*
2483                  * If this is a new dataset we set the key immediately.
2484                  * Otherwise we don't want to change the key until we
2485                  * are sure the rest of the receive succeeded so we stash
2486                  * the keynvl away until then.
2487                  */
2488                 err = dsl_crypto_recv_raw(spa_name(ra->os->os_spa),
2489                     drc->drc_ds->ds_object, drc->drc_drrb->drr_type,
2490                     keynvl, drc->drc_newfs);
2491                 if (err != 0)
2492                         goto out;
2493
2494                 if (!drc->drc_newfs)
2495                         drc->drc_keynvl = fnvlist_dup(keynvl);
2496         }
2497
2498         if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
2499                 err = resume_check(ra, begin_nvl);
2500                 if (err != 0)
2501                         goto out;
2502         }
2503
2504         (void) bqueue_init(&rwa->q,
2505             MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize),
2506             offsetof(struct receive_record_arg, node));
2507         cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL);
2508         mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
2509         rwa->os = ra->os;
2510         rwa->byteswap = drc->drc_byteswap;
2511         rwa->resumable = drc->drc_resumable;
2512         rwa->raw = drc->drc_raw;
2513         rwa->os->os_raw_receive = drc->drc_raw;
2514
2515         (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
2516             TS_RUN, minclsyspri);
2517         /*
2518          * We're reading rwa->err without locks, which is safe since we are the
2519          * only reader, and the worker thread is the only writer.  It's ok if we
2520          * miss a write for an iteration or two of the loop, since the writer
2521          * thread will keep freeing records we send it until we send it an eos
2522          * marker.
2523          *
2524          * We can leave this loop in 3 ways:  First, if rwa->err is
2525          * non-zero.  In that case, the writer thread will free the rrd we just
2526          * pushed.  Second, if  we're interrupted; in that case, either it's the
2527          * first loop and ra->rrd was never allocated, or it's later and ra->rrd
2528          * has been handed off to the writer thread who will free it.  Finally,
2529          * if receive_read_record fails or we're at the end of the stream, then
2530          * we free ra->rrd and exit.
2531          */
2532         while (rwa->err == 0) {
2533                 if (issig(JUSTLOOKING) && issig(FORREAL)) {
2534                         err = SET_ERROR(EINTR);
2535                         break;
2536                 }
2537
2538                 ASSERT3P(ra->rrd, ==, NULL);
2539                 ra->rrd = ra->next_rrd;
2540                 ra->next_rrd = NULL;
2541                 /* Allocates and loads header into ra->next_rrd */
2542                 err = receive_read_record(ra);
2543
2544                 if (ra->rrd->header.drr_type == DRR_END || err != 0) {
2545                         kmem_free(ra->rrd, sizeof (*ra->rrd));
2546                         ra->rrd = NULL;
2547                         break;
2548                 }
2549
2550                 bqueue_enqueue(&rwa->q, ra->rrd,
2551                     sizeof (struct receive_record_arg) + ra->rrd->payload_size);
2552                 ra->rrd = NULL;
2553         }
2554         if (ra->next_rrd == NULL)
2555                 ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
2556         ra->next_rrd->eos_marker = B_TRUE;
2557         bqueue_enqueue(&rwa->q, ra->next_rrd, 1);
2558
2559         mutex_enter(&rwa->mutex);
2560         while (!rwa->done) {
2561                 cv_wait(&rwa->cv, &rwa->mutex);
2562         }
2563         mutex_exit(&rwa->mutex);
2564
2565         /*
2566          * If we are receiving a full stream as a clone, all object IDs which
2567          * are greater than the maximum ID referenced in the stream are
2568          * by definition unused and must be freed.
2569          */
2570         if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) {
2571                 uint64_t obj = rwa->max_object + 1;
2572                 int free_err = 0;
2573                 int next_err = 0;
2574
2575                 while (next_err == 0) {
2576                         free_err = dmu_free_long_object(rwa->os, obj);
2577                         if (free_err != 0 && free_err != ENOENT)
2578                                 break;
2579
2580                         next_err = dmu_object_next(rwa->os, &obj, FALSE, 0);
2581                 }
2582
2583                 if (err == 0) {
2584                         if (free_err != 0 && free_err != ENOENT)
2585                                 err = free_err;
2586                         else if (next_err != ESRCH)
2587                                 err = next_err;
2588                 }
2589         }
2590
2591         cv_destroy(&rwa->cv);
2592         mutex_destroy(&rwa->mutex);
2593         bqueue_destroy(&rwa->q);
2594         if (err == 0)
2595                 err = rwa->err;
2596
2597 out:
2598         nvlist_free(begin_nvl);
2599         if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
2600                 zfs_onexit_fd_rele(cleanup_fd);
2601
2602         if (err != 0) {
2603                 /*
2604                  * Clean up references. If receive is not resumable,
2605                  * destroy what we created, so we don't leave it in
2606                  * the inconsistent state.
2607                  */
2608                 dmu_recv_cleanup_ds(drc);
2609                 nvlist_free(drc->drc_keynvl);
2610         }
2611
2612         *voffp = ra->voff;
2613         objlist_destroy(&ra->ignore_objlist);
2614         kmem_free(ra, sizeof (*ra));
2615         kmem_free(rwa, sizeof (*rwa));
2616         return (err);
2617 }
2618
2619 static int
2620 dmu_recv_end_check(void *arg, dmu_tx_t *tx)
2621 {
2622         dmu_recv_cookie_t *drc = arg;
2623         dsl_pool_t *dp = dmu_tx_pool(tx);
2624         int error;
2625
2626         ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag);
2627
2628         if (!drc->drc_newfs) {
2629                 dsl_dataset_t *origin_head;
2630
2631                 error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head);
2632                 if (error != 0)
2633                         return (error);
2634                 if (drc->drc_force) {
2635                         /*
2636                          * We will destroy any snapshots in tofs (i.e. before
2637                          * origin_head) that are after the origin (which is
2638                          * the snap before drc_ds, because drc_ds can not
2639                          * have any snaps of its own).
2640                          */
2641                         uint64_t obj;
2642
2643                         obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
2644                         while (obj !=
2645                             dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
2646                                 dsl_dataset_t *snap;
2647                                 error = dsl_dataset_hold_obj(dp, obj, FTAG,
2648                                     &snap);
2649                                 if (error != 0)
2650                                         break;
2651                                 if (snap->ds_dir != origin_head->ds_dir)
2652                                         error = SET_ERROR(EINVAL);
2653                                 if (error == 0)  {
2654                                         error = dsl_destroy_snapshot_check_impl(
2655                                             snap, B_FALSE);
2656                                 }
2657                                 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
2658                                 dsl_dataset_rele(snap, FTAG);
2659                                 if (error != 0)
2660                                         break;
2661                         }
2662                         if (error != 0) {
2663                                 dsl_dataset_rele(origin_head, FTAG);
2664                                 return (error);
2665                         }
2666                 }
2667                 if (drc->drc_keynvl != NULL) {
2668                         error = dsl_crypto_recv_raw_key_check(drc->drc_ds,
2669                             drc->drc_keynvl, tx);
2670                         if (error != 0) {
2671                                 dsl_dataset_rele(origin_head, FTAG);
2672                                 return (error);
2673                         }
2674                 }
2675
2676                 error = dsl_dataset_clone_swap_check_impl(drc->drc_ds,
2677                     origin_head, drc->drc_force, drc->drc_owner, tx);
2678                 if (error != 0) {
2679                         dsl_dataset_rele(origin_head, FTAG);
2680                         return (error);
2681                 }
2682                 error = dsl_dataset_snapshot_check_impl(origin_head,
2683                     drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
2684                 dsl_dataset_rele(origin_head, FTAG);
2685                 if (error != 0)
2686                         return (error);
2687
2688                 error = dsl_destroy_head_check_impl(drc->drc_ds, 1);
2689         } else {
2690                 error = dsl_dataset_snapshot_check_impl(drc->drc_ds,
2691                     drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred);
2692         }
2693         return (error);
2694 }
2695
2696 static void
2697 dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
2698 {
2699         dmu_recv_cookie_t *drc = arg;
2700         dsl_pool_t *dp = dmu_tx_pool(tx);
2701         boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0;
2702
2703         spa_history_log_internal_ds(drc->drc_ds, "finish receiving",
2704             tx, "snap=%s", drc->drc_tosnap);
2705         drc->drc_ds->ds_objset->os_raw_receive = B_FALSE;
2706
2707         if (!drc->drc_newfs) {
2708                 dsl_dataset_t *origin_head;
2709
2710                 VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG,
2711                     &origin_head));
2712
2713                 if (drc->drc_force) {
2714                         /*
2715                          * Destroy any snapshots of drc_tofs (origin_head)
2716                          * after the origin (the snap before drc_ds).
2717                          */
2718                         uint64_t obj;
2719
2720                         obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
2721                         while (obj !=
2722                             dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) {
2723                                 dsl_dataset_t *snap;
2724                                 VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG,
2725                                     &snap));
2726                                 ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir);
2727                                 obj = dsl_dataset_phys(snap)->ds_prev_snap_obj;
2728                                 dsl_destroy_snapshot_sync_impl(snap,
2729                                     B_FALSE, tx);
2730                                 dsl_dataset_rele(snap, FTAG);
2731                         }
2732                 }
2733                 if (drc->drc_keynvl != NULL) {
2734                         dsl_crypto_recv_raw_key_sync(drc->drc_ds,
2735                             drc->drc_keynvl, tx);
2736                         nvlist_free(drc->drc_keynvl);
2737                         drc->drc_keynvl = NULL;
2738                 }
2739
2740                 VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev);
2741
2742                 dsl_dataset_clone_swap_sync_impl(drc->drc_ds,
2743                     origin_head, tx);
2744                 dsl_dataset_snapshot_sync_impl(origin_head,
2745                     drc->drc_tosnap, tx);
2746
2747                 /* set snapshot's creation time and guid */
2748                 dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx);
2749                 dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time =
2750                     drc->drc_drrb->drr_creation_time;
2751                 dsl_dataset_phys(origin_head->ds_prev)->ds_guid =
2752                     drc->drc_drrb->drr_toguid;
2753                 dsl_dataset_phys(origin_head->ds_prev)->ds_flags &=
2754                     ~DS_FLAG_INCONSISTENT;
2755
2756                 dmu_buf_will_dirty(origin_head->ds_dbuf, tx);
2757                 dsl_dataset_phys(origin_head)->ds_flags &=
2758                     ~DS_FLAG_INCONSISTENT;
2759
2760                 drc->drc_newsnapobj =
2761                     dsl_dataset_phys(origin_head)->ds_prev_snap_obj;
2762
2763                 dsl_dataset_rele(origin_head, FTAG);
2764                 dsl_destroy_head_sync_impl(drc->drc_ds, tx);
2765
2766                 if (drc->drc_owner != NULL)
2767                         VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner);
2768         } else {
2769                 dsl_dataset_t *ds = drc->drc_ds;
2770
2771                 dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx);
2772
2773                 /* set snapshot's creation time and guid */
2774                 dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
2775                 dsl_dataset_phys(ds->ds_prev)->ds_creation_time =
2776                     drc->drc_drrb->drr_creation_time;
2777                 dsl_dataset_phys(ds->ds_prev)->ds_guid =
2778                     drc->drc_drrb->drr_toguid;
2779                 dsl_dataset_phys(ds->ds_prev)->ds_flags &=
2780                     ~DS_FLAG_INCONSISTENT;
2781
2782                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
2783                 dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
2784                 if (dsl_dataset_has_resume_receive_state(ds)) {
2785                         (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
2786                             DS_FIELD_RESUME_FROMGUID, tx);
2787                         (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
2788                             DS_FIELD_RESUME_OBJECT, tx);
2789                         (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
2790                             DS_FIELD_RESUME_OFFSET, tx);
2791                         (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
2792                             DS_FIELD_RESUME_BYTES, tx);
2793                         (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
2794                             DS_FIELD_RESUME_TOGUID, tx);
2795                         (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
2796                             DS_FIELD_RESUME_TONAME, tx);
2797                 }
2798                 drc->drc_newsnapobj =
2799                     dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
2800         }
2801         zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE);
2802
2803         /*
2804          * Release the hold from dmu_recv_begin.  This must be done before
2805          * we return to open context, so that when we free the dataset's dnode
2806          * we can evict its bonus buffer. Since the dataset may be destroyed
2807          * at this point (and therefore won't have a valid pointer to the spa)
2808          * we release the key mapping manually here while we do have a valid
2809          * pointer, if it exists.
2810          */
2811         if (!drc->drc_raw && encrypted) {
2812                 (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa,
2813                     drc->drc_ds->ds_object, drc->drc_ds);
2814         }
2815         dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag);
2816         drc->drc_ds = NULL;
2817 }
2818
2819 static int
2820 add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj,
2821     boolean_t raw)
2822 {
2823         dsl_pool_t *dp;
2824         dsl_dataset_t *snapds;
2825         guid_map_entry_t *gmep;
2826         objset_t *os;
2827         ds_hold_flags_t dsflags = (raw) ? 0 : DS_HOLD_FLAG_DECRYPT;
2828         int err;
2829
2830         ASSERT(guid_map != NULL);
2831
2832         err = dsl_pool_hold(name, FTAG, &dp);
2833         if (err != 0)
2834                 return (err);
2835         gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP);
2836         err = dsl_dataset_own_obj(dp, snapobj, dsflags, gmep, &snapds);
2837         if (err == 0) {
2838                 /*
2839                  * If this is a deduplicated raw send stream, we need
2840                  * to make sure that we can still read raw blocks from
2841                  * earlier datasets in the stream, so we set the
2842                  * os_raw_receive flag now.
2843                  */
2844                 if (raw) {
2845                         err = dmu_objset_from_ds(snapds, &os);
2846                         if (err != 0) {
2847                                 dsl_dataset_disown(snapds, dsflags, FTAG);
2848                                 dsl_pool_rele(dp, FTAG);
2849                                 kmem_free(gmep, sizeof (*gmep));
2850                                 return (err);
2851                         }
2852                         os->os_raw_receive = B_TRUE;
2853                 }
2854
2855                 gmep->raw = raw;
2856                 gmep->guid = dsl_dataset_phys(snapds)->ds_guid;
2857                 gmep->gme_ds = snapds;
2858                 avl_add(guid_map, gmep);
2859         } else {
2860                 kmem_free(gmep, sizeof (*gmep));
2861         }
2862
2863         dsl_pool_rele(dp, FTAG);
2864         return (err);
2865 }
2866
2867 static int dmu_recv_end_modified_blocks = 3;
2868
2869 static int
2870 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
2871 {
2872 #ifdef _KERNEL
2873         /*
2874          * We will be destroying the ds; make sure its origin is unmounted if
2875          * necessary.
2876          */
2877         char name[ZFS_MAX_DATASET_NAME_LEN];
2878         dsl_dataset_name(drc->drc_ds, name);
2879         zfs_destroy_unmount_origin(name);
2880 #endif
2881
2882         return (dsl_sync_task(drc->drc_tofs,
2883             dmu_recv_end_check, dmu_recv_end_sync, drc,
2884             dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
2885 }
2886
2887 static int
2888 dmu_recv_new_end(dmu_recv_cookie_t *drc)
2889 {
2890         return (dsl_sync_task(drc->drc_tofs,
2891             dmu_recv_end_check, dmu_recv_end_sync, drc,
2892             dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL));
2893 }
2894
2895 int
2896 dmu_recv_end(dmu_recv_cookie_t *drc, void *owner)
2897 {
2898         int error;
2899
2900         drc->drc_owner = owner;
2901
2902         if (drc->drc_newfs)
2903                 error = dmu_recv_new_end(drc);
2904         else
2905                 error = dmu_recv_existing_end(drc);
2906
2907         if (error != 0) {
2908                 dmu_recv_cleanup_ds(drc);
2909                 nvlist_free(drc->drc_keynvl);
2910         } else if (drc->drc_guid_to_ds_map != NULL) {
2911                 (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map,
2912                     drc->drc_newsnapobj, drc->drc_raw);
2913         }
2914         return (error);
2915 }
2916
2917 /*
2918  * Return TRUE if this objset is currently being received into.
2919  */
2920 boolean_t
2921 dmu_objset_is_receiving(objset_t *os)
2922 {
2923         return (os->os_dsl_dataset != NULL &&
2924             os->os_dsl_dataset->ds_owner == dmu_recv_tag);
2925 }
2926
2927 #if defined(_KERNEL)
2928 module_param(zfs_recv_queue_length, int, 0644);
2929 MODULE_PARM_DESC(zfs_recv_queue_length, "Maximum receive queue length");
2930 #endif