module/zfs/dmu_send.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  24  * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
  25  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  26  * Copyright 2014 HybridCluster. All rights reserved.
  27  * Copyright 2016 RackTop Systems.
  28  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  29  */
  30
  31 #include <sys/dmu.h>
  32 #include <sys/dmu_impl.h>
  33 #include <sys/dmu_tx.h>
  34 #include <sys/dbuf.h>
  35 #include <sys/dnode.h>
  36 #include <sys/zfs_context.h>
  37 #include <sys/dmu_objset.h>
  38 #include <sys/dmu_traverse.h>
  39 #include <sys/dsl_dataset.h>
  40 #include <sys/dsl_dir.h>
  41 #include <sys/dsl_prop.h>
  42 #include <sys/dsl_pool.h>
  43 #include <sys/dsl_synctask.h>
  44 #include <sys/spa_impl.h>
  45 #include <sys/zfs_ioctl.h>
  46 #include <sys/zap.h>
  47 #include <sys/zio_checksum.h>
  48 #include <sys/zfs_znode.h>
  49 #include <zfs_fletcher.h>
  50 #include <sys/avl.h>
  51 #include <sys/ddt.h>
  52 #include <sys/zfs_onexit.h>
  53 #include <sys/dmu_send.h>
  54 #include <sys/dsl_destroy.h>
  55 #include <sys/blkptr.h>
  56 #include <sys/dsl_bookmark.h>
  57 #include <sys/zfeature.h>
  58 #include <sys/bqueue.h>
  59 #include <sys/zvol.h>
  60 #include <sys/policy.h>
  61
  62 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
  63 int zfs_send_corrupt_data = B_FALSE;
  64 int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
  65 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
  66 int zfs_send_set_freerecords_bit = B_TRUE;
  67 /* Set this tunable to FALSE is disable sending unmodified spill blocks. */
  68 int zfs_send_unmodified_spill_blocks = B_TRUE;
  69
  70 /*
  71  * Use this to override the recordsize calculation for fast zfs send estimates.
  72  */
  73 unsigned long zfs_override_estimate_recordsize = 0;
  74
  75 #define BP_SPAN(datablkszsec, indblkshift, level) \
  76         (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
  77         (level) * (indblkshift - SPA_BLKPTRSHIFT)))
  78
  79 struct send_thread_arg {
  80         bqueue_t        q;
  81         dsl_dataset_t   *ds;            /* Dataset to traverse */
  82         uint64_t        fromtxg;        /* Traverse from this txg */
  83         int             flags;          /* flags to pass to traverse_dataset */
  84         int             error_code;
  85         boolean_t       cancel;
  86         zbookmark_phys_t resume;
  87 };
  88
  89 struct send_block_record {
  90         boolean_t               eos_marker; /* Marks the end of the stream */
  91         blkptr_t                bp;
  92         zbookmark_phys_t        zb;
  93         uint8_t                 indblkshift;
  94         uint16_t                datablkszsec;
  95         bqueue_node_t           ln;
  96 };
  97
  98 typedef struct dump_bytes_io {
  99         dmu_sendarg_t   *dbi_dsp;
 100         void            *dbi_buf;
 101         int             dbi_len;
 102 } dump_bytes_io_t;
 103
 104 static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data);
 105
 106 static void
 107 dump_bytes_cb(void *arg)
 108 {
 109         dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
 110         dmu_sendarg_t *dsp = dbi->dbi_dsp;
 111         dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
 112         ssize_t resid; /* have to get resid to get detailed errno */
 113
 114         /*
 115          * The code does not rely on len being a multiple of 8.  We keep
 116          * this assertion because of the corresponding assertion in
 117          * receive_read().  Keeping this assertion ensures that we do not
 118          * inadvertently break backwards compatibility (causing the assertion
 119          * in receive_read() to trigger on old software). Newer feature flags
 120          * (such as raw send) may break this assertion since they were
 121          * introduced after the requirement was made obsolete.
 122          */
 123
 124         ASSERT(dbi->dbi_len % 8 == 0 ||
 125             (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0);
 126
 127         dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
 128             (caddr_t)dbi->dbi_buf, dbi->dbi_len,
 129             0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
 130
 131         mutex_enter(&ds->ds_sendstream_lock);
 132         *dsp->dsa_off += dbi->dbi_len;
 133         mutex_exit(&ds->ds_sendstream_lock);
 134 }
 135
 136 static int
 137 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
 138 {
 139         dump_bytes_io_t dbi;
 140
 141         dbi.dbi_dsp = dsp;
 142         dbi.dbi_buf = buf;
 143         dbi.dbi_len = len;
 144
 145 #if defined(HAVE_LARGE_STACKS)
 146         dump_bytes_cb(&dbi);
 147 #else
 148         /*
 149          * The vn_rdwr() call is performed in a taskq to ensure that there is
 150          * always enough stack space to write safely to the target filesystem.
 151          * The ZIO_TYPE_FREE threads are used because there can be a lot of
 152          * them and they are used in vdev_file.c for a similar purpose.
 153          */
 154         spa_taskq_dispatch_sync(dmu_objset_spa(dsp->dsa_os), ZIO_TYPE_FREE,
 155             ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP);
 156 #endif /* HAVE_LARGE_STACKS */
 157
 158         return (dsp->dsa_err);
 159 }
 160
 161 /*
 162  * For all record types except BEGIN, fill in the checksum (overlaid in
 163  * drr_u.drr_checksum.drr_checksum).  The checksum verifies everything
 164  * up to the start of the checksum itself.
 165  */
 166 static int
 167 dump_record(dmu_sendarg_t *dsp, void *payload, int payload_len)
 168 {
 169         ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 170             ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t));
 171         (void) fletcher_4_incremental_native(dsp->dsa_drr,
 172             offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum),
 173             &dsp->dsa_zc);
 174         if (dsp->dsa_drr->drr_type == DRR_BEGIN) {
 175                 dsp->dsa_sent_begin = B_TRUE;
 176         } else {
 177                 ASSERT(ZIO_CHECKSUM_IS_ZERO(&dsp->dsa_drr->drr_u.
 178                     drr_checksum.drr_checksum));
 179                 dsp->dsa_drr->drr_u.drr_checksum.drr_checksum = dsp->dsa_zc;
 180         }
 181         if (dsp->dsa_drr->drr_type == DRR_END) {
 182                 dsp->dsa_sent_end = B_TRUE;
 183         }
 184         (void) fletcher_4_incremental_native(&dsp->dsa_drr->
 185             drr_u.drr_checksum.drr_checksum,
 186             sizeof (zio_cksum_t), &dsp->dsa_zc);
 187         if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 188                 return (SET_ERROR(EINTR));
 189         if (payload_len != 0) {
 190                 (void) fletcher_4_incremental_native(payload, payload_len,
 191                     &dsp->dsa_zc);
 192                 if (dump_bytes(dsp, payload, payload_len) != 0)
 193                         return (SET_ERROR(EINTR));
 194         }
 195         return (0);
 196 }
 197
 198 /*
 199  * Fill in the drr_free struct, or perform aggregation if the previous record is
 200  * also a free record, and the two are adjacent.
 201  *
 202  * Note that we send free records even for a full send, because we want to be
 203  * able to receive a full send as a clone, which requires a list of all the free
 204  * and freeobject records that were generated on the source.
 205  */
 206 static int
 207 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
 208     uint64_t length)
 209 {
 210         struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
 211
 212         /*
 213          * When we receive a free record, dbuf_free_range() assumes
 214          * that the receiving system doesn't have any dbufs in the range
 215          * being freed.  This is always true because there is a one-record
 216          * constraint: we only send one WRITE record for any given
 217          * object,offset.  We know that the one-record constraint is
 218          * true because we always send data in increasing order by
 219          * object,offset.
 220          *
 221          * If the increasing-order constraint ever changes, we should find
 222          * another way to assert that the one-record constraint is still
 223          * satisfied.
 224          */
 225         ASSERT(object > dsp->dsa_last_data_object ||
 226             (object == dsp->dsa_last_data_object &&
 227             offset > dsp->dsa_last_data_offset));
 228
 229         /*
 230          * If there is a pending op, but it's not PENDING_FREE, push it out,
 231          * since free block aggregation can only be done for blocks of the
 232          * same type (i.e., DRR_FREE records can only be aggregated with
 233          * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
 234          * aggregated with other DRR_FREEOBJECTS records.
 235          */
 236         if (dsp->dsa_pending_op != PENDING_NONE &&
 237             dsp->dsa_pending_op != PENDING_FREE) {
 238                 if (dump_record(dsp, NULL, 0) != 0)
 239                         return (SET_ERROR(EINTR));
 240                 dsp->dsa_pending_op = PENDING_NONE;
 241         }
 242
 243         if (dsp->dsa_pending_op == PENDING_FREE) {
 244                 /*
 245                  * There should never be a PENDING_FREE if length is
 246                  * DMU_OBJECT_END (because dump_dnode is the only place where
 247                  * this function is called with a DMU_OBJECT_END, and only after
 248                  * flushing any pending record).
 249                  */
 250                 ASSERT(length != DMU_OBJECT_END);
 251                 /*
 252                  * Check to see whether this free block can be aggregated
 253                  * with pending one.
 254                  */
 255                 if (drrf->drr_object == object && drrf->drr_offset +
 256                     drrf->drr_length == offset) {
 257                         if (offset + length < offset)
 258                                 drrf->drr_length = DMU_OBJECT_END;
 259                         else
 260                                 drrf->drr_length += length;
 261                         return (0);
 262                 } else {
 263                         /* not a continuation.  Push out pending record */
 264                         if (dump_record(dsp, NULL, 0) != 0)
 265                                 return (SET_ERROR(EINTR));
 266                         dsp->dsa_pending_op = PENDING_NONE;
 267                 }
 268         }
 269         /* create a FREE record and make it pending */
 270         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 271         dsp->dsa_drr->drr_type = DRR_FREE;
 272         drrf->drr_object = object;
 273         drrf->drr_offset = offset;
 274         if (offset + length < offset)
 275                 drrf->drr_length = DMU_OBJECT_END;
 276         else
 277                 drrf->drr_length = length;
 278         drrf->drr_toguid = dsp->dsa_toguid;
 279         if (length == DMU_OBJECT_END) {
 280                 if (dump_record(dsp, NULL, 0) != 0)
 281                         return (SET_ERROR(EINTR));
 282         } else {
 283                 dsp->dsa_pending_op = PENDING_FREE;
 284         }
 285
 286         return (0);
 287 }
 288
 289 static int
 290 dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object,
 291     uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data)
 292 {
 293         uint64_t payload_size;
 294         boolean_t raw = (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW);
 295         struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
 296
 297         /*
 298          * We send data in increasing object, offset order.
 299          * See comment in dump_free() for details.
 300          */
 301         ASSERT(object > dsp->dsa_last_data_object ||
 302             (object == dsp->dsa_last_data_object &&
 303             offset > dsp->dsa_last_data_offset));
 304         dsp->dsa_last_data_object = object;
 305         dsp->dsa_last_data_offset = offset + lsize - 1;
 306
 307         /*
 308          * If there is any kind of pending aggregation (currently either
 309          * a grouping of free objects or free blocks), push it out to
 310          * the stream, since aggregation can't be done across operations
 311          * of different types.
 312          */
 313         if (dsp->dsa_pending_op != PENDING_NONE) {
 314                 if (dump_record(dsp, NULL, 0) != 0)
 315                         return (SET_ERROR(EINTR));
 316                 dsp->dsa_pending_op = PENDING_NONE;
 317         }
 318         /* write a WRITE record */
 319         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 320         dsp->dsa_drr->drr_type = DRR_WRITE;
 321         drrw->drr_object = object;
 322         drrw->drr_type = type;
 323         drrw->drr_offset = offset;
 324         drrw->drr_toguid = dsp->dsa_toguid;
 325         drrw->drr_logical_size = lsize;
 326
 327         /* only set the compression fields if the buf is compressed or raw */
 328         if (raw || lsize != psize) {
 329                 ASSERT(!BP_IS_EMBEDDED(bp));
 330                 ASSERT3S(psize, >, 0);
 331
 332                 if (raw) {
 333                         ASSERT(BP_IS_PROTECTED(bp));
 334
 335                         /*
 336                          * This is a raw protected block so we need to pass
 337                          * along everything the receiving side will need to
 338                          * interpret this block, including the byteswap, salt,
 339                          * IV, and MAC.
 340                          */
 341                         if (BP_SHOULD_BYTESWAP(bp))
 342                                 drrw->drr_flags |= DRR_RAW_BYTESWAP;
 343                         zio_crypt_decode_params_bp(bp, drrw->drr_salt,
 344                             drrw->drr_iv);
 345                         zio_crypt_decode_mac_bp(bp, drrw->drr_mac);
 346                 } else {
 347                         /* this is a compressed block */
 348                         ASSERT(dsp->dsa_featureflags &
 349                             DMU_BACKUP_FEATURE_COMPRESSED);
 350                         ASSERT(!BP_SHOULD_BYTESWAP(bp));
 351                         ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)));
 352                         ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF);
 353                         ASSERT3S(lsize, >=, psize);
 354                 }
 355
 356                 /* set fields common to compressed and raw sends */
 357                 drrw->drr_compressiontype = BP_GET_COMPRESS(bp);
 358                 drrw->drr_compressed_size = psize;
 359                 payload_size = drrw->drr_compressed_size;
 360         } else {
 361                 payload_size = drrw->drr_logical_size;
 362         }
 363
 364         if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) {
 365                 /*
 366                  * There's no pre-computed checksum for partial-block writes,
 367                  * embedded BP's, or encrypted BP's that are being sent as
 368                  * plaintext, so (like fletcher4-checkummed blocks) userland
 369                  * will have to compute a dedup-capable checksum itself.
 370                  */
 371                 drrw->drr_checksumtype = ZIO_CHECKSUM_OFF;
 372         } else {
 373                 drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
 374                 if (zio_checksum_table[drrw->drr_checksumtype].ci_flags &
 375                     ZCHECKSUM_FLAG_DEDUP)
 376                         drrw->drr_flags |= DRR_CHECKSUM_DEDUP;
 377                 DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
 378                 DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
 379                 DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
 380                 DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp));
 381                 drrw->drr_key.ddk_cksum = bp->blk_cksum;
 382         }
 383
 384         if (dump_record(dsp, data, payload_size) != 0)
 385                 return (SET_ERROR(EINTR));
 386         return (0);
 387 }
 388
 389 static int
 390 dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
 391     int blksz, const blkptr_t *bp)
 392 {
 393         char buf[BPE_PAYLOAD_SIZE];
 394         struct drr_write_embedded *drrw =
 395             &(dsp->dsa_drr->drr_u.drr_write_embedded);
 396
 397         if (dsp->dsa_pending_op != PENDING_NONE) {
 398                 if (dump_record(dsp, NULL, 0) != 0)
 399                         return (SET_ERROR(EINTR));
 400                 dsp->dsa_pending_op = PENDING_NONE;
 401         }
 402
 403         ASSERT(BP_IS_EMBEDDED(bp));
 404
 405         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 406         dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED;
 407         drrw->drr_object = object;
 408         drrw->drr_offset = offset;
 409         drrw->drr_length = blksz;
 410         drrw->drr_toguid = dsp->dsa_toguid;
 411         drrw->drr_compression = BP_GET_COMPRESS(bp);
 412         drrw->drr_etype = BPE_GET_ETYPE(bp);
 413         drrw->drr_lsize = BPE_GET_LSIZE(bp);
 414         drrw->drr_psize = BPE_GET_PSIZE(bp);
 415
 416         decode_embedded_bp_compressed(bp, buf);
 417
 418         if (dump_record(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0)
 419                 return (SET_ERROR(EINTR));
 420         return (0);
 421 }
 422
 423 static int
 424 dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data)
 425 {
 426         struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
 427         uint64_t blksz = BP_GET_LSIZE(bp);
 428         uint64_t payload_size = blksz;
 429
 430         if (dsp->dsa_pending_op != PENDING_NONE) {
 431                 if (dump_record(dsp, NULL, 0) != 0)
 432                         return (SET_ERROR(EINTR));
 433                 dsp->dsa_pending_op = PENDING_NONE;
 434         }
 435
 436         /* write a SPILL record */
 437         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 438         dsp->dsa_drr->drr_type = DRR_SPILL;
 439         drrs->drr_object = object;
 440         drrs->drr_length = blksz;
 441         drrs->drr_toguid = dsp->dsa_toguid;
 442
 443         /* See comment in dump_dnode() for full details */
 444         if (zfs_send_unmodified_spill_blocks &&
 445             (bp->blk_birth <= dsp->dsa_fromtxg)) {
 446                 drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
 447         }
 448
 449         /* handle raw send fields */
 450         if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
 451                 ASSERT(BP_IS_PROTECTED(bp));
 452
 453                 if (BP_SHOULD_BYTESWAP(bp))
 454                         drrs->drr_flags |= DRR_RAW_BYTESWAP;
 455                 drrs->drr_compressiontype = BP_GET_COMPRESS(bp);
 456                 drrs->drr_compressed_size = BP_GET_PSIZE(bp);
 457                 zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv);
 458                 zio_crypt_decode_mac_bp(bp, drrs->drr_mac);
 459                 payload_size = drrs->drr_compressed_size;
 460         }
 461
 462         if (dump_record(dsp, data, payload_size) != 0)
 463                 return (SET_ERROR(EINTR));
 464         return (0);
 465 }
 466
 467 static int
 468 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
 469 {
 470         struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
 471         uint64_t maxobj = DNODES_PER_BLOCK *
 472             (DMU_META_DNODE(dsp->dsa_os)->dn_maxblkid + 1);
 473
 474         /*
 475          * ZoL < 0.7 does not handle large FREEOBJECTS records correctly,
 476          * leading to zfs recv never completing. to avoid this issue, don't
 477          * send FREEOBJECTS records for object IDs which cannot exist on the
 478          * receiving side.
 479          */
 480         if (maxobj > 0) {
 481                 if (maxobj < firstobj)
 482                         return (0);
 483
 484                 if (maxobj < firstobj + numobjs)
 485                         numobjs = maxobj - firstobj;
 486         }
 487
 488         /*
 489          * If there is a pending op, but it's not PENDING_FREEOBJECTS,
 490          * push it out, since free block aggregation can only be done for
 491          * blocks of the same type (i.e., DRR_FREE records can only be
 492          * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
 493          * can only be aggregated with other DRR_FREEOBJECTS records.
 494          */
 495         if (dsp->dsa_pending_op != PENDING_NONE &&
 496             dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
 497                 if (dump_record(dsp, NULL, 0) != 0)
 498                         return (SET_ERROR(EINTR));
 499                 dsp->dsa_pending_op = PENDING_NONE;
 500         }
 501         if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
 502                 /*
 503                  * See whether this free object array can be aggregated
 504                  * with pending one
 505                  */
 506                 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
 507                         drrfo->drr_numobjs += numobjs;
 508                         return (0);
 509                 } else {
 510                         /* can't be aggregated.  Push out pending record */
 511                         if (dump_record(dsp, NULL, 0) != 0)
 512                                 return (SET_ERROR(EINTR));
 513                         dsp->dsa_pending_op = PENDING_NONE;
 514                 }
 515         }
 516
 517         /* write a FREEOBJECTS record */
 518         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 519         dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
 520         drrfo->drr_firstobj = firstobj;
 521         drrfo->drr_numobjs = numobjs;
 522         drrfo->drr_toguid = dsp->dsa_toguid;
 523
 524         dsp->dsa_pending_op = PENDING_FREEOBJECTS;
 525
 526         return (0);
 527 }
 528
 529 static int
 530 dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object,
 531     dnode_phys_t *dnp)
 532 {
 533         struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
 534         int bonuslen;
 535
 536         if (object < dsp->dsa_resume_object) {
 537                 /*
 538                  * Note: when resuming, we will visit all the dnodes in
 539                  * the block of dnodes that we are resuming from.  In
 540                  * this case it's unnecessary to send the dnodes prior to
 541                  * the one we are resuming from.  We should be at most one
 542                  * block's worth of dnodes behind the resume point.
 543                  */
 544                 ASSERT3U(dsp->dsa_resume_object - object, <,
 545                     1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
 546                 return (0);
 547         }
 548
 549         if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
 550                 return (dump_freeobjects(dsp, object, 1));
 551
 552         if (dsp->dsa_pending_op != PENDING_NONE) {
 553                 if (dump_record(dsp, NULL, 0) != 0)
 554                         return (SET_ERROR(EINTR));
 555                 dsp->dsa_pending_op = PENDING_NONE;
 556         }
 557
 558         /* write an OBJECT record */
 559         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 560         dsp->dsa_drr->drr_type = DRR_OBJECT;
 561         drro->drr_object = object;
 562         drro->drr_type = dnp->dn_type;
 563         drro->drr_bonustype = dnp->dn_bonustype;
 564         drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 565         drro->drr_bonuslen = dnp->dn_bonuslen;
 566         drro->drr_dn_slots = dnp->dn_extra_slots + 1;
 567         drro->drr_checksumtype = dnp->dn_checksum;
 568         drro->drr_compress = dnp->dn_compress;
 569         drro->drr_toguid = dsp->dsa_toguid;
 570
 571         if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) &&
 572             drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE)
 573                 drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE;
 574
 575         bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8);
 576
 577         if ((dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW)) {
 578                 ASSERT(BP_IS_ENCRYPTED(bp));
 579
 580                 if (BP_SHOULD_BYTESWAP(bp))
 581                         drro->drr_flags |= DRR_RAW_BYTESWAP;
 582
 583                 /* needed for reconstructing dnp on recv side */
 584                 drro->drr_maxblkid = dnp->dn_maxblkid;
 585                 drro->drr_indblkshift = dnp->dn_indblkshift;
 586                 drro->drr_nlevels = dnp->dn_nlevels;
 587                 drro->drr_nblkptr = dnp->dn_nblkptr;
 588
 589                 /*
 590                  * Since we encrypt the entire bonus area, the (raw) part
 591                  * beyond the bonuslen is actually nonzero, so we need
 592                  * to send it.
 593                  */
 594                 if (bonuslen != 0) {
 595                         drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp);
 596                         bonuslen = drro->drr_raw_bonuslen;
 597                 }
 598         }
 599
 600         /*
 601          * DRR_OBJECT_SPILL is set for every dnode which references a
 602          * spill block.  This allows the receiving pool to definitively
 603          * determine when a spill block should be kept or freed.
 604          */
 605         if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
 606                 drro->drr_flags |= DRR_OBJECT_SPILL;
 607
 608         if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0)
 609                 return (SET_ERROR(EINTR));
 610
 611         /* Free anything past the end of the file. */
 612         if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
 613             (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
 614                 return (SET_ERROR(EINTR));
 615
 616         /*
 617          * Send DRR_SPILL records for unmodified spill blocks.  This is useful
 618          * because changing certain attributes of the object (e.g. blocksize)
 619          * can cause old versions of ZFS to incorrectly remove a spill block.
 620          * Including these records in the stream forces an up to date version
 621          * to always be written ensuring they're never lost.  Current versions
 622          * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
 623          * ignore these unmodified spill blocks.
 624          */
 625         if (zfs_send_unmodified_spill_blocks &&
 626             (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
 627             (DN_SPILL_BLKPTR(dnp)->blk_birth <= dsp->dsa_fromtxg)) {
 628                 struct send_block_record record;
 629
 630                 bzero(&record, sizeof (struct send_block_record));
 631                 record.eos_marker = B_FALSE;
 632                 record.bp = *DN_SPILL_BLKPTR(dnp);
 633                 SET_BOOKMARK(&(record.zb), dmu_objset_id(dsp->dsa_os),
 634                     object, 0, DMU_SPILL_BLKID);
 635
 636                 if (do_dump(dsp, &record) != 0)
 637                         return (SET_ERROR(EINTR));
 638         }
 639
 640         if (dsp->dsa_err != 0)
 641                 return (SET_ERROR(EINTR));
 642
 643         return (0);
 644 }
 645
 646 static int
 647 dump_object_range(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t firstobj,
 648     uint64_t numslots)
 649 {
 650         struct drr_object_range *drror =
 651             &(dsp->dsa_drr->drr_u.drr_object_range);
 652
 653         /* we only use this record type for raw sends */
 654         ASSERT(BP_IS_PROTECTED(bp));
 655         ASSERT(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW);
 656         ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 657         ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE);
 658         ASSERT0(BP_GET_LEVEL(bp));
 659
 660         if (dsp->dsa_pending_op != PENDING_NONE) {
 661                 if (dump_record(dsp, NULL, 0) != 0)
 662                         return (SET_ERROR(EINTR));
 663                 dsp->dsa_pending_op = PENDING_NONE;
 664         }
 665
 666         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 667         dsp->dsa_drr->drr_type = DRR_OBJECT_RANGE;
 668         drror->drr_firstobj = firstobj;
 669         drror->drr_numslots = numslots;
 670         drror->drr_toguid = dsp->dsa_toguid;
 671         if (BP_SHOULD_BYTESWAP(bp))
 672                 drror->drr_flags |= DRR_RAW_BYTESWAP;
 673         zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv);
 674         zio_crypt_decode_mac_bp(bp, drror->drr_mac);
 675
 676         if (dump_record(dsp, NULL, 0) != 0)
 677                 return (SET_ERROR(EINTR));
 678         return (0);
 679 }
 680
 681 static boolean_t
 682 backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp)
 683 {
 684         if (!BP_IS_EMBEDDED(bp))
 685                 return (B_FALSE);
 686
 687         /*
 688          * Compression function must be legacy, or explicitly enabled.
 689          */
 690         if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS &&
 691             !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LZ4)))
 692                 return (B_FALSE);
 693
 694         /*
 695          * Embed type must be explicitly enabled.
 696          */
 697         switch (BPE_GET_ETYPE(bp)) {
 698         case BP_EMBEDDED_TYPE_DATA:
 699                 if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
 700                         return (B_TRUE);
 701                 break;
 702         default:
 703                 return (B_FALSE);
 704         }
 705         return (B_FALSE);
 706 }
 707
 708 /*
 709  * This is the callback function to traverse_dataset that acts as the worker
 710  * thread for dmu_send_impl.
 711  */
 712 /*ARGSUSED*/
 713 static int
 714 send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 715     const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg)
 716 {
 717         struct send_thread_arg *sta = arg;
 718         struct send_block_record *record;
 719         uint64_t record_size;
 720         int err = 0;
 721
 722         ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
 723             zb->zb_object >= sta->resume.zb_object);
 724         ASSERT3P(sta->ds, !=, NULL);
 725
 726         if (sta->cancel)
 727                 return (SET_ERROR(EINTR));
 728
 729         if (bp == NULL) {
 730                 ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL);
 731                 return (0);
 732         } else if (zb->zb_level < 0) {
 733                 return (0);
 734         }
 735
 736         record = kmem_zalloc(sizeof (struct send_block_record), KM_SLEEP);
 737         record->eos_marker = B_FALSE;
 738         record->bp = *bp;
 739         record->zb = *zb;
 740         record->indblkshift = dnp->dn_indblkshift;
 741         record->datablkszsec = dnp->dn_datablkszsec;
 742         record_size = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 743         bqueue_enqueue(&sta->q, record, record_size);
 744
 745         return (err);
 746 }
 747
 748 /*
 749  * This function kicks off the traverse_dataset.  It also handles setting the
 750  * error code of the thread in case something goes wrong, and pushes the End of
 751  * Stream record when the traverse_dataset call has finished.  If there is no
 752  * dataset to traverse, the thread immediately pushes End of Stream marker.
 753  */
 754 static void
 755 send_traverse_thread(void *arg)
 756 {
 757         struct send_thread_arg *st_arg = arg;
 758         int err;
 759         struct send_block_record *data;
 760         fstrans_cookie_t cookie = spl_fstrans_mark();
 761
 762         if (st_arg->ds != NULL) {
 763                 err = traverse_dataset_resume(st_arg->ds,
 764                     st_arg->fromtxg, &st_arg->resume,
 765                     st_arg->flags, send_cb, st_arg);
 766
 767                 if (err != EINTR)
 768                         st_arg->error_code = err;
 769         }
 770         data = kmem_zalloc(sizeof (*data), KM_SLEEP);
 771         data->eos_marker = B_TRUE;
 772         bqueue_enqueue(&st_arg->q, data, 1);
 773         spl_fstrans_unmark(cookie);
 774         thread_exit();
 775 }
 776
 777 /*
 778  * This function actually handles figuring out what kind of record needs to be
 779  * dumped, reading the data (which has hopefully been prefetched), and calling
 780  * the appropriate helper function.
 781  */
 782 static int
 783 do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
 784 {
 785         dsl_dataset_t *ds = dmu_objset_ds(dsa->dsa_os);
 786         const blkptr_t *bp = &data->bp;
 787         const zbookmark_phys_t *zb = &data->zb;
 788         uint8_t indblkshift = data->indblkshift;
 789         uint16_t dblkszsec = data->datablkszsec;
 790         spa_t *spa = ds->ds_dir->dd_pool->dp_spa;
 791         dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
 792         int err = 0;
 793
 794         ASSERT3U(zb->zb_level, >=, 0);
 795
 796         ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
 797             zb->zb_object >= dsa->dsa_resume_object);
 798
 799         /*
 800          * All bps of an encrypted os should have the encryption bit set.
 801          * If this is not true it indicates tampering and we report an error.
 802          */
 803         if (dsa->dsa_os->os_encrypted &&
 804             !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) {
 805                 spa_log_error(spa, zb);
 806                 zfs_panic_recover("unencrypted block in encrypted "
 807                     "object set %llu", ds->ds_object);
 808                 return (SET_ERROR(EIO));
 809         }
 810
 811         if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 812             DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 813                 return (0);
 814         } else if (BP_IS_HOLE(bp) &&
 815             zb->zb_object == DMU_META_DNODE_OBJECT) {
 816                 uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
 817                 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 818                 err = dump_freeobjects(dsa, dnobj, span >> DNODE_SHIFT);
 819         } else if (BP_IS_HOLE(bp)) {
 820                 uint64_t span = BP_SPAN(dblkszsec, indblkshift, zb->zb_level);
 821                 uint64_t offset = zb->zb_blkid * span;
 822                 /* Don't dump free records for offsets > DMU_OBJECT_END */
 823                 if (zb->zb_blkid == 0 || span <= DMU_OBJECT_END / zb->zb_blkid)
 824                         err = dump_free(dsa, zb->zb_object, offset, span);
 825         } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
 826                 return (0);
 827         } else if (type == DMU_OT_DNODE) {
 828                 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 829                 arc_flags_t aflags = ARC_FLAG_WAIT;
 830                 arc_buf_t *abuf;
 831                 enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
 832
 833                 if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
 834                         ASSERT(BP_IS_ENCRYPTED(bp));
 835                         ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 836                         zioflags |= ZIO_FLAG_RAW;
 837                 }
 838
 839                 ASSERT0(zb->zb_level);
 840
 841                 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 842                     ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0)
 843                         return (SET_ERROR(EIO));
 844
 845                 dnode_phys_t *blk = abuf->b_data;
 846                 uint64_t dnobj = zb->zb_blkid * epb;
 847
 848                 /*
 849                  * Raw sends require sending encryption parameters for the
 850                  * block of dnodes. Regular sends do not need to send this
 851                  * info.
 852                  */
 853                 if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
 854                         ASSERT(arc_is_encrypted(abuf));
 855                         err = dump_object_range(dsa, bp, dnobj, epb);
 856                 }
 857
 858                 if (err == 0) {
 859                         for (int i = 0; i < epb;
 860                             i += blk[i].dn_extra_slots + 1) {
 861                                 err = dump_dnode(dsa, bp, dnobj + i, blk + i);
 862                                 if (err != 0)
 863                                         break;
 864                         }
 865                 }
 866                 arc_buf_destroy(abuf, &abuf);
 867         } else if (type == DMU_OT_SA) {
 868                 arc_flags_t aflags = ARC_FLAG_WAIT;
 869                 arc_buf_t *abuf;
 870                 enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
 871
 872                 if (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
 873                         ASSERT(BP_IS_PROTECTED(bp));
 874                         zioflags |= ZIO_FLAG_RAW;
 875                 }
 876
 877                 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 878                     ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0)
 879                         return (SET_ERROR(EIO));
 880
 881                 err = dump_spill(dsa, bp, zb->zb_object, abuf->b_data);
 882                 arc_buf_destroy(abuf, &abuf);
 883         } else if (backup_do_embed(dsa, bp)) {
 884                 /* it's an embedded level-0 block of a regular object */
 885                 int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
 886                 ASSERT0(zb->zb_level);
 887                 err = dump_write_embedded(dsa, zb->zb_object,
 888                     zb->zb_blkid * blksz, blksz, bp);
 889         } else {
 890                 /* it's a level-0 block of a regular object */
 891                 arc_flags_t aflags = ARC_FLAG_WAIT;
 892                 arc_buf_t *abuf;
 893                 int blksz = dblkszsec << SPA_MINBLOCKSHIFT;
 894                 uint64_t offset;
 895
 896                 /*
 897                  * If we have large blocks stored on disk but the send flags
 898                  * don't allow us to send large blocks, we split the data from
 899                  * the arc buf into chunks.
 900                  */
 901                 boolean_t split_large_blocks = blksz > SPA_OLD_MAXBLOCKSIZE &&
 902                     !(dsa->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS);
 903
 904                 /*
 905                  * Raw sends require that we always get raw data as it exists
 906                  * on disk, so we assert that we are not splitting blocks here.
 907                  */
 908                 boolean_t request_raw =
 909                     (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) != 0;
 910
 911                 /*
 912                  * We should only request compressed data from the ARC if all
 913                  * the following are true:
 914                  *  - stream compression was requested
 915                  *  - we aren't splitting large blocks into smaller chunks
 916                  *  - the data won't need to be byteswapped before sending
 917                  *  - this isn't an embedded block
 918                  *  - this isn't metadata (if receiving on a different endian
 919                  *    system it can be byteswapped more easily)
 920                  */
 921                 boolean_t request_compressed =
 922                     (dsa->dsa_featureflags & DMU_BACKUP_FEATURE_COMPRESSED) &&
 923                     !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) &&
 924                     !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp));
 925
 926                 IMPLY(request_raw, !split_large_blocks);
 927                 IMPLY(request_raw, BP_IS_PROTECTED(bp));
 928                 ASSERT0(zb->zb_level);
 929                 ASSERT(zb->zb_object > dsa->dsa_resume_object ||
 930                     (zb->zb_object == dsa->dsa_resume_object &&
 931                     zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
 932
 933                 ASSERT3U(blksz, ==, BP_GET_LSIZE(bp));
 934
 935                 enum zio_flag zioflags = ZIO_FLAG_CANFAIL;
 936                 if (request_raw)
 937                         zioflags |= ZIO_FLAG_RAW;
 938                 else if (request_compressed)
 939                         zioflags |= ZIO_FLAG_RAW_COMPRESS;
 940
 941                 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
 942                     ZIO_PRIORITY_ASYNC_READ, zioflags, &aflags, zb) != 0) {
 943                         if (zfs_send_corrupt_data) {
 944                                 /* Send a block filled with 0x"zfs badd bloc" */
 945                                 abuf = arc_alloc_buf(spa, &abuf, ARC_BUFC_DATA,
 946                                     blksz);
 947                                 uint64_t *ptr;
 948                                 for (ptr = abuf->b_data;
 949                                     (char *)ptr < (char *)abuf->b_data + blksz;
 950                                     ptr++)
 951                                         *ptr = 0x2f5baddb10cULL;
 952                         } else {
 953                                 return (SET_ERROR(EIO));
 954                         }
 955                 }
 956
 957                 offset = zb->zb_blkid * blksz;
 958
 959                 if (split_large_blocks) {
 960                         ASSERT0(arc_is_encrypted(abuf));
 961                         ASSERT3U(arc_get_compression(abuf), ==,
 962                             ZIO_COMPRESS_OFF);
 963                         char *buf = abuf->b_data;
 964                         while (blksz > 0 && err == 0) {
 965                                 int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE);
 966                                 err = dump_write(dsa, type, zb->zb_object,
 967                                     offset, n, n, NULL, buf);
 968                                 offset += n;
 969                                 buf += n;
 970                                 blksz -= n;
 971                         }
 972                 } else {
 973                         err = dump_write(dsa, type, zb->zb_object, offset,
 974                             blksz, arc_buf_size(abuf), bp, abuf->b_data);
 975                 }
 976                 arc_buf_destroy(abuf, &abuf);
 977         }
 978
 979         ASSERT(err == 0 || err == EINTR);
 980         return (err);
 981 }
 982
 983 /*
 984  * Pop the new data off the queue, and free the old data.
 985  */
 986 static struct send_block_record *
 987 get_next_record(bqueue_t *bq, struct send_block_record *data)
 988 {
 989         struct send_block_record *tmp = bqueue_dequeue(bq);
 990         kmem_free(data, sizeof (*data));
 991         return (tmp);
 992 }
 993
 994 /*
 995  * Actually do the bulk of the work in a zfs send.
 996  *
 997  * Note: Releases dp using the specified tag.
 998  */
 999 static int
1000 dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
1001     zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone,
1002     boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
1003     boolean_t rawok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
1004     vnode_t *vp, offset_t *off)
1005 {
1006         objset_t *os;
1007         dmu_replay_record_t *drr;
1008         dmu_sendarg_t *dsp;
1009         int err;
1010         uint64_t fromtxg = 0;
1011         uint64_t featureflags = 0;
1012         struct send_thread_arg to_arg;
1013         void *payload = NULL;
1014         size_t payload_len = 0;
1015         struct send_block_record *to_data;
1016
1017         err = dmu_objset_from_ds(to_ds, &os);
1018         if (err != 0) {
1019                 dsl_pool_rele(dp, tag);
1020                 return (err);
1021         }
1022
1023         /*
1024          * If this is a non-raw send of an encrypted ds, we can ensure that
1025          * the objset_phys_t is authenticated. This is safe because this is
1026          * either a snapshot or we have owned the dataset, ensuring that
1027          * it can't be modified.
1028          */
1029         if (!rawok && os->os_encrypted &&
1030             arc_is_unauthenticated(os->os_phys_buf)) {
1031                 zbookmark_phys_t zb;
1032
1033                 SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT,
1034                     ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
1035                 err = arc_untransform(os->os_phys_buf, os->os_spa,
1036                     &zb, B_FALSE);
1037                 if (err != 0) {
1038                         dsl_pool_rele(dp, tag);
1039                         return (err);
1040                 }
1041
1042                 ASSERT0(arc_is_unauthenticated(os->os_phys_buf));
1043         }
1044
1045         drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1046         drr->drr_type = DRR_BEGIN;
1047         drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
1048         DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
1049             DMU_SUBSTREAM);
1050
1051         bzero(&to_arg, sizeof (to_arg));
1052
1053 #ifdef _KERNEL
1054         if (dmu_objset_type(os) == DMU_OST_ZFS) {
1055                 uint64_t version;
1056                 if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) {
1057                         kmem_free(drr, sizeof (dmu_replay_record_t));
1058                         dsl_pool_rele(dp, tag);
1059                         return (SET_ERROR(EINVAL));
1060                 }
1061                 if (version >= ZPL_VERSION_SA) {
1062                         featureflags |= DMU_BACKUP_FEATURE_SA_SPILL;
1063                 }
1064         }
1065 #endif
1066
1067         /* raw sends imply large_block_ok */
1068         if ((large_block_ok || rawok) &&
1069             dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS))
1070                 featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS;
1071         if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE))
1072                 featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE;
1073
1074         /* encrypted datasets will not have embedded blocks */
1075         if ((embedok || rawok) && !os->os_encrypted &&
1076             spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) {
1077                 featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA;
1078         }
1079
1080         /* raw send implies compressok */
1081         if (compressok || rawok)
1082                 featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
1083
1084         if (rawok && os->os_encrypted)
1085                 featureflags |= DMU_BACKUP_FEATURE_RAW;
1086
1087         if ((featureflags &
1088             (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED |
1089             DMU_BACKUP_FEATURE_RAW)) != 0 &&
1090             spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) {
1091                 featureflags |= DMU_BACKUP_FEATURE_LZ4;
1092         }
1093
1094         if (resumeobj != 0 || resumeoff != 0) {
1095                 featureflags |= DMU_BACKUP_FEATURE_RESUMING;
1096         }
1097
1098         DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
1099             featureflags);
1100
1101         drr->drr_u.drr_begin.drr_creation_time =
1102             dsl_dataset_phys(to_ds)->ds_creation_time;
1103         drr->drr_u.drr_begin.drr_type = dmu_objset_type(os);
1104         if (is_clone)
1105                 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
1106         drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(to_ds)->ds_guid;
1107         if (dsl_dataset_phys(to_ds)->ds_flags & DS_FLAG_CI_DATASET)
1108                 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
1109         if (zfs_send_set_freerecords_bit)
1110                 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
1111
1112         drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK;
1113
1114         if (ancestor_zb != NULL) {
1115                 drr->drr_u.drr_begin.drr_fromguid =
1116                     ancestor_zb->zbm_guid;
1117                 fromtxg = ancestor_zb->zbm_creation_txg;
1118         }
1119         dsl_dataset_name(to_ds, drr->drr_u.drr_begin.drr_toname);
1120         if (!to_ds->ds_is_snapshot) {
1121                 (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--",
1122                     sizeof (drr->drr_u.drr_begin.drr_toname));
1123         }
1124
1125         dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
1126
1127         dsp->dsa_drr = drr;
1128         dsp->dsa_vp = vp;
1129         dsp->dsa_outfd = outfd;
1130         dsp->dsa_proc = curproc;
1131         dsp->dsa_os = os;
1132         dsp->dsa_off = off;
1133         dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
1134         dsp->dsa_fromtxg = fromtxg;
1135         dsp->dsa_pending_op = PENDING_NONE;
1136         dsp->dsa_featureflags = featureflags;
1137         dsp->dsa_resume_object = resumeobj;
1138         dsp->dsa_resume_offset = resumeoff;
1139
1140         mutex_enter(&to_ds->ds_sendstream_lock);
1141         list_insert_head(&to_ds->ds_sendstreams, dsp);
1142         mutex_exit(&to_ds->ds_sendstream_lock);
1143
1144         dsl_dataset_long_hold(to_ds, FTAG);
1145         dsl_pool_rele(dp, tag);
1146
1147         /* handle features that require a DRR_BEGIN payload */
1148         if (featureflags &
1149             (DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_RAW)) {
1150                 nvlist_t *keynvl = NULL;
1151                 nvlist_t *nvl = fnvlist_alloc();
1152
1153                 if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
1154                         dmu_object_info_t to_doi;
1155                         err = dmu_object_info(os, resumeobj, &to_doi);
1156                         if (err != 0) {
1157                                 fnvlist_free(nvl);
1158                                 goto out;
1159                         }
1160
1161                         SET_BOOKMARK(&to_arg.resume, to_ds->ds_object,
1162                             resumeobj, 0,
1163                             resumeoff / to_doi.doi_data_block_size);
1164
1165                         fnvlist_add_uint64(nvl, "resume_object", resumeobj);
1166                         fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
1167                 }
1168
1169                 if (featureflags & DMU_BACKUP_FEATURE_RAW) {
1170                         uint64_t ivset_guid = (ancestor_zb != NULL) ?
1171                             ancestor_zb->zbm_ivset_guid : 0;
1172
1173                         ASSERT(os->os_encrypted);
1174
1175                         err = dsl_crypto_populate_key_nvlist(to_ds,
1176                             ivset_guid, &keynvl);
1177                         if (err != 0) {
1178                                 fnvlist_free(nvl);
1179                                 goto out;
1180                         }
1181
1182                         fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl);
1183                 }
1184
1185                 payload = fnvlist_pack(nvl, &payload_len);
1186                 drr->drr_payloadlen = payload_len;
1187                 fnvlist_free(keynvl);
1188                 fnvlist_free(nvl);
1189         }
1190
1191         err = dump_record(dsp, payload, payload_len);
1192         fnvlist_pack_free(payload, payload_len);
1193         if (err != 0) {
1194                 err = dsp->dsa_err;
1195                 goto out;
1196         }
1197
1198         err = bqueue_init(&to_arg.q,
1199             MAX(zfs_send_queue_length, 2 * zfs_max_recordsize),
1200             offsetof(struct send_block_record, ln));
1201         to_arg.error_code = 0;
1202         to_arg.cancel = B_FALSE;
1203         to_arg.ds = to_ds;
1204         to_arg.fromtxg = fromtxg;
1205         to_arg.flags = TRAVERSE_PRE | TRAVERSE_PREFETCH;
1206         if (rawok)
1207                 to_arg.flags |= TRAVERSE_NO_DECRYPT;
1208         (void) thread_create(NULL, 0, send_traverse_thread, &to_arg, 0, curproc,
1209             TS_RUN, minclsyspri);
1210
1211         to_data = bqueue_dequeue(&to_arg.q);
1212
1213         while (!to_data->eos_marker && err == 0) {
1214                 err = do_dump(dsp, to_data);
1215                 to_data = get_next_record(&to_arg.q, to_data);
1216                 if (issig(JUSTLOOKING) && issig(FORREAL))
1217                         err = EINTR;
1218         }
1219
1220         if (err != 0) {
1221                 to_arg.cancel = B_TRUE;
1222                 while (!to_data->eos_marker) {
1223                         to_data = get_next_record(&to_arg.q, to_data);
1224                 }
1225         }
1226         kmem_free(to_data, sizeof (*to_data));
1227
1228         bqueue_destroy(&to_arg.q);
1229
1230         if (err == 0 && to_arg.error_code != 0)
1231                 err = to_arg.error_code;
1232
1233         if (err != 0)
1234                 goto out;
1235
1236         if (dsp->dsa_pending_op != PENDING_NONE)
1237                 if (dump_record(dsp, NULL, 0) != 0)
1238                         err = SET_ERROR(EINTR);
1239
1240         if (err != 0) {
1241                 if (err == EINTR && dsp->dsa_err != 0)
1242                         err = dsp->dsa_err;
1243                 goto out;
1244         }
1245
1246         bzero(drr, sizeof (dmu_replay_record_t));
1247         drr->drr_type = DRR_END;
1248         drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
1249         drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
1250
1251         if (dump_record(dsp, NULL, 0) != 0)
1252                 err = dsp->dsa_err;
1253 out:
1254         mutex_enter(&to_ds->ds_sendstream_lock);
1255         list_remove(&to_ds->ds_sendstreams, dsp);
1256         mutex_exit(&to_ds->ds_sendstream_lock);
1257
1258         VERIFY(err != 0 || (dsp->dsa_sent_begin && dsp->dsa_sent_end));
1259
1260         kmem_free(drr, sizeof (dmu_replay_record_t));
1261         kmem_free(dsp, sizeof (dmu_sendarg_t));
1262
1263         dsl_dataset_long_rele(to_ds, FTAG);
1264
1265         return (err);
1266 }
1267
1268 int
1269 dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
1270     boolean_t embedok, boolean_t large_block_ok, boolean_t compressok,
1271     boolean_t rawok, int outfd, vnode_t *vp, offset_t *off)
1272 {
1273         dsl_pool_t *dp;
1274         dsl_dataset_t *ds;
1275         dsl_dataset_t *fromds = NULL;
1276         ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
1277         int err;
1278
1279         err = dsl_pool_hold(pool, FTAG, &dp);
1280         if (err != 0)
1281                 return (err);
1282
1283         err = dsl_dataset_hold_obj_flags(dp, tosnap, dsflags, FTAG, &ds);
1284         if (err != 0) {
1285                 dsl_pool_rele(dp, FTAG);
1286                 return (err);
1287         }
1288
1289         if (fromsnap != 0) {
1290                 zfs_bookmark_phys_t zb = { 0 };
1291                 boolean_t is_clone;
1292
1293                 err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds);
1294                 if (err != 0) {
1295                         dsl_dataset_rele_flags(ds, dsflags, FTAG);
1296                         dsl_pool_rele(dp, FTAG);
1297                         return (err);
1298                 }
1299                 if (!dsl_dataset_is_before(ds, fromds, 0)) {
1300                         err = SET_ERROR(EXDEV);
1301                         dsl_dataset_rele(fromds, FTAG);
1302                         dsl_dataset_rele_flags(ds, dsflags, FTAG);
1303                         dsl_pool_rele(dp, FTAG);
1304                         return (err);
1305                 }
1306
1307                 zb.zbm_creation_time =
1308                     dsl_dataset_phys(fromds)->ds_creation_time;
1309                 zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg;
1310                 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
1311
1312                 if (dsl_dataset_is_zapified(fromds)) {
1313                         (void) zap_lookup(dp->dp_meta_objset,
1314                             fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1,
1315                             &zb.zbm_ivset_guid);
1316                 }
1317
1318                 is_clone = (fromds->ds_dir != ds->ds_dir);
1319                 dsl_dataset_rele(fromds, FTAG);
1320                 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
1321                     embedok, large_block_ok, compressok, rawok, outfd,
1322                     0, 0, vp, off);
1323         } else {
1324                 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
1325                     embedok, large_block_ok, compressok, rawok, outfd,
1326                     0, 0, vp, off);
1327         }
1328         dsl_dataset_rele_flags(ds, dsflags, FTAG);
1329         return (err);
1330 }
1331
1332 int
1333 dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
1334     boolean_t large_block_ok, boolean_t compressok, boolean_t rawok,
1335     int outfd, uint64_t resumeobj, uint64_t resumeoff, vnode_t *vp,
1336     offset_t *off)
1337 {
1338         dsl_pool_t *dp;
1339         dsl_dataset_t *ds;
1340         int err;
1341         ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT;
1342         boolean_t owned = B_FALSE;
1343
1344         if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL)
1345                 return (SET_ERROR(EINVAL));
1346
1347         err = dsl_pool_hold(tosnap, FTAG, &dp);
1348         if (err != 0)
1349                 return (err);
1350         if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) {
1351                 /*
1352                  * We are sending a filesystem or volume.  Ensure
1353                  * that it doesn't change by owning the dataset.
1354                  */
1355                 err = dsl_dataset_own(dp, tosnap, dsflags, FTAG, &ds);
1356                 owned = B_TRUE;
1357         } else {
1358                 err = dsl_dataset_hold_flags(dp, tosnap, dsflags, FTAG, &ds);
1359         }
1360         if (err != 0) {
1361                 dsl_pool_rele(dp, FTAG);
1362                 return (err);
1363         }
1364
1365         if (fromsnap != NULL) {
1366                 zfs_bookmark_phys_t zb = { 0 };
1367                 boolean_t is_clone = B_FALSE;
1368                 int fsnamelen = strchr(tosnap, '@') - tosnap;
1369
1370                 /*
1371                  * If the fromsnap is in a different filesystem, then
1372                  * mark the send stream as a clone.
1373                  */
1374                 if (strncmp(tosnap, fromsnap, fsnamelen) != 0 ||
1375                     (fromsnap[fsnamelen] != '@' &&
1376                     fromsnap[fsnamelen] != '#')) {
1377                         is_clone = B_TRUE;
1378                 }
1379
1380                 if (strchr(fromsnap, '@')) {
1381                         dsl_dataset_t *fromds;
1382                         err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds);
1383                         if (err == 0) {
1384                                 if (!dsl_dataset_is_before(ds, fromds, 0))
1385                                         err = SET_ERROR(EXDEV);
1386                                 zb.zbm_creation_time =
1387                                     dsl_dataset_phys(fromds)->ds_creation_time;
1388                                 zb.zbm_creation_txg =
1389                                     dsl_dataset_phys(fromds)->ds_creation_txg;
1390                                 zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid;
1391                                 is_clone = (ds->ds_dir != fromds->ds_dir);
1392
1393                                 if (dsl_dataset_is_zapified(fromds)) {
1394                                         (void) zap_lookup(dp->dp_meta_objset,
1395                                             fromds->ds_object,
1396                                             DS_FIELD_IVSET_GUID, 8, 1,
1397                                             &zb.zbm_ivset_guid);
1398                                 }
1399                                 dsl_dataset_rele(fromds, FTAG);
1400                         }
1401                 } else {
1402                         err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb);
1403                 }
1404                 if (err != 0) {
1405                         if (owned)
1406                                 dsl_dataset_disown(ds, dsflags, FTAG);
1407                         else
1408                                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1409
1410                         dsl_pool_rele(dp, FTAG);
1411                         return (err);
1412                 }
1413                 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
1414                     embedok, large_block_ok, compressok, rawok,
1415                     outfd, resumeobj, resumeoff, vp, off);
1416         } else {
1417                 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
1418                     embedok, large_block_ok, compressok, rawok,
1419                     outfd, resumeobj, resumeoff, vp, off);
1420         }
1421         if (owned)
1422                 dsl_dataset_disown(ds, dsflags, FTAG);
1423         else
1424                 dsl_dataset_rele_flags(ds, dsflags, FTAG);
1425
1426         return (err);
1427 }
1428
1429 static int
1430 dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed,
1431     uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep)
1432 {
1433         int err = 0;
1434         uint64_t size;
1435         /*
1436          * Assume that space (both on-disk and in-stream) is dominated by
1437          * data.  We will adjust for indirect blocks and the copies property,
1438          * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
1439          */
1440
1441         uint64_t recordsize;
1442         uint64_t record_count;
1443         objset_t *os;
1444         VERIFY0(dmu_objset_from_ds(ds, &os));
1445
1446         /* Assume all (uncompressed) blocks are recordsize. */
1447         if (zfs_override_estimate_recordsize != 0) {
1448                 recordsize = zfs_override_estimate_recordsize;
1449         } else if (os->os_phys->os_type == DMU_OST_ZVOL) {
1450                 err = dsl_prop_get_int_ds(ds,
1451                     zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize);
1452         } else {
1453                 err = dsl_prop_get_int_ds(ds,
1454                     zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize);
1455         }
1456         if (err != 0)
1457                 return (err);
1458         record_count = uncompressed / recordsize;
1459
1460         /*
1461          * If we're estimating a send size for a compressed stream, use the
1462          * compressed data size to estimate the stream size. Otherwise, use the
1463          * uncompressed data size.
1464          */
1465         size = stream_compressed ? compressed : uncompressed;
1466
1467         /*
1468          * Subtract out approximate space used by indirect blocks.
1469          * Assume most space is used by data blocks (non-indirect, non-dnode).
1470          * Assume no ditto blocks or internal fragmentation.
1471          *
1472          * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
1473          * block.
1474          */
1475         size -= record_count * sizeof (blkptr_t);
1476
1477         /* Add in the space for the record associated with each block. */
1478         size += record_count * sizeof (dmu_replay_record_t);
1479
1480         *sizep = size;
1481
1482         return (0);
1483 }
1484
1485 int
1486 dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds,
1487     boolean_t stream_compressed, uint64_t *sizep)
1488 {
1489         int err;
1490         uint64_t uncomp, comp;
1491
1492         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
1493
1494         /* tosnap must be a snapshot */
1495         if (!ds->ds_is_snapshot)
1496                 return (SET_ERROR(EINVAL));
1497
1498         /* fromsnap, if provided, must be a snapshot */
1499         if (fromds != NULL && !fromds->ds_is_snapshot)
1500                 return (SET_ERROR(EINVAL));
1501
1502         /*
1503          * fromsnap must be an earlier snapshot from the same fs as tosnap,
1504          * or the origin's fs.
1505          */
1506         if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0))
1507                 return (SET_ERROR(EXDEV));
1508
1509         /* Get compressed and uncompressed size estimates of changed data. */
1510         if (fromds == NULL) {
1511                 uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes;
1512                 comp = dsl_dataset_phys(ds)->ds_compressed_bytes;
1513         } else {
1514                 uint64_t used;
1515                 err = dsl_dataset_space_written(fromds, ds,
1516                     &used, &comp, &uncomp);
1517                 if (err != 0)
1518                         return (err);
1519         }
1520
1521         err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp,
1522             stream_compressed, sizep);
1523         /*
1524          * Add the size of the BEGIN and END records to the estimate.
1525          */
1526         *sizep += 2 * sizeof (dmu_replay_record_t);
1527         return (err);
1528 }
1529
1530 struct calculate_send_arg {
1531         uint64_t uncompressed;
1532         uint64_t compressed;
1533 };
1534
1535 /*
1536  * Simple callback used to traverse the blocks of a snapshot and sum their
1537  * uncompressed and compressed sizes.
1538  */
1539 /* ARGSUSED */
1540 static int
1541 dmu_calculate_send_traversal(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1542     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
1543 {
1544         struct calculate_send_arg *space = arg;
1545         if (bp != NULL && !BP_IS_HOLE(bp)) {
1546                 space->uncompressed += BP_GET_UCSIZE(bp);
1547                 space->compressed += BP_GET_PSIZE(bp);
1548         }
1549         return (0);
1550 }
1551
1552 /*
1553  * Given a desination snapshot and a TXG, calculate the approximate size of a
1554  * send stream sent from that TXG. from_txg may be zero, indicating that the
1555  * whole snapshot will be sent.
1556  */
1557 int
1558 dmu_send_estimate_from_txg(dsl_dataset_t *ds, uint64_t from_txg,
1559     boolean_t stream_compressed, uint64_t *sizep)
1560 {
1561         int err;
1562         struct calculate_send_arg size = { 0 };
1563
1564         ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
1565
1566         /* tosnap must be a snapshot */
1567         if (!dsl_dataset_is_snapshot(ds))
1568                 return (SET_ERROR(EINVAL));
1569
1570         /* verify that from_txg is before the provided snapshot was taken */
1571         if (from_txg >= dsl_dataset_phys(ds)->ds_creation_txg) {
1572                 return (SET_ERROR(EXDEV));
1573         }
1574         /*
1575          * traverse the blocks of the snapshot with birth times after
1576          * from_txg, summing their uncompressed size
1577          */
1578         err = traverse_dataset(ds, from_txg,
1579             TRAVERSE_POST | TRAVERSE_NO_DECRYPT,
1580             dmu_calculate_send_traversal, &size);
1581
1582         if (err)
1583                 return (err);
1584
1585         err = dmu_adjust_send_estimate_for_indirects(ds, size.uncompressed,
1586             size.compressed, stream_compressed, sizep);
1587         return (err);
1588 }
1589
1590
1591 #if defined(_KERNEL)
1592 /* BEGIN CSTYLED */
1593 module_param(zfs_override_estimate_recordsize, ulong, 0644);
1594 MODULE_PARM_DESC(zfs_override_estimate_recordsize,
1595         "Record size calculation override for zfs send estimates");
1596 /* END CSTYLED */
1597
1598 module_param(zfs_send_corrupt_data, int, 0644);
1599 MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data");
1600
1601 module_param(zfs_send_queue_length, int, 0644);
1602 MODULE_PARM_DESC(zfs_send_queue_length, "Maximum send queue length");
1603
1604 module_param(zfs_send_unmodified_spill_blocks, int, 0644);
1605 MODULE_PARM_DESC(zfs_send_unmodified_spill_blocks,
1606         "Send unmodified spill blocks");
1607 #endif