OpenZFS 2605, 6980, 6902

author Matthew Ahrens <mahrens@delphix.com>

Wed, 6 Jan 2016 21:22:48 +0000 (22:22 +0100)

committer Brian Behlendorf <behlendorf1@llnl.gov>

Tue, 28 Jun 2016 20:47:02 +0000 (13:47 -0700)
author Matthew Ahrens <mahrens@delphix.com>
Wed, 6 Jan 2016 21:22:48 +0000 (22:22 +0100)
committer Brian Behlendorf <behlendorf1@llnl.gov>
Tue, 28 Jun 2016 20:47:02 +0000 (13:47 -0700)
diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c

index 8bcff2dba7856fc3b383d72f29392d01dd439d60..3d20973a200e78975324a9e8e470f00835f77f4b 100644 (file)
--- a/cmd/zfs/zfs_main.c
+++ b/cmd/zfs/zfs_main.c
@@ -248,10 +248,11 @@ get_usage(zfs_help_t idx)
         case HELP_PROMOTE:
                 return (gettext("\tpromote <clone-filesystem>\n"));
         case HELP_RECEIVE:
-               return (gettext("\treceive [-vnFu] <filesystem|volume|"
+               return (gettext("\treceive [-vnsFu] <filesystem|volume|"
                     "snapshot>\n"
-                   "\treceive [-vnFu] [-o origin=<snapshot>] [-d | -e] "
-                   "<filesystem>\n"));
+                   "\treceive [-vnsFu] [-o origin=<snapshot>] [-d | -e] "
+                   "<filesystem>\n"
+                   "\treceive -A <filesystem|volume>\n"));
         case HELP_RENAME:
                 return (gettext("\trename [-f] <filesystem|volume|snapshot> "
                     "<filesystem|volume|snapshot>\n"
@@ -263,7 +264,8 @@ get_usage(zfs_help_t idx)
                 return (gettext("\tsend [-DnPpRvLe] [-[iI] snapshot] "
                     "<snapshot>\n"
                     "\tsend [-Le] [-i snapshot|bookmark] "
-                   "<filesystem|volume|snapshot>\n"));
+                   "<filesystem|volume|snapshot>\n"
+                   "\tsend [-nvPe] -t <receive_resume_token>\n"));
         case HELP_SET:
                 return (gettext("\tset <property=value> ... "
                     "<filesystem|volume|snapshot> ...\n"));
@@ -3707,6 +3709,7 @@ zfs_do_send(int argc, char **argv)
  {
         char *fromname = NULL;
         char *toname = NULL;
+       char *resume_token = NULL;
         char *cp;
         zfs_handle_t *zhp;
         sendflags_t flags = { 0 };
@@ -3715,7 +3718,7 @@ zfs_do_send(int argc, char **argv)
         boolean_t extraverbose = B_FALSE;
  
         /* check options */
-       while ((c = getopt(argc, argv, ":i:I:RDpvnPLe")) != -1) {
+       while ((c = getopt(argc, argv, ":i:I:RDpvnPLet:")) != -1) {
                 switch (c) {
                 case 'i':
                         if (fromname)
@@ -3756,6 +3759,9 @@ zfs_do_send(int argc, char **argv)
                 case 'e':
                         flags.embed_data = B_TRUE;
                         break;
+               case 't':
+                       resume_token = optarg;
+                       break;
                 case ':':
                         (void) fprintf(stderr, gettext("missing argument for "
                             "'%c' option\n"), optopt);
@@ -3771,14 +3777,28 @@ zfs_do_send(int argc, char **argv)
         argc -= optind;
         argv += optind;
  
-       /* check number of arguments */
-       if (argc < 1) {
-               (void) fprintf(stderr, gettext("missing snapshot argument\n"));
-               usage(B_FALSE);
-       }
-       if (argc > 1) {
-               (void) fprintf(stderr, gettext("too many arguments\n"));
-               usage(B_FALSE);
+       if (resume_token != NULL) {
+               if (fromname != NULL || flags.replicate || flags.props ||
+                   flags.dedup) {
+                       (void) fprintf(stderr,
+                           gettext("invalid flags combined with -t\n"));
+                       usage(B_FALSE);
+               }
+               if (argc != 0) {
+                       (void) fprintf(stderr, gettext("no additional "
+                           "arguments are permitted with -t\n"));
+                       usage(B_FALSE);
+               }
+       } else {
+               if (argc < 1) {
+                       (void) fprintf(stderr,
+                           gettext("missing snapshot argument\n"));
+                       usage(B_FALSE);
+               }
+               if (argc > 1) {
+                       (void) fprintf(stderr, gettext("too many arguments\n"));
+                       usage(B_FALSE);
+               }
         }
  
         if (!flags.dryrun && isatty(STDOUT_FILENO)) {
@@ -3788,6 +3808,11 @@ zfs_do_send(int argc, char **argv)
                 return (1);
         }
  
+       if (resume_token != NULL) {
+               return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO,
+                   resume_token));
+       }
+
         /*
          * Special case sending a filesystem, or from a bookmark.
          */
@@ -3893,8 +3918,6 @@ zfs_do_send(int argc, char **argv)
  }
  
  /*
- * zfs receive [-vnFu] [-d | -e] <fs@snap>
- *
   * Restore a backup stream from stdin.
   */
  static int
@@ -3902,6 +3925,8 @@ zfs_do_receive(int argc, char **argv)
  {
         int c, err;
         recvflags_t flags = { 0 };
+       boolean_t abort_resumable = B_FALSE;
+
         nvlist_t *props;
         nvpair_t *nvp = NULL;
  
@@ -3909,7 +3934,7 @@ zfs_do_receive(int argc, char **argv)
                 nomem();
  
         /* check options */
-       while ((c = getopt(argc, argv, ":o:denuvF")) != -1) {
+       while ((c = getopt(argc, argv, ":o:denuvFsA")) != -1) {
                 switch (c) {
                 case 'o':
                         if (parseprop(props, optarg) != 0)
@@ -3931,9 +3956,15 @@ zfs_do_receive(int argc, char **argv)
                 case 'v':
                         flags.verbose = B_TRUE;
                         break;
+               case 's':
+                       flags.resumable = B_TRUE;
+                       break;
                 case 'F':
                         flags.force = B_TRUE;
                         break;
+               case 'A':
+                       abort_resumable = B_TRUE;
+                       break;
                 case ':':
                         (void) fprintf(stderr, gettext("missing argument for "
                             "'%c' option\n"), optopt);
@@ -3966,6 +3997,44 @@ zfs_do_receive(int argc, char **argv)
                 }
         }
  
+       if (abort_resumable) {
+               if (flags.isprefix || flags.istail || flags.dryrun ||
+                   flags.resumable || flags.nomount) {
+                       (void) fprintf(stderr, gettext("invalid option"));
+                       usage(B_FALSE);
+               }
+
+               char namebuf[ZFS_MAXNAMELEN];
+               (void) snprintf(namebuf, sizeof (namebuf),
+                   "%s/%%recv", argv[0]);
+
+               if (zfs_dataset_exists(g_zfs, namebuf,
+                   ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) {
+                       zfs_handle_t *zhp = zfs_open(g_zfs,
+                           namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+                       if (zhp == NULL)
+                               return (1);
+                       err = zfs_destroy(zhp, B_FALSE);
+               } else {
+                       zfs_handle_t *zhp = zfs_open(g_zfs,
+                           argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+                       if (zhp == NULL)
+                               usage(B_FALSE);
+                       if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) ||
+                           zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+                           NULL, 0, NULL, NULL, 0, B_TRUE) == -1) {
+                               (void) fprintf(stderr,
+                                   gettext("'%s' does not have any "
+                                   "resumable receive state to abort\n"),
+                                   argv[0]);
+                               return (1);
+                       }
+                       err = zfs_destroy(zhp, B_FALSE);
+               }
+
+               return (err != 0);
+       }
+
         if (isatty(STDIN_FILENO)) {
                 (void) fprintf(stderr,
                     gettext("Error: Backup stream can not be read "
@@ -3973,7 +4042,6 @@ zfs_do_receive(int argc, char **argv)
                     "You must redirect standard input.\n"));
                 return (1);
         }
-
         err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL);
  
         return (err != 0);
@@ -5803,6 +5871,24 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
                 return (0);
         }
  
+       /*
+        * If this filesystem is inconsistent and has a receive resume
+        * token, we can not mount it.
+        */
+       if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
+           zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+           NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
+               if (!explicit)
+                       return (0);
+
+               (void) fprintf(stderr, gettext("cannot %s '%s': "
+                   "Contains partially-completed state from "
+                   "\"zfs receive -r\", which can be resumed with "
+                   "\"zfs send -t\"\n"),
+                   cmdname, zfs_get_name(zhp));
+               return (1);
+       }
+
         /*
          * At this point, we have verified that the mountpoint and/or
          * shareopts are appropriate for auto management. If the
diff --git a/cmd/zstreamdump/zstreamdump.c b/cmd/zstreamdump/zstreamdump.c

index f288d148e574b6211ec560b5d502378a98df8cf9..08d52bb37a834e3d72bad87d9a6cbd7a0099f449 100644 (file)
--- a/cmd/zstreamdump/zstreamdump.c
+++ b/cmd/zstreamdump/zstreamdump.c
@@ -127,7 +127,7 @@ read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum)
                     (longlong_t)saved_cksum.zc_word[1],
                     (longlong_t)saved_cksum.zc_word[2],
                     (longlong_t)saved_cksum.zc_word[3]);
-               exit(1);
+               return (0);
         }
         return (sizeof (*drr));
  }
@@ -347,8 +347,7 @@ main(int argc, char *argv[])
                         if (verbose)
                                 (void) printf("\n");
  
-                       if ((DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
-                           DMU_COMPOUNDSTREAM) && drr->drr_payloadlen != 0) {
+                       if (drr->drr_payloadlen != 0) {
                                 nvlist_t *nv;
                                 int sz = drr->drr_payloadlen;
  
diff --git a/config/user-commands.m4 b/config/user-commands.m4

index 655b992418cce42126a71f09fc344ad58eace8d2..bda2b8652b5366f1993b4f87b81bf6e7cc1ec925 100644 (file)
--- a/config/user-commands.m4
+++ b/config/user-commands.m4
@@ -67,6 +67,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_COMMANDS_COMMON], [
         AC_PATH_TOOL(SHUF, shuf, "")
         AC_PATH_TOOL(SLEEP, sleep, "")
         AC_PATH_TOOL(SORT, sort, "")
+       AC_PATH_TOOL(STAT, stat, "")
         AC_PATH_TOOL(STRINGS, strings, "")
         AC_PATH_TOOL(SU, su, "")
         AC_PATH_TOOL(SUM, sum, "")
@@ -75,6 +76,7 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_COMMANDS_COMMON], [
         AC_PATH_TOOL(TAR, tar, "")
         AC_PATH_TOOL(TOUCH, touch, "")
         AC_PATH_TOOL(TR, tr, "")
+       AC_PATH_TOOL(TRUNCATE, truncate, "")
         AC_PATH_TOOL(TRUE, true, "")
         AC_PATH_TOOL(UMASK, umask, "")
         AC_PATH_TOOL(UMOUNT, umount, "")
@@ -103,7 +105,6 @@ AC_DEFUN([ZFS_AC_CONFIG_USER_COMMANDS_LINUX], [
         AC_PATH_TOOL(SHARE, exportfs, "")
         AC_PATH_TOOL(SWAP, swapon, "")
         AC_PATH_TOOL(SWAPADD, swapon, "")
-       AC_PATH_TOOL(TRUNCATE, truncate, "")
         AC_PATH_TOOL(UDEVADM, udevadm, "")
         AC_PATH_TOOL(UFSDUMP, dump, "")
         AC_PATH_TOOL(UFSRESTORE, restore, "")
diff --git a/include/libzfs.h b/include/libzfs.h

index 654b932843182eef5837359fac3d1dc7cc49eb28..40eb20781fa88504b97cc906c450f917ac003517 100644 (file)
--- a/include/libzfs.h
+++ b/include/libzfs.h
@@ -640,6 +640,10 @@ typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
  extern int zfs_send(zfs_handle_t *, const char *, const char *,
      sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **);
  extern int zfs_send_one(zfs_handle_t *, const char *, int, enum lzc_send_flags);
+extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd,
+    const char *);
+extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl,
+    const char *token);
  
  extern int zfs_promote(zfs_handle_t *);
  extern int zfs_hold(zfs_handle_t *, const char *, const char *,
@@ -680,6 +684,12 @@ typedef struct recvflags {
         /* set "canmount=off" on all modified filesystems */
         boolean_t canmountoff;
  
+       /*
+        * Mark the file systems as "resumable" and do not destroy them if the
+        * receive is interrupted
+        */
+       boolean_t resumable;
+
         /* byteswap flag is used internally; callers need not specify */
         boolean_t byteswap;
  
diff --git a/include/libzfs_core.h b/include/libzfs_core.h

index bdd6c951ee496dc1e21a297e7a69b1342aecf79b..5d3a6fda7dcb9f96801d7f23647f410eb51f45bf 100644 (file)
--- a/include/libzfs_core.h
+++ b/include/libzfs_core.h
@@ -20,7 +20,7 @@
   */
  
  /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
   */
  
  #ifndef        _LIBZFS_CORE_H
@@ -58,7 +58,11 @@ enum lzc_send_flags {
  };
  
  int lzc_send(const char *, const char *, int, enum lzc_send_flags);
+int lzc_send_resume(const char *, const char *, int,
+    enum lzc_send_flags, uint64_t, uint64_t);
  int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, int);
+int lzc_receive_resumable(const char *, nvlist_t *, const char *,
+    boolean_t, int);
  int lzc_send_space(const char *, const char *, uint64_t *);
  
  boolean_t lzc_exists(const char *);
diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h

index 75d094f0812e9769bd3afe3202d4a6a17026f1fb..d700d1d17ed3654dfd922b25b5fe48f349d7d5a6 100644 (file)
--- a/include/sys/dmu_impl.h
+++ b/include/sys/dmu_impl.h
@@ -24,7 +24,7 @@
   */
  /*
   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2014 by Delphix. All rights reserved.
   */
  
  #ifndef _SYS_DMU_IMPL_H
@@ -272,6 +272,8 @@ typedef struct dmu_sendarg {
         uint64_t dsa_featureflags;
         uint64_t dsa_last_data_object;
         uint64_t dsa_last_data_offset;
+       uint64_t dsa_resume_object;
+       uint64_t dsa_resume_offset;
  } dmu_sendarg_t;
  
  void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *);
diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h

index 2442a1f8aab10b86a5eff4d89e02f1f3db4bb4b9..871f5625460ec96a989c4ecdfef34c9a321df983 100644 (file)
--- a/include/sys/dmu_send.h
+++ b/include/sys/dmu_send.h
@@ -36,10 +36,13 @@ struct vnode;
  struct dsl_dataset;
  struct drr_begin;
  struct avl_tree;
+struct dmu_replay_record;
  
-int dmu_send(const char *tosnap, const char *fromsnap,
-    boolean_t embedok, boolean_t large_block_ok,
-    int outfd, struct vnode *vp, offset_t *off);
+extern const char *recv_clone_name;
+
+int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+    boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
+    struct vnode *vp, offset_t *off);
  int dmu_send_estimate(struct dsl_dataset *ds, struct dsl_dataset *fromds,
      uint64_t *sizep);
  int dmu_send_estimate_from_txg(struct dsl_dataset *ds, uint64_t fromtxg,
@@ -50,12 +53,14 @@ int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
  
  typedef struct dmu_recv_cookie {
         struct dsl_dataset *drc_ds;
+       struct dmu_replay_record *drc_drr_begin;
         struct drr_begin *drc_drrb;
         const char *drc_tofs;
         const char *drc_tosnap;
         boolean_t drc_newfs;
         boolean_t drc_byteswap;
         boolean_t drc_force;
+       boolean_t drc_resumable;
         struct avl_tree *drc_guid_to_ds_map;
         zio_cksum_t drc_cksum;
         uint64_t drc_newsnapobj;
@@ -63,8 +68,9 @@ typedef struct dmu_recv_cookie {
         cred_t *drc_cred;
  } dmu_recv_cookie_t;
  
-int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
-    boolean_t force, char *origin, dmu_recv_cookie_t *drc);
+int dmu_recv_begin(char *tofs, char *tosnap,
+    struct dmu_replay_record *drr_begin,
+    boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc);
  int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp,
      int cleanup_fd, uint64_t *action_handlep);
  int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner);
diff --git a/include/sys/dmu_traverse.h b/include/sys/dmu_traverse.h

index 544b721e46129a06f5bc3d86ec8e9c3e6d685be1..c010edd440d95f974ba584be09365c672394ecbd 100644 (file)
--- a/include/sys/dmu_traverse.h
+++ b/include/sys/dmu_traverse.h
@@ -54,6 +54,8 @@ typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
  
  int traverse_dataset(struct dsl_dataset *ds,
      uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start,
+    zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg);
  int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr,
      uint64_t txg_start, zbookmark_phys_t *resume, int flags,
      blkptr_cb_t func, void *arg);
diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h

index f3e64a77236e729debf26695c25758e38db7fb9b..195a271c92acb318d14ad47a2aa0b25b0b15c796 100644 (file)
--- a/include/sys/dsl_dataset.h
+++ b/include/sys/dsl_dataset.h
@@ -98,6 +98,18 @@ struct dsl_pool;
   */
  #define        DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode"
  
+/*
+ * These fields are set on datasets that are in the middle of a resumable
+ * receive, and allow the sender to resume the send if it is interrupted.
+ */
+#define        DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid"
+#define        DS_FIELD_RESUME_TONAME "com.delphix:resume_toname"
+#define        DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid"
+#define        DS_FIELD_RESUME_OBJECT "com.delphix:resume_object"
+#define        DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset"
+#define        DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes"
+#define        DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok"
+
  /*
   * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
   * name lookups should be performed case-insensitively.
@@ -191,6 +203,14 @@ typedef struct dsl_dataset {
         kmutex_t ds_sendstream_lock;
         list_t ds_sendstreams;
  
+       /*
+        * When in the middle of a resumable receive, tracks how much
+        * progress we have made.
+        */
+       uint64_t ds_resume_object[TXG_SIZE];
+       uint64_t ds_resume_offset[TXG_SIZE];
+       uint64_t ds_resume_bytes[TXG_SIZE];
+
         /* Protected by our dsl_dir's dd_lock */
         list_t ds_prop_cbs;
  
@@ -242,6 +262,7 @@ int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
  void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
  void dsl_dataset_name(dsl_dataset_t *ds, char *name);
  boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag);
+boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds);
  uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
      dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
  uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
@@ -322,6 +343,8 @@ int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx,
  void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds,
      zprop_source_t source, uint64_t value, dmu_tx_t *tx);
  void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx);
+boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds);
+boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds);
  int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result);
  
  void dsl_dataset_deactivate_feature(uint64_t dsobj,
diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h

index 0935dca77a4125377e8f246493b0664f0e019ddf..8a581eee5d1c58c52652be5dcbda941f38443a3b 100644 (file)
--- a/include/sys/fs/zfs.h
+++ b/include/sys/fs/zfs.h
@@ -158,6 +158,7 @@ typedef enum {
         ZFS_PROP_REDUNDANT_METADATA,
         ZFS_PROP_OVERLAY,
         ZFS_PROP_PREV_SNAP,
+       ZFS_PROP_RECEIVE_RESUME_TOKEN,
         ZFS_NUM_PROPS
  } zfs_prop_t;
  
diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h

index a978524e2400ce5d4fb06f4c392aab9eeace5bd8..ee6b9f6500a249fdc1b5532aecd5275a055609cb 100644 (file)
--- a/include/sys/zfs_ioctl.h
+++ b/include/sys/zfs_ioctl.h
@@ -90,16 +90,16 @@ typedef enum drr_headertype {
   * Feature flags for zfs send streams (flags in drr_versioninfo)
   */
  
-#define        DMU_BACKUP_FEATURE_DEDUP                (1<<0)
-#define        DMU_BACKUP_FEATURE_DEDUPPROPS           (1<<1)
-#define        DMU_BACKUP_FEATURE_SA_SPILL             (1<<2)
+#define        DMU_BACKUP_FEATURE_DEDUP                (1 << 0)
+#define        DMU_BACKUP_FEATURE_DEDUPPROPS           (1 << 1)
+#define        DMU_BACKUP_FEATURE_SA_SPILL             (1 << 2)
  /* flags #3 - #15 are reserved for incompatible closed-source implementations */
-#define        DMU_BACKUP_FEATURE_EMBED_DATA           (1<<16)
-#define        DMU_BACKUP_FEATURE_EMBED_DATA_LZ4       (1<<17)
+#define        DMU_BACKUP_FEATURE_EMBED_DATA           (1 << 16)
+#define        DMU_BACKUP_FEATURE_EMBED_DATA_LZ4       (1 << 17)
  /* flag #18 is reserved for a Delphix feature */
-#define        DMU_BACKUP_FEATURE_LARGE_BLOCKS         (1<<19)
-/* flag #20 is reserved for resumable streams */
-#define        DMU_BACKUP_FEATURE_LARGE_DNODE          (1<<21)
+#define        DMU_BACKUP_FEATURE_LARGE_BLOCKS         (1 << 19)
+#define        DMU_BACKUP_FEATURE_RESUMING             (1 << 20)
+#define        DMU_BACKUP_FEATURE_LARGE_DNODE          (1 << 21)
  
  /*
   * Mask of all supported backup features
@@ -107,11 +107,16 @@ typedef enum drr_headertype {
  #define        DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \
      DMU_BACKUP_FEATURE_DEDUPPROPS | DMU_BACKUP_FEATURE_SA_SPILL | \
      DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_EMBED_DATA_LZ4 | \
-    DMU_BACKUP_FEATURE_LARGE_BLOCKS | DMU_BACKUP_FEATURE_LARGE_DNODE)
+    DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \
+    DMU_BACKUP_FEATURE_LARGE_DNODE)
  
  /* Are all features in the given flag word currently supported? */
  #define        DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK))
  
+typedef enum dmu_send_resume_token_version {
+       ZFS_SEND_RESUME_TOKEN_VERSION = 1
+} dmu_send_resume_token_version_t;
+
  /*
   * The drr_versioninfo field of the dmu_replay_record has the
   * following layout:
@@ -335,6 +340,12 @@ typedef enum zfs_case {
         ZFS_CASE_MIXED
  } zfs_case_t;
  
+/*
+ * Note: this struct must have the same layout in 32-bit and 64-bit, so
+ * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit
+ * kernel.  Therefore, we add padding to it so that no "hidden" padding
+ * is automatically added on 64-bit (but not on 32-bit).
+ */
  typedef struct zfs_cmd {
         char            zc_name[MAXPATHLEN];    /* name of pool or dataset */
         uint64_t        zc_nvlist_src;          /* really (char *) */
@@ -363,14 +374,15 @@ typedef struct zfs_cmd {
         uint64_t        zc_iflags;              /* internal to zfs(7fs) */
         zfs_share_t     zc_share;
         dmu_objset_stats_t zc_objset_stats;
-       struct drr_begin zc_begin_record;
+       dmu_replay_record_t zc_begin_record;
         zinject_record_t zc_inject_record;
         uint32_t        zc_defer_destroy;
         uint32_t        zc_flags;
         uint64_t        zc_action_handle;
         int             zc_cleanup_fd;
         uint8_t         zc_simple;
-       uint8_t         zc_pad[3];              /* alignment */
+       boolean_t       zc_resumable;
+       uint8_t         zc_pad[2];              /* alignment */
         uint64_t        zc_sendobj;
         uint64_t        zc_fromobj;
         uint64_t        zc_createtxg;
diff --git a/include/sys/zvol.h b/include/sys/zvol.h

index c3e386f0b79e2e222c2944be53d895055737993b..00ed220d3b1776527833aef75aa65e6d30c06c17 100644 (file)
--- a/include/sys/zvol.h
+++ b/include/sys/zvol.h
@@ -32,6 +32,8 @@
  #define        ZVOL_OBJ                1ULL
  #define        ZVOL_ZAP_OBJ            2ULL
  
+extern void *zvol_tag;
+
  extern void zvol_create_minors(spa_t *spa, const char *name, boolean_t async);
  extern void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async);
  extern void zvol_rename_minors(spa_t *spa, const char *oldname,
diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c

index 07f2e75a620423268583e0726a2e7faef96a9b7e..80f75e2564860f5d724434ec901707ca6374f39f 100644 (file)
--- a/lib/libzfs/libzfs_dataset.c
+++ b/lib/libzfs/libzfs_dataset.c
@@ -1865,22 +1865,21 @@ getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
         return (value);
  }
  
-static char *
+static const char *
  getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
  {
         nvlist_t *nv;
-       char *value;
+       const char *value;
  
         *source = NULL;
         if (nvlist_lookup_nvlist(zhp->zfs_props,
             zfs_prop_to_name(prop), &nv) == 0) {
-               verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
+               value = fnvlist_lookup_string(nv, ZPROP_VALUE);
                 (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
         } else {
                 verify(!zhp->zfs_props_table ||
                     zhp->zfs_props_table[prop] == B_TRUE);
-               if ((value = (char *)zfs_prop_default_string(prop)) == NULL)
-                       value = "";
+               value = zfs_prop_default_string(prop);
                 *source = "";
         }
  
@@ -2301,7 +2300,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
  {
         char *source = NULL;
         uint64_t val;
-       char *str;
+       const char *str;
         const char *strval;
         boolean_t received = zfs_is_recvd_props_mode(zhp);
  
@@ -2407,14 +2406,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
                 break;
  
         case ZFS_PROP_ORIGIN:
-               (void) strlcpy(propbuf, getprop_string(zhp, prop, &source),
-                   proplen);
-               /*
-                * If there is no parent at all, return failure to indicate that
-                * it doesn't apply to this dataset.
-                */
-               if (propbuf[0] == '\0')
+               str = getprop_string(zhp, prop, &source);
+               if (str == NULL)
                         return (-1);
+               (void) strlcpy(propbuf, str, proplen);
                 break;
  
         case ZFS_PROP_CLONES:
@@ -2596,8 +2591,10 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
                         break;
  
                 case PROP_TYPE_STRING:
-                       (void) strlcpy(propbuf,
-                           getprop_string(zhp, prop, &source), proplen);
+                       str = getprop_string(zhp, prop, &source);
+                       if (str == NULL)
+                               return (-1);
+                       (void) strlcpy(propbuf, str, proplen);
                         break;
  
                 case PROP_TYPE_INDEX:
diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c

index 29907dc8bfac441027386c9268a0c16f47083d5f..83351c716b159c55d964d0b950a9cca0800d33c8 100644 (file)
--- a/lib/libzfs/libzfs_mount.c
+++ b/lib/libzfs/libzfs_mount.c
@@ -1051,6 +1051,17 @@ mount_cb(zfs_handle_t *zhp, void *data)
                 return (0);
         }
  
+       /*
+        * If this filesystem is inconsistent and has a receive resume
+        * token, we can not mount it.
+        */
+       if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) &&
+           zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+           NULL, 0, NULL, NULL, 0, B_TRUE) == 0) {
+               zfs_close(zhp);
+               return (0);
+       }
+
         libzfs_add_handle(cbp, zhp);
         if (zfs_iter_filesystems(zhp, mount_cb, cbp) != 0) {
                 zfs_close(zhp);
diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c

index 46640f62324388b43112c8072773d23de9a0cba1..1bf8bf49aeda5e896f6d7e65b41a1ea267c61851 100644 (file)
--- a/lib/libzfs/libzfs_sendrecv.c
+++ b/lib/libzfs/libzfs_sendrecv.c
@@ -21,7 +21,7 @@
  
  /*
   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
- * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
+ * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
   * Copyright (c) 2012 Pawel Jakub Dawidek <pawel@dawidek.net>.
   * All rights reserved
@@ -56,6 +56,7 @@
  #include "zfs_prop.h"
  #include "zfs_fletcher.h"
  #include "libzfs_impl.h"
+#include <zlib.h>
  #include <sys/zio_checksum.h>
  #include <sys/ddt.h>
  #include <sys/socket.h>
@@ -66,6 +67,8 @@ extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
  static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *,
      recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, int,
      uint64_t *);
+static int guid_to_name(libzfs_handle_t *, const char *,
+    uint64_t, boolean_t, char *);
  
  static const zio_cksum_t zero_cksum = { { 0 } };
  
@@ -234,7 +237,7 @@ cksummer(void *arg)
  {
         dedup_arg_t *dda = arg;
         char *buf = zfs_alloc(dda->dedup_hdl, SPA_MAXBLOCKSIZE);
-       dmu_replay_record_t thedrr;
+       dmu_replay_record_t thedrr = { 0 };
         dmu_replay_record_t *drr = &thedrr;
         FILE *ofp;
         int outfd;
@@ -283,8 +286,7 @@ cksummer(void *arg)
                             DMU_BACKUP_FEATURE_DEDUPPROPS);
                         DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
  
-                       if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
-                           DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
+                       if (drr->drr_payloadlen != 0) {
                                 sz = drr->drr_payloadlen;
  
                                 if (sz > SPA_MAXBLOCKSIZE) {
@@ -1013,17 +1015,14 @@ static void *
  send_progress_thread(void *arg)
  {
         progress_arg_t *pa = arg;
-
         zfs_cmd_t zc = {"\0"};
         zfs_handle_t *zhp = pa->pa_zhp;
         libzfs_handle_t *hdl = zhp->zfs_hdl;
         unsigned long long bytes;
         char buf[16];
-
         time_t t;
         struct tm *tm;
  
-       assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT);
         (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
  
         if (!pa->pa_parsable)
@@ -1056,6 +1055,51 @@ send_progress_thread(void *arg)
         }
  }
  
+static void
+send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap,
+    uint64_t size, boolean_t parsable)
+{
+       if (parsable) {
+               if (fromsnap != NULL) {
+                       (void) fprintf(fout, "incremental\t%s\t%s",
+                           fromsnap, tosnap);
+               } else {
+                       (void) fprintf(fout, "full\t%s",
+                           tosnap);
+               }
+       } else {
+               if (fromsnap != NULL) {
+                       if (strchr(fromsnap, '@') == NULL &&
+                           strchr(fromsnap, '#') == NULL) {
+                               (void) fprintf(fout, dgettext(TEXT_DOMAIN,
+                                   "send from @%s to %s"),
+                                   fromsnap, tosnap);
+                       } else {
+                               (void) fprintf(fout, dgettext(TEXT_DOMAIN,
+                                   "send from %s to %s"),
+                                   fromsnap, tosnap);
+                       }
+               } else {
+                       (void) fprintf(fout, dgettext(TEXT_DOMAIN,
+                           "full send of %s"),
+                           tosnap);
+               }
+       }
+
+       if (size != 0) {
+               if (parsable) {
+                       (void) fprintf(fout, "\t%llu",
+                           (longlong_t)size);
+               } else {
+                       char buf[16];
+                       zfs_nicenum(size, buf, sizeof (buf));
+                       (void) fprintf(fout, dgettext(TEXT_DOMAIN,
+                           " estimated size is %s"), buf);
+               }
+       }
+       (void) fprintf(fout, "\n");
+}
+
  static int
  dump_snapshot(zfs_handle_t *zhp, void *arg)
  {
@@ -1135,37 +1179,14 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
             (sdd->fromorigin || sdd->replicate);
  
         if (sdd->verbose) {
-               uint64_t size;
-               err = estimate_ioctl(zhp, sdd->prevsnap_obj,
+               uint64_t size = 0;
+               (void) estimate_ioctl(zhp, sdd->prevsnap_obj,
                     fromorigin, &size);
  
-               if (sdd->parsable) {
-                       if (sdd->prevsnap[0] != '\0') {
-                               (void) fprintf(fout, "incremental\t%s\t%s",
-                                   sdd->prevsnap, zhp->zfs_name);
-                       } else {
-                               (void) fprintf(fout, "full\t%s",
-                                   zhp->zfs_name);
-                       }
-               } else {
-                       (void) fprintf(fout, dgettext(TEXT_DOMAIN,
-                           "send from @%s to %s"),
-                           sdd->prevsnap, zhp->zfs_name);
-               }
-               if (err == 0) {
-                       if (sdd->parsable) {
-                               (void) fprintf(fout, "\t%llu\n",
-                                   (longlong_t)size);
-                       } else {
-                               char buf[16];
-                               zfs_nicenum(size, buf, sizeof (buf));
-                               (void) fprintf(fout, dgettext(TEXT_DOMAIN,
-                                   " estimated size is %s\n"), buf);
-                       }
-                       sdd->size += size;
-               } else {
-                       (void) fprintf(fout, "\n");
-               }
+               send_print_verbose(fout, zhp->zfs_name,
+                   sdd->prevsnap[0] ? sdd->prevsnap : NULL,
+                   size, sdd->parsable);
+               sdd->size += size;
         }
  
         if (!sdd->dryrun) {
@@ -1376,6 +1397,231 @@ again:
         return (0);
  }
  
+nvlist_t *
+zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token)
+{
+       unsigned int version;
+       int nread, i;
+       unsigned long long checksum, packed_len;
+
+       /*
+        * Decode token header, which is:
+        *   <token version>-<checksum of payload>-<uncompressed payload length>
+        * Note that the only supported token version is 1.
+        */
+       nread = sscanf(token, "%u-%llx-%llx-",
+           &version, &checksum, &packed_len);
+       if (nread != 3) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "resume token is corrupt (invalid format)"));
+               return (NULL);
+       }
+
+       if (version != ZFS_SEND_RESUME_TOKEN_VERSION) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "resume token is corrupt (invalid version %u)"),
+                   version);
+               return (NULL);
+       }
+
+       /* convert hexadecimal representation to binary */
+       token = strrchr(token, '-') + 1;
+       int len = strlen(token) / 2;
+       unsigned char *compressed = zfs_alloc(hdl, len);
+       for (i = 0; i < len; i++) {
+               nread = sscanf(token + i * 2, "%2hhx", compressed + i);
+               if (nread != 1) {
+                       free(compressed);
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "resume token is corrupt "
+                           "(payload is not hex-encoded)"));
+                       return (NULL);
+               }
+       }
+
+       /* verify checksum */
+       zio_cksum_t cksum;
+       fletcher_4_native(compressed, len, &cksum);
+       if (cksum.zc_word[0] != checksum) {
+               free(compressed);
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "resume token is corrupt (incorrect checksum)"));
+               return (NULL);
+       }
+
+       /* uncompress */
+       void *packed = zfs_alloc(hdl, packed_len);
+       uLongf packed_len_long = packed_len;
+       if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK ||
+           packed_len_long != packed_len) {
+               free(packed);
+               free(compressed);
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "resume token is corrupt (decompression failed)"));
+               return (NULL);
+       }
+
+       /* unpack nvlist */
+       nvlist_t *nv;
+       int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP);
+       free(packed);
+       free(compressed);
+       if (error != 0) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "resume token is corrupt (nvlist_unpack failed)"));
+               return (NULL);
+       }
+       return (nv);
+}
+
+int
+zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd,
+    const char *resume_token)
+{
+       char errbuf[1024];
+       char *toname;
+       char *fromname = NULL;
+       uint64_t resumeobj, resumeoff, toguid, fromguid, bytes;
+       zfs_handle_t *zhp;
+       int error = 0;
+       char name[ZFS_MAXNAMELEN];
+       enum lzc_send_flags lzc_flags = 0;
+
+       (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+           "cannot resume send"));
+
+       nvlist_t *resume_nvl =
+           zfs_send_resume_token_to_nvlist(hdl, resume_token);
+       if (resume_nvl == NULL) {
+               /*
+                * zfs_error_aux has already been set by
+                * zfs_send_resume_token_to_nvlist
+                */
+               return (zfs_error(hdl, EZFS_FAULT, errbuf));
+       }
+       if (flags->verbose) {
+               (void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+                   "resume token contents:\n"));
+               nvlist_print(stderr, resume_nvl);
+       }
+
+       if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 ||
+           nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 ||
+           nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 ||
+           nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 ||
+           nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "resume token is corrupt"));
+               return (zfs_error(hdl, EZFS_FAULT, errbuf));
+       }
+       fromguid = 0;
+       (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid);
+
+       if (flags->embed_data || nvlist_exists(resume_nvl, "embedok"))
+               lzc_flags |= LZC_SEND_FLAG_EMBED_DATA;
+
+       if (guid_to_name(hdl, toname, toguid, B_FALSE, name) != 0) {
+               if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) {
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "'%s' is no longer the same snapshot used in "
+                           "the initial send"), toname);
+               } else {
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "'%s' used in the initial send no longer exists"),
+                           toname);
+               }
+               return (zfs_error(hdl, EZFS_BADPATH, errbuf));
+       }
+       zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
+       if (zhp == NULL) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "unable to access '%s'"), name);
+               return (zfs_error(hdl, EZFS_BADPATH, errbuf));
+       }
+
+       if (fromguid != 0) {
+               if (guid_to_name(hdl, toname, fromguid, B_TRUE, name) != 0) {
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "incremental source %#llx no longer exists"),
+                           (longlong_t)fromguid);
+                       return (zfs_error(hdl, EZFS_BADPATH, errbuf));
+               }
+               fromname = name;
+       }
+
+       if (flags->verbose) {
+               uint64_t size = 0;
+               error = lzc_send_space(zhp->zfs_name, fromname, &size);
+               if (error == 0)
+                       size = MAX(0, (int64_t)(size - bytes));
+               send_print_verbose(stderr, zhp->zfs_name, fromname,
+                   size, flags->parsable);
+       }
+
+       if (!flags->dryrun) {
+               progress_arg_t pa = { 0 };
+               pthread_t tid;
+               /*
+                * If progress reporting is requested, spawn a new thread to
+                * poll ZFS_IOC_SEND_PROGRESS at a regular interval.
+                */
+               if (flags->progress) {
+                       pa.pa_zhp = zhp;
+                       pa.pa_fd = outfd;
+                       pa.pa_parsable = flags->parsable;
+
+                       error = pthread_create(&tid, NULL,
+                           send_progress_thread, &pa);
+                       if (error != 0) {
+                               zfs_close(zhp);
+                               return (error);
+                       }
+               }
+
+               error = lzc_send_resume(zhp->zfs_name, fromname, outfd,
+                   lzc_flags, resumeobj, resumeoff);
+
+               if (flags->progress) {
+                       (void) pthread_cancel(tid);
+                       (void) pthread_join(tid, NULL);
+               }
+
+               char errbuf[1024];
+               (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+                   "warning: cannot send '%s'"), zhp->zfs_name);
+
+               zfs_close(zhp);
+
+               switch (error) {
+               case 0:
+                       return (0);
+               case EXDEV:
+               case ENOENT:
+               case EDQUOT:
+               case EFBIG:
+               case EIO:
+               case ENOLINK:
+               case ENOSPC:
+               case ENOSTR:
+               case ENXIO:
+               case EPIPE:
+               case ERANGE:
+               case EFAULT:
+               case EROFS:
+                       zfs_error_aux(hdl, strerror(errno));
+                       return (zfs_error(hdl, EZFS_BADBACKUP, errbuf));
+
+               default:
+                       return (zfs_standard_error(hdl, errno, errbuf));
+               }
+       }
+
+
+       zfs_close(zhp);
+
+       return (error);
+}
+
  /*
   * Generate a send stream for the dataset identified by the argument zhp.
   *
@@ -1451,7 +1697,9 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
                 dmu_replay_record_t drr = { 0 };
                 char *packbuf = NULL;
                 size_t buflen = 0;
-               zio_cksum_t zc = { { 0 } };
+               zio_cksum_t zc;
+
+               ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0);
  
                 if (flags->replicate || flags->props) {
                         nvlist_t *hdrnv;
@@ -1913,6 +2161,7 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
  
  typedef struct guid_to_name_data {
         uint64_t guid;
+       boolean_t bookmark_ok;
         char *name;
         char *skip;
  } guid_to_name_data_t;
@@ -1921,20 +2170,25 @@ static int
  guid_to_name_cb(zfs_handle_t *zhp, void *arg)
  {
         guid_to_name_data_t *gtnd = arg;
+       const char *slash;
         int err;
  
         if (gtnd->skip != NULL &&
-           strcmp(zhp->zfs_name, gtnd->skip) == 0) {
+           (slash = strrchr(zhp->zfs_name, '/')) != NULL &&
+           strcmp(slash + 1, gtnd->skip) == 0) {
+               zfs_close(zhp);
                 return (0);
         }
  
-       if (zhp->zfs_dmustats.dds_guid == gtnd->guid) {
+       if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid) {
                 (void) strcpy(gtnd->name, zhp->zfs_name);
                 zfs_close(zhp);
                 return (EEXIST);
         }
  
         err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
+       if (err != EEXIST && gtnd->bookmark_ok)
+               err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd);
         zfs_close(zhp);
         return (err);
  }
@@ -1948,45 +2202,48 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg)
   */
  static int
  guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid,
-    char *name)
+    boolean_t bookmark_ok, char *name)
  {
-       /* exhaustive search all local snapshots */
         char pname[ZFS_MAXNAMELEN];
         guid_to_name_data_t gtnd;
-       int err = 0;
-       zfs_handle_t *zhp;
-       char *cp;
  
         gtnd.guid = guid;
+       gtnd.bookmark_ok = bookmark_ok;
         gtnd.name = name;
         gtnd.skip = NULL;
  
-       (void) strlcpy(pname, parent, sizeof (pname));
-
         /*
-        * Search progressively larger portions of the hierarchy.  This will
+        * Search progressively larger portions of the hierarchy, starting
+        * with the filesystem specified by 'parent'.  This will
          * select the "most local" version of the origin snapshot in the case
          * that there are multiple matching snapshots in the system.
          */
-       while ((cp = strrchr(pname, '/')) != NULL) {
-
+       (void) strlcpy(pname, parent, sizeof (pname));
+       char *cp = strrchr(pname, '@');
+       if (cp == NULL)
+               cp = strchr(pname, '\0');
+       for (; cp != NULL; cp = strrchr(pname, '/')) {
                 /* Chop off the last component and open the parent */
                 *cp = '\0';
-               zhp = make_dataset_handle(hdl, pname);
+               zfs_handle_t *zhp = make_dataset_handle(hdl, pname);
  
                 if (zhp == NULL)
                         continue;
-
-               err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
+               int err = guid_to_name_cb(zfs_handle_dup(zhp), &gtnd);
+               if (err != EEXIST)
+                       err = zfs_iter_children(zhp, guid_to_name_cb, &gtnd);
+               if (err != EEXIST && bookmark_ok)
+                       err = zfs_iter_bookmarks(zhp, guid_to_name_cb, &gtnd);
                 zfs_close(zhp);
                 if (err == EEXIST)
                         return (0);
  
                 /*
-                * Remember the dataset that we already searched, so we
-                * skip it next time through.
+                * Remember the last portion of the dataset so we skip it next
+                * time through (as we've already searched that portion of the
+                * hierarchy).
                  */
-               gtnd.skip = pname;
+               gtnd.skip = strrchr(pname, '/') + 1;
         }
  
         return (ENOENT);
@@ -2586,11 +2843,9 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
  
                 switch (drr->drr_type) {
                 case DRR_BEGIN:
-                       /* NB: not to be used on v2 stream packages */
                         if (drr->drr_payloadlen != 0) {
-                               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                                   "invalid substream header"));
-                               return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+                               (void) recv_read(hdl, fd, buf,
+                                   drr->drr_payloadlen, B_FALSE, NULL);
                         }
                         break;
  
@@ -2651,6 +2906,40 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
         return (-1);
  }
  
+static void
+recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap,
+    boolean_t resumable)
+{
+       char target_fs[ZFS_MAXNAMELEN];
+
+       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+           "checksum mismatch or incomplete stream"));
+
+       if (!resumable)
+               return;
+       (void) strlcpy(target_fs, target_snap, sizeof (target_fs));
+       *strchr(target_fs, '@') = '\0';
+       zfs_handle_t *zhp = zfs_open(hdl, target_fs,
+           ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+       if (zhp == NULL)
+               return;
+
+       char token_buf[ZFS_MAXPROPLEN];
+       int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN,
+           token_buf, sizeof (token_buf),
+           NULL, NULL, 0, B_TRUE);
+       if (error == 0) {
+               zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                   "checksum mismatch or incomplete stream.\n"
+                   "Partially received snapshot is saved.\n"
+                   "A resuming stream can be generated on the sending "
+                   "system by running:\n"
+                   "    zfs send -t %s"),
+                   token_buf);
+       }
+       zfs_close(zhp);
+}
+
  /*
   * Restores a backup of tosnap from the file descriptor specified by infd.
   */
@@ -2799,7 +3088,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
          */
         if (drrb->drr_flags & DRR_FLAG_CLONE) {
                 if (guid_to_name(hdl, zc.zc_value,
-                   drrb->drr_fromguid, zc.zc_string) != 0) {
+                   drrb->drr_fromguid, B_FALSE, zc.zc_string) != 0) {
                         zcmd_free_nvlists(&zc);
                         zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
                             "local origin for clone %s does not exist"),
@@ -2815,8 +3104,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                             zc.zc_string);
         }
  
+       boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+           DMU_BACKUP_FEATURE_RESUMING;
         stream_wantsnewfs = (drrb->drr_fromguid == 0 ||
-           (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap);
+           (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming;
  
         if (stream_wantsnewfs) {
                 /*
@@ -2835,7 +3126,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                         char suffix[ZFS_MAXNAMELEN];
                         (void) strcpy(suffix, strrchr(zc.zc_value, '/'));
                         if (guid_to_name(hdl, zc.zc_name, parent_snapguid,
-                           zc.zc_value) == 0) {
+                           B_FALSE, zc.zc_value) == 0) {
                                 *strchr(zc.zc_value, '@') = '\0';
                                 (void) strcat(zc.zc_value, suffix);
                         }
@@ -2862,7 +3153,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                         char snap[ZFS_MAXNAMELEN];
                         (void) strcpy(snap, strchr(zc.zc_value, '@'));
                         if (guid_to_name(hdl, zc.zc_name, drrb->drr_fromguid,
-                           zc.zc_value) == 0) {
+                           B_FALSE, zc.zc_value) == 0) {
                                 *strchr(zc.zc_value, '@') = '\0';
                                 (void) strcat(zc.zc_value, snap);
                         }
@@ -2876,11 +3167,12 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                 zfs_handle_t *zhp;
  
                 /*
-                * Destination fs exists.  Therefore this should either
-                * be an incremental, or the stream specifies a new fs
-                * (full stream or clone) and they want us to blow it
-                * away (and have therefore specified -F and removed any
-                * snapshots).
+                * Destination fs exists.  It must be one of these cases:
+                *  - an incremental send stream
+                *  - the stream specifies a new fs (full stream or clone)
+                *    and they want us to blow away the existing fs (and
+                *    have therefore specified -F and removed any snapshots)
+                *  - we are resuming a failed receive.
                  */
                 if (stream_wantsnewfs) {
                         if (!flags->force) {
@@ -2935,6 +3227,18 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                                 return (-1);
                         }
                 }
+
+               /*
+                * If we are resuming a newfs, set newfs here so that we will
+                * mount it if the recv succeeds this time.  We can tell
+                * that it was a newfs on the first recv because the fs
+                * itself will be inconsistent (if the fs existed when we
+                * did the first recv, we would have received it into
+                * .../%recv).
+                */
+               if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT))
+                       newfs = B_TRUE;
+
                 zfs_close(zhp);
         } else {
                 /*
@@ -2967,9 +3271,10 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                 newfs = B_TRUE;
         }
  
-       zc.zc_begin_record = drr_noswap->drr_u.drr_begin;
+       zc.zc_begin_record = *drr_noswap;
         zc.zc_cookie = infd;
         zc.zc_guid = flags->force;
+       zc.zc_resumable = flags->resumable;
         if (flags->verbose) {
                 (void) printf("%s %s stream of %s into %s\n",
                     flags->dryrun ? "would receive" : "receiving",
@@ -3106,8 +3411,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                         (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
                         break;
                 case ECKSUM:
-                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-                           "invalid stream (checksum mismatch)"));
+                       recv_ecksum_set_aux(hdl, zc.zc_value, flags->resumable);
                         (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
                         break;
                 case ENOTSUP:
@@ -3309,7 +3613,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap,
   * Restores a backup of tosnap from the file descriptor specified by infd.
   * Return 0 on total success, -2 if some things couldn't be
   * destroyed/renamed/promoted, -1 if some things couldn't be received.
- * (-1 will override -2).
+ * (-1 will override -2, if -1 and the resumable flag was specified the
+ * transfer can be resumed if the sending side supports it).
   */
  int
  zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props,
diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c

index b706e6f6be88c106e6630f5a929fbcbe3f1fef00..220792300b8cac1c22d4be62bb98e18aea352f98 100644 (file)
--- a/lib/libzfs_core/libzfs_core.c
+++ b/lib/libzfs_core/libzfs_core.c
@@ -467,6 +467,13 @@ lzc_get_holds(const char *snapname, nvlist_t **holdsp)
  int
  lzc_send(const char *snapname, const char *from, int fd,
      enum lzc_send_flags flags)
+{
+       return (lzc_send_resume(snapname, from, fd, flags, 0, 0));
+}
+
+int
+lzc_send_resume(const char *snapname, const char *from, int fd,
+    enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff)
  {
         nvlist_t *args;
         int err;
@@ -479,6 +486,10 @@ lzc_send(const char *snapname, const char *from, int fd,
                 fnvlist_add_boolean(args, "largeblockok");
         if (flags & LZC_SEND_FLAG_EMBED_DATA)
                 fnvlist_add_boolean(args, "embedok");
+       if (resumeobj != 0 || resumeoff != 0) {
+               fnvlist_add_uint64(args, "resume_object", resumeobj);
+               fnvlist_add_uint64(args, "resume_offset", resumeoff);
+       }
         err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL);
         nvlist_free(args);
         return (err);
@@ -536,22 +547,9 @@ recv_read(int fd, void *buf, int ilen)
         return (0);
  }
  
-/*
- * The simplest receive case: receive from the specified fd, creating the
- * specified snapshot.  Apply the specified properties a "received" properties
- * (which can be overridden by locally-set properties).  If the stream is a
- * clone, its origin snapshot must be specified by 'origin'.  The 'force'
- * flag will cause the target filesystem to be rolled back or destroyed if
- * necessary to receive.
- *
- * Return 0 on success or an errno on failure.
- *
- * Note: this interface does not work on dedup'd streams
- * (those with DMU_BACKUP_FEATURE_DEDUP).
- */
-int
-lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
-    boolean_t force, int fd)
+static int
+lzc_receive_impl(const char *snapname, nvlist_t *props, const char *origin,
+    boolean_t force, boolean_t resumable, int fd)
  {
         /*
          * The receive ioctl is still legacy, so we need to construct our own
@@ -561,7 +559,6 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
         char *atp;
         char *packed = NULL;
         size_t size;
-       dmu_replay_record_t drr;
         int error;
  
         ASSERT3S(g_refcount, >, 0);
@@ -597,10 +594,9 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
                 (void) strlcpy(zc.zc_string, origin, sizeof (zc.zc_string));
  
         /* zc_begin_record is non-byteswapped BEGIN record */
-       error = recv_read(fd, &drr, sizeof (drr));
+       error = recv_read(fd, &zc.zc_begin_record, sizeof (zc.zc_begin_record));
         if (error != 0)
                 goto out;
-       zc.zc_begin_record = drr.drr_u.drr_begin;
  
         /* zc_cookie is fd to read from */
         zc.zc_cookie = fd;
@@ -608,6 +604,8 @@ lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
         /* zc guid is force flag */
         zc.zc_guid = force;
  
+       zc.zc_resumable = resumable;
+
         /* zc_cleanup_fd is unused */
         zc.zc_cleanup_fd = -1;
  
@@ -622,6 +620,39 @@ out:
         return (error);
  }
  
+/*
+ * The simplest receive case: receive from the specified fd, creating the
+ * specified snapshot.  Apply the specified properties as "received" properties
+ * (which can be overridden by locally-set properties).  If the stream is a
+ * clone, its origin snapshot must be specified by 'origin'.  The 'force'
+ * flag will cause the target filesystem to be rolled back or destroyed if
+ * necessary to receive.
+ *
+ * Return 0 on success or an errno on failure.
+ *
+ * Note: this interface does not work on dedup'd streams
+ * (those with DMU_BACKUP_FEATURE_DEDUP).
+ */
+int
+lzc_receive(const char *snapname, nvlist_t *props, const char *origin,
+    boolean_t force, int fd)
+{
+       return (lzc_receive_impl(snapname, props, origin, force, B_FALSE, fd));
+}
+
+/*
+ * Like lzc_receive, but if the receive fails due to premature stream
+ * termination, the intermediate state will be preserved on disk.  In this
+ * case, ECKSUM will be returned.  The receive may subsequently be resumed
+ * with a resuming send stream generated by lzc_send_resume().
+ */
+int
+lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin,
+    boolean_t force, int fd)
+{
+       return (lzc_receive_impl(snapname, props, origin, force, B_TRUE, fd));
+}
+
  /*
   * Roll back this filesystem or volume to its most recent snapshot.
   * If snapnamebuf is not NULL, it will be filled in with the name
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c

index 800cdd7d7c5589c03801092303c812ed0258c025..93bfb0403717adcf43892efb7a79a9a0f922de74 100644 (file)
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -1408,6 +1408,8 @@ spl_fstrans_check(void)
         return (0);
  }
  
+void *zvol_tag = "zvol_tag";
+
  void
  zvol_create_minors(spa_t *spa, const char *name, boolean_t async)
  {
diff --git a/man/man8/zfs.8 b/man/man8/zfs.8

index 2aca58b76cf04c5217d611f1f03b4fadd355b627..79c053fcc0520591cf1c5a40b7175c08707a33e8 100644 (file)
--- a/man/man8/zfs.8
+++ b/man/man8/zfs.8
@@ -180,17 +180,27 @@ zfs \- configures ZFS file systems
  
  .LP
  .nf
-\fBzfs\fR \fBsend\fR [\fB-eL\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-Le\fR] [\fB-i \fIsnapshot\fR|\fIbookmark\fR]\fR \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
  .fi
  
  .LP
  .nf
-\fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs\fR \fBsend\fR [\fB-Penv\fR] \fB-t\fR \fIreceive_resume_token\fR
  .fi
  
  .LP
  .nf
-\fBzfs\fR \fBreceive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR
+\fBzfs\fR \fBreceive\fR [\fB-Fnsuv\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBreceive\fR [\fB-Fnsuv\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR
+.fi
+
+.LP
+.nf
+\fBzfs\fR \fBreceive\fR \fB-A\fR \fIfilesystem\fR|\fIvolume\fR
  .fi
  
  .LP
@@ -532,6 +542,17 @@ For cloned file systems or volumes, the snapshot from which the clone was create
  .sp
  .ne 2
  .na
+\fB\fBreceive_resume_token\fR\fR
+.ad
+.sp .6
+.RS 4n
+For filesystems or volumes which have saved partially-completed state from \fBzfs receive -s\fR , this opaque token can be provided to \fBzfs send -t\fR to resume and complete the \fBzfs receive\fR.
+.RE
+
+.sp
+.ne 2
+.mk
+.na
  \fB\fBreferenced\fR\fR
  .ad
  .sp .6
@@ -2799,7 +2820,7 @@ The format of the stream is committed. You will be able to receive your streams
  .sp
  .ne 2
  .na
-\fBzfs send\fR [\fB-eL\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
+\fBzfs send\fR [\fB-Le\fR] [\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR
  .ad
  .sp .6
  .RS 4n
@@ -2809,24 +2830,6 @@ the pool must be read-only, or the filesystem must not be mounted.  When the
  stream generated from a filesystem or volume is received, the default snapshot
  name will be "--head--".
  
-.sp
-.ne 2
-.na
-\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR
-.ad
-.sp .6
-.RS 4n
-Generate an incremental send stream.  The incremental source must be an earlier
-snapshot in the destination's history. It will commonly be an earlier
-snapshot in the destination's filesystem, in which case it can be
-specified as the last component of the name (the \fB#\fR or \fB@\fR character
-and following).
-.sp
-If the incremental target is a clone, the incremental source can
-be the origin snapshot, or an earlier snapshot in the origin's filesystem,
-or the origin's origin, etc.
-.RE
-
  .sp
  .ne 2
  .na
@@ -2859,15 +2862,39 @@ then the receiving system must have that feature enabled as well. See
  \fBembedded_data\fR feature.
  .RE
  
+.sp
+.ne 2
+.na
+\fB-i\fR \fIsnapshot\fR|\fIbookmark\fR
+.ad
+.sp .6
+.RS 4n
+Generate an incremental send stream.  The incremental source must be an earlier snapshot in the destination's history. It will commonly be an earlier snapshot in the destination's filesystem, in which case it can be specified as the last component of the name (the \fB#\fR or \fB@\fR character and following).
+.sp
+If the incremental target is a clone, the incremental source can be the origin snapshot, or an earlier snapshot in the origin's filesystem, or the origin's origin, etc.
+.RE
+
+.RE
+.sp
+.ne 2
+.na
+\fB\fBzfs send\fR [\fB-Penv\fR] \fB-t\fR \fIreceive_resume_token\fR\fR
+.ad
+.sp .6
+.RS 4n
+Creates a send stream which resumes an interrupted receive. The \fIreceive_resume_token\fR is the value of this property on the filesystem or volume that was being received into. See the documentation for \fBzfs receive -s\fR for more details.
+
+.RE
+
  .RE
  .sp
  .ne 2
  .na
-\fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
+\fB\fBzfs receive\fR [\fB-Fnsuv\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR|\fIvolume\fR|\fIsnapshot\fR\fR
  .ad
  .br
  .na
-\fB\fBzfs receive\fR [\fB-vnFu\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR\fR
+\fB\fBzfs receive\fR [\fB-Fnsuv\fR] [\fB-d\fR|\fB-e\fR] [\fB-o origin\fR=\fIsnapshot\fR] \fIfilesystem\fR\fR
  .ad
  .sp .6
  .RS 4n
@@ -2885,21 +2912,36 @@ The \fB-d\fR and \fB-e\fR options cause the file system name of the target snaps
  .sp
  .ne 2
  .na
-\fB\fB-d\fR\fR
+\fB\fB-F\fR\fR
  .ad
  .sp .6
  .RS 4n
-Discard the first element of the sent snapshot's file system name, using the remaining elements to determine the name of the target file system for the new snapshot as described in the paragraph above.
+Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side.
  .RE
  
  .sp
  .ne 2
+.mk
  .na
-\fB\fB-e\fR\fR
+\fB\fB-n\fR\fR
  .ad
  .sp .6
  .RS 4n
-Discard all but the last element of the sent snapshot's file system name, using that element to determine the name of the target file system for the new snapshot as described in the paragraph above.
+Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-s\fR\fR
+.ad
+.sp .6
+.RS 4n
+If the receive is interrupted, save the partially received state, rather than deleting it. Interruption may be due to premature termination of the stream (e.g. due to network failure or failure of the remote system if the stream is being read over a network connection), a checksum error in the stream, termination of the \fBzfs receive\fR process, or unclean shutdown of the system.
+.sp
+The receive can be resumed with a stream generated by \fBzfs send -t\fR token, where the \fItoken\fR is the value of the \fBreceive_resume_token\fR property of the filesystem or volume which is received into.
+.sp
+To use this flag, the storage pool must have the \fBextensible_dataset\fR feature enabled.  See \fBzpool-features\fR(5) for details on ZFS feature flags.
  .RE
  
  .sp
@@ -2925,11 +2967,21 @@ Print verbose information about the stream and the time required to perform the
  .sp
  .ne 2
  .na
-\fB\fB-n\fR\fR
+\fB\fB-d\fR\fR
  .ad
  .sp .6
  .RS 4n
-Do not actually receive the stream. This can be useful in conjunction with the \fB-v\fR option to verify the name the receive operation would use.
+Discard the first element of the sent snapshot's file system name, using the remaining elements to determine the name of the target file system for the new snapshot as described in the paragraph above.
+.RE
+
+.sp
+.ne 2
+.na
+\fB\fB-e\fR\fR
+.ad
+.sp .6
+.RS 4n
+Discard all but the last element of the sent snapshot's file system name, using that element to determine the name of the target file system for the new snapshot as described in the paragraph above.
  .RE
  
  .sp
@@ -2942,15 +2994,16 @@ Do not actually receive the stream. This can be useful in conjunction with the \
  Forces the stream to be received as a clone of the given snapshot. This is only valid if the stream is an incremental stream whose source is the same as the provided origin.
  .RE
  
+.RE
+
  .sp
  .ne 2
  .na
-\fB\fB-F\fR\fR
+\fB\fBzfs receive\fR [\fB-A\fR] \fIfilesystem\fR|\fIvolume\fR
  .ad
  .sp .6
  .RS 4n
-Force a rollback of the file system to the most recent snapshot before performing the receive operation. If receiving an incremental replication stream (for example, one generated by \fBzfs send -R -[iI]\fR), destroy snapshots and file systems that do not exist on the sending side.
-.RE
+Abort an interrupted \fBzfs receive \fB-s\fR\fR, deleting its saved partially received state.
  
  .RE
  
diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c

index 1dbeab084601437c10478eb197cc033327c9442f..1d68ca29e6d6b369225c3c2e3a096ad8c3ce372b 100644 (file)
--- a/module/zcommon/zfs_prop.c
+++ b/module/zcommon/zfs_prop.c
@@ -375,6 +375,10 @@ zfs_prop_init(void)
         zprop_register_string(ZFS_PROP_SELINUX_ROOTCONTEXT, "rootcontext",
             "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "<selinux rootcontext>",
             "ROOTCONTEXT");
+       zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN,
+           "receive_resume_token",
+           NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+           "<string token>", "RESUMETOK");
  
         /* readonly number properties */
         zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c

index cdc897726ee7a53578c95789d87d29437add2f14..a9991aabb0aac4536c6f1f3929631da1b455a558 100644 (file)
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -405,6 +405,17 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
          * checksum/compression/copies.
          */
         if (ds != NULL) {
+               boolean_t needlock = B_FALSE;
+
+               /*
+                * Note: it's valid to open the objset if the dataset is
+                * long-held, in which case the pool_config lock will not
+                * be held.
+                */
+               if (!dsl_pool_config_held(dmu_objset_pool(os))) {
+                       needlock = B_TRUE;
+                       dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
+               }
                 err = dsl_prop_register(ds,
                     zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
                     primary_cache_changed_cb, os);
@@ -461,6 +472,8 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
                                     dnodesize_changed_cb, os);
                         }
                 }
+               if (needlock)
+                       dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
                 if (err != 0) {
                         VERIFY(arc_buf_remove_ref(os->os_phys_buf,
                             &os->os_phys_buf));
@@ -520,6 +533,13 @@ dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
  {
         int err = 0;
  
+       /*
+        * We shouldn't be doing anything with dsl_dataset_t's unless the
+        * pool_config lock is held, or the dataset is long-held.
+        */
+       ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
+           dsl_dataset_long_held(ds));
+
         mutex_enter(&ds->ds_opening_lock);
         if (ds->ds_objset == NULL) {
                 objset_t *os;
diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c

index 901386a5a4d06f4e7e83e398c33b966db433d6c0..1f903c7f3954a8ad3f7ef68b6cb86a16325bb274 100644 (file)
--- a/module/zfs/dmu_send.c
+++ b/module/zfs/dmu_send.c
@@ -23,6 +23,7 @@
   * Copyright (c) 2011, 2015 by Delphix. All rights reserved.
   * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
   * Copyright (c) 2014, Joyent, Inc. All rights reserved.
+ * Copyright 2014 HybridCluster. All rights reserved.
   * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
   * Copyright (c) 2016 Actifio, Inc. All rights reserved.
   */
@@ -64,12 +65,14 @@ int zfs_send_queue_length = 16 * 1024 * 1024;
  int zfs_recv_queue_length = 16 * 1024 * 1024;
  
  static char *dmu_recv_tag = "dmu_recv_tag";
-static const char *recv_clone_name = "%recv";
+const char *recv_clone_name = "%recv";
  
  #define        BP_SPAN(datablkszsec, indblkshift, level) \
         (((uint64_t)datablkszsec) << (SPA_MINBLOCKSHIFT + \
         (level) * (indblkshift - SPA_BLKPTRSHIFT)))
  
+static void byteswap_record(dmu_replay_record_t *drr);
+
  struct send_thread_arg {
         bqueue_t        q;
         dsl_dataset_t   *ds;            /* Dataset to traverse */
@@ -77,6 +80,7 @@ struct send_thread_arg {
         int             flags;          /* flags to pass to traverse_dataset */
         int             error_code;
         boolean_t       cancel;
+       zbookmark_phys_t resume;
  };
  
  struct send_block_record {
@@ -99,7 +103,7 @@ dump_bytes_cb(void *arg)
  {
         dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg;
         dmu_sendarg_t *dsp = dbi->dbi_dsp;
-       dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
+       dsl_dataset_t *ds = dmu_objset_ds(dsp->dsa_os);
         ssize_t resid; /* have to get resid to get detailed errno */
         ASSERT0(dbi->dbi_len % 8);
  
@@ -180,7 +184,7 @@ dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
          * that the receiving system doesn't have any dbufs in the range
          * being freed.  This is always true because there is a one-record
          * constraint: we only send one WRITE record for any given
-        * object+offset.  We know that the one-record constraint is
+        * object,offset.  We know that the one-record constraint is
          * true because we always send data in increasing order by
          * object,offset.
          *
@@ -428,6 +432,19 @@ dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
  {
         struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
  
+       if (object < dsp->dsa_resume_object) {
+               /*
+                * Note: when resuming, we will visit all the dnodes in
+                * the block of dnodes that we are resuming from.  In
+                * this case it's unnecessary to send the dnodes prior to
+                * the one we are resuming from.  We should be at most one
+                * block's worth of dnodes behind the resume point.
+                */
+               ASSERT3U(dsp->dsa_resume_object - object, <,
+                   1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT));
+               return (0);
+       }
+
         if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
                 return (dump_freeobjects(dsp, object, 1));
  
@@ -509,6 +526,9 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
         uint64_t record_size;
         int err = 0;
  
+       ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+           zb->zb_object >= sta->resume.zb_object);
+
         if (sta->cancel)
                 return (SET_ERROR(EINTR));
  
@@ -545,8 +565,10 @@ send_traverse_thread(void *arg)
         struct send_block_record *data;
  
         if (st_arg->ds != NULL) {
-               err = traverse_dataset(st_arg->ds, st_arg->fromtxg,
-                   st_arg->flags, send_cb, arg);
+               err = traverse_dataset_resume(st_arg->ds,
+                   st_arg->fromtxg, &st_arg->resume,
+                   st_arg->flags, send_cb, st_arg);
+
                 if (err != EINTR)
                         st_arg->error_code = err;
         }
@@ -575,6 +597,9 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
  
         ASSERT3U(zb->zb_level, >=, 0);
  
+       ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT ||
+           zb->zb_object >= dsa->dsa_resume_object);
+
         if (zb->zb_object != DMU_META_DNODE_OBJECT &&
             DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
                 return (0);
@@ -637,6 +662,10 @@ do_dump(dmu_sendarg_t *dsa, struct send_block_record *data)
                 uint64_t offset;
  
                 ASSERT0(zb->zb_level);
+               ASSERT(zb->zb_object > dsa->dsa_resume_object ||
+                   (zb->zb_object == dsa->dsa_resume_object &&
+                   zb->zb_blkid * blksz >= dsa->dsa_resume_offset));
+
                 if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf,
                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
                     &aflags, zb) != 0) {
@@ -697,8 +726,10 @@ get_next_record(bqueue_t *bq, struct send_block_record *data)
   */
  static int
  dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
-    zfs_bookmark_phys_t *ancestor_zb, boolean_t is_clone, boolean_t embedok,
-    boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off)
+    zfs_bookmark_phys_t *ancestor_zb,
+    boolean_t is_clone, boolean_t embedok, boolean_t large_block_ok, int outfd,
+    uint64_t resumeobj, uint64_t resumeoff,
+    vnode_t *vp, offset_t *off)
  {
         objset_t *os;
         dmu_replay_record_t *drr;
@@ -707,6 +738,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
         uint64_t fromtxg = 0;
         uint64_t featureflags = 0;
         struct send_thread_arg to_arg;
+       void *payload = NULL;
+       size_t payload_len = 0;
         struct send_block_record *to_data;
  
         err = dmu_objset_from_ds(to_ds, &os);
@@ -721,6 +754,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
         DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
             DMU_SUBSTREAM);
  
+       bzero(&to_arg, sizeof (to_arg));
+
  #ifdef _KERNEL
         if (dmu_objset_type(os) == DMU_OST_ZFS) {
                 uint64_t version;
@@ -746,6 +781,10 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
                         featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4;
         }
  
+       if (resumeobj != 0 || resumeoff != 0) {
+               featureflags |= DMU_BACKUP_FEATURE_RESUMING;
+       }
+
         DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo,
             featureflags);
  
@@ -781,6 +820,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
         dsp->dsa_pending_op = PENDING_NONE;
         dsp->dsa_incremental = (ancestor_zb != NULL);
         dsp->dsa_featureflags = featureflags;
+       dsp->dsa_resume_object = resumeobj;
+       dsp->dsa_resume_offset = resumeoff;
  
         mutex_enter(&to_ds->ds_sendstream_lock);
         list_insert_head(&to_ds->ds_sendstreams, dsp);
@@ -789,7 +830,26 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
         dsl_dataset_long_hold(to_ds, FTAG);
         dsl_pool_rele(dp, tag);
  
-       if (dump_record(dsp, NULL, 0) != 0) {
+       if (resumeobj != 0 || resumeoff != 0) {
+               dmu_object_info_t to_doi;
+               nvlist_t *nvl;
+               err = dmu_object_info(os, resumeobj, &to_doi);
+               if (err != 0)
+                       goto out;
+               SET_BOOKMARK(&to_arg.resume, to_ds->ds_object, resumeobj, 0,
+                   resumeoff / to_doi.doi_data_block_size);
+
+               nvl = fnvlist_alloc();
+               fnvlist_add_uint64(nvl, "resume_object", resumeobj);
+               fnvlist_add_uint64(nvl, "resume_offset", resumeoff);
+               payload = fnvlist_pack(nvl, &payload_len);
+               drr->drr_payloadlen = payload_len;
+               fnvlist_free(nvl);
+       }
+
+       err = dump_record(dsp, payload, payload_len);
+       fnvlist_pack_free(payload, payload_len);
+       if (err != 0) {
                 err = dsp->dsa_err;
                 goto out;
         }
@@ -899,19 +959,19 @@ dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap,
                 is_clone = (fromds->ds_dir != ds->ds_dir);
                 dsl_dataset_rele(fromds, FTAG);
                 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
-                   embedok, large_block_ok, outfd, vp, off);
+                   embedok, large_block_ok, outfd, 0, 0, vp, off);
         } else {
                 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
-                   embedok, large_block_ok, outfd, vp, off);
+                   embedok, large_block_ok, outfd, 0, 0, vp, off);
         }
         dsl_dataset_rele(ds, FTAG);
         return (err);
  }
  
  int
-dmu_send(const char *tosnap, const char *fromsnap,
-    boolean_t embedok, boolean_t large_block_ok,
-    int outfd, vnode_t *vp, offset_t *off)
+dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok,
+    boolean_t large_block_ok, int outfd, uint64_t resumeobj, uint64_t resumeoff,
+    vnode_t *vp, offset_t *off)
  {
         dsl_pool_t *dp;
         dsl_dataset_t *ds;
@@ -978,10 +1038,12 @@ dmu_send(const char *tosnap, const char *fromsnap,
                         return (err);
                 }
                 err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone,
-                   embedok, large_block_ok, outfd, vp, off);
+                   embedok, large_block_ok,
+                   outfd, resumeobj, resumeoff, vp, off);
         } else {
                 err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE,
-                   embedok, large_block_ok, outfd, vp, off);
+                   embedok, large_block_ok,
+                   outfd, resumeobj, resumeoff, vp, off);
         }
         if (owned)
                 dsl_dataset_disown(ds, FTAG);
@@ -1221,6 +1283,7 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
  
         /* already checked */
         ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+       ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING));
  
         if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
             DMU_COMPOUNDSTREAM ||
@@ -1233,6 +1296,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
             spa_version(dp->dp_spa) < SPA_VERSION_SA)
                 return (SET_ERROR(ENOTSUP));
  
+       if (drba->drba_cookie->drc_resumable &&
+           !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET))
+               return (SET_ERROR(ENOTSUP));
+
         /*
          * The receiving code doesn't know how to translate a WRITE_EMBEDDED
          * record to a plan WRITE record, so the pool must have the
@@ -1345,15 +1412,16 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
  {
         dmu_recv_begin_arg_t *drba = arg;
         dsl_pool_t *dp = dmu_tx_pool(tx);
+       objset_t *mos = dp->dp_meta_objset;
         struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
         const char *tofs = drba->drba_cookie->drc_tofs;
         dsl_dataset_t *ds, *newds;
         uint64_t dsobj;
         int error;
-       uint64_t crflags;
+       uint64_t crflags = 0;
  
-       crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ?
-           DS_FLAG_CI_DATASET : 0;
+       if (drrb->drr_flags & DRR_FLAG_CI_DATA)
+               crflags |= DS_FLAG_CI_DATASET;
  
         error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
         if (error == 0) {
@@ -1391,6 +1459,32 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
         }
         VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds));
  
+       if (drba->drba_cookie->drc_resumable) {
+               uint64_t one = 1;
+               uint64_t zero = 0;
+
+               dsl_dataset_zapify(newds, tx);
+               if (drrb->drr_fromguid != 0) {
+                       VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID,
+                           8, 1, &drrb->drr_fromguid, tx));
+               }
+               VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID,
+                   8, 1, &drrb->drr_toguid, tx));
+               VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME,
+                   1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx));
+               VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT,
+                   8, 1, &one, tx));
+               VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET,
+                   8, 1, &zero, tx));
+               VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES,
+                   8, 1, &zero, tx));
+               if (DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) &
+                   DMU_BACKUP_FEATURE_EMBED_DATA) {
+                       VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK,
+                           8, 1, &one, tx));
+               }
+       }
+
         dmu_buf_will_dirty(newds->ds_dbuf, tx);
         dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT;
  
@@ -1408,56 +1502,191 @@ dmu_recv_begin_sync(void *arg, dmu_tx_t *tx)
         spa_history_log_internal_ds(newds, "receive", tx, "");
  }
  
+static int
+dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
+{
+       dmu_recv_begin_arg_t *drba = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       struct drr_begin *drrb = drba->drba_cookie->drc_drrb;
+       int error;
+       uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+       dsl_dataset_t *ds;
+       const char *tofs = drba->drba_cookie->drc_tofs;
+       char recvname[ZFS_MAXNAMELEN];
+       uint64_t val;
+
+       /* already checked */
+       ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC);
+       ASSERT(featureflags & DMU_BACKUP_FEATURE_RESUMING);
+
+       if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+           DMU_COMPOUNDSTREAM ||
+           drrb->drr_type >= DMU_OST_NUMTYPES)
+               return (SET_ERROR(EINVAL));
+
+       /* Verify pool version supports SA if SA_SPILL feature set */
+       if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
+           spa_version(dp->dp_spa) < SPA_VERSION_SA)
+               return (SET_ERROR(ENOTSUP));
+
+       /*
+        * The receiving code doesn't know how to translate a WRITE_EMBEDDED
+        * record to a plain WRITE record, so the pool must have the
+        * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED
+        * records.  Same with WRITE_EMBEDDED records that use LZ4 compression.
+        */
+       if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) &&
+           !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA))
+               return (SET_ERROR(ENOTSUP));
+       if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) &&
+           !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS))
+               return (SET_ERROR(ENOTSUP));
+
+       (void) snprintf(recvname, sizeof (recvname), "%s/%s",
+           tofs, recv_clone_name);
+
+       if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+               /* %recv does not exist; continue in tofs */
+               error = dsl_dataset_hold(dp, tofs, FTAG, &ds);
+               if (error != 0)
+                       return (error);
+       }
+
+       /* check that ds is marked inconsistent */
+       if (!DS_IS_INCONSISTENT(ds)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (SET_ERROR(EINVAL));
+       }
+
+       /* check that there is resuming data, and that the toguid matches */
+       if (!dsl_dataset_is_zapified(ds)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (SET_ERROR(EINVAL));
+       }
+       error = zap_lookup(dp->dp_meta_objset, ds->ds_object,
+           DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val);
+       if (error != 0 || drrb->drr_toguid != val) {
+               dsl_dataset_rele(ds, FTAG);
+               return (SET_ERROR(EINVAL));
+       }
+
+       /*
+        * Check if the receive is still running.  If so, it will be owned.
+        * Note that nothing else can own the dataset (e.g. after the receive
+        * fails) because it will be marked inconsistent.
+        */
+       if (dsl_dataset_has_owner(ds)) {
+               dsl_dataset_rele(ds, FTAG);
+               return (SET_ERROR(EBUSY));
+       }
+
+       /* There should not be any snapshots of this fs yet. */
+       if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) {
+               dsl_dataset_rele(ds, FTAG);
+               return (SET_ERROR(EINVAL));
+       }
+
+       /*
+        * Note: resume point will be checked when we process the first WRITE
+        * record.
+        */
+
+       /* check that the origin matches */
+       val = 0;
+       (void) zap_lookup(dp->dp_meta_objset, ds->ds_object,
+           DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val);
+       if (drrb->drr_fromguid != val) {
+               dsl_dataset_rele(ds, FTAG);
+               return (SET_ERROR(EINVAL));
+       }
+
+       dsl_dataset_rele(ds, FTAG);
+       return (0);
+}
+
+static void
+dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx)
+{
+       dmu_recv_begin_arg_t *drba = arg;
+       dsl_pool_t *dp = dmu_tx_pool(tx);
+       const char *tofs = drba->drba_cookie->drc_tofs;
+       dsl_dataset_t *ds;
+       uint64_t dsobj;
+       char recvname[ZFS_MAXNAMELEN];
+
+       (void) snprintf(recvname, sizeof (recvname), "%s/%s",
+           tofs, recv_clone_name);
+
+       if (dsl_dataset_hold(dp, recvname, FTAG, &ds) != 0) {
+               /* %recv does not exist; continue in tofs */
+               VERIFY0(dsl_dataset_hold(dp, tofs, FTAG, &ds));
+               drba->drba_cookie->drc_newfs = B_TRUE;
+       }
+
+       /* clear the inconsistent flag so that we can own it */
+       ASSERT(DS_IS_INCONSISTENT(ds));
+       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+       dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+       dsobj = ds->ds_object;
+       dsl_dataset_rele(ds, FTAG);
+
+       VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &ds));
+
+       dmu_buf_will_dirty(ds->ds_dbuf, tx);
+       dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT;
+
+       ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)));
+
+       drba->drba_cookie->drc_ds = ds;
+
+       spa_history_log_internal_ds(ds, "resume receive", tx, "");
+}
+
  /*
   * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
   * succeeds; otherwise we will leak the holds on the datasets.
   */
  int
-dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
-    boolean_t force, char *origin, dmu_recv_cookie_t *drc)
+dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
+    boolean_t force, boolean_t resumable, char *origin, dmu_recv_cookie_t *drc)
  {
         dmu_recv_begin_arg_t drba = { 0 };
-       dmu_replay_record_t *drr;
  
         bzero(drc, sizeof (dmu_recv_cookie_t));
-       drc->drc_drrb = drrb;
+       drc->drc_drr_begin = drr_begin;
+       drc->drc_drrb = &drr_begin->drr_u.drr_begin;
         drc->drc_tosnap = tosnap;
         drc->drc_tofs = tofs;
         drc->drc_force = force;
+       drc->drc_resumable = resumable;
         drc->drc_cred = CRED();
  
-       if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
+       if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) {
                 drc->drc_byteswap = B_TRUE;
-       else if (drrb->drr_magic != DMU_BACKUP_MAGIC)
-               return (SET_ERROR(EINVAL));
-
-       drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
-       drr->drr_type = DRR_BEGIN;
-       drr->drr_u.drr_begin = *drc->drc_drrb;
-       if (drc->drc_byteswap) {
-               fletcher_4_incremental_byteswap(drr,
+               fletcher_4_incremental_byteswap(drr_begin,
                     sizeof (dmu_replay_record_t), &drc->drc_cksum);
-       } else {
-               fletcher_4_incremental_native(drr,
+               byteswap_record(drr_begin);
+       } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) {
+               fletcher_4_incremental_native(drr_begin,
                     sizeof (dmu_replay_record_t), &drc->drc_cksum);
-       }
-       kmem_free(drr, sizeof (dmu_replay_record_t));
-
-       if (drc->drc_byteswap) {
-               drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-               drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
-               drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
-               drrb->drr_type = BSWAP_32(drrb->drr_type);
-               drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
-               drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
+       } else {
+               return (SET_ERROR(EINVAL));
         }
  
         drba.drba_origin = origin;
         drba.drba_cookie = drc;
         drba.drba_cred = CRED();
  
-       return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync,
-           &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+       if (DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) &
+           DMU_BACKUP_FEATURE_RESUMING) {
+               return (dsl_sync_task(tofs,
+                   dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync,
+                   &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+       } else  {
+               return (dsl_sync_task(tofs,
+                   dmu_recv_begin_check, dmu_recv_begin_sync,
+                   &drba, 5, ZFS_SPACE_CHECK_NORMAL));
+       }
  }
  
  struct receive_record_arg {
@@ -1469,6 +1698,7 @@ struct receive_record_arg {
          */
         arc_buf_t *write_buf;
         int payload_size;
+       uint64_t bytes_read; /* bytes read from stream when record created */
         boolean_t eos_marker; /* Marks the end of the stream */
         bqueue_node_t node;
  };
@@ -1477,6 +1707,7 @@ struct receive_writer_arg {
         objset_t *os;
         boolean_t byteswap;
         bqueue_t q;
+
         /*
          * These three args are used to signal to the main thread that we're
          * done.
@@ -1484,15 +1715,20 @@ struct receive_writer_arg {
         kmutex_t mutex;
         kcondvar_t cv;
         boolean_t done;
+
         int err;
         /* A map from guid to dataset to help handle dedup'd streams. */
         avl_tree_t *guid_to_ds_map;
+       boolean_t resumable;
+       uint64_t last_object, last_offset;
+       uint64_t bytes_read; /* bytes read when current record created */
  };
  
  struct receive_arg  {
         objset_t *os;
         vnode_t *vp; /* The vnode to read the stream from */
         uint64_t voff; /* The current offset in the stream */
+       uint64_t bytes_read;
         /*
          * A record that has had its payload read in, but hasn't yet been handed
          * off to the worker thread.
@@ -1564,14 +1800,21 @@ receive_read(struct receive_arg *ra, int len, void *buf)
                     ra->voff, UIO_SYSSPACE, FAPPEND,
                     RLIM64_INFINITY, CRED(), &resid);
  
-               if (resid == len - done)
-                       ra->err = SET_ERROR(EINVAL);
+               if (resid == len - done) {
+                       /*
+                        * Note: ECKSUM indicates that the receive
+                        * was interrupted and can potentially be resumed.
+                        */
+                       ra->err = SET_ERROR(ECKSUM);
+               }
                 ra->voff += len - done - resid;
                 done = len - resid;
                 if (ra->err != 0)
                         return (ra->err);
         }
  
+       ra->bytes_read += len;
+
         ASSERT3U(done, ==, len);
         return (0);
  }
@@ -1675,6 +1918,43 @@ deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size)
         }
  }
  
+static void
+save_resume_state(struct receive_writer_arg *rwa,
+    uint64_t object, uint64_t offset, dmu_tx_t *tx)
+{
+       int txgoff = dmu_tx_get_txg(tx) & TXG_MASK;
+
+       if (!rwa->resumable)
+               return;
+
+       /*
+        * We use ds_resume_bytes[] != 0 to indicate that we need to
+        * update this on disk, so it must not be 0.
+        */
+       ASSERT(rwa->bytes_read != 0);
+
+       /*
+        * We only resume from write records, which have a valid
+        * (non-meta-dnode) object number.
+        */
+       ASSERT(object != 0);
+
+       /*
+        * For resuming to work correctly, we must receive records in order,
+        * sorted by object,offset.  This is checked by the callers, but
+        * assert it here for good measure.
+        */
+       ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]);
+       ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] ||
+           offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]);
+       ASSERT3U(rwa->bytes_read, >=,
+           rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]);
+
+       rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object;
+       rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset;
+       rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read;
+}
+
  noinline static int
  receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
         void *data)
@@ -1773,6 +2053,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
                 dmu_buf_rele(db, FTAG);
         }
         dmu_tx_commit(tx);
+
         return (0);
  }
  
@@ -1820,6 +2101,18 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
             !DMU_OT_IS_VALID(drrw->drr_type))
                 return (SET_ERROR(EINVAL));
  
+       /*
+        * For resuming to work, records must be in increasing order
+        * by (object, offset).
+        */
+       if (drrw->drr_object < rwa->last_object ||
+           (drrw->drr_object == rwa->last_object &&
+           drrw->drr_offset < rwa->last_offset)) {
+               return (SET_ERROR(EINVAL));
+       }
+       rwa->last_object = drrw->drr_object;
+       rwa->last_offset = drrw->drr_offset;
+
         if (dmu_object_info(rwa->os, drrw->drr_object, NULL) != 0)
                 return (SET_ERROR(EINVAL));
  
@@ -1842,8 +2135,17 @@ receive_write(struct receive_writer_arg *rwa, struct drr_write *drrw,
         if (dmu_bonus_hold(rwa->os, drrw->drr_object, FTAG, &bonus) != 0)
                 return (SET_ERROR(EINVAL));
         dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx);
+
+       /*
+        * Note: If the receive fails, we want the resume stream to start
+        * with the same record that we last successfully received (as opposed
+        * to the next record), so that we can verify that we are
+        * resuming from the correct location.
+        */
+       save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx);
         dmu_tx_commit(tx);
         dmu_buf_rele(bonus, FTAG);
+
         return (0);
  }
  
@@ -1902,43 +2204,48 @@ receive_write_byref(struct receive_writer_arg *rwa,
         dmu_write(rwa->os, drrwbr->drr_object,
             drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
         dmu_buf_rele(dbp, FTAG);
+
+       /* See comment in restore_write. */
+       save_resume_state(rwa, drrwbr->drr_object, drrwbr->drr_offset, tx);
         dmu_tx_commit(tx);
         return (0);
  }
  
  static int
  receive_write_embedded(struct receive_writer_arg *rwa,
-    struct drr_write_embedded *drrwnp, void *data)
+    struct drr_write_embedded *drrwe, void *data)
  {
         dmu_tx_t *tx;
         int err;
  
-       if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset)
+       if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset)
                 return (EINVAL);
  
-       if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE)
+       if (drrwe->drr_psize > BPE_PAYLOAD_SIZE)
                 return (EINVAL);
  
-       if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES)
+       if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES)
                 return (EINVAL);
-       if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
+       if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS)
                 return (EINVAL);
  
         tx = dmu_tx_create(rwa->os);
  
-       dmu_tx_hold_write(tx, drrwnp->drr_object,
-           drrwnp->drr_offset, drrwnp->drr_length);
+       dmu_tx_hold_write(tx, drrwe->drr_object,
+           drrwe->drr_offset, drrwe->drr_length);
         err = dmu_tx_assign(tx, TXG_WAIT);
         if (err != 0) {
                 dmu_tx_abort(tx);
                 return (err);
         }
  
-       dmu_write_embedded(rwa->os, drrwnp->drr_object,
-           drrwnp->drr_offset, data, drrwnp->drr_etype,
-           drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize,
+       dmu_write_embedded(rwa->os, drrwe->drr_object,
+           drrwe->drr_offset, data, drrwe->drr_etype,
+           drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize,
             rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx);
  
+       /* See comment in restore_write. */
+       save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx);
         dmu_tx_commit(tx);
         return (0);
  }
@@ -2012,10 +2319,16 @@ receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf)
  static void
  dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc)
  {
-       char name[MAXNAMELEN];
-       dsl_dataset_name(drc->drc_ds, name);
-       dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
-       (void) dsl_destroy_head(name);
+       if (drc->drc_resumable) {
+               /* wait for our resume state to be written to disk */
+               txg_wait_synced(drc->drc_ds->ds_dir->dd_pool, 0);
+               dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+       } else {
+               char name[MAXNAMELEN];
+               dsl_dataset_name(drc->drc_ds, name);
+               dsl_dataset_disown(drc->drc_ds, dmu_recv_tag);
+               (void) dsl_destroy_head(name);
+       }
  }
  
  static void
@@ -2044,12 +2357,17 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
  
         if (len != 0) {
                 ASSERT3U(len, <=, SPA_MAXBLOCKSIZE);
-               ra->rrd->payload = buf;
-               ra->rrd->payload_size = len;
-               err = receive_read(ra, len, ra->rrd->payload);
+               err = receive_read(ra, len, buf);
                 if (err != 0)
                         return (err);
-               receive_cksum(ra, len, ra->rrd->payload);
+               receive_cksum(ra, len, buf);
+
+               /* note: rrd is NULL when reading the begin record's payload */
+               if (ra->rrd != NULL) {
+                       ra->rrd->payload = buf;
+                       ra->rrd->payload_size = len;
+                       ra->rrd->bytes_read = ra->bytes_read;
+               }
         }
  
         ra->prev_cksum = ra->cksum;
@@ -2057,6 +2375,7 @@ receive_read_payload_and_next_header(struct receive_arg *ra, int len, void *buf)
         ra->next_rrd = kmem_zalloc(sizeof (*ra->next_rrd), KM_SLEEP);
         err = receive_read(ra, sizeof (ra->next_rrd->header),
             &ra->next_rrd->header);
+       ra->next_rrd->bytes_read = ra->bytes_read;
         if (err != 0) {
                 kmem_free(ra->next_rrd, sizeof (*ra->next_rrd));
                 ra->next_rrd = NULL;
@@ -2236,7 +2555,7 @@ receive_read_record(struct receive_arg *ra)
         {
                 struct drr_end *drre = &ra->rrd->header.drr_u.drr_end;
                 if (!ZIO_CHECKSUM_EQUAL(ra->prev_cksum, drre->drr_checksum))
-                       return (SET_ERROR(EINVAL));
+                       return (SET_ERROR(ECKSUM));
                 return (0);
         }
         case DRR_SPILL:
@@ -2263,6 +2582,10 @@ receive_process_record(struct receive_writer_arg *rwa,
  {
         int err;
  
+       /* Processing in order, therefore bytes_read should be increasing. */
+       ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read);
+       rwa->bytes_read = rrd->bytes_read;
+
         switch (rrd->header.drr_type) {
         case DRR_OBJECT:
         {
@@ -2357,6 +2680,33 @@ receive_writer_thread(void *arg)
         mutex_exit(&rwa->mutex);
  }
  
+static int
+resume_check(struct receive_arg *ra, nvlist_t *begin_nvl)
+{
+       uint64_t val;
+       objset_t *mos = dmu_objset_pool(ra->os)->dp_meta_objset;
+       uint64_t dsobj = dmu_objset_id(ra->os);
+       uint64_t resume_obj, resume_off;
+
+       if (nvlist_lookup_uint64(begin_nvl,
+           "resume_object", &resume_obj) != 0 ||
+           nvlist_lookup_uint64(begin_nvl,
+           "resume_offset", &resume_off) != 0) {
+               return (SET_ERROR(EINVAL));
+       }
+       VERIFY0(zap_lookup(mos, dsobj,
+           DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val));
+       if (resume_obj != val)
+               return (SET_ERROR(EINVAL));
+       VERIFY0(zap_lookup(mos, dsobj,
+           DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val));
+       if (resume_off != val)
+               return (SET_ERROR(EINVAL));
+
+       return (0);
+}
+
+
  /*
   * Read in the stream's records, one by one, and apply them to the pool.  There
   * are two threads involved; the thread that calls this function will spin up a
@@ -2378,6 +2728,9 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
         struct receive_writer_arg *rwa;
         int featureflags;
         struct receive_ign_obj_node *n;
+       uint32_t payloadlen;
+       void *payload;
+       nvlist_t *begin_nvl = NULL;
  
         ra = kmem_zalloc(sizeof (*ra), KM_SLEEP);
         rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP);
@@ -2386,6 +2739,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
         ra->cksum = drc->drc_cksum;
         ra->vp = vp;
         ra->voff = *voffp;
+
+       if (dsl_dataset_is_zapified(drc->drc_ds)) {
+               (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset,
+                   drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES,
+                   sizeof (ra->bytes_read), 1, &ra->bytes_read);
+       }
+
         list_create(&ra->ignore_obj_list, sizeof (struct receive_ign_obj_node),
                 offsetof(struct receive_ign_obj_node, node));
  
@@ -2438,9 +2798,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
                 drc->drc_guid_to_ds_map = rwa->guid_to_ds_map;
         }
  
-       err = receive_read_payload_and_next_header(ra, 0, NULL);
-       if (err)
+       payloadlen = drc->drc_drr_begin->drr_payloadlen;
+       payload = NULL;
+       if (payloadlen != 0)
+               payload = kmem_alloc(payloadlen, KM_SLEEP);
+
+       err = receive_read_payload_and_next_header(ra, payloadlen, payload);
+       if (err != 0) {
+               if (payloadlen != 0)
+                       kmem_free(payload, payloadlen);
                 goto out;
+       }
+       if (payloadlen != 0) {
+               err = nvlist_unpack(payload, payloadlen, &begin_nvl, KM_SLEEP);
+               kmem_free(payload, payloadlen);
+               if (err != 0)
+                       goto out;
+       }
+
+       if (featureflags & DMU_BACKUP_FEATURE_RESUMING) {
+               err = resume_check(ra, begin_nvl);
+               if (err != 0)
+                       goto out;
+       }
  
         (void) bqueue_init(&rwa->q, zfs_recv_queue_length,
             offsetof(struct receive_record_arg, node));
@@ -2448,6 +2828,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
         mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL);
         rwa->os = ra->os;
         rwa->byteswap = drc->drc_byteswap;
+       rwa->resumable = drc->drc_resumable;
  
         (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
             TS_RUN, minclsyspri);
@@ -2461,7 +2842,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
          * We can leave this loop in 3 ways:  First, if rwa->err is
          * non-zero.  In that case, the writer thread will free the rrd we just
          * pushed.  Second, if  we're interrupted; in that case, either it's the
-        * first loop and ra->rrd was never allocated, or it's later, and ra.rrd
+        * first loop and ra->rrd was never allocated, or it's later and ra->rrd
          * has been handed off to the writer thread who will free it.  Finally,
          * if receive_read_record fails or we're at the end of the stream, then
          * we free ra->rrd and exit.
@@ -2506,13 +2887,15 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
                 err = rwa->err;
  
  out:
+       nvlist_free(begin_nvl);
         if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
                 zfs_onexit_fd_rele(cleanup_fd);
  
         if (err != 0) {
                 /*
-                * destroy what we created, so we don't leave it in the
-                * inconsistent restoring state.
+                * Clean up references. If receive is not resumable,
+                * destroy what we created, so we don't leave it in
+                * the inconsistent state.
                  */
                 dmu_recv_cleanup_ds(drc);
         }
@@ -2674,6 +3057,20 @@ dmu_recv_end_sync(void *arg, dmu_tx_t *tx)
  
                 dmu_buf_will_dirty(ds->ds_dbuf, tx);
                 dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT;
+               if (dsl_dataset_has_resume_receive_state(ds)) {
+                       (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+                           DS_FIELD_RESUME_FROMGUID, tx);
+                       (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+                           DS_FIELD_RESUME_OBJECT, tx);
+                       (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+                           DS_FIELD_RESUME_OFFSET, tx);
+                       (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+                           DS_FIELD_RESUME_BYTES, tx);
+                       (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+                           DS_FIELD_RESUME_TOGUID, tx);
+                       (void) zap_remove(dp->dp_meta_objset, ds->ds_object,
+                           DS_FIELD_RESUME_TONAME, tx);
+               }
         }
         drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj;
         zvol_create_minors(dp->dp_spa, drc->drc_tofs, B_TRUE);
diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c

index 44ba74181c46ae7ff1faf374351ea81a19357586..0df12fac8c36777c21d27239e9cfbde4012a7e6c 100644 (file)
--- a/module/zfs/dmu_traverse.c
+++ b/module/zfs/dmu_traverse.c
@@ -47,6 +47,7 @@ typedef struct prefetch_data {
         int pd_flags;
         boolean_t pd_cancel;
         boolean_t pd_exited;
+       zbookmark_phys_t pd_resume;
  } prefetch_data_t;
  
  typedef struct traverse_data {
@@ -323,30 +324,29 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                 uint32_t flags = ARC_FLAG_WAIT;
                 int32_t i;
                 int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
-               dnode_phys_t *cdnp;
+               dnode_phys_t *child_dnp;
  
                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
                 if (err != 0)
                         goto post;
-               cdnp = buf->b_data;
+               child_dnp = buf->b_data;
  
-               for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) {
-                       prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset,
-                           zb->zb_blkid * epb + i);
+               for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
+                       prefetch_dnode_metadata(td, &child_dnp[i],
+                           zb->zb_objset, zb->zb_blkid * epb + i);
                 }
  
                 /* recursively visitbp() blocks below this */
-               for (i = 0; i < epb; i += cdnp[i].dn_extra_slots + 1) {
-                       err = traverse_dnode(td, &cdnp[i], zb->zb_objset,
-                           zb->zb_blkid * epb + i);
+               for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) {
+                       err = traverse_dnode(td, &child_dnp[i],
+                           zb->zb_objset, zb->zb_blkid * epb + i);
                         if (err != 0)
                                 break;
                 }
         } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
                 arc_flags_t flags = ARC_FLAG_WAIT;
                 objset_phys_t *osp;
-               dnode_phys_t *mdnp, *gdnp, *udnp;
  
                 err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf,
                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
@@ -354,11 +354,7 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                         goto post;
  
                 osp = buf->b_data;
-               mdnp = &osp->os_meta_dnode;
-               gdnp = &osp->os_groupused_dnode;
-               udnp = &osp->os_userused_dnode;
-
-               prefetch_dnode_metadata(td, mdnp, zb->zb_objset,
+               prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset,
                     DMU_META_DNODE_OBJECT);
                 /*
                  * See the block comment above for the goal of this variable.
@@ -370,21 +366,21 @@ traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
                         td->td_realloc_possible = B_FALSE;
  
                 if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-                       prefetch_dnode_metadata(td, gdnp, zb->zb_objset,
-                           DMU_GROUPUSED_OBJECT);
-                       prefetch_dnode_metadata(td, udnp, zb->zb_objset,
-                           DMU_USERUSED_OBJECT);
+                       prefetch_dnode_metadata(td, &osp->os_groupused_dnode,
+                           zb->zb_objset, DMU_GROUPUSED_OBJECT);
+                       prefetch_dnode_metadata(td, &osp->os_userused_dnode,
+                           zb->zb_objset, DMU_USERUSED_OBJECT);
                 }
  
-               err = traverse_dnode(td, mdnp, zb->zb_objset,
+               err = traverse_dnode(td, &osp->os_meta_dnode, zb->zb_objset,
                     DMU_META_DNODE_OBJECT);
                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-                       err = traverse_dnode(td, gdnp, zb->zb_objset,
-                           DMU_GROUPUSED_OBJECT);
+                       err = traverse_dnode(td, &osp->os_groupused_dnode,
+                           zb->zb_objset, DMU_GROUPUSED_OBJECT);
                 }
                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
-                       err = traverse_dnode(td, udnp, zb->zb_objset,
-                           DMU_USERUSED_OBJECT);
+                       err = traverse_dnode(td, &osp->os_userused_dnode,
+                           zb->zb_objset, DMU_USERUSED_OBJECT);
                 }
         }
  
@@ -416,9 +412,15 @@ post:
                  * Set the bookmark to the first level-0 block that we need
                  * to visit.  This way, the resuming code does not need to
                  * deal with resuming from indirect blocks.
+                *
+                * Note, if zb_level <= 0, dnp may be NULL, so we don't want
+                * to dereference it.
                  */
-               td->td_resume->zb_blkid = zb->zb_blkid <<
-                   (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
+               td->td_resume->zb_blkid = zb->zb_blkid;
+               if (zb->zb_level > 0) {
+                       td->td_resume->zb_blkid <<= zb->zb_level *
+                           (dnp->dn_indblkshift - SPA_BLKPTRSHIFT);
+               }
                 td->td_paused = B_TRUE;
         }
  
@@ -450,6 +452,10 @@ traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
         int j, err = 0;
         zbookmark_phys_t czb;
  
+       if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL &&
+           object < td->td_resume->zb_object)
+               return (0);
+
         if (td->td_flags & TRAVERSE_PRE) {
                 SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL,
                     ZB_DNODE_BLKID);
@@ -527,6 +533,7 @@ traverse_prefetch_thread(void *arg)
         td.td_func = traverse_prefetcher;
         td.td_arg = td_main->td_pfd;
         td.td_pfd = NULL;
+       td.td_resume = &td_main->td_pfd->pd_resume;
  
         SET_BOOKMARK(&czb, td.td_objset,
             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
@@ -556,12 +563,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
         ASSERT(ds == NULL || objset == ds->ds_object);
         ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST));
  
-       /*
-        * The data prefetching mechanism (the prefetch thread) is incompatible
-        * with resuming from a bookmark.
-        */
-       ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA));
-
         td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP);
         pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP);
         czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP);
@@ -586,6 +587,8 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
         }
  
         pd->pd_flags = flags;
+       if (resume != NULL)
+               pd->pd_resume = *resume;
         mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
         cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
  
@@ -638,11 +641,19 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
   * in syncing context).
   */
  int
-traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
-    blkptr_cb_t func, void *arg)
+traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start,
+    zbookmark_phys_t *resume,
+    int flags, blkptr_cb_t func, void *arg)
  {
         return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object,
-           &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg));
+           &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg));
+}
+
+int
+traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start,
+    int flags, blkptr_cb_t func, void *arg)
+{
+       return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg));
  }
  
  int
@@ -675,7 +686,7 @@ traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
  
         /* visit each dataset */
         for (obj = 1; err == 0;
-           err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
+           err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) {
                 dmu_object_info_t doi;
  
                 err = dmu_object_info(mos, obj, &doi);
diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c

index 9c275b234ca8ddb78b44b6099a86ab7bd37b6603..e86e6cc731eceb7683f2ce4c2080bef90279d598 100644 (file)
--- a/module/zfs/dsl_dataset.c
+++ b/module/zfs/dsl_dataset.c
@@ -52,6 +52,9 @@
  #include <sys/dsl_userhold.h>
  #include <sys/dsl_bookmark.h>
  #include <sys/policy.h>
+#include <sys/dmu_send.h>
+#include <sys/zio_compress.h>
+#include <zfs_fletcher.h>
  
  /*
   * The SPA supports block sizes up to 16MB.  However, very large blocks
@@ -704,6 +707,7 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
  {
         boolean_t gotit = FALSE;
  
+       ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
         mutex_enter(&ds->ds_lock);
         if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) {
                 ds->ds_owner = tag;
@@ -714,6 +718,16 @@ dsl_dataset_tryown(dsl_dataset_t *ds, void *tag)
         return (gotit);
  }
  
+boolean_t
+dsl_dataset_has_owner(dsl_dataset_t *ds)
+{
+       boolean_t rv;
+       mutex_enter(&ds->ds_lock);
+       rv = (ds->ds_owner != NULL);
+       mutex_exit(&ds->ds_lock);
+       return (rv);
+}
+
  static void
  dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, dmu_tx_t *tx)
  {
@@ -1615,6 +1629,21 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
         dmu_buf_will_dirty(ds->ds_dbuf, tx);
         dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid;
  
+       if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) {
+               VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+                   ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1,
+                   &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx));
+               VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+                   ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1,
+                   &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx));
+               VERIFY0(zap_update(tx->tx_pool->dp_meta_objset,
+                   ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1,
+                   &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx));
+               ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0;
+               ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0;
+               ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0;
+       }
+
         dmu_objset_sync(ds->ds_objset, zio, tx);
  
         for (f = 0; f < SPA_FEATURES; f++) {
@@ -1670,6 +1699,78 @@ fail:
         nvlist_free(propval);
  }
  
+static void
+get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv)
+{
+       dsl_pool_t *dp = ds->ds_dir->dd_pool;
+
+       if (dsl_dataset_has_resume_receive_state(ds)) {
+               char *str;
+               void *packed;
+               uint8_t *compressed;
+               uint64_t val;
+               nvlist_t *token_nv = fnvlist_alloc();
+               size_t packed_size, compressed_size;
+               zio_cksum_t cksum;
+               char *propval;
+               char buf[MAXNAMELEN];
+               int i;
+
+               if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+                   DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) {
+                       fnvlist_add_uint64(token_nv, "fromguid", val);
+               }
+               if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+                   DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) {
+                       fnvlist_add_uint64(token_nv, "object", val);
+               }
+               if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+                   DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) {
+                       fnvlist_add_uint64(token_nv, "offset", val);
+               }
+               if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+                   DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) {
+                       fnvlist_add_uint64(token_nv, "bytes", val);
+               }
+               if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+                   DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) {
+                       fnvlist_add_uint64(token_nv, "toguid", val);
+               }
+               if (zap_lookup(dp->dp_meta_objset, ds->ds_object,
+                   DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) {
+                       fnvlist_add_string(token_nv, "toname", buf);
+               }
+               if (zap_contains(dp->dp_meta_objset, ds->ds_object,
+                   DS_FIELD_RESUME_EMBEDOK) == 0) {
+                       fnvlist_add_boolean(token_nv, "embedok");
+               }
+               packed = fnvlist_pack(token_nv, &packed_size);
+               fnvlist_free(token_nv);
+               compressed = kmem_alloc(packed_size, KM_SLEEP);
+
+               compressed_size = gzip_compress(packed, compressed,
+                   packed_size, packed_size, 6);
+
+               fletcher_4_native(compressed, compressed_size, &cksum);
+
+               str = kmem_alloc(compressed_size * 2 + 1, KM_SLEEP);
+               for (i = 0; i < compressed_size; i++) {
+                       (void) sprintf(str + i * 2, "%02x", compressed[i]);
+               }
+               str[compressed_size * 2] = '\0';
+               propval = kmem_asprintf("%u-%llx-%llx-%s",
+                   ZFS_SEND_RESUME_TOKEN_VERSION,
+                   (longlong_t)cksum.zc_word[0],
+                   (longlong_t)packed_size, str);
+               dsl_prop_nvlist_add_string(nv,
+                   ZFS_PROP_RECEIVE_RESUME_TOKEN, propval);
+               kmem_free(packed, packed_size);
+               kmem_free(str, compressed_size * 2 + 1);
+               kmem_free(compressed, packed_size);
+               strfree(propval);
+       }
+}
+
  void
  dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
  {
@@ -1743,6 +1844,29 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
                 }
         }
  
+       if (!dsl_dataset_is_snapshot(ds)) {
+               dsl_dataset_t *recv_ds;
+               char recvname[ZFS_MAXNAMELEN];
+
+               /*
+                * A failed "newfs" (e.g. full) resumable receive leaves
+                * the stats set on this dataset.  Check here for the prop.
+                */
+               get_receive_resume_stats(ds, nv);
+
+               /*
+                * A failed incremental resumable receive leaves the
+                * stats set on our child named "%recv".  Check the child
+                * for the prop.
+                */
+               dsl_dataset_name(ds, recvname);
+               (void) strcat(recvname, "/");
+               (void) strcat(recvname, recv_clone_name);
+               if (dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) {
+                       get_receive_resume_stats(recv_ds, nv);
+                       dsl_dataset_rele(recv_ds, FTAG);
+               }
+       }
  }
  
  void
@@ -1970,7 +2094,8 @@ dsl_dataset_rename_snapshot(const char *fsname,
   * only one long hold on the dataset.  We're not allowed to change anything here
   * so we don't permanently release the long hold or regular hold here.  We want
   * to do this only when syncing to avoid the dataset unexpectedly going away
- * when we release the long hold.
+ * when we release the long hold.  Allow a long hold to exist for volumes, this
+ * may occur when asynchronously registering the minor with the kernel.
   */
  static int
  dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
@@ -1985,7 +2110,7 @@ dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx)
                 dsl_dataset_long_rele(ds, owner);
         }
  
-       held = dsl_dataset_long_held(ds);
+       held = (dsl_dataset_long_held(ds) && (ds->ds_owner != zvol_tag));
  
         if (owner != NULL)
                 dsl_dataset_long_hold(ds, owner);
@@ -3391,6 +3516,23 @@ dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx)
         dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx);
  }
  
+boolean_t
+dsl_dataset_is_zapified(dsl_dataset_t *ds)
+{
+       dmu_object_info_t doi;
+
+       dmu_object_info_from_db(ds->ds_dbuf, &doi);
+       return (doi.doi_type == DMU_OTN_ZAP_METADATA);
+}
+
+boolean_t
+dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds)
+{
+       return (dsl_dataset_is_zapified(ds) &&
+           zap_contains(ds->ds_dir->dd_pool->dp_meta_objset,
+           ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0);
+}
+
  #if defined(_KERNEL) && defined(HAVE_SPL)
  #if defined(_LP64)
  module_param(zfs_max_recordsize, int, 0644);
diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c

index d7c34c9a403e12b4c90325f3e2b1d3e3693b6927..716081ba3ac333febbcc897cca7c34603cd059b5 100644 (file)
--- a/module/zfs/dsl_destroy.c
+++ b/module/zfs/dsl_destroy.c
@@ -978,9 +978,17 @@ dsl_destroy_inconsistent(const char *dsname, void *arg)
         objset_t *os;
  
         if (dmu_objset_hold(dsname, FTAG, &os) == 0) {
-               boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+               boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os));
+
+               /*
+                * If the dataset is inconsistent because a resumable receive
+                * has failed, then do not destroy it.
+                */
+               if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os)))
+                       need_destroy = B_FALSE;
+
                 dmu_objset_rele(os, FTAG);
-               if (inconsistent)
+               if (need_destroy)
                         (void) dsl_destroy_head(dsname);
         }
         return (0);
diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c

index 30338ac148e0540cf94b3ecb59b0315c94640cf8..825e838470e68df1894b916820264a8a7ddea3a2 100644 (file)
--- a/module/zfs/zfs_ioctl.c
+++ b/module/zfs/zfs_ioctl.c
@@ -4005,6 +4005,7 @@ static boolean_t zfs_ioc_recv_inject_err;
   * zc_guid             force flag
   * zc_cleanup_fd       cleanup-on-exit file descriptor
   * zc_action_handle    handle for this guid/ds mapping (or zero on first call)
+ * zc_resumable                if data is incomplete assume sender will resume
   *
   * outputs:
   * zc_cookie           number of bytes read
@@ -4051,13 +4052,13 @@ zfs_ioc_recv(zfs_cmd_t *zc)
                 return (SET_ERROR(EBADF));
         }
  
-       VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+       errors = fnvlist_alloc();
  
         if (zc->zc_string[0])
                 origin = zc->zc_string;
  
         error = dmu_recv_begin(tofs, tosnap,
-           &zc->zc_begin_record, force, origin, &drc);
+           &zc->zc_begin_record, force, zc->zc_resumable, origin, &drc);
         if (error != 0)
                 goto out;
  
@@ -5182,6 +5183,8 @@ zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl)
   *         indicates that blocks > 128KB are permitted
   *     (optional) "embedok" -> (value ignored)
   *         presence indicates DRR_WRITE_EMBEDDED records are permitted
+ *     (optional) "resume_object" and "resume_offset" -> (uint64)
+ *         if present, resume send stream from specified object and offset.
   * }
   *
   * outnvl is unused
@@ -5197,6 +5200,8 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
         file_t *fp;
         boolean_t largeblockok;
         boolean_t embedok;
+       uint64_t resumeobj = 0;
+       uint64_t resumeoff = 0;
  
         error = nvlist_lookup_int32(innvl, "fd", &fd);
         if (error != 0)
@@ -5207,12 +5212,15 @@ zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl)
         largeblockok = nvlist_exists(innvl, "largeblockok");
         embedok = nvlist_exists(innvl, "embedok");
  
+       (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj);
+       (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff);
+
         if ((fp = getf(fd)) == NULL)
                 return (SET_ERROR(EBADF));
  
         off = fp->f_offset;
-       error = dmu_send(snapname, fromname, embedok, largeblockok,
-           fd, fp->f_vnode, &off);
+       error = dmu_send(snapname, fromname, embedok, largeblockok, fd,
+           resumeobj, resumeoff, fp->f_vnode, &off);
  
         if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
                 fp->f_offset = off;
diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c

index 9c89493edfcf2d92cb87eb25afe70f8e88cb5a63..73277901ff16f57b77442645fea20b8535e8acee 100644 (file)
--- a/module/zfs/zvol.c
+++ b/module/zfs/zvol.c
@@ -61,7 +61,7 @@ unsigned long zvol_max_discard_blocks = 16384;
  
  static kmutex_t zvol_state_lock;
  static list_t zvol_state_list;
-static char *zvol_tag = "zvol_tag";
+void *zvol_tag = "zvol_tag";
  
  /*
   * The in-core state of each volume.
diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run

index 003c513dd711640ff662fb5cc44b76df04ec4dd6..e5594cd84cddf4ff85438c7450e0317c6746f7c3 100644 (file)
--- a/tests/runfiles/linux.run
+++ b/tests/runfiles/linux.run
@@ -146,14 +146,13 @@ tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos',
  tests = []
  
  # DISABLED:
-# zfs_receive_003_pos - needs investigation
-# zfs_receive_010_pos - needs investigation
-# zfs_receive_011_pos - needs investigation
-# zfs_receive_012_pos - needs investigation
+# zfs_receive_004_neg - Fails for OpenZFS on illumos
+# zfs_receive_011_pos - Requires port of OpenZFS 6562
+# zfs_receive_012_pos - Requires port of OpenZFS 6562
  [tests/functional/cli_root/zfs_receive]
-tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_005_neg',
-    'zfs_receive_006_pos', 'zfs_receive_007_neg', 'zfs_receive_008_pos',
-    'zfs_receive_009_neg']
+tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos',
+    'zfs_receive_005_neg', 'zfs_receive_006_pos',
+    'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg']
  
  # DISABLED:
  # zfs_rename_002_pos - needs investigation
@@ -175,11 +174,10 @@ tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos']
  [tests/functional/cli_root/zfs_rollback]
  tests = ['zfs_rollback_003_neg', 'zfs_rollback_004_neg']
  
-# DISABLED:
-# zfs_send_007_pos - needs investigation
  [tests/functional/cli_root/zfs_send]
  tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos',
-    'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos']
+    'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos',
+    'zfs_send_007_pos']
  
  # DISABLED:
  # mountpoint_003_pos - needs investigation
@@ -207,10 +205,11 @@ tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos',
  
  # DISABLED:
  # zfs_snapshot_008_neg - nested pools
+# zfs_snapshot_009_pos - Fails for OpenZFS on illumos
  [tests/functional/cli_root/zfs_snapshot]
  tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg',
      'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg',
-    'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_009_pos']
+    'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg']
  
  # DISABLED:
  # zfs_unmount_005_pos - needs investigation
@@ -565,12 +564,17 @@ tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos',
  #[tests/functional/rootpool]
  #tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_neg']
  
-# DISABLED: Hangs on I/O for unclear reason.
-#[tests/functional/rsend]
-#tests = ['rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
-#    'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos',
-#    'rsend_009_pos', 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos',
-#    'rsend_013_pos']
+# DISABLED:
+# rsend_008_pos - Fails for OpenZFS on illumos
+# rsend_009_pos - Fails for OpenZFS on illumos
+# rsend_020_pos - ASSERTs in dump_record()
+[tests/functional/rsend]
+tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
+    'rsend_005_pos', 'rsend_006_pos', 'rsend_007_pos',
+    'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos',
+    'rsend_013_pos', 'rsend_014_pos',
+    'rsend_019_pos',
+    'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos']
  
  [tests/functional/scrub_mirror]
  tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos',
@@ -586,17 +590,17 @@ tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos',
      'slog_009_neg', 'slog_010_neg', 'slog_011_neg']
  
  # DISABLED:
+# clone_001_pos - nested pools
  # rollback_003_pos - Hangs in unmount and spins.
-# snapshot_013_pos - Hangs on I/O for unclear reason.
-# snapshot_016_pos - .zfs mv/rmdir/mkdir disabled by default.
-#[tests/functional/snapshot]
-#tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos',
-#    'snapshot_001_pos', 'snapshot_002_pos',
-#    'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos',
-#    'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos',
-#    'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos',
-#    'snapshot_012_pos', 'snapshot_014_pos',
-#    'snapshot_015_pos', 'snapshot_017_pos']
+# snapshot_016_pos - Problem with automount
+[tests/functional/snapshot]
+tests = ['rollback_001_pos', 'rollback_002_pos',
+    'snapshot_001_pos', 'snapshot_002_pos',
+    'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos',
+    'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos',
+    'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos',
+    'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos',
+    'snapshot_015_pos', 'snapshot_017_pos']
  [tests/functional/snapused]
  tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos',
      'snapused_004_pos', 'snapused_005_pos']
diff --git a/tests/zfs-tests/cmd/mktree/mktree.c b/tests/zfs-tests/cmd/mktree/mktree.c

index 8f9b38578c11bc28aadb8405054f10f14c43d28c..95d31a6527d12e879a54edab7f8da7276299c226 100644 (file)
--- a/tests/zfs-tests/cmd/mktree/mktree.c
+++ b/tests/zfs-tests/cmd/mktree/mktree.c
@@ -172,7 +172,7 @@ crtfile(char *pname)
                 exit(errno);
         }
  
-       if (fsetxattr(fd, "xattr", pbuf, 1024, 0) < 0) {
+       if (fsetxattr(fd, "user.xattr", pbuf, 1024, 0) < 0) {
                 (void) fprintf(stderr, "fsetxattr(fd, \"xattr\", pbuf, "
                     "1024, 0) failed.\n[%d]: %s.\n", errno, strerror(errno));
                 exit(errno);
diff --git a/tests/zfs-tests/include/commands.cfg.in b/tests/zfs-tests/include/commands.cfg.in

index 823ee9679e7a86ecccb10ff446f9800e1b764294..bea8df62931cc1f7c56e3b0a081f8f0966fed543 100644 (file)
--- a/tests/zfs-tests/include/commands.cfg.in
+++ b/tests/zfs-tests/include/commands.cfg.in
@@ -85,6 +85,7 @@ export SHARE="@SHARE@"
  export SHUF="@SHUF@"
  export SLEEP="@SLEEP@"
  export SORT="@SORT@"
+export STAT="@STAT@"
  export STRINGS="@STRINGS@"
  export SU="@SU@"
  export SUM="@SUM@"
@@ -97,8 +98,8 @@ export TAIL="@TAIL@"
  export TAR="@TAR@"
  export TOUCH="@TOUCH@"
  export TR="@TR@"
-export TRUE="@TRUE@"
  export TRUNCATE="@TRUNCATE@"
+export TRUE="@TRUE@"
  export UDEVADM="@UDEVADM@"
  export UFSDUMP="@UFSDUMP@"
  export UFSRESTORE="@UFSRESTORE@"
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am

index 533acc695bd4ebb9a559eaae4b68de6b2974b7f6..f5857f4a483931978e3b6546a8d49ae5166bb9f3 100644 (file)
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am
@@ -10,4 +10,7 @@ dist_pkgdata_SCRIPTS = \
         zfs_receive_006_pos.ksh \
         zfs_receive_007_neg.ksh \
         zfs_receive_008_pos.ksh \
-       zfs_receive_009_neg.ksh
+       zfs_receive_009_neg.ksh \
+       zfs_receive_010_pos.ksh \
+       zfs_receive_011_pos.ksh \
+       zfs_receive_012_pos.ksh
diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh

index 13ae4f024810ecd29ca8262ecc268cb204f0bc52..308903d9700e756e14441a6639f668f74d5b38a1 100755 (executable)
--- a/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh
+++ b/tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh
@@ -61,7 +61,7 @@ test_pool ()
         first_object=$(ls -i $mntpnt | awk '{print $1}')
         log_must $ZFS snapshot $POOL/fs@a
         while true; do
-               log_must $FIND $mntpnt -delete
+               log_must $FIND $mntpnt/* -delete
                 sync
                 log_must $MKFILES "$mntpnt/" 4000
                 FILE=$(ls -i $mntpnt | awk \
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib

index 91779cc7862edbd7d627a9d5c1cc17e420639b67..4cdf6b7310b7364c768adb5d55dea38cca033f47 100644 (file)
--- a/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
+++ b/tests/zfs-tests/tests/functional/rsend/rsend.kshlib
@@ -74,9 +74,11 @@ function setup_test_model
         if is_global_zone ; then
                 log_must $ZFS create -V 16M $pool/vol
                 log_must $ZFS create -V 16M $pool/$FS/vol
+               block_device_wait
  
                 log_must $ZFS snapshot $pool/$FS/vol@vsnap
                 log_must $ZFS clone $pool/$FS/vol@vsnap $pool/$FS/vclone
+               block_device_wait
         fi
  
         log_must snapshot_tree $pool/$FS/fs1/fs2@fsnap
@@ -199,10 +201,10 @@ function cmp_ds_prop
         typeset dtst1=$1
         typeset dtst2=$2
  
-       for item in "type" "origin" "volblocksize" "aclinherit" "aclmode" \
+       for item in "type" "origin" "volblocksize" "aclinherit" "acltype" \
             "atime" "canmount" "checksum" "compression" "copies" "devices" \
             "dnodesize" "exec" "quota" "readonly" "recordsize" "reservation" \
-           "setuid" "sharenfs" "snapdir" "version" "volsize" "xattr" "zoned" \
+           "setuid" "snapdir" "version" "volsize" "xattr" "zoned" \
             "mountpoint";
         do
                 $ZFS get -H -o property,value,source $item $dtst1 >> \
@@ -393,7 +395,7 @@ function mk_files
         for ((i=0; i<$nfiles; i=i+1)); do
                 $DD if=/dev/urandom \
                     of=/$fs/file-$maxsize-$((i+$file_id_offset)) \
-                   bs=$(($RANDOM * $RANDOM % $maxsize)) \
+                   bs=$((($RANDOM * $RANDOM % ($maxsize - 1)) + 1)) \
                     count=1 >/dev/null 2>&1 || log_fail \
                     "Failed to create /$fs/file-$maxsize-$((i+$file_id_offset))"
         done
@@ -438,7 +440,7 @@ function mess_file
                 # write the same value that's already there.
                 #
                 log_must eval "$DD if=/dev/urandom of=$file conv=notrunc " \
-                   "bs=1 count=2 oseek=$offset >/dev/null 2>&1"
+                   "bs=1 count=2 seek=$offset >/dev/null 2>&1"
         else
                 log_must $TRUNCATE -s $offset $file
         fi
@@ -523,20 +525,20 @@ function test_fs_setup
                 mk_files 100 1048576 0 $sendfs &
                 mk_files 10 10485760 0 $sendfs &
                 mk_files 1 104857600 0 $sendfs &
-               log_must $WAIT
+               wait
                 log_must $ZFS snapshot $sendfs@a
  
                 rm_files 200 256 0 $sendfs &
                 rm_files 200 131072 0 $sendfs &
                 rm_files 20 1048576 0 $sendfs &
                 rm_files 2 10485760 0 $sendfs &
-               log_must $WAIT
+               wait
  
                 mk_files 400 256 0 $sendfs &
                 mk_files 400 131072 0 $sendfs &
                 mk_files 40 1048576 0 $sendfs &
                 mk_files 4 10485760 0 $sendfs &
-               log_must $WAIT
+               wait
  
                 log_must $ZFS snapshot $sendfs@b
                 log_must eval "$ZFS send -v $sendfs@a >/$sendpool/initial.zsend"
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh

index 91cdd6e34da0c0d749389bc7b255569d187e9629..a7698da691458a51be2dbe57eeb460fce0d9941b 100755 (executable)
--- a/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh
@@ -110,9 +110,6 @@ function cleanup
                 log_must $ZFS inherit $prop $POOL2
         done
  
-       #if is_shared $POOL; then
-       #       log_must $ZFS set sharenfs=off $POOL
-       #fi
         log_must setup_test_model $POOL
  
         if [[ -d $TESTDIR ]]; then
@@ -131,7 +128,7 @@ for fs in "$POOL" "$POOL/pclone" "$POOL/$FS" "$POOL/$FS/fs1" \
         "$POOL/$FS/fs1/fs2" "$POOL/$FS/fs1/fclone" ; do
         rand_set_prop $fs aclinherit "discard" "noallow" "secure" "passthrough"
         rand_set_prop $fs checksum "on" "off" "fletcher2" "fletcher4" "sha256"
-       rand_set_prop $fs aclmode "discard" "groupmask" "passthrough"
+       rand_set_prop $fs acltype "off" "noacl" "posixacl"
         rand_set_prop $fs atime "on" "off"
         rand_set_prop $fs checksum "on" "off" "fletcher2" "fletcher4" "sha256"
         rand_set_prop $fs compression "on" "off" "lzjb" "gzip" \
@@ -161,7 +158,8 @@ done
  
  
  # Verify inherited property can be received
-rand_set_prop $POOL sharenfs "on" "off" "rw"
+rand_set_prop $POOL redundant_metadata "all" "most"
+rand_set_prop $POOL sync "standard" "always" "disabled"
  
  #
  # Duplicate POOL2 for testing
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_014_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_014_pos.ksh

old mode 100644 (file)

new mode 100755 (executable)

index b6cbb1c..6857681
--- a/tests/zfs-tests/tests/functional/rsend/rsend_014_pos.ksh
+++ b/tests/zfs-tests/tests/functional/rsend/rsend_014_pos.ksh
@@ -46,6 +46,7 @@ log_must cleanup_pool $POOL2
  
  log_must eval "$ZFS send -R $POOL/$FS@final > $BACKDIR/fs-final-R"
  log_must eval "$ZFS receive -d $POOL2 < $BACKDIR/fs-final-R"
+block_device_wait
  log_must eval "$ZPOOL export $POOL"
  log_must eval "$ZPOOL import $POOL"
  
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh

old mode 100644 (file)

new mode 100755 (executable)
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh

old mode 100644 (file)

new mode 100755 (executable)
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh

old mode 100644 (file)

new mode 100755 (executable)
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh

old mode 100644 (file)

new mode 100755 (executable)
diff --git a/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh b/tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh

old mode 100644 (file)

new mode 100755 (executable)
diff --git a/zfs-script-config.sh.in b/zfs-script-config.sh.in

index e22cfd595a3147f229b7262364d6995b4d7730d6..0a85c5fef6d488df4f1f7dab28c0eae916b363d8 100644 (file)
--- a/zfs-script-config.sh.in
+++ b/zfs-script-config.sh.in
@@ -50,7 +50,7 @@ export FILE_WRITE=${TESTSDIR}/zfs-tests/cmd/file_write/file_write
  export LARGEST_FILE=${TESTSDIR}/zfs-tests/cmd/largest_file/largest_file
  export MKBUSY=${TESTSDIR}/zfs-tests/cmd/mkbusy/mkbusy
  export MKFILE=${TESTSDIR}/zfs-tests/cmd/mkfile/mkfile
-export MKFILES=${TESTSDIR}/zfs-tests/cmd/mkfile/mkfiles
+export MKFILES=${TESTSDIR}/zfs-tests/cmd/mkfiles/mkfiles
  export MKTREE=${TESTSDIR}/zfs-tests/cmd/mktree/mktree
  export MMAP_EXEC=${TESTSDIR}/zfs-tests/cmd/mmap_exec/mmap_exec
  export MMAPWRITE=${TESTSDIR}/zfs-tests/cmd/mmapwrite/mmapwrite
author	Matthew Ahrens <mahrens@delphix.com>
	Wed, 6 Jan 2016 21:22:48 +0000 (22:22 +0100)
committer	Brian Behlendorf <behlendorf1@llnl.gov>
	Tue, 28 Jun 2016 20:47:02 +0000 (13:47 -0700)
cmd/zfs/zfs_main.c		patch \| blob \| blame \| history
cmd/zstreamdump/zstreamdump.c		patch \| blob \| blame \| history
config/user-commands.m4		patch \| blob \| blame \| history
include/libzfs.h		patch \| blob \| blame \| history
include/libzfs_core.h		patch \| blob \| blame \| history
include/sys/dmu_impl.h		patch \| blob \| blame \| history
include/sys/dmu_send.h		patch \| blob \| blame \| history
include/sys/dmu_traverse.h		patch \| blob \| blame \| history
include/sys/dsl_dataset.h		patch \| blob \| blame \| history
include/sys/fs/zfs.h		patch \| blob \| blame \| history
include/sys/zfs_ioctl.h		patch \| blob \| blame \| history
include/sys/zvol.h		patch \| blob \| blame \| history
lib/libzfs/libzfs_dataset.c		patch \| blob \| blame \| history
lib/libzfs/libzfs_mount.c		patch \| blob \| blame \| history
lib/libzfs/libzfs_sendrecv.c		patch \| blob \| blame \| history
lib/libzfs_core/libzfs_core.c		patch \| blob \| blame \| history
lib/libzpool/kernel.c		patch \| blob \| blame \| history
man/man8/zfs.8		patch \| blob \| blame \| history
module/zcommon/zfs_prop.c		patch \| blob \| blame \| history
module/zfs/dmu_objset.c		patch \| blob \| blame \| history
module/zfs/dmu_send.c		patch \| blob \| blame \| history
module/zfs/dmu_traverse.c		patch \| blob \| blame \| history
module/zfs/dsl_dataset.c		patch \| blob \| blame \| history
module/zfs/dsl_destroy.c		patch \| blob \| blame \| history
module/zfs/zfs_ioctl.c		patch \| blob \| blame \| history
module/zfs/zvol.c		patch \| blob \| blame \| history
tests/runfiles/linux.run		patch \| blob \| blame \| history
tests/zfs-tests/cmd/mktree/mktree.c		patch \| blob \| blame \| history
tests/zfs-tests/include/commands.cfg.in		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/rsend/rsend.kshlib		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh		patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/rsend/rsend_014_pos.ksh	[changed mode: 0644->0755]	patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh	[changed mode: 0644->0755]	patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh	[changed mode: 0644->0755]	patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh	[changed mode: 0644->0755]	patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh	[changed mode: 0644->0755]	patch \| blob \| blame \| history
tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh	[changed mode: 0644->0755]	patch \| blob \| blame \| history
zfs-script-config.sh.in		patch \| blob \| blame \| history