]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Fix send/recv lost spill block
authorBrian Behlendorf <behlendorf1@llnl.gov>
Tue, 7 May 2019 22:18:44 +0000 (15:18 -0700)
committerGitHub <noreply@github.com>
Tue, 7 May 2019 22:18:44 +0000 (15:18 -0700)
When receiving a DRR_OBJECT record the receive_object() function
needs to determine how to handle a spill block associated with the
object.  It may need to be removed or kept depending on how the
object was modified at the source.

This determination is currently accomplished using a heuristic which
takes in to account the DRR_OBJECT record and the existing object
properties.  This is a problem because there isn't quite enough
information available to do the right thing under all circumstances.
For example, when only the block size changes the spill block is
removed when it should be kept.

What's needed to resolve this is an additional flag in the DRR_OBJECT
which indicates if the object being received references a spill block.
The DRR_OBJECT_SPILL flag was added for this purpose.  When set then
the object references a spill block and it must be kept.  Either
it is update to date, or it will be replaced by a subsequent DRR_SPILL
record.  Conversely, if the object being received doesn't reference
a spill block then any existing spill block should always be removed.

Since previous versions of ZFS do not understand this new flag
additional DRR_SPILL records will be inserted in to the stream.
This has the advantage of being fully backward compatible.  Existing
ZFS systems receiving this stream will recreate the spill block if
it was incorrectly removed.  Updated ZFS versions will correctly
ignore the additional spill blocks which can be identified by
checking for the DRR_SPILL_UNMODIFIED flag.

The small downside to this approach is that is may increase the size
of the stream and of the received snapshot on previous versions of
ZFS.  Additionally, when receiving streams generated by previous
unpatched versions of ZFS spill blocks may still be lost.

OpenZFS-issue: https://www.illumos.org/issues/9952
FreeBSD-issue: https://bugs.freebsd.org/bugzilla/show_bug.cgi?id=233277

Reviewed-by: Paul Dagnelie <pcd@delphix.com>
Reviewed-by: Matt Ahrens <mahrens@delphix.com>
Reviewed-by: Tom Caputi <tcaputi@datto.com>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
Closes #8668

19 files changed:
include/sys/dmu.h
include/sys/dmu_impl.h
include/sys/dmu_recv.h
include/sys/dnode.h
include/sys/fs/zfs.h
include/sys/zfs_ioctl.h
lib/libzfs/libzfs_sendrecv.c
man/man5/zfs-module-parameters.5
module/zfs/dbuf.c
module/zfs/dmu_object.c
module/zfs/dmu_recv.c
module/zfs/dmu_send.c
module/zfs/dnode.c
tests/runfiles/linux.run
tests/zfs-tests/tests/functional/rsend/Makefile.am
tests/zfs-tests/tests/functional/rsend/rsend.kshlib
tests/zfs-tests/tests/functional/rsend/send_realloc_encrypted_files.ksh
tests/zfs-tests/tests/functional/rsend/send_realloc_files.ksh
tests/zfs-tests/tests/functional/rsend/send_spill_block.ksh [new file with mode: 0755]

index 93d05aac42e16fe995b8544b77339d36c479de95..88c83617178d16634d220bea629d17f6c8a2f7b3 100644 (file)
@@ -420,7 +420,8 @@ int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
 int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object,
     dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype,
-    int bonuslen, int dnodesize, dmu_tx_t *txp);
+    int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx);
+int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Free an object from this objset.
index 03a63077f10139f3f50c4bf36ee29060e60657cf..5e1901da4ac2f53302ec5b86d965813a735b710f 100644 (file)
@@ -265,6 +265,7 @@ typedef struct dmu_sendarg {
        objset_t *dsa_os;
        zio_cksum_t dsa_zc;
        uint64_t dsa_toguid;
+       uint64_t dsa_fromtxg;
        int dsa_err;
        dmu_pendop_t dsa_pending_op;
        uint64_t dsa_featureflags;
index 90002026bec95bb8039ad68647a6c28a46aad9e5..ffa89249d311be710795ded093f53ac21848315b 100644 (file)
@@ -48,6 +48,7 @@ typedef struct dmu_recv_cookie {
        boolean_t drc_resumable;
        boolean_t drc_raw;
        boolean_t drc_clone;
+       boolean_t drc_spill;
        struct avl_tree *drc_guid_to_ds_map;
        nvlist_t *drc_keynvl;
        zio_cksum_t drc_cksum;
index accbe6945e27c2e3bdf2bf8ae682ef538b99b584..c60258bbc768dd124c1dc6fed441b6c140f81575 100644 (file)
@@ -267,8 +267,8 @@ typedef struct dnode_phys {
        };
 } dnode_phys_t;
 
-#define        DN_SPILL_BLKPTR(dnp)    (blkptr_t *)((char *)(dnp) + \
-       (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))
+#define        DN_SPILL_BLKPTR(dnp)    ((blkptr_t *)((char *)(dnp) + \
+       (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT)))
 
 struct dnode {
        /*
@@ -420,7 +420,8 @@ void dnode_sync(dnode_t *dn, dmu_tx_t *tx);
 void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
     dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
 void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx);
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots,
+    boolean_t keep_spill, dmu_tx_t *tx);
 void dnode_free(dnode_t *dn, dmu_tx_t *tx);
 void dnode_byteswap(dnode_phys_t *dnp);
 void dnode_buf_byteswap(void *buf, size_t size);
index 8a532ec7ed3c3b3b65684014db925e6ac7e1c886..3bcefdbfd775cacf52d0abdcd7070f206cce186c 100644 (file)
@@ -1317,6 +1317,7 @@ typedef enum {
        ZFS_ERR_WRONG_PARENT,
        ZFS_ERR_FROM_IVSET_GUID_MISSING,
        ZFS_ERR_FROM_IVSET_GUID_MISMATCH,
+       ZFS_ERR_SPILL_BLOCK_FLAG_MISSING,
 } zfs_errno_t;
 
 /*
index bb5b48c9170bd78da64cb612c28a0bc0bcff72d5..a883c335857c3b97c70dbcc0037349ef2e1a1ceb 100644 (file)
@@ -101,7 +101,7 @@ typedef enum drr_headertype {
 /* flag #18 is reserved for a Delphix feature */
 #define        DMU_BACKUP_FEATURE_LARGE_BLOCKS         (1 << 19)
 #define        DMU_BACKUP_FEATURE_RESUMING             (1 << 20)
-/* flag #21 is reserved for a Delphix feature */
+/* flag #21 is reserved for the redacted send/receive feature */
 #define        DMU_BACKUP_FEATURE_COMPRESSED           (1 << 22)
 #define        DMU_BACKUP_FEATURE_LARGE_DNODE          (1 << 23)
 #define        DMU_BACKUP_FEATURE_RAW                  (1 << 24)
@@ -131,7 +131,7 @@ typedef enum dmu_send_resume_token_version {
  *
  *     64      56      48      40      32      24      16      8       0
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
- *     |               reserved        |        feature-flags      |C|S|
+ *     |               reserved        |        feature-flags      |C|S|
  *     +-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * The low order two bits indicate the header type: SUBSTREAM (0x1)
@@ -160,16 +160,38 @@ typedef enum dmu_send_resume_token_version {
  * cannot necessarily be received as a clone correctly.
  */
 #define        DRR_FLAG_FREERECORDS    (1<<2)
+/*
+ * When DRR_FLAG_SPILL_BLOCK is set it indicates the DRR_OBJECT_SPILL
+ * and DRR_SPILL_UNMODIFIED flags are meaningful in the send stream.
+ *
+ * When DRR_FLAG_SPILL_BLOCK is set, DRR_OBJECT records will have
+ * DRR_OBJECT_SPILL set if and only if they should have a spill block
+ * (either an existing one, or a new one in the send stream).  When clear
+ * the object does not have a spill block and any existing spill block
+ * should be freed.
+ *
+ * Similarly, when DRR_FLAG_SPILL_BLOCK is set, DRR_SPILL records will
+ * have DRR_SPILL_UNMODIFIED set if and only if they were included for
+ * backward compatibility purposes, and can be safely ignored by new versions
+ * of zfs receive.  Previous versions of ZFS which do not understand the
+ * DRR_FLAG_SPILL_BLOCK will process this record and recreate any missing
+ * spill blocks.
+ */
+#define        DRR_FLAG_SPILL_BLOCK    (1<<3)
 
 /*
  * flags in the drr_flags field in the DRR_WRITE, DRR_SPILL, DRR_OBJECT,
  * DRR_WRITE_BYREF, and DRR_OBJECT_RANGE blocks
  */
-#define        DRR_CHECKSUM_DEDUP      (1<<0) /* not used for DRR_SPILL blocks */
+#define        DRR_CHECKSUM_DEDUP      (1<<0) /* not used for SPILL records */
 #define        DRR_RAW_BYTESWAP        (1<<1)
+#define        DRR_OBJECT_SPILL        (1<<2) /* OBJECT record has a spill block */
+#define        DRR_SPILL_UNMODIFIED    (1<<2) /* SPILL record for unmodified block */
 
 #define        DRR_IS_DEDUP_CAPABLE(flags)     ((flags) & DRR_CHECKSUM_DEDUP)
 #define        DRR_IS_RAW_BYTESWAPPED(flags)   ((flags) & DRR_RAW_BYTESWAP)
+#define        DRR_OBJECT_HAS_SPILL(flags)     ((flags) & DRR_OBJECT_SPILL)
+#define        DRR_SPILL_IS_UNMODIFIED(flags)  ((flags) & DRR_SPILL_UNMODIFIED)
 
 /* deal with compressed drr_write replay records */
 #define        DRR_WRITE_COMPRESSED(drrw)      ((drrw)->drr_compressiontype != 0)
index 2c2eca8db7cc4db3cf202c768c539320572912bf..f69a46430bbea85cfffef6cc6820c70ea63f9b4e 100644 (file)
@@ -4466,6 +4466,13 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
                            "of raw encrypted send streams."));
                        (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
                        break;
+               case ZFS_ERR_SPILL_BLOCK_FLAG_MISSING:
+                       zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+                           "Spill block flag missing for raw send.\n"
+                           "The zfs software on the sending system must "
+                           "be updated."));
+                       (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf);
+                       break;
                case EBUSY:
                        if (hastoken) {
                                zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
index ad6cd4e94b687d0f5cac9bf7d973877a3a95039d..5c49670f155a23dab73f1b98a02cb1054079b959 100644 (file)
@@ -2337,6 +2337,21 @@ Allow sending of corrupt data (ignore read/checksum errors when sending data)
 Use \fB1\fR for yes and \fB0\fR for no (default).
 .RE
 
+.sp
+.ne 2
+.na
+\fBzfs_send_unmodified_spill_blocks\fR (int)
+.ad
+.RS 12n
+Include unmodified spill blocks in the send stream. Under certain circumstances
+previous versions of ZFS could incorrectly remove the spill block from an
+existing object.  Including unmodified copies of the spill blocks creates a
+backwards compatible stream which will recreate a spill block if it was
+incorrectly removed.
+.sp
+Use \fB1\fR for yes (default) and \fB0\fR for no.
+.RE
+
 .sp
 .ne 2
 .na
@@ -2355,7 +2370,6 @@ Default value: \fB16,777,216\fR.
 \fBzfs_recv_queue_length\fR (int)
 .ad
 .RS 12n
-.sp
 The maximum number of bytes allowed in the \fBzfs receive\fR queue. This value
 must be at least twice the maximum block size in use.
 .sp
index d52a520fae2fc4b3e1bb67775533a63a21430976..07e616f6f0de614e58422d45acbb94f592d57342 100644 (file)
@@ -2466,7 +2466,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
        ASSERT(db->db_level == 0);
        ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
        ASSERT(buf != NULL);
-       ASSERT(arc_buf_lsize(buf) == db->db.db_size);
+       ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size);
        ASSERT(tx->tx_txg != 0);
 
        arc_return_buf(buf, db);
index e77ebeca54f20cb27fe7238e1184b4e771fe1723..ec78ebbdcb46efc4367ecc3e820a0ec706759019 100644 (file)
@@ -24,6 +24,7 @@
  * Copyright 2014 HybridCluster. All rights reserved.
  */
 
+#include <sys/dbuf.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
@@ -304,13 +305,13 @@ dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
        return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype,
-           bonuslen, DNODE_MIN_SIZE, tx));
+           bonuslen, DNODE_MIN_SIZE, B_FALSE, tx));
 }
 
 int
 dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize,
-    dmu_tx_t *tx)
+    boolean_t keep_spill, dmu_tx_t *tx)
 {
        dnode_t *dn;
        int dn_slots = dnodesize >> DNODE_SHIFT;
@@ -327,7 +328,30 @@ dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot,
        if (err)
                return (err);
 
-       dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, tx);
+       dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots,
+           keep_spill, tx);
+
+       dnode_rele(dn, FTAG);
+       return (err);
+}
+
+int
+dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+       dnode_t *dn;
+       int err;
+
+       err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0,
+           FTAG, &dn);
+       if (err)
+               return (err);
+
+       rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
+       if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+               dbuf_rm_spill(dn, tx);
+               dnode_rm_spill(dn, tx);
+       }
+       rw_exit(&dn->dn_struct_rwlock);
 
        dnode_rele(dn, FTAG);
        return (err);
@@ -489,6 +513,7 @@ EXPORT_SYMBOL(dmu_object_claim);
 EXPORT_SYMBOL(dmu_object_claim_dnsize);
 EXPORT_SYMBOL(dmu_object_reclaim);
 EXPORT_SYMBOL(dmu_object_reclaim_dnsize);
+EXPORT_SYMBOL(dmu_object_rm_spill);
 EXPORT_SYMBOL(dmu_object_free);
 EXPORT_SYMBOL(dmu_object_next);
 EXPORT_SYMBOL(dmu_object_zapify);
index fc5d47f5febbdd2942c62996d02ff4b18fc48d1e..976b1bd464207247172b3e1e3ab4eba2deac7216 100644 (file)
@@ -274,6 +274,10 @@ dmu_recv_begin_check(void *arg, dmu_tx_t *tx)
                /* embedded data is incompatible with encryption and raw recv */
                if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)
                        return (SET_ERROR(EINVAL));
+
+               /* raw receives require spill block allocation flag */
+               if (!(flags & DRR_FLAG_SPILL_BLOCK))
+                       return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
        } else {
                dsflags |= DS_HOLD_FLAG_DECRYPT;
        }
@@ -615,8 +619,13 @@ dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx)
        (void) snprintf(recvname, sizeof (recvname), "%s/%s",
            tofs, recv_clone_name);
 
-       if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0)
+       if (featureflags & DMU_BACKUP_FEATURE_RAW) {
+               /* raw receives require spill block allocation flag */
+               if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK))
+                       return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING));
+       } else {
                dsflags |= DS_HOLD_FLAG_DECRYPT;
+       }
 
        if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) {
                /* %recv does not exist; continue in tofs */
@@ -764,6 +773,9 @@ dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin,
                return (SET_ERROR(EINVAL));
        }
 
+       if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)
+               drc->drc_spill = B_TRUE;
+
        drba.drba_origin = origin;
        drba.drba_cookie = drc;
        drba.drba_cred = CRED();
@@ -835,7 +847,8 @@ struct receive_writer_arg {
        /* A map from guid to dataset to help handle dedup'd streams. */
        avl_tree_t *guid_to_ds_map;
        boolean_t resumable;
-       boolean_t raw;
+       boolean_t raw;   /* DMU_BACKUP_FEATURE_RAW set */
+       boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */
        uint64_t last_object;
        uint64_t last_offset;
        uint64_t max_object; /* highest object ID referenced in stream */
@@ -1151,10 +1164,19 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
                    drro->drr_raw_bonuslen)
                        return (SET_ERROR(EINVAL));
        } else {
-               if (drro->drr_flags != 0 || drro->drr_raw_bonuslen != 0 ||
-                   drro->drr_indblkshift != 0 || drro->drr_nlevels != 0 ||
-                   drro->drr_nblkptr != 0)
+               /*
+                * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN
+                * record indicates this by setting DRR_FLAG_SPILL_BLOCK.
+                */
+               if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) ||
+                   (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) {
+                       return (SET_ERROR(EINVAL));
+               }
+
+               if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 ||
+                   drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) {
                        return (SET_ERROR(EINVAL));
+               }
        }
 
        err = dmu_object_info(rwa->os, drro->drr_object, &doi);
@@ -1312,7 +1334,7 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
        }
 
        if (object == DMU_NEW_OBJECT) {
-               /* currently free, want to be allocated */
+               /* Currently free, wants to be allocated */
                err = dmu_object_claim_dnsize(rwa->os, drro->drr_object,
                    drro->drr_type, drro->drr_blksz,
                    drro->drr_bonustype, drro->drr_bonuslen,
@@ -1321,11 +1343,19 @@ receive_object(struct receive_writer_arg *rwa, struct drr_object *drro,
            drro->drr_blksz != doi.doi_data_block_size ||
            drro->drr_bonustype != doi.doi_bonus_type ||
            drro->drr_bonuslen != doi.doi_bonus_size) {
-               /* currently allocated, but with different properties */
+               /* Currently allocated, but with different properties */
                err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object,
                    drro->drr_type, drro->drr_blksz,
                    drro->drr_bonustype, drro->drr_bonuslen,
-                   dn_slots << DNODE_SHIFT, tx);
+                   dn_slots << DNODE_SHIFT, rwa->spill ?
+                   DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx);
+       } else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) {
+               /*
+                * Currently allocated, the existing version of this object
+                * may reference a spill block that is no longer allocated
+                * at the source and needs to be freed.
+                */
+               err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx);
        }
 
        if (err != 0) {
@@ -1665,6 +1695,17 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
            drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os)))
                return (SET_ERROR(EINVAL));
 
+       /*
+        * This is an unmodified spill block which was added to the stream
+        * to resolve an issue with incorrectly removing spill blocks.  It
+        * should be ignored by current versions of the code which support
+        * the DRR_FLAG_SPILL_BLOCK flag.
+        */
+       if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) {
+               dmu_return_arcbuf(abuf);
+               return (0);
+       }
+
        if (rwa->raw) {
                if (!DMU_OT_IS_VALID(drrs->drr_type) ||
                    drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS ||
@@ -1699,9 +1740,16 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
                return (err);
        }
 
-       if (db_spill->db_size < drrs->drr_length)
+       /*
+        * Spill blocks may both grow and shrink.  When a change in size
+        * occurs any existing dbuf must be updated to match the logical
+        * size of the provided arc_buf_t.
+        */
+       if (db_spill->db_size != drrs->drr_length) {
+               dmu_buf_will_fill(db_spill, tx);
                VERIFY(0 == dbuf_spill_set_blksz(db_spill,
                    drrs->drr_length, tx));
+       }
 
        if (rwa->byteswap && !arc_is_encrypted(abuf) &&
            arc_get_compression(abuf) == ZIO_COMPRESS_OFF) {
@@ -2575,6 +2623,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
        rwa->byteswap = drc->drc_byteswap;
        rwa->resumable = drc->drc_resumable;
        rwa->raw = drc->drc_raw;
+       rwa->spill = drc->drc_spill;
        rwa->os->os_raw_receive = drc->drc_raw;
 
        (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc,
index ad64d666bee75177c0d1128114a96f81f2780141..a6ff5ce3e4465524dcb131a60885cef8a419c69b 100644 (file)
@@ -64,6 +64,8 @@ int zfs_send_corrupt_data = B_FALSE;
 int zfs_send_queue_length = SPA_MAXBLOCKSIZE;
 /* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */
 int zfs_send_set_freerecords_bit = B_TRUE;
+/* Set this tunable to FALSE is disable sending unmodified spill blocks. */
+int zfs_send_unmodified_spill_blocks = B_TRUE;
 
 /*
  * Use this to override the recordsize calculation for fast zfs send estimates.
@@ -99,6 +101,8 @@ typedef struct dump_bytes_io {
        int             dbi_len;
 } dump_bytes_io_t;
 
+static int do_dump(dmu_sendarg_t *dsa, struct send_block_record *data);
+
 static void
 dump_bytes_cb(void *arg)
 {
@@ -436,6 +440,12 @@ dump_spill(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object, void *data)
        drrs->drr_length = blksz;
        drrs->drr_toguid = dsp->dsa_toguid;
 
+       /* See comment in dump_dnode() for full details */
+       if (zfs_send_unmodified_spill_blocks &&
+           (bp->blk_birth <= dsp->dsa_fromtxg)) {
+               drrs->drr_flags |= DRR_SPILL_UNMODIFIED;
+       }
+
        /* handle raw send fields */
        if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_RAW) {
                ASSERT(BP_IS_PROTECTED(bp));
@@ -587,6 +597,14 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object,
                }
        }
 
+       /*
+        * DRR_OBJECT_SPILL is set for every dnode which references a
+        * spill block.  This allows the receiving pool to definitively
+        * determine when a spill block should be kept or freed.
+        */
+       if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)
+               drro->drr_flags |= DRR_OBJECT_SPILL;
+
        if (dump_record(dsp, DN_BONUS(dnp), bonuslen) != 0)
                return (SET_ERROR(EINTR));
 
@@ -594,8 +612,34 @@ dump_dnode(dmu_sendarg_t *dsp, const blkptr_t *bp, uint64_t object,
        if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
            (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0)
                return (SET_ERROR(EINTR));
+
+       /*
+        * Send DRR_SPILL records for unmodified spill blocks.  This is useful
+        * because changing certain attributes of the object (e.g. blocksize)
+        * can cause old versions of ZFS to incorrectly remove a spill block.
+        * Including these records in the stream forces an up to date version
+        * to always be written ensuring they're never lost.  Current versions
+        * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can
+        * ignore these unmodified spill blocks.
+        */
+       if (zfs_send_unmodified_spill_blocks &&
+           (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) &&
+           (DN_SPILL_BLKPTR(dnp)->blk_birth <= dsp->dsa_fromtxg)) {
+               struct send_block_record record;
+
+               bzero(&record, sizeof (struct send_block_record));
+               record.eos_marker = B_FALSE;
+               record.bp = *DN_SPILL_BLKPTR(dnp);
+               SET_BOOKMARK(&(record.zb), dmu_objset_id(dsp->dsa_os),
+                   object, 0, DMU_SPILL_BLKID);
+
+               if (do_dump(dsp, &record) != 0)
+                       return (SET_ERROR(EINTR));
+       }
+
        if (dsp->dsa_err != 0)
                return (SET_ERROR(EINTR));
+
        return (0);
 }
 
@@ -1036,6 +1080,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
        /* raw send implies compressok */
        if (compressok || rawok)
                featureflags |= DMU_BACKUP_FEATURE_COMPRESSED;
+
        if (rawok && os->os_encrypted)
                featureflags |= DMU_BACKUP_FEATURE_RAW;
 
@@ -1064,6 +1109,8 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
        if (zfs_send_set_freerecords_bit)
                drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_FREERECORDS;
 
+       drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK;
+
        if (ancestor_zb != NULL) {
                drr->drr_u.drr_begin.drr_fromguid =
                    ancestor_zb->zbm_guid;
@@ -1084,6 +1131,7 @@ dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *to_ds,
        dsp->dsa_os = os;
        dsp->dsa_off = off;
        dsp->dsa_toguid = dsl_dataset_phys(to_ds)->ds_guid;
+       dsp->dsa_fromtxg = fromtxg;
        dsp->dsa_pending_op = PENDING_NONE;
        dsp->dsa_featureflags = featureflags;
        dsp->dsa_resume_object = resumeobj;
@@ -1552,4 +1600,8 @@ MODULE_PARM_DESC(zfs_send_corrupt_data, "Allow sending corrupt data");
 
 module_param(zfs_send_queue_length, int, 0644);
 MODULE_PARM_DESC(zfs_send_queue_length, "Maximum send queue length");
+
+module_param(zfs_send_unmodified_spill_blocks, int, 0644);
+MODULE_PARM_DESC(zfs_send_unmodified_spill_blocks,
+       "Send unmodified spill blocks");
 #endif
index 78a90f68fbd3bb4768b4477e487bf60484b2644b..38ec646bacda45379fb7d73f4626ad5dd42f4ecf 100644 (file)
@@ -660,7 +660,8 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 
 void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
-    dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx)
+    dmu_object_type_t bonustype, int bonuslen, int dn_slots,
+    boolean_t keep_spill, dmu_tx_t *tx)
 {
        int nblkptr;
 
@@ -710,7 +711,7 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
                dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype;
        if (dn->dn_nblkptr != nblkptr)
                dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr;
-       if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
+       if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) {
                dbuf_rm_spill(dn, tx);
                dnode_rm_spill(dn, tx);
        }
index 746d42a22df4f0236bb8236e7dde35be9889cbc6..8219cf42b101c767115463f47f6d3c0af0bdd3ba 100644 (file)
@@ -807,8 +807,8 @@ tests = ['rsend_001_pos', 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos',
     'send-c_recv_dedup', 'send_encrypted_files', 'send_encrypted_hierarchy',
     'send_encrypted_props', 'send_encrypted_truncated_files',
     'send_freeobjects', 'send_realloc_dnode_size', 'send_realloc_files',
-    'send_realloc_encrypted_files', 'send_holds', 'send_hole_birth',
-    'send_mixed_raw', 'send-wDR_encrypted_zvol']
+    'send_realloc_encrypted_files', 'send_spill_block', 'send_holds',
+    'send_hole_birth', 'send_mixed_raw', 'send-wDR_encrypted_zvol']
 tags = ['functional', 'rsend']
 
 [tests/functional/scrub_mirror]
index 8669a51fb885c3123843fa574234b90eeceae6e0..585018ac25037bbb6ecaae7a0ab79528393170f5 100644 (file)
@@ -44,6 +44,7 @@ dist_pkgdata_SCRIPTS = \
        send_realloc_dnode_size.ksh \
        send_realloc_files.ksh \
        send_realloc_encrypted_files.ksh \
+       send_spill_block.ksh \
        send_holds.ksh \
        send_hole_birth.ksh \
        send_mixed_raw.ksh \
index 2ef6775e6b393c6e580a3d200b3346b1c157436b..521a1c7eb63c45700f8821ccd186e8d121090d4d 100644 (file)
@@ -30,6 +30,7 @@
 
 . $STF_SUITE/include/libtest.shlib
 . $STF_SUITE/include/math.shlib
+. $STF_SUITE/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib
 . $STF_SUITE/tests/functional/rsend/rsend.cfg
 
 #
@@ -518,9 +519,13 @@ function churn_files
                        value=$((RANDOM % 5))
                        if [ $value -eq 0 -a $xattrs -ne 0 ]; then
                                attrname="testattr$((RANDOM % 3))"
+                               attrlen="$(((RANDOM % 1000) + 1))"
+                               attrvalue="$(random_string VALID_NAME_CHAR \
+                                   $attrlen)"
                                attr -qr $attrname $file_name || \
                                    log_fail "Failed to remove $attrname"
-                               attr -qs $attrname -V TestValue $file_name || \
+                               attr -qs $attrname \
+                                   -V "$attrvalue" $file_name || \
                                    log_fail "Failed to set $attrname"
                        elif [ $value -eq 1 ]; then
                                dd if=/dev/urandom of=$file_name \
@@ -548,9 +553,12 @@ function churn_files
                        if [ $xattrs -ne 0 ]; then
                                for j in {0..2}; do
                                        attrname="testattr$j"
-                                       attr -qs $attrname -V TestValue \
-                                           $file_name || log_fail \
-                                           "Failed to set $attrname"
+                                       attrlen="$(((RANDOM % 1000) + 1))"
+                                       attrvalue="$(random_string \
+                                           VALID_NAME_CHAR $attrlen)"
+                                       attr -qs $attrname \
+                                           -V "$attrvalue" $file_name || \
+                                           log_fail "Failed to set $attrname"
                                done
                        fi
                fi
@@ -791,10 +799,11 @@ function rand_set_prop
        log_must eval "zfs set $prop='$value' $dtst"
 }
 
-# Generate a recursive checksum of a filesystems contents.  Only file
-# data is included in the checksum (no meta data, or xattrs).
+# Generate a recursive checksum of a filesystem which includes the file
+# contents and any associated xattrs.
 function recursive_cksum
 {
-       find $1 -type f -exec sha256sum {} \; | \
+       find $1 -type f -exec sh -c 'sha256sum {}; getfattr \
+           --absolute-names --only-values -d {} | sha256sum' \; | \
            sort -k 2 | awk '{ print $1 }' | sha256sum
 }
index 0649beaa352b84d3c71aa4d93703e29479b0e044..3c3de86d91c6093fc8e8e7c921ad8a9bf7064ea7 100755 (executable)
@@ -65,7 +65,16 @@ log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs@snap${last_snap}"
 # Set atime=off to prevent the recursive_cksum from modifying newfs.
 log_must zfs set atime=off $POOL/newfs
 
-for i in {1..5}; do
+# Due to reduced performance on debug kernels use fewer files by default.
+if is_kmemleak; then
+       nr_files=100
+       passes=2
+else
+       nr_files=1000
+       passes=3
+fi
+
+for i in {1..$passes}; do
        # Randomly modify several dataset properties in order to generate
        # more interesting incremental send streams.
        rand_set_prop $POOL/fs checksum "off" "fletcher4" "sha256"
@@ -76,12 +85,8 @@ for i in {1..5}; do
 
        # Churn the filesystem in such a way that we're likely to be both
        # allocating and reallocating objects in the incremental stream.
-       #
-       # Disable xattrs until the following spill block issue is resolved:
-       # https://github.com/openzfs/openzfs/pull/705
-       #
-       log_must churn_files 1000 524288 $POOL/fs 0
-       expected_cksum=$(recursive_cksum /$fs)
+       log_must churn_files $nr_files 524288 $POOL/fs
+       expected_cksum=$(recursive_cksum /$POOL/fs)
 
        # Create a snapshot and use it to send an incremental stream.
        this_snap=$((last_snap + 1))
index 80464e05e9422cf025601c437d6eaa5c147def8d..4b89a73d8081130036c89091a3f6116ff2e65523 100755 (executable)
@@ -35,6 +35,8 @@
 #   e) Destroy the incremental stream and old snapshot.
 #
 
+verify_runnable "both"
+
 log_assert "Verify incremental receive handles reallocation"
 
 function cleanup
@@ -56,7 +58,16 @@ log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs@snap${last_snap}"
 # Set atime=off to prevent the recursive_cksum from modifying newfs.
 log_must zfs set atime=off $POOL/newfs
 
-for i in {1..5}; do
+# Due to reduced performance on debug kernels use fewer files by default.
+if is_kmemleak; then
+       nr_files=100
+       passes=2
+else
+       nr_files=1000
+       passes=3
+fi
+
+for i in {1..$passes}; do
        # Randomly modify several dataset properties in order to generate
        # more interesting incremental send streams.
        rand_set_prop $POOL/fs checksum "off" "fletcher4" "sha256"
@@ -67,8 +78,8 @@ for i in {1..5}; do
 
        # Churn the filesystem in such a way that we're likely to be both
        # allocating and reallocating objects in the incremental stream.
-       log_must churn_files 1000 524288 $POOL/fs
-       expected_cksum=$(recursive_cksum /$fs)
+       log_must churn_files $nr_files 524288 $POOL/fs
+       expected_cksum=$(recursive_cksum /$POOL/fs)
 
        # Create a snapshot and use it to send an incremental stream.
        this_snap=$((last_snap + 1))
diff --git a/tests/zfs-tests/tests/functional/rsend/send_spill_block.ksh b/tests/zfs-tests/tests/functional/rsend/send_spill_block.ksh
new file mode 100755 (executable)
index 0000000..9de732e
--- /dev/null
@@ -0,0 +1,155 @@
+#!/bin/ksh
+
+#
+# This file and its contents are supplied under the terms of the
+# Common Development and Distribution License ("CDDL"), version 1.0.
+# You may only use this file in accordance with the terms of version
+# 1.0 of the CDDL.
+#
+# A full copy of the text of the CDDL should have accompanied this
+# source.  A copy of the CDDL is also available via the Internet at
+# http://www.illumos.org/license/CDDL.
+#
+
+#
+# Copyright (c) 2019 by Lawrence Livermore National Security, LLC.
+#
+
+. $STF_SUITE/include/libtest.shlib
+. $STF_SUITE/tests/functional/rsend/rsend.kshlib
+
+#
+# Description:
+# Verify spill blocks are correctly preserved.
+#
+# Strategy:
+# 1) Create a set of files each containing some file data.
+# 2) Add enough xattrs to the file to require a spill block.
+# 3) Snapshot and send these files to a new dataset.
+# 4) Modify the files and spill blocks in a variety of ways.
+# 5) Send the changes using an incremental send stream.
+# 6) Verify that all the xattrs (and thus the spill block) were
+#    preserved when receiving the incremental stream.
+#
+
+verify_runnable "both"
+
+log_assert "Verify spill blocks are correctly preserved"
+
+function cleanup
+{
+       rm -f $BACKDIR/fs@*
+       destroy_dataset $POOL/fs "-rR"
+       destroy_dataset $POOL/newfs "-rR"
+}
+
+attrvalue="abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz"
+
+log_onexit cleanup
+
+log_must zfs create $POOL/fs
+log_must zfs set xattr=sa $POOL/fs
+log_must zfs set dnodesize=legacy $POOL/fs
+log_must zfs set recordsize=128k $POOL/fs
+
+# Create 40 files each with a spill block containing xattrs.  Each file
+# will be modified in a different way to validate the incremental receive.
+for i in {1..40}; do
+       file="/$POOL/fs/file$i"
+
+       log_must mkfile 16384 $file
+       for j in {1..20}; do
+               log_must attr -qs "testattr$j" -V "$attrvalue" $file
+       done
+done
+
+# Snapshot the pool and send it to the new dataset.
+log_must zfs snapshot $POOL/fs@snap1
+log_must eval "zfs send -e $POOL/fs@snap1 >$BACKDIR/fs@snap1"
+log_must eval "zfs recv $POOL/newfs < $BACKDIR/fs@snap1"
+
+#
+# Modify file[1-6]'s contents but not the spill blocks.
+#
+# file1 - Increase record size; single block
+# file2 - Increase record size; multiple blocks
+# file3 - Truncate file to zero size; single block
+# file4 - Truncate file to smaller size; single block
+# file5 - Truncate file to much larger size; add holes
+# file6 - Truncate file to embedded size; embedded data
+#
+log_must mkfile 32768 /$POOL/fs/file1
+log_must mkfile 1048576 /$POOL/fs/file2
+log_must truncate -s 0 /$POOL/fs/file3
+log_must truncate -s 8192 /$POOL/fs/file4
+log_must truncate -s 1073741824 /$POOL/fs/file5
+log_must truncate -s 50 /$POOL/fs/file6
+
+#
+# Modify file[11-16]'s contents and their spill blocks.
+#
+# file11 - Increase record size; single block
+# file12 - Increase record size; multiple blocks
+# file13 - Truncate file to zero size; single block
+# file14 - Truncate file to smaller size; single block
+# file15 - Truncate file to much larger size; add holes
+# file16 - Truncate file to embedded size; embedded data
+#
+log_must mkfile 32768 /$POOL/fs/file11
+log_must mkfile 1048576 /$POOL/fs/file12
+log_must truncate -s 0 /$POOL/fs/file13
+log_must truncate -s 8192 /$POOL/fs/file14
+log_must truncate -s 1073741824 /$POOL/fs/file15
+log_must truncate -s 50 /$POOL/fs/file16
+
+for i in {11..20}; do
+       log_must attr -qr testattr1 /$POOL/fs/file$i
+done
+
+#
+# Modify file[21-26]'s contents and remove their spill blocks.
+#
+# file21 - Increase record size; single block
+# file22 - Increase record size; multiple blocks
+# file23 - Truncate file to zero size; single block
+# file24 - Truncate file to smaller size; single block
+# file25 - Truncate file to much larger size; add holes
+# file26 - Truncate file to embedded size; embedded data
+#
+log_must mkfile 32768 /$POOL/fs/file21
+log_must mkfile 1048576 /$POOL/fs/file22
+log_must truncate -s 0 /$POOL/fs/file23
+log_must truncate -s 8192 /$POOL/fs/file24
+log_must truncate -s 1073741824 /$POOL/fs/file25
+log_must truncate -s 50 /$POOL/fs/file26
+
+for i in {21..30}; do
+       for j in {1..20}; do
+               log_must attr -qr testattr$j /$POOL/fs/file$i
+       done
+done
+
+#
+# Modify file[31-40]'s spill blocks but not the file contents.
+#
+for i in {31..40}; do
+       file="/$POOL/fs/file$i"
+       log_must attr -qr testattr$(((RANDOM % 20) + 1)) $file
+       log_must attr -qs testattr$(((RANDOM % 20) + 1)) -V "$attrvalue" $file
+done
+
+# Calculate the expected recursive checksum for the source.
+expected_cksum=$(recursive_cksum /$POOL/fs)
+
+# Snapshot the pool and send the incremental snapshot.
+log_must zfs snapshot $POOL/fs@snap2
+log_must eval "zfs send -e -i $POOL/fs@snap1 $POOL/fs@snap2 >$BACKDIR/fs@snap2"
+log_must eval "zfs recv -F $POOL/newfs < $BACKDIR/fs@snap2"
+
+# Validate the received copy using the received recursive checksum.
+actual_cksum=$(recursive_cksum /$POOL/newfs)
+if [[ "$expected_cksum" != "$actual_cksum" ]]; then
+       log_fail "Checksums differ ($expected_cksum != $actual_cksum)"
+fi
+
+log_pass "Verify spill blocks are correctly preserved"