]> git.proxmox.com Git - mirror_zfs.git/commitdiff
dmu: Allow buffer fills to fail
authorAlexander Motin <mav@FreeBSD.org>
Fri, 15 Dec 2023 17:51:41 +0000 (12:51 -0500)
committerGitHub <noreply@github.com>
Fri, 15 Dec 2023 17:51:41 +0000 (09:51 -0800)
When ZFS overwrites a whole block, it does not bother to read the
old content from disk. It is a good optimization, but if the buffer
fill fails due to page fault or something else, the buffer ends up
corrupted, neither keeping old content, nor getting the new one.

On FreeBSD this is additionally complicated by page faults being
blocked by VFS layer, always returning EFAULT on attempt to write
from mmap()'ed but not yet cached address range.  Normally it is
not a big problem, since after original failure VFS will retry the
write after reading the required data.  The problem becomes worse
in specific case when somebody tries to write into a file its own
mmap()'ed content from the same location.  In that situation the
only copy of the data is getting corrupted on the page fault and
the following retries only fixate the status quo.  Block cloning
makes this issue easier to reproduce, since it does not read the
old data, unlike traditional file copy, that may work by chance.

This patch provides the fill status to dmu_buf_fill_done(), that
in case of error can destroy the corrupted buffer as if no write
happened.  One more complication in case of block cloning is that
if error is possible during fill, dmu_buf_will_fill() must read
the data via fall-back to dmu_buf_will_dirty().  It is required
to allow in case of error restoring the buffer to a state after
the cloning, not not before it, that would happen if we just call
dbuf_undirty().

Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Rob Norris <robn@despairlabs.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #15665

include/os/freebsd/spl/sys/uio.h
include/os/linux/spl/sys/uio.h
include/sys/dbuf.h
lib/libspl/include/sys/uio.h
module/os/freebsd/zfs/dmu_os.c
module/zfs/dbuf.c
module/zfs/dmu.c
module/zfs/dmu_recv.c
module/zfs/dsl_bookmark.c

index b71f2f2e56251d94396534d9e622c71c9f887f22..b9d41903ea638fbf8d5a46ca389f7ce32612c21d 100644 (file)
@@ -62,7 +62,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
 }
 
 static inline void
-zfs_uio_advance(zfs_uio_t *uio, size_t size)
+zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
 {
        zfs_uio_resid(uio) -= size;
        zfs_uio_offset(uio) += size;
index a4b600004c9f4f1412dca30af01a7829795eda09..5e6ea8d3c22124a0f13ce472a8097474e39127b7 100644 (file)
@@ -95,7 +95,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off)
 }
 
 static inline void
-zfs_uio_advance(zfs_uio_t *uio, size_t size)
+zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
 {
        uio->uio_resid -= size;
        uio->uio_loffset += size;
index 2ff0bc72b2701e5895569d4c3bad1d0c1719226d..3808a04cba8011f517437baec1e83b66c3542f27 100644 (file)
@@ -380,8 +380,8 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
-void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail);
+boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid,
index e9e21819d4f89d44d2b8147f01225f9e847116e4..665bfc42301b29cd0e9ddf09c46b60902b84b3f7 100644 (file)
@@ -90,7 +90,7 @@ zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len)
 }
 
 static inline void
-zfs_uio_advance(zfs_uio_t *uio, size_t size)
+zfs_uio_advance(zfs_uio_t *uio, ssize_t size)
 {
        uio->uio_resid -= size;
        uio->uio_loffset += size;
index ee6fb2dc657b579557dbb2426e8858693a081b5e..48ea37cbad598c47b54663b5be71b0171d62615d 100644 (file)
@@ -107,7 +107,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
                ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
                if (tocpy == db->db_size)
-                       dmu_buf_will_fill(db, tx);
+                       dmu_buf_will_fill(db, tx, B_FALSE);
                else
                        dmu_buf_will_dirty(db, tx);
 
@@ -123,7 +123,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
                }
 
                if (tocpy == db->db_size)
-                       dmu_buf_fill_done(db, tx);
+                       dmu_buf_fill_done(db, tx, B_FALSE);
 
                offset += tocpy;
                size -= tocpy;
index 03c97941d6d307aa1f26a9f4e0c7022b68f657a0..e9d5abca3324286e4145577aef21c818b76eb6c9 100644 (file)
@@ -2751,7 +2751,7 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 }
 
 void
-dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
+dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail)
 {
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
@@ -2769,8 +2769,14 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
                 * Block cloning: We will be completely overwriting a block
                 * cloned in this transaction group, so let's undirty the
                 * pending clone and mark the block as uncached. This will be
-                * as if the clone was never done.
+                * as if the clone was never done.  But if the fill can fail
+                * we should have a way to return back to the cloned data.
                 */
+               if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) {
+                       mutex_exit(&db->db_mtx);
+                       dmu_buf_will_dirty(db_fake, tx);
+                       return;
+               }
                VERIFY(!dbuf_undirty(db, tx));
                db->db_state = DB_UNCACHED;
        }
@@ -2831,32 +2837,41 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx)
        dl->dr_overridden_by.blk_birth = dr->dr_txg;
 }
 
-void
-dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx)
+boolean_t
+dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed)
 {
        (void) tx;
        dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
-       dbuf_states_t old_state;
        mutex_enter(&db->db_mtx);
        DBUF_VERIFY(db);
 
-       old_state = db->db_state;
-       db->db_state = DB_CACHED;
-       if (old_state == DB_FILL) {
+       if (db->db_state == DB_FILL) {
                if (db->db_level == 0 && db->db_freed_in_flight) {
                        ASSERT(db->db_blkid != DMU_BONUS_BLKID);
                        /* we were freed while filling */
                        /* XXX dbuf_undirty? */
                        memset(db->db.db_data, 0, db->db.db_size);
                        db->db_freed_in_flight = FALSE;
+                       db->db_state = DB_CACHED;
                        DTRACE_SET_STATE(db,
                            "fill done handling freed in flight");
+                       failed = B_FALSE;
+               } else if (failed) {
+                       VERIFY(!dbuf_undirty(db, tx));
+                       db->db_buf = NULL;
+                       dbuf_clear_data(db);
+                       DTRACE_SET_STATE(db, "fill failed");
                } else {
+                       db->db_state = DB_CACHED;
                        DTRACE_SET_STATE(db, "fill done");
                }
                cv_broadcast(&db->db_changed);
+       } else {
+               db->db_state = DB_CACHED;
+               failed = B_FALSE;
        }
        mutex_exit(&db->db_mtx);
+       return (failed);
 }
 
 void
@@ -3001,7 +3016,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
        DTRACE_SET_STATE(db, "filling assigned arcbuf");
        mutex_exit(&db->db_mtx);
        (void) dbuf_dirty(db, tx);
-       dmu_buf_fill_done(&db->db, tx);
+       dmu_buf_fill_done(&db->db, tx, B_FALSE);
 }
 
 void
index f5a5d0fc437f59faec99c40f1521c660449a4edd..d82211e6d4c74b8de955fbb506c4d96a7b2f5b9a 100644 (file)
@@ -1134,14 +1134,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
                ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
                if (tocpy == db->db_size)
-                       dmu_buf_will_fill(db, tx);
+                       dmu_buf_will_fill(db, tx, B_FALSE);
                else
                        dmu_buf_will_dirty(db, tx);
 
                (void) memcpy((char *)db->db_data + bufoff, buf, tocpy);
 
                if (tocpy == db->db_size)
-                       dmu_buf_fill_done(db, tx);
+                       dmu_buf_fill_done(db, tx, B_FALSE);
 
                offset += tocpy;
                size -= tocpy;
@@ -1349,27 +1349,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx)
 
                ASSERT(size > 0);
 
-               bufoff = zfs_uio_offset(uio) - db->db_offset;
+               offset_t off = zfs_uio_offset(uio);
+               bufoff = off - db->db_offset;
                tocpy = MIN(db->db_size - bufoff, size);
 
                ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
                if (tocpy == db->db_size)
-                       dmu_buf_will_fill(db, tx);
+                       dmu_buf_will_fill(db, tx, B_TRUE);
                else
                        dmu_buf_will_dirty(db, tx);
 
-               /*
-                * XXX zfs_uiomove could block forever (eg.nfs-backed
-                * pages).  There needs to be a uiolockdown() function
-                * to lock the pages in memory, so that zfs_uiomove won't
-                * block.
-                */
                err = zfs_uio_fault_move((char *)db->db_data + bufoff,
                    tocpy, UIO_WRITE, uio);
 
-               if (tocpy == db->db_size)
-                       dmu_buf_fill_done(db, tx);
+               if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) {
+                       /* The fill was reverted.  Undo any uio progress. */
+                       zfs_uio_advance(uio, off - zfs_uio_offset(uio));
+               }
 
                if (err)
                        break;
index 05ca91717c2fd195f888a3ae63fb83bca01c3941..54aa60259ea1f9f318744dfe5f7996b5b6ae06d9 100644 (file)
@@ -2532,7 +2532,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs,
         * size of the provided arc_buf_t.
         */
        if (db_spill->db_size != drrs->drr_length) {
-               dmu_buf_will_fill(db_spill, tx);
+               dmu_buf_will_fill(db_spill, tx, B_FALSE);
                VERIFY0(dbuf_spill_set_blksz(db_spill,
                    drrs->drr_length, tx));
        }
index 03d9420dbdb9d54132bb4ff869addb07d368bd4b..4faefecbadbb2088a2aaf5e1e52b418e7fd52849 100644 (file)
@@ -490,7 +490,7 @@ dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot,
                        dmu_buf_t *db;
                        VERIFY0(dmu_spill_hold_by_bonus(local_rl->rl_bonus,
                            DB_RF_MUST_SUCCEED, FTAG, &db));
-                       dmu_buf_will_fill(db, tx);
+                       dmu_buf_will_fill(db, tx, B_FALSE);
                        VERIFY0(dbuf_spill_set_blksz(db, P2ROUNDUP(bonuslen,
                            SPA_MINBLOCKSIZE), tx));
                        local_rl->rl_phys = db->db_data;