]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Fix read errors race after block cloning
authorAlexander Motin <mav@FreeBSD.org>
Mon, 8 Apr 2024 19:03:18 +0000 (15:03 -0400)
committerGitHub <noreply@github.com>
Mon, 8 Apr 2024 19:03:18 +0000 (12:03 -0700)
Investigating read errors triggering panic fixed in #16042 I've
found that we have a race in a sync process between the moment
dirty record for cloned block is removed and the moment dbuf is
destroyed.  If dmu_buf_hold_array_by_dnode() take a hold on a
cloned dbuf before it is synced/destroyed, then dbuf_read_impl()
may see it still in DB_NOFILL state, but without the dirty record.
Such case is not an error, but equivalent to DB_UNCACHED, since
the dbuf block pointer is already updated by dbuf_write_ready().
Unfortunately it is impossible to safely change the dbuf state
to DB_UNCACHED there, since there may already be another cloning
in progress, that dropped dbuf lock before creating a new dirty
record, protected only by the range lock.

Reviewed-by: Rob Norris <robn@despairlabs.com>
Reviewed-by: Robert Evans <evansr@google.com>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16052

module/zfs/dbuf.c

index d43f84e84725fac6d12f639962c8fca9bb7bf216..8c42b116d7e1d6f4fad4e7fc92bcdf082121f19a 100644 (file)
@@ -1563,7 +1563,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
        zbookmark_phys_t zb;
        uint32_t aflags = ARC_FLAG_NOWAIT;
        int err, zio_flags;
-       blkptr_t bp, *bpp;
+       blkptr_t bp, *bpp = NULL;
 
        ASSERT(!zfs_refcount_is_zero(&db->db_holds));
        ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -1577,29 +1577,28 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
                goto early_unlock;
        }
 
-       if (db->db_state == DB_UNCACHED) {
-               if (db->db_blkptr == NULL) {
-                       bpp = NULL;
-               } else {
-                       bp = *db->db_blkptr;
+       /*
+        * If we have a pending block clone, we don't want to read the
+        * underlying block, but the content of the block being cloned,
+        * pointed by the dirty record, so we have the most recent data.
+        * If there is no dirty record, then we hit a race in a sync
+        * process when the dirty record is already removed, while the
+        * dbuf is not yet destroyed. Such case is equivalent to uncached.
+        */
+       if (db->db_state == DB_NOFILL) {
+               dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records);
+               if (dr != NULL) {
+                       if (!dr->dt.dl.dr_brtwrite) {
+                               err = EIO;
+                               goto early_unlock;
+                       }
+                       bp = dr->dt.dl.dr_overridden_by;
                        bpp = &bp;
                }
-       } else {
-               dbuf_dirty_record_t *dr;
-
-               ASSERT3S(db->db_state, ==, DB_NOFILL);
+       }
 
-               /*
-                * Block cloning: If we have a pending block clone,
-                * we don't want to read the underlying block, but the content
-                * of the block being cloned, so we have the most recent data.
-                */
-               dr = list_head(&db->db_dirty_records);
-               if (dr == NULL || !dr->dt.dl.dr_brtwrite) {
-                       err = EIO;
-                       goto early_unlock;
-               }
-               bp = dr->dt.dl.dr_overridden_by;
+       if (bpp == NULL && db->db_blkptr != NULL) {
+               bp = *db->db_blkptr;
                bpp = &bp;
        }