]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Verify block pointers before writing them out
authorMatthew Ahrens <mahrens@delphix.com>
Mon, 8 May 2023 18:20:23 +0000 (11:20 -0700)
committerGitHub <noreply@github.com>
Mon, 8 May 2023 18:20:23 +0000 (11:20 -0700)
If a block pointer is corrupted (but the block containing it checksums
correctly, e.g. due to a bug that overwrites random memory), we can
often detect it before the block is read, with the `zfs_blkptr_verify()`
function, which is used in `arc_read()`, `zio_free()`, etc.

However, such corruption is not typically recoverable.  To recover from
it we would need to detect the memory error before the block pointer is
written to disk.

This PR verifies BP's that are contained in indirect blocks and dnodes
before they are written to disk, in `dbuf_write_ready()`. This way,
we'll get a panic before the on-disk data is corrupted. This will help
us to diagnose what's causing the corruption, as well as being much
easier to recover from.

To minimize performance impact, only checks that can be done without
holding the spa_config_lock are performed.

Additionally, when corruption is detected, the raw words of the block
pointer are logged.  (Note that `dprintf_bp()` is a no-op by default,
but if enabled it is not safe to use with invalid block pointers.)

Reviewed-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Reviewed-by: Paul Zuchowski <pzuchowski@datto.com>
Reviewed-by: Alexander Motin <mav@FreeBSD.org>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #14817

cmd/zdb/zdb.c
include/sys/zio.h
module/zfs/arc.c
module/zfs/dbuf.c
module/zfs/dsl_scan.c
module/zfs/spa.c
module/zfs/zio.c

index cea80b6908411d224508a3eddcf654cbf2af06c5..5ab13b470dc0be1dfac2ba6d32904d5ab725ebe6 100644 (file)
@@ -8499,8 +8499,8 @@ zdb_read_block(char *thing, spa_t *spa)
            !(flags & ZDB_FLAG_DECOMPRESS)) {
                const blkptr_t *b = (const blkptr_t *)(void *)
                    ((uintptr_t)buf + (uintptr_t)blkptr_offset);
-               if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) ==
-                   B_FALSE) {
+               if (zfs_blkptr_verify(spa, b,
+                   BLK_CONFIG_NEEDED, BLK_VERIFY_ONLY) == B_FALSE) {
                        abd_return_buf_copy(pabd, buf, lsize);
                        borrowed = B_FALSE;
                        buf = lbuf;
@@ -8508,8 +8508,8 @@ zdb_read_block(char *thing, spa_t *spa)
                            lbuf, lsize, psize, flags);
                        b = (const blkptr_t *)(void *)
                            ((uintptr_t)buf + (uintptr_t)blkptr_offset);
-                       if (failed || zfs_blkptr_verify(spa, b, B_FALSE,
-                           BLK_VERIFY_LOG) == B_FALSE) {
+                       if (failed || zfs_blkptr_verify(spa, b,
+                           BLK_CONFIG_NEEDED, BLK_VERIFY_LOG) == B_FALSE) {
                                printf("invalid block pointer at this DVA\n");
                                goto out;
                        }
index 3463682a10654411f413f71b1136c4242d6fb1b9..695bc09e6cb71112f44071f433d4c912093fc78f 100644 (file)
@@ -531,6 +531,12 @@ enum blk_verify_flag {
        BLK_VERIFY_HALT
 };
 
+enum blk_config_flag {
+       BLK_CONFIG_HELD,   // SCL_VDEV held for writer
+       BLK_CONFIG_NEEDED, // SCL_VDEV should be obtained for reader
+       BLK_CONFIG_SKIP,   // skip checks which require SCL_VDEV
+};
+
 extern int zio_bookmark_compare(const void *, const void *);
 
 extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
@@ -646,7 +652,7 @@ extern int zio_resume(spa_t *spa);
 extern void zio_resume_wait(spa_t *spa);
 
 extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
-    boolean_t config_held, enum blk_verify_flag blk_verify);
+    enum blk_config_flag blk_config, enum blk_verify_flag blk_verify);
 
 /*
  * Initial setup and teardown.
index c50228a2682f648a3c1a77358ae584c80b5d5dde..bf8d99f94c399e4eb56964fb844d0c4acda218b0 100644 (file)
@@ -5696,8 +5696,8 @@ top:
         * and treat it as a checksum error.  This allows an alternate blkptr
         * to be tried when one is available (e.g. ditto blocks).
         */
-       if (!zfs_blkptr_verify(spa, bp, zio_flags & ZIO_FLAG_CONFIG_WRITER,
-           BLK_VERIFY_LOG)) {
+       if (!zfs_blkptr_verify(spa, bp, (zio_flags & ZIO_FLAG_CONFIG_WRITER) ?
+           BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
                rc = SET_ERROR(ECKSUM);
                goto done;
        }
index 8193fb2440791031a610057805184b22df74e959..6a50f1927adda616f756e82dcc8a80772db1fea8 100644 (file)
@@ -4636,6 +4636,20 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                                i += DNODE_MIN_SIZE;
                                if (dnp->dn_type != DMU_OT_NONE) {
                                        fill++;
+                                       for (int j = 0; j < dnp->dn_nblkptr;
+                                           j++) {
+                                               (void) zfs_blkptr_verify(spa,
+                                                   &dnp->dn_blkptr[j],
+                                                   BLK_CONFIG_SKIP,
+                                                   BLK_VERIFY_HALT);
+                                       }
+                                       if (dnp->dn_flags &
+                                           DNODE_FLAG_SPILL_BLKPTR) {
+                                               (void) zfs_blkptr_verify(spa,
+                                                   DN_SPILL_BLKPTR(dnp),
+                                                   BLK_CONFIG_SKIP,
+                                                   BLK_VERIFY_HALT);
+                                       }
                                        i += dnp->dn_extra_slots *
                                            DNODE_MIN_SIZE;
                                }
@@ -4653,6 +4667,8 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
                for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
                        if (BP_IS_HOLE(ibp))
                                continue;
+                       (void) zfs_blkptr_verify(spa, ibp,
+                           BLK_CONFIG_SKIP, BLK_VERIFY_HALT);
                        fill += BP_GET_FILL(ibp);
                }
        }
index d6a9365df120ef9b1ca5a242a7575fe5ab06712b..d398b6705551575918be7f6f9759047fe66422ad 100644 (file)
@@ -1970,7 +1970,8 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype,
                            DMU_USERUSED_OBJECT, tx);
                }
                arc_buf_destroy(buf, &buf);
-       } else if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) {
+       } else if (!zfs_blkptr_verify(spa, bp,
+           BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
                /*
                 * Sanity check the block pointer contents, this is handled
                 * by arc_read() for the cases above.
index c2a67fbc7c55476c518efa35cf0f1f53421e6023..16396170273cb675a9fe2ca518479b7a3474e897 100644 (file)
@@ -2387,7 +2387,7 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
         * When damaged consider it to be a metadata error since we cannot
         * trust the BP_GET_TYPE and BP_GET_LEVEL values.
         */
-       if (!zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_LOG)) {
+       if (!zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
                atomic_inc_64(&sle->sle_meta_count);
                return (0);
        }
index 0924fb6f40bc8e3573c9ea6197b15805dffe26cd..365d34832c3ac17a60f408305c15b338ecb1b863 100644 (file)
@@ -935,9 +935,35 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
        (void) vsnprintf(buf, sizeof (buf), fmt, adx);
        va_end(adx);
 
+       zfs_dbgmsg("bad blkptr at %px: "
+           "DVA[0]=%#llx/%#llx "
+           "DVA[1]=%#llx/%#llx "
+           "DVA[2]=%#llx/%#llx "
+           "prop=%#llx "
+           "pad=%#llx,%#llx "
+           "phys_birth=%#llx "
+           "birth=%#llx "
+           "fill=%#llx "
+           "cksum=%#llx/%#llx/%#llx/%#llx",
+           bp,
+           (long long)bp->blk_dva[0].dva_word[0],
+           (long long)bp->blk_dva[0].dva_word[1],
+           (long long)bp->blk_dva[1].dva_word[0],
+           (long long)bp->blk_dva[1].dva_word[1],
+           (long long)bp->blk_dva[2].dva_word[0],
+           (long long)bp->blk_dva[2].dva_word[1],
+           (long long)bp->blk_prop,
+           (long long)bp->blk_pad[0],
+           (long long)bp->blk_pad[1],
+           (long long)bp->blk_phys_birth,
+           (long long)bp->blk_birth,
+           (long long)bp->blk_fill,
+           (long long)bp->blk_cksum.zc_word[0],
+           (long long)bp->blk_cksum.zc_word[1],
+           (long long)bp->blk_cksum.zc_word[2],
+           (long long)bp->blk_cksum.zc_word[3]);
        switch (blk_verify) {
        case BLK_VERIFY_HALT:
-               dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
                zfs_panic_recover("%s: %s", spa_name(spa), buf);
                break;
        case BLK_VERIFY_LOG:
@@ -958,47 +984,54 @@ zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp,
  * If everything checks out B_TRUE is returned.  The zfs_blkptr_verify
  * argument controls the behavior when an invalid field is detected.
  *
- * Modes for zfs_blkptr_verify:
- *   1) BLK_VERIFY_ONLY (evaluate the block)
- *   2) BLK_VERIFY_LOG (evaluate the block and log problems)
- *   3) BLK_VERIFY_HALT (call zfs_panic_recover on error)
+ * Values for blk_verify_flag:
+ *   BLK_VERIFY_ONLY: evaluate the block
+ *   BLK_VERIFY_LOG: evaluate the block and log problems
+ *   BLK_VERIFY_HALT: call zfs_panic_recover on error
+ *
+ * Values for blk_config_flag:
+ *   BLK_CONFIG_HELD: caller holds SCL_VDEV for writer
+ *   BLK_CONFIG_NEEDED: caller holds no config lock, SCL_VDEV will be
+ *   obtained for reader
+ *   BLK_CONFIG_SKIP: skip checks which require SCL_VDEV, for better
+ *   performance
  */
 boolean_t
-zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
-    enum blk_verify_flag blk_verify)
+zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp,
+    enum blk_config_flag blk_config, enum blk_verify_flag blk_verify)
 {
        int errors = 0;
 
        if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
                errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-                   "blkptr at %p has invalid TYPE %llu",
+                   "blkptr at %px has invalid TYPE %llu",
                    bp, (longlong_t)BP_GET_TYPE(bp));
        }
        if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS) {
                errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-                   "blkptr at %p has invalid CHECKSUM %llu",
+                   "blkptr at %px has invalid CHECKSUM %llu",
                    bp, (longlong_t)BP_GET_CHECKSUM(bp));
        }
        if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS) {
                errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-                   "blkptr at %p has invalid COMPRESS %llu",
+                   "blkptr at %px has invalid COMPRESS %llu",
                    bp, (longlong_t)BP_GET_COMPRESS(bp));
        }
        if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
                errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-                   "blkptr at %p has invalid LSIZE %llu",
+                   "blkptr at %px has invalid LSIZE %llu",
                    bp, (longlong_t)BP_GET_LSIZE(bp));
        }
        if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
                errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-                   "blkptr at %p has invalid PSIZE %llu",
+                   "blkptr at %px has invalid PSIZE %llu",
                    bp, (longlong_t)BP_GET_PSIZE(bp));
        }
 
        if (BP_IS_EMBEDDED(bp)) {
                if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) {
                        errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-                           "blkptr at %p has invalid ETYPE %llu",
+                           "blkptr at %px has invalid ETYPE %llu",
                            bp, (longlong_t)BPE_GET_ETYPE(bp));
                }
        }
@@ -1010,10 +1043,19 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
        if (!spa->spa_trust_config)
                return (errors == 0);
 
-       if (!config_held)
-               spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
-       else
+       switch (blk_config) {
+       case BLK_CONFIG_HELD:
                ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER));
+               break;
+       case BLK_CONFIG_NEEDED:
+               spa_config_enter(spa, SCL_VDEV, bp, RW_READER);
+               break;
+       case BLK_CONFIG_SKIP:
+               return (errors == 0);
+       default:
+               panic("invalid blk_config %u", blk_config);
+       }
+
        /*
         * Pool-specific checks.
         *
@@ -1028,20 +1070,20 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
 
                if (vdevid >= spa->spa_root_vdev->vdev_children) {
                        errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-                           "blkptr at %p DVA %u has invalid VDEV %llu",
+                           "blkptr at %px DVA %u has invalid VDEV %llu",
                            bp, i, (longlong_t)vdevid);
                        continue;
                }
                vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
                if (vd == NULL) {
                        errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-                           "blkptr at %p DVA %u has invalid VDEV %llu",
+                           "blkptr at %px DVA %u has invalid VDEV %llu",
                            bp, i, (longlong_t)vdevid);
                        continue;
                }
                if (vd->vdev_ops == &vdev_hole_ops) {
                        errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-                           "blkptr at %p DVA %u has hole VDEV %llu",
+                           "blkptr at %px DVA %u has hole VDEV %llu",
                            bp, i, (longlong_t)vdevid);
                        continue;
                }
@@ -1059,13 +1101,11 @@ zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held,
                        asize = vdev_gang_header_asize(vd);
                if (offset + asize > vd->vdev_asize) {
                        errors += zfs_blkptr_verify_log(spa, bp, blk_verify,
-                           "blkptr at %p DVA %u has invalid OFFSET %llu",
+                           "blkptr at %px DVA %u has invalid OFFSET %llu",
                            bp, i, (longlong_t)offset);
                }
        }
-       if (errors > 0)
-               dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp);
-       if (!config_held)
+       if (blk_config == BLK_CONFIG_NEEDED)
                spa_config_exit(spa, SCL_VDEV, bp);
 
        return (errors == 0);
@@ -1203,7 +1243,7 @@ void
 zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 {
 
-       (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT);
+       (void) zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
        /*
         * The check for EMBEDDED is a performance optimization.  We
@@ -1282,8 +1322,8 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 {
        zio_t *zio;
 
-       (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER,
-           BLK_VERIFY_HALT);
+       (void) zfs_blkptr_verify(spa, bp, (flags & ZIO_FLAG_CONFIG_WRITER) ?
+           BLK_CONFIG_HELD : BLK_CONFIG_NEEDED, BLK_VERIFY_HALT);
 
        if (BP_IS_EMBEDDED(bp))
                return (zio_null(pio, spa, NULL, NULL, NULL, 0));