]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Refactor dbuf_read() for safer decryption
authorAlexander Motin <mav@FreeBSD.org>
Mon, 22 Apr 2024 18:41:03 +0000 (14:41 -0400)
committerGitHub <noreply@github.com>
Mon, 22 Apr 2024 18:41:03 +0000 (11:41 -0700)
In dbuf_read_verify_dnode_crypt():
 - We don't need original dbuf locked there. Instead take a lock
on a dnode dbuf, that is actually manipulated.
 - Block decryption for a dnode dbuf if it is currently being
written.  ARC hash lock does not protect anonymous buffers, so
arc_untransform() is unsafe when used on buffers being written,
that may happen in case of encrypted dnode buffers, since they
are not copied by dbuf_dirty()/dbuf_hold_copy().

In dbuf_read():
 - If the buffer is in flight, recheck its compression/encryption
status after it is cached, since it may need arc_untransform().

Tested-by: Rich Ercolani <rincebrain@gmail.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Alexander Motin <mav@FreeBSD.org>
Sponsored by: iXsystems, Inc.
Closes #16104

module/zfs/dbuf.c

index 5f3643f573f7c36454bb32cca1d7291a5481c047..bb913f5563742bee47cbdee4a7c525d90e17df8d 100644 (file)
@@ -161,13 +161,13 @@ struct {
 } dbuf_sums;
 
 #define        DBUF_STAT_INCR(stat, val)       \
-       wmsum_add(&dbuf_sums.stat, val);
+       wmsum_add(&dbuf_sums.stat, val)
 #define        DBUF_STAT_DECR(stat, val)       \
-       DBUF_STAT_INCR(stat, -(val));
+       DBUF_STAT_INCR(stat, -(val))
 #define        DBUF_STAT_BUMP(stat)            \
-       DBUF_STAT_INCR(stat, 1);
+       DBUF_STAT_INCR(stat, 1)
 #define        DBUF_STAT_BUMPDOWN(stat)        \
-       DBUF_STAT_INCR(stat, -1);
+       DBUF_STAT_INCR(stat, -1)
 #define        DBUF_STAT_MAX(stat, v) {                                        \
        uint64_t _m;                                                    \
        while ((v) > (_m = dbuf_stats.stat.value.ui64) &&               \
@@ -177,7 +177,6 @@ struct {
 
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr);
-static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags);
 
 /*
  * Global data structures and functions for the dbuf cache.
@@ -1418,13 +1417,9 @@ dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp,
  * a decrypted block. Otherwise success.
  */
 static int
-dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
+dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn)
 {
-       int bonuslen, max_bonuslen, err;
-
-       err = dbuf_read_verify_dnode_crypt(db, flags);
-       if (err)
-               return (err);
+       int bonuslen, max_bonuslen;
 
        bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
        max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots);
@@ -1509,32 +1504,46 @@ dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, blkptr_t *bp)
  * decrypt / authenticate them when we need to read an encrypted bonus buffer.
  */
 static int
-dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
+dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags)
 {
-       int err = 0;
        objset_t *os = db->db_objset;
-       arc_buf_t *dnode_abuf;
-       dnode_t *dn;
+       dmu_buf_impl_t *dndb;
+       arc_buf_t *dnbuf;
        zbookmark_phys_t zb;
-
-       ASSERT(MUTEX_HELD(&db->db_mtx));
+       int err;
 
        if ((flags & DB_RF_NO_DECRYPT) != 0 ||
-           !os->os_encrypted || os->os_raw_receive)
+           !os->os_encrypted || os->os_raw_receive ||
+           (dndb = dn->dn_dbuf) == NULL)
                return (0);
 
-       DB_DNODE_ENTER(db);
-       dn = DB_DNODE(db);
-       dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL;
-
-       if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) {
-               DB_DNODE_EXIT(db);
+       dnbuf = dndb->db_buf;
+       if (!arc_is_encrypted(dnbuf))
                return (0);
-       }
+
+       mutex_enter(&dndb->db_mtx);
+
+       /*
+        * Since dnode buffer is modified by sync process, there can be only
+        * one copy of it.  It means we can not modify (decrypt) it while it
+        * is being written.  I don't see how this may happen now, since
+        * encrypted dnode writes by receive should be completed before any
+        * plain-text reads due to txg wait, but better be safe than sorry.
+        */
+       while (1) {
+               if (!arc_is_encrypted(dnbuf)) {
+                       mutex_exit(&dndb->db_mtx);
+                       return (0);
+               }
+               dbuf_dirty_record_t *dr = dndb->db_data_pending;
+               if (dr == NULL || dr->dt.dl.dr_data != dnbuf)
+                       break;
+               cv_wait(&dndb->db_changed, &dndb->db_mtx);
+       };
 
        SET_BOOKMARK(&zb, dmu_objset_id(os),
-           DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid);
-       err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE);
+           DMU_META_DNODE_OBJECT, 0, dndb->db_blkid);
+       err = arc_untransform(dnbuf, os->os_spa, &zb, B_TRUE);
 
        /*
         * An error code of EACCES tells us that the key is still not
@@ -1547,7 +1556,7 @@ dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags)
            !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype))))
                err = 0;
 
-       DB_DNODE_EXIT(db);
+       mutex_exit(&dndb->db_mtx);
 
        return (err);
 }
@@ -1573,7 +1582,7 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
            RW_LOCK_HELD(&db->db_parent->db_rwlock));
 
        if (db->db_blkid == DMU_BONUS_BLKID) {
-               err = dbuf_read_bonus(db, dn, flags);
+               err = dbuf_read_bonus(db, dn);
                goto early_unlock;
        }
 
@@ -1635,10 +1644,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, dnode_t *dn, zio_t *zio, uint32_t flags,
                goto early_unlock;
        }
 
-       err = dbuf_read_verify_dnode_crypt(db, flags);
-       if (err != 0)
-               goto early_unlock;
-
        db->db_state = DB_READ;
        DTRACE_SET_STATE(db, "read issued");
        mutex_exit(&db->db_mtx);
@@ -1754,19 +1759,23 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
 {
-       int err = 0;
-       boolean_t prefetch;
        dnode_t *dn;
+       boolean_t miss = B_TRUE, need_wait = B_FALSE, prefetch;
+       int err;
 
-       /*
-        * We don't have to hold the mutex to check db_state because it
-        * can't be freed while we have a hold on the buffer.
-        */
        ASSERT(!zfs_refcount_is_zero(&db->db_holds));
 
        DB_DNODE_ENTER(db);
        dn = DB_DNODE(db);
 
+       /*
+        * Ensure that this block's dnode has been decrypted if the caller
+        * has requested decrypted data.
+        */
+       err = dbuf_read_verify_dnode_crypt(db, dn, flags);
+       if (err != 0)
+               goto done;
+
        prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
            (flags & DB_RF_NOPREFETCH) == 0;
 
@@ -1775,13 +1784,38 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
                db->db_partial_read = B_TRUE;
        else if (!(flags & DB_RF_PARTIAL_MORE))
                db->db_partial_read = B_FALSE;
-       if (db->db_state == DB_CACHED) {
+       miss = (db->db_state != DB_CACHED);
+
+       if (db->db_state == DB_READ || db->db_state == DB_FILL) {
                /*
-                * Ensure that this block's dnode has been decrypted if
-                * the caller has requested decrypted data.
+                * Another reader came in while the dbuf was in flight between
+                * UNCACHED and CACHED.  Either a writer will finish filling
+                * the buffer, sending the dbuf to CACHED, or the first reader's
+                * request will reach the read_done callback and send the dbuf
+                * to CACHED.  Otherwise, a failure occurred and the dbuf will
+                * be sent to UNCACHED.
                 */
-               err = dbuf_read_verify_dnode_crypt(db, flags);
+               if (flags & DB_RF_NEVERWAIT) {
+                       mutex_exit(&db->db_mtx);
+                       DB_DNODE_EXIT(db);
+                       goto done;
+               }
+               do {
+                       ASSERT(db->db_state == DB_READ ||
+                           (flags & DB_RF_HAVESTRUCT) == 0);
+                       DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db,
+                           zio_t *, pio);
+                       cv_wait(&db->db_changed, &db->db_mtx);
+               } while (db->db_state == DB_READ || db->db_state == DB_FILL);
+               if (db->db_state == DB_UNCACHED) {
+                       err = SET_ERROR(EIO);
+                       mutex_exit(&db->db_mtx);
+                       DB_DNODE_EXIT(db);
+                       goto done;
+               }
+       }
 
+       if (db->db_state == DB_CACHED) {
                /*
                 * If the arc buf is compressed or encrypted and the caller
                 * requested uncompressed data, we need to untransform it
@@ -1789,8 +1823,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
                 * unauthenticated blocks, which will verify their MAC if
                 * the key is now available.
                 */
-               if (err == 0 && db->db_buf != NULL &&
-                   (flags & DB_RF_NO_DECRYPT) == 0 &&
+               if ((flags & DB_RF_NO_DECRYPT) == 0 && db->db_buf != NULL &&
                    (arc_is_encrypted(db->db_buf) ||
                    arc_is_unauthenticated(db->db_buf) ||
                    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) {
@@ -1804,17 +1837,10 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
                        dbuf_set_data(db, db->db_buf);
                }
                mutex_exit(&db->db_mtx);
-               if (err == 0 && prefetch) {
-                       dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-                           B_FALSE, flags & DB_RF_HAVESTRUCT);
-               }
-               DB_DNODE_EXIT(db);
-               DBUF_STAT_BUMP(hash_hits);
-       } else if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL) {
-               boolean_t need_wait = B_FALSE;
-
+       } else {
+               ASSERT(db->db_state == DB_UNCACHED ||
+                   db->db_state == DB_NOFILL);
                db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG);
-
                if (pio == NULL && (db->db_state == DB_NOFILL ||
                    (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)))) {
                        spa_t *spa = dn->dn_objset->os_spa;
@@ -1822,65 +1848,33 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *pio, uint32_t flags)
                        need_wait = B_TRUE;
                }
                err = dbuf_read_impl(db, dn, pio, flags, dblt, FTAG);
-               /*
-                * dbuf_read_impl has dropped db_mtx and our parent's rwlock
-                * for us
-                */
-               if (!err && prefetch) {
-                       dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-                           db->db_state != DB_CACHED,
-                           flags & DB_RF_HAVESTRUCT);
-               }
-
-               DB_DNODE_EXIT(db);
-               DBUF_STAT_BUMP(hash_misses);
+               /* dbuf_read_impl drops db_mtx and parent's rwlock. */
+               miss = (db->db_state != DB_CACHED);
+       }
 
-               /*
-                * If we created a zio_root we must execute it to avoid
-                * leaking it, even if it isn't attached to any work due
-                * to an error in dbuf_read_impl().
-                */
-               if (need_wait) {
-                       if (err == 0)
-                               err = zio_wait(pio);
-                       else
-                               (void) zio_wait(pio);
-                       pio = NULL;
-               }
-       } else {
-               /*
-                * Another reader came in while the dbuf was in flight
-                * between UNCACHED and CACHED.  Either a writer will finish
-                * writing the buffer (sending the dbuf to CACHED) or the
-                * first reader's request will reach the read_done callback
-                * and send the dbuf to CACHED.  Otherwise, a failure
-                * occurred and the dbuf went to UNCACHED.
-                */
-               mutex_exit(&db->db_mtx);
-               if (prefetch) {
-                       dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE,
-                           B_TRUE, flags & DB_RF_HAVESTRUCT);
-               }
-               DB_DNODE_EXIT(db);
-               DBUF_STAT_BUMP(hash_misses);
+       if (err == 0 && prefetch) {
+               dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, miss,
+                   flags & DB_RF_HAVESTRUCT);
+       }
+       DB_DNODE_EXIT(db);
 
-               /* Skip the wait per the caller's request. */
-               if ((flags & DB_RF_NEVERWAIT) == 0) {
-                       mutex_enter(&db->db_mtx);
-                       while (db->db_state == DB_READ ||
-                           db->db_state == DB_FILL) {
-                               ASSERT(db->db_state == DB_READ ||
-                                   (flags & DB_RF_HAVESTRUCT) == 0);
-                               DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
-                                   db, zio_t *, pio);
-                               cv_wait(&db->db_changed, &db->db_mtx);
-                       }
-                       if (db->db_state == DB_UNCACHED)
-                               err = SET_ERROR(EIO);
-                       mutex_exit(&db->db_mtx);
-               }
+       /*
+        * If we created a zio we must execute it to avoid leaking it, even if
+        * it isn't attached to any work due to an error in dbuf_read_impl().
+        */
+       if (need_wait) {
+               if (err == 0)
+                       err = zio_wait(pio);
+               else
+                       (void) zio_wait(pio);
+               pio = NULL;
        }
 
+done:
+       if (miss)
+               DBUF_STAT_BUMP(hash_misses);
+       else
+               DBUF_STAT_BUMP(hash_hits);
        if (pio && err != 0) {
                zio_t *zio = zio_null(pio, pio->io_spa, NULL, NULL, NULL,
                    ZIO_FLAG_CANFAIL);