]> git.proxmox.com Git - mirror_zfs.git/commitdiff
deadlock between spa_errlog_lock and dp_config_rwlock
authorMatthew Ahrens <mahrens@delphix.com>
Thu, 22 Dec 2022 19:48:49 +0000 (11:48 -0800)
committerGitHub <noreply@github.com>
Thu, 22 Dec 2022 19:48:49 +0000 (11:48 -0800)
There is a lock order inversion deadlock between `spa_errlog_lock` and
`dp_config_rwlock`:

A thread in `spa_delete_dataset_errlog()` is running from a sync task.
It is holding the `dp_config_rwlock` for writer (see
`dsl_sync_task_sync()`), and waiting for the `spa_errlog_lock`.

A thread in `dsl_pool_config_enter()` is holding the `spa_errlog_lock`
(see `spa_get_errlog_size()`) and waiting for the `dp_config_rwlock` (as
reader).

Note that this was introduced by #12812.

This commit address this by defining the lock ordering to be
dp_config_rwlock first, then spa_errlog_lock / spa_errlist_lock.
spa_get_errlog() and spa_get_errlog_size() can acquire the locks in this
order, and then process_error_block() and get_head_and_birth_txg() can
verify that the dp_config_rwlock is already held.

Additionally, a buffer overrun in `spa_get_errlog()` is corrected.  Many
code paths didn't check if `*count` got to zero, instead continuing to
overwrite past the beginning of the userspace buffer at `uaddr`.

Tested by having some errors in the pool (via `zinject -t data
/path/to/file`), one thread running `zpool iostat 0.001`, and another
thread runs `zfs destroy` (in a loop, although it hits the first time).
This reproduces the problem easily without the fix, and works with the
fix.

Reviewed-by: Mark Maybee <mark.maybee@delphix.com>
Reviewed-by: George Amanakis <gamanakis@gmail.com>
Reviewed-by: George Wilson <gwilson@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: Matthew Ahrens <mahrens@delphix.com>
Closes #14239
Closes #14289

cmd/zpool/zpool_main.c
cmd/ztest.c
include/sys/spa.h
lib/libzfs/libzfs_pool.c
lib/libzfs/libzfs_status.c
module/zfs/dsl_scan.c
module/zfs/spa.c
module/zfs/spa_errlog.c
module/zfs/zfs_ioctl.c

index af76d20f2d52a3c4be7e9171ad3a5958182c1d4f..04e80efbb32c54141aec1a092fdedf47dc8db8dd 100644 (file)
@@ -8599,37 +8599,17 @@ status_callback(zpool_handle_t *zhp, void *data)
 
                if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
                    &nerr) == 0) {
-                       nvlist_t *nverrlist = NULL;
-
-                       /*
-                        * If the approximate error count is small, get a
-                        * precise count by fetching the entire log and
-                        * uniquifying the results.
-                        */
-                       if (nerr > 0 && nerr < 100 && !cbp->cb_verbose &&
-                           zpool_get_errlog(zhp, &nverrlist) == 0) {
-                               nvpair_t *elem;
-
-                               elem = NULL;
-                               nerr = 0;
-                               while ((elem = nvlist_next_nvpair(nverrlist,
-                                   elem)) != NULL) {
-                                       nerr++;
-                               }
-                       }
-                       nvlist_free(nverrlist);
-
                        (void) printf("\n");
-
-                       if (nerr == 0)
-                               (void) printf(gettext("errors: No known data "
-                                   "errors\n"));
-                       else if (!cbp->cb_verbose)
+                       if (nerr == 0) {
+                               (void) printf(gettext(
+                                   "errors: No known data errors\n"));
+                       } else if (!cbp->cb_verbose) {
                                (void) printf(gettext("errors: %llu data "
                                    "errors, use '-v' for a list\n"),
                                    (u_longlong_t)nerr);
-                       else
+                       } else {
                                print_error_log(zhp);
+                       }
                }
 
                if (cbp->cb_dedup_stats)
index 605ce5cea3b64c0e31ec4b2e6643c8ddf1715382..4a031dac210c60ec5f920ef7dddab3f17eebf0cf 100644 (file)
@@ -6313,7 +6313,7 @@ ztest_scrub_impl(spa_t *spa)
        while (dsl_scan_scrubbing(spa_get_dsl(spa)))
                txg_wait_synced(spa_get_dsl(spa), 0);
 
-       if (spa_get_errlog_size(spa) > 0)
+       if (spa_approx_errlog_size(spa) > 0)
                return (ECKSUM);
 
        ztest_pool_scrubbed = B_TRUE;
index b260dd32820e5decf450f8d852d50271381cc950..500eb3491a9927dd77d38cff8b9d452d3ffc8dfc 100644 (file)
@@ -1146,7 +1146,7 @@ extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
 extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
-extern uint64_t spa_get_errlog_size(spa_t *spa);
+extern uint64_t spa_approx_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count);
 extern void spa_errlog_rotate(spa_t *spa);
 extern void spa_errlog_drain(spa_t *spa);
index 7f7e19a090bc5d32d39614d7dc0b5a42f7582a7c..2977986e1e6f4001722ff2961283a610119d334f 100644 (file)
@@ -4133,33 +4133,28 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
 {
        zfs_cmd_t zc = {"\0"};
        libzfs_handle_t *hdl = zhp->zpool_hdl;
-       uint64_t count;
-       zbookmark_phys_t *zb = NULL;
-       int i;
+       zbookmark_phys_t *buf;
+       uint64_t buflen = 10000; /* approx. 1MB of RAM */
+
+       if (fnvlist_lookup_uint64(zhp->zpool_config,
+           ZPOOL_CONFIG_ERRCOUNT) == 0)
+               return (0);
 
        /*
-        * Retrieve the raw error list from the kernel.  If the number of errors
-        * has increased, allocate more space and continue until we get the
-        * entire list.
+        * Retrieve the raw error list from the kernel.  If it doesn't fit,
+        * allocate a larger buffer and retry.
         */
-       count = fnvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT);
-       if (count == 0)
-               return (0);
-       zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl,
-           count * sizeof (zbookmark_phys_t));
-       zc.zc_nvlist_dst_size = count;
        (void) strcpy(zc.zc_name, zhp->zpool_name);
        for (;;) {
+               buf = zfs_alloc(zhp->zpool_hdl,
+                   buflen * sizeof (zbookmark_phys_t));
+               zc.zc_nvlist_dst = (uintptr_t)buf;
+               zc.zc_nvlist_dst_size = buflen;
                if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG,
                    &zc) != 0) {
-                       free((void *)(uintptr_t)zc.zc_nvlist_dst);
+                       free(buf);
                        if (errno == ENOMEM) {
-                               void *dst;
-
-                               count = zc.zc_nvlist_dst_size;
-                               dst = zfs_alloc(zhp->zpool_hdl, count *
-                                   sizeof (zbookmark_phys_t));
-                               zc.zc_nvlist_dst = (uintptr_t)dst;
+                               buflen *= 2;
                        } else {
                                return (zpool_standard_error_fmt(hdl, errno,
                                    dgettext(TEXT_DOMAIN, "errors: List of "
@@ -4177,18 +4172,17 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
         * _not_ copied as part of the process.  So we point the start of our
         * array appropriate and decrement the total number of elements.
         */
-       zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) +
-           zc.zc_nvlist_dst_size;
-       count -= zc.zc_nvlist_dst_size;
+       zbookmark_phys_t *zb = buf + zc.zc_nvlist_dst_size;
+       uint64_t zblen = buflen - zc.zc_nvlist_dst_size;
 
-       qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
+       qsort(zb, zblen, sizeof (zbookmark_phys_t), zbookmark_mem_compare);
 
        verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0);
 
        /*
         * Fill in the nverrlistp with nvlist's of dataset and object numbers.
         */
-       for (i = 0; i < count; i++) {
+       for (uint64_t i = 0; i < zblen; i++) {
                nvlist_t *nv;
 
                /* ignoring zb_blkid and zb_level for now */
@@ -4215,11 +4209,11 @@ zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp)
                nvlist_free(nv);
        }
 
-       free((void *)(uintptr_t)zc.zc_nvlist_dst);
+       free(buf);
        return (0);
 
 nomem:
-       free((void *)(uintptr_t)zc.zc_nvlist_dst);
+       free(buf);
        return (no_memory(zhp->zpool_hdl));
 }
 
index 6999d9afc5cd9337968cdcf6a9c384a4b72b48ef..27bb4476d706cb511b9c8acac145d4a5806a9fe7 100644 (file)
@@ -222,7 +222,6 @@ check_status(nvlist_t *config, boolean_t isimport,
 {
        pool_scan_stat_t *ps = NULL;
        uint_t vsc, psc;
-       uint64_t nerr;
        uint64_t suspended;
        uint64_t hostid = 0;
        uint64_t errata = 0;
@@ -392,6 +391,7 @@ check_status(nvlist_t *config, boolean_t isimport,
         * Persistent data errors.
         */
        if (!isimport) {
+               uint64_t nerr;
                if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT,
                    &nerr) == 0 && nerr != 0)
                        return (ZPOOL_STATUS_CORRUPT_DATA);
index 03c2aa313af080775c80afbb221736fe42f1f5c8..c4b90991a4452fb6f04485ee0a74aa2b637e0bab 100644 (file)
@@ -944,13 +944,13 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
 
        if (dsl_scan_restarting(scn, tx))
                spa_history_log_internal(spa, "scan aborted, restarting", tx,
-                   "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+                   "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
        else if (!complete)
                spa_history_log_internal(spa, "scan cancelled", tx,
-                   "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+                   "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
        else
                spa_history_log_internal(spa, "scan done", tx,
-                   "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa));
+                   "errors=%llu", (u_longlong_t)spa_approx_errlog_size(spa));
 
        if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) {
                spa->spa_scrub_active = B_FALSE;
@@ -1013,7 +1013,7 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx)
                    vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) {
                        spa_history_log_internal(spa,
                            "starting deferred resilver", tx, "errors=%llu",
-                           (u_longlong_t)spa_get_errlog_size(spa));
+                           (u_longlong_t)spa_approx_errlog_size(spa));
                        spa_async_request(spa, SPA_ASYNC_RESILVER);
                }
 
index fe7051db273717433efd6d8da1419738da1bb062..67b3a03a951a978d0a638f6e70954c9ac35c46ae 100644 (file)
@@ -5543,7 +5543,7 @@ spa_get_stats(const char *name, nvlist_t **config,
 
                        fnvlist_add_uint64(*config,
                            ZPOOL_CONFIG_ERRCOUNT,
-                           spa_get_errlog_size(spa));
+                           spa_approx_errlog_size(spa));
 
                        if (spa_suspended(spa)) {
                                fnvlist_add_uint64(*config,
index 30e1249dd3b0cc262b9f2c828c7afa8e5503afc5..c6d97eed28926d0566fda125606d32f5cc387cc0 100644 (file)
@@ -160,10 +160,8 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
        dsl_dataset_t *ds;
        objset_t *os;
 
-       dsl_pool_config_enter(dp, FTAG);
        int error = dsl_dataset_hold_obj(dp, ds_obj, FTAG, &ds);
        if (error != 0) {
-               dsl_pool_config_exit(dp, FTAG);
                return (error);
        }
        ASSERT(head_dataset_id);
@@ -172,7 +170,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
        error = dmu_objset_from_ds(ds, &os);
        if (error != 0) {
                dsl_dataset_rele(ds, FTAG);
-               dsl_pool_config_exit(dp, FTAG);
                return (error);
        }
 
@@ -189,7 +186,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
            ZFS_KEYSTATUS_UNAVAILABLE) {
                zep->zb_birth = 0;
                dsl_dataset_rele(ds, FTAG);
-               dsl_pool_config_exit(dp, FTAG);
                return (0);
        }
 
@@ -199,7 +195,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
        error = dnode_hold(os, zep->zb_object, FTAG, &dn);
        if (error != 0) {
                dsl_dataset_rele(ds, FTAG);
-               dsl_pool_config_exit(dp, FTAG);
                return (error);
        }
 
@@ -225,7 +220,6 @@ get_head_and_birth_txg(spa_t *spa, zbookmark_err_phys_t *zep, uint64_t ds_obj,
        rw_exit(&dn->dn_struct_rwlock);
        dnode_rele(dn, FTAG);
        dsl_dataset_rele(ds, FTAG);
-       dsl_pool_config_exit(dp, FTAG);
        return (error);
 }
 
@@ -303,17 +297,31 @@ find_birth_txg(dsl_dataset_t *ds, zbookmark_err_phys_t *zep,
 }
 
 /*
- * This function serves a double role. If only_count is true, it returns
- * (in *count) how many times an error block belonging to this filesystem is
- * referenced by snapshots or clones. If only_count is false, each time the
- * error block is referenced by a snapshot or clone, it fills the userspace
- * array at uaddr with the bookmarks of the error blocks. The array is filled
- * from the back and *count is modified to be the number of unused entries at
- * the beginning of the array.
+ * Copy the bookmark to the end of the user-space buffer which starts at
+ * uaddr and has *count unused entries, and decrement *count by 1.
+ */
+static int
+copyout_entry(const zbookmark_phys_t *zb, void *uaddr, uint64_t *count)
+{
+       if (*count == 0)
+               return (SET_ERROR(ENOMEM));
+
+       *count -= 1;
+       if (copyout(zb, (char *)uaddr + (*count) * sizeof (zbookmark_phys_t),
+           sizeof (zbookmark_phys_t)) != 0)
+               return (SET_ERROR(EFAULT));
+       return (0);
+}
+
+/*
+ * Each time the error block is referenced by a snapshot or clone, add a
+ * zbookmark_phys_t entry to the userspace array at uaddr. The array is
+ * filled from the back and the in-out parameter *count is modified to be the
+ * number of unused entries at the beginning of the array.
  */
 static int
 check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
-    uint64_t *count, void *uaddr, boolean_t only_count)
+    void *uaddr, uint64_t *count)
 {
        dsl_dataset_t *ds;
        dsl_pool_t *dp = spa->spa_dsl_pool;
@@ -343,18 +351,12 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
        }
        if (zep->zb_birth == latest_txg) {
                /* Block neither free nor rewritten. */
-               if (!only_count) {
-                       zbookmark_phys_t zb;
-                       zep_to_zb(head_ds, zep, &zb);
-                       if (copyout(&zb, (char *)uaddr + (*count - 1)
-                           * sizeof (zbookmark_phys_t),
-                           sizeof (zbookmark_phys_t)) != 0) {
-                               dsl_dataset_rele(ds, FTAG);
-                               return (SET_ERROR(EFAULT));
-                       }
-                       (*count)--;
-               } else {
-                       (*count)++;
+               zbookmark_phys_t zb;
+               zep_to_zb(head_ds, zep, &zb);
+               error = copyout_entry(&zb, uaddr, count);
+               if (error != 0) {
+                       dsl_dataset_rele(ds, FTAG);
+                       return (error);
                }
                check_snapshot = B_FALSE;
        } else {
@@ -407,19 +409,12 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
                        snap_obj_array[aff_snap_count] = snap_obj;
                        aff_snap_count++;
 
-                       if (!only_count) {
-                               zbookmark_phys_t zb;
-                               zep_to_zb(snap_obj, zep, &zb);
-                               if (copyout(&zb, (char *)uaddr + (*count - 1) *
-                                   sizeof (zbookmark_phys_t),
-                                   sizeof (zbookmark_phys_t)) != 0) {
-                                       dsl_dataset_rele(ds, FTAG);
-                                       error = SET_ERROR(EFAULT);
-                                       goto out;
-                               }
-                               (*count)--;
-                       } else {
-                               (*count)++;
+                       zbookmark_phys_t zb;
+                       zep_to_zb(snap_obj, zep, &zb);
+                       error = copyout_entry(&zb, uaddr, count);
+                       if (error != 0) {
+                               dsl_dataset_rele(ds, FTAG);
+                               goto out;
                        }
 
                        /*
@@ -433,8 +428,7 @@ check_filesystem(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
                            zap_cursor_retrieve(&zc, &za) == 0;
                            zap_cursor_advance(&zc)) {
                                error = check_filesystem(spa,
-                                   za.za_first_integer, zep,
-                                   count, uaddr, only_count);
+                                   za.za_first_integer, zep, uaddr, count);
 
                                if (error != 0) {
                                        zap_cursor_fini(&zc);
@@ -477,11 +471,8 @@ find_top_affected_fs(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
 
 static int
 process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
-    uint64_t *count, void *uaddr, boolean_t only_count)
+    void *uaddr, uint64_t *count)
 {
-       dsl_pool_t *dp = spa->spa_dsl_pool;
-       uint64_t top_affected_fs;
-
        /*
         * If the zb_birth is 0 it means we failed to retrieve the birth txg
         * of the block pointer. This happens when an encrypted filesystem is
@@ -489,94 +480,23 @@ process_error_block(spa_t *spa, uint64_t head_ds, zbookmark_err_phys_t *zep,
         * check_filesystem(), instead do the accounting here.
         */
        if (zep->zb_birth == 0) {
-               if (!only_count) {
-                       zbookmark_phys_t zb;
-                       zep_to_zb(head_ds, zep, &zb);
-                       if (copyout(&zb, (char *)uaddr + (*count - 1)
-                           * sizeof (zbookmark_phys_t),
-                           sizeof (zbookmark_phys_t)) != 0) {
-                               return (SET_ERROR(EFAULT));
-                       }
-                       (*count)--;
-               } else {
-                       (*count)++;
+               zbookmark_phys_t zb;
+               zep_to_zb(head_ds, zep, &zb);
+               int error = copyout_entry(&zb, uaddr, count);
+               if (error != 0) {
+                       return (error);
                }
                return (0);
        }
 
-       dsl_pool_config_enter(dp, FTAG);
+       uint64_t top_affected_fs;
        int error = find_top_affected_fs(spa, head_ds, zep, &top_affected_fs);
-       if (error == 0)
-               error = check_filesystem(spa, top_affected_fs, zep, count,
-                   uaddr, only_count);
-
-       dsl_pool_config_exit(dp, FTAG);
-       return (error);
-}
-
-static uint64_t
-get_errlog_size(spa_t *spa, uint64_t spa_err_obj)
-{
-       if (spa_err_obj == 0)
-               return (0);
-       uint64_t total = 0;
-
-       zap_cursor_t zc;
-       zap_attribute_t za;
-       for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
-           zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
-
-               zap_cursor_t head_ds_cursor;
-               zap_attribute_t head_ds_attr;
-               zbookmark_err_phys_t head_ds_block;
-
-               uint64_t head_ds;
-               name_to_object(za.za_name, &head_ds);
-
-               for (zap_cursor_init(&head_ds_cursor, spa->spa_meta_objset,
-                   za.za_first_integer); zap_cursor_retrieve(&head_ds_cursor,
-                   &head_ds_attr) == 0; zap_cursor_advance(&head_ds_cursor)) {
-
-                       name_to_errphys(head_ds_attr.za_name, &head_ds_block);
-                       (void) process_error_block(spa, head_ds, &head_ds_block,
-                           &total, NULL, B_TRUE);
-               }
-               zap_cursor_fini(&head_ds_cursor);
+       if (error == 0) {
+               error = check_filesystem(spa, top_affected_fs, zep,
+                   uaddr, count);
        }
-       zap_cursor_fini(&zc);
-       return (total);
-}
-
-static uint64_t
-get_errlist_size(spa_t *spa, avl_tree_t *tree)
-{
-       if (avl_numnodes(tree) == 0)
-               return (0);
-       uint64_t total = 0;
 
-       spa_error_entry_t *se;
-       for (se = avl_first(tree); se != NULL; se = AVL_NEXT(tree, se)) {
-               zbookmark_err_phys_t zep;
-               zep.zb_object = se->se_bookmark.zb_object;
-               zep.zb_level = se->se_bookmark.zb_level;
-               zep.zb_blkid = se->se_bookmark.zb_blkid;
-               zep.zb_birth = 0;
-
-               /*
-                * If we cannot find out the head dataset and birth txg of
-                * the present error block, we opt not to error out. In the
-                * next pool sync this information will be retrieved by
-                * sync_error_list() and written to the on-disk error log.
-                */
-               uint64_t head_ds_obj;
-               int error = get_head_and_birth_txg(spa, &zep,
-                   se->se_bookmark.zb_objset, &head_ds_obj);
-
-               if (!error)
-                       (void) process_error_block(spa, head_ds_obj, &zep,
-                           &total, NULL, B_TRUE);
-       }
-       return (total);
+       return (error);
 }
 #endif
 
@@ -677,13 +597,33 @@ spa_remove_error(spa_t *spa, zbookmark_phys_t *zb)
        spa_add_healed_error(spa, spa->spa_errlog_scrub, zb);
 }
 
+static uint64_t
+approx_errlog_size_impl(spa_t *spa, uint64_t spa_err_obj)
+{
+       if (spa_err_obj == 0)
+               return (0);
+       uint64_t total = 0;
+
+       zap_cursor_t zc;
+       zap_attribute_t za;
+       for (zap_cursor_init(&zc, spa->spa_meta_objset, spa_err_obj);
+           zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) {
+               uint64_t count;
+               if (zap_count(spa->spa_meta_objset, za.za_first_integer,
+                   &count) == 0)
+                       total += count;
+       }
+       zap_cursor_fini(&zc);
+       return (total);
+}
+
 /*
- * Return the number of errors currently in the error log.  This is actually the
- * sum of both the last log and the current log, since we don't know the union
- * of these logs until we reach userland.
+ * Return the approximate number of errors currently in the error log.  This
+ * will be nonzero if there are some errors, but otherwise it may be more
+ * or less than the number of entries returned by spa_get_errlog().
  */
 uint64_t
-spa_get_errlog_size(spa_t *spa)
+spa_approx_errlog_size(spa_t *spa)
 {
        uint64_t total = 0;
 
@@ -701,23 +641,16 @@ spa_get_errlog_size(spa_t *spa)
                        total += count;
                mutex_exit(&spa->spa_errlog_lock);
 
-               mutex_enter(&spa->spa_errlist_lock);
-               total += avl_numnodes(&spa->spa_errlist_last);
-               total += avl_numnodes(&spa->spa_errlist_scrub);
-               mutex_exit(&spa->spa_errlist_lock);
        } else {
-#ifdef _KERNEL
                mutex_enter(&spa->spa_errlog_lock);
-               total += get_errlog_size(spa, spa->spa_errlog_last);
-               total += get_errlog_size(spa, spa->spa_errlog_scrub);
+               total += approx_errlog_size_impl(spa, spa->spa_errlog_last);
+               total += approx_errlog_size_impl(spa, spa->spa_errlog_scrub);
                mutex_exit(&spa->spa_errlog_lock);
-
-               mutex_enter(&spa->spa_errlist_lock);
-               total += get_errlist_size(spa, &spa->spa_errlist_last);
-               total += get_errlist_size(spa, &spa->spa_errlist_scrub);
-               mutex_exit(&spa->spa_errlist_lock);
-#endif
        }
+       mutex_enter(&spa->spa_errlist_lock);
+       total += avl_numnodes(&spa->spa_errlist_last);
+       total += avl_numnodes(&spa->spa_errlist_scrub);
+       mutex_exit(&spa->spa_errlist_lock);
        return (total);
 }
 
@@ -860,8 +793,7 @@ spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx)
 
 #ifdef _KERNEL
 /*
- * If an error block is shared by two datasets it will be counted twice. For
- * detailed message see spa_get_errlog_size() above.
+ * If an error block is shared by two datasets it will be counted twice.
  */
 static int
 process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
@@ -884,14 +816,11 @@ process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
                        zbookmark_phys_t zb;
                        name_to_bookmark(za.za_name, &zb);
 
-                       if (copyout(&zb, (char *)uaddr +
-                           (*count - 1) * sizeof (zbookmark_phys_t),
-                           sizeof (zbookmark_phys_t)) != 0) {
+                       int error = copyout_entry(&zb, uaddr, count);
+                       if (error != 0) {
                                zap_cursor_fini(&zc);
-                               return (SET_ERROR(EFAULT));
+                               return (error);
                        }
-                       *count -= 1;
-
                }
                zap_cursor_fini(&zc);
                return (0);
@@ -914,7 +843,7 @@ process_error_log(spa_t *spa, uint64_t obj, void *uaddr, uint64_t *count)
                        zbookmark_err_phys_t head_ds_block;
                        name_to_errphys(head_ds_attr.za_name, &head_ds_block);
                        int error = process_error_block(spa, head_ds,
-                           &head_ds_block, count, uaddr, B_FALSE);
+                           &head_ds_block, uaddr, count);
 
                        if (error != 0) {
                                zap_cursor_fini(&head_ds_cursor);
@@ -936,16 +865,11 @@ process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
        if (!spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG)) {
                for (se = avl_first(list); se != NULL;
                    se = AVL_NEXT(list, se)) {
-
-                       if (*count == 0)
-                               return (SET_ERROR(ENOMEM));
-
-                       if (copyout(&se->se_bookmark, (char *)uaddr +
-                           (*count - 1) * sizeof (zbookmark_phys_t),
-                           sizeof (zbookmark_phys_t)) != 0)
-                               return (SET_ERROR(EFAULT));
-
-                       *count -= 1;
+                       int error =
+                           copyout_entry(&se->se_bookmark, uaddr, count);
+                       if (error != 0) {
+                               return (error);
+                       }
                }
                return (0);
        }
@@ -963,7 +887,7 @@ process_error_list(spa_t *spa, avl_tree_t *list, void *uaddr, uint64_t *count)
 
                if (!error)
                        error = process_error_block(spa, head_ds_obj, &zep,
-                           count, uaddr, B_FALSE);
+                           uaddr, count);
                if (error)
                        return (error);
        }
@@ -988,6 +912,12 @@ spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
        int ret = 0;
 
 #ifdef _KERNEL
+       /*
+        * The pool config lock is needed to hold a dataset_t via (among other
+        * places) process_error_list() -> get_head_and_birth_txg(), and lock
+        * ordering requires that we get it before the spa_errlog_lock.
+        */
+       dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
        mutex_enter(&spa->spa_errlog_lock);
 
        ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count);
@@ -1006,6 +936,7 @@ spa_get_errlog(spa_t *spa, void *uaddr, uint64_t *count)
        mutex_exit(&spa->spa_errlist_lock);
 
        mutex_exit(&spa->spa_errlog_lock);
+       dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 #else
        (void) spa, (void) uaddr, (void) count;
 #endif
@@ -1174,6 +1105,13 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
        spa->spa_scrub_finished = B_FALSE;
 
        mutex_exit(&spa->spa_errlist_lock);
+
+       /*
+        * The pool config lock is needed to hold a dataset_t via
+        * sync_error_list() -> get_head_and_birth_txg(), and lock ordering
+        * requires that we get it before the spa_errlog_lock.
+        */
+       dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
        mutex_enter(&spa->spa_errlog_lock);
 
        tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1218,6 +1156,7 @@ spa_errlog_sync(spa_t *spa, uint64_t txg)
        dmu_tx_commit(tx);
 
        mutex_exit(&spa->spa_errlog_lock);
+       dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 }
 
 static void
@@ -1354,7 +1293,7 @@ spa_swap_errlog(spa_t *spa, uint64_t new_head_ds, uint64_t old_head_ds,
 #if defined(_KERNEL)
 /* error handling */
 EXPORT_SYMBOL(spa_log_error);
-EXPORT_SYMBOL(spa_get_errlog_size);
+EXPORT_SYMBOL(spa_approx_errlog_size);
 EXPORT_SYMBOL(spa_get_errlog);
 EXPORT_SYMBOL(spa_errlog_rotate);
 EXPORT_SYMBOL(spa_errlog_drain);
index a55d9f88ab7d62ac53b2aba3b8c0ace22794e52f..f913c3509b619f7f9fb27c09807ec39a128582a9 100644 (file)
@@ -5695,17 +5695,12 @@ zfs_ioc_error_log(zfs_cmd_t *zc)
 {
        spa_t *spa;
        int error;
-       uint64_t count = zc->zc_nvlist_dst_size;
 
        if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
                return (error);
 
        error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst,
-           &count);
-       if (error == 0)
-               zc->zc_nvlist_dst_size = count;
-       else
-               zc->zc_nvlist_dst_size = spa_get_errlog_size(spa);
+           &zc->zc_nvlist_dst_size);
 
        spa_close(spa, FTAG);