]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Fixes for procfs files backed by linked lists
authorJohn Gallagher <jgallag88@gmail.com>
Wed, 26 Sep 2018 18:08:12 +0000 (11:08 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Wed, 26 Sep 2018 18:08:12 +0000 (11:08 -0700)
There are some issues with the way the seq_file interface is implemented
for kstats backed by linked lists (zfs_dbgmsgs and certain per-pool
debugging info):

* We don't account for the fact that seq_file sometimes visits a node
  multiple times, which results in missing messages when read through
  procfs.
* We don't keep separate state for each reader of a file, so concurrent
  readers will receive incorrect results.
* We don't account for the fact that entries may have been removed from
  the list between read syscalls, so reading from these files in procfs
  can cause the system to crash.

This change fixes these issues and adds procfs_list, a wrapper around a
linked list which abstracts away the details of implementing the
seq_file interface for a list and exposing the contents of the list
through procfs.

Reviewed by: Don Brady <don.brady@delphix.com>
Reviewed-by: Serapheim Dimitropoulos <serapheim@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed-by: Brian Behlendorf <behlendorf1@llnl.gov>
Signed-off-by: John Gallagher <john.gallagher@delphix.com>
External-issue: LX-1211
Closes #7819

27 files changed:
configure.ac
include/spl/sys/Makefile.am
include/spl/sys/kstat.h
include/spl/sys/procfs_list.h [new file with mode: 0644]
include/sys/spa.h
include/sys/zfs_context.h
include/sys/zfs_debug.h
lib/libzpool/kernel.c
module/spl/Makefile.in
module/spl/spl-kstat.c
module/spl/spl-procfs-list.c [new file with mode: 0644]
module/zfs/spa_stats.c
module/zfs/vdev_queue.c
module/zfs/zfs_debug.c
tests/runfiles/linux.run
tests/zfs-tests/tests/functional/Makefile.am
tests/zfs-tests/tests/functional/kstat/Makefile.am [deleted file]
tests/zfs-tests/tests/functional/kstat/cleanup.ksh [deleted file]
tests/zfs-tests/tests/functional/kstat/setup.ksh [deleted file]
tests/zfs-tests/tests/functional/kstat/state.ksh [deleted file]
tests/zfs-tests/tests/functional/procfs/Makefile.am [new file with mode: 0644]
tests/zfs-tests/tests/functional/procfs/cleanup.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/procfs/pool_state.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/procfs/procfs_list_stale_read.ksh [new file with mode: 0755]
tests/zfs-tests/tests/functional/procfs/setup.ksh [new file with mode: 0755]

index 18d91b3599115508487dcdd2949f91a17c431445..301258e7f7565b6766cc3407378dbdbf973441f4 100644 (file)
@@ -283,7 +283,6 @@ AC_CONFIG_FILES([
        tests/zfs-tests/tests/functional/inheritance/Makefile
        tests/zfs-tests/tests/functional/inuse/Makefile
        tests/zfs-tests/tests/functional/io/Makefile
-       tests/zfs-tests/tests/functional/kstat/Makefile
        tests/zfs-tests/tests/functional/large_files/Makefile
        tests/zfs-tests/tests/functional/largest_pool/Makefile
        tests/zfs-tests/tests/functional/link_count/Makefile
@@ -301,6 +300,7 @@ AC_CONFIG_FILES([
        tests/zfs-tests/tests/functional/pool_checkpoint/Makefile
        tests/zfs-tests/tests/functional/poolversion/Makefile
        tests/zfs-tests/tests/functional/privilege/Makefile
+       tests/zfs-tests/tests/functional/procfs/Makefile
        tests/zfs-tests/tests/functional/projectquota/Makefile
        tests/zfs-tests/tests/functional/pyzfs/Makefile
        tests/zfs-tests/tests/functional/quota/Makefile
index d58ed0e20b1b4584d48a5abafe9362fa14b9a05a..e596ff3732f43ac31cca25e3077102298ef1224b 100644 (file)
@@ -28,6 +28,7 @@ KERNEL_H = \
        $(top_srcdir)/include/spl/sys/param.h \
        $(top_srcdir)/include/spl/sys/processor.h \
        $(top_srcdir)/include/spl/sys/proc.h \
+       $(top_srcdir)/include/spl/sys/procfs_list.h \
        $(top_srcdir)/include/spl/sys/random.h \
        $(top_srcdir)/include/spl/sys/rwlock.h \
        $(top_srcdir)/include/spl/sys/shrinker.h \
index f197ce455e65a4f1cca0a82c5baaeb95992c127e..53274d8f59c4e93df9af02e9afc8382ec5cad6f0 100644 (file)
@@ -98,30 +98,34 @@ typedef struct kstat_raw_ops {
        void *(*addr)(kstat_t *ksp, loff_t index);
 } kstat_raw_ops_t;
 
+typedef struct kstat_proc_entry {
+       char    kpe_name[KSTAT_STRLEN+1];       /* kstat name */
+       char    kpe_module[KSTAT_STRLEN+1];     /* provider module name */
+       kstat_module_t          *kpe_owner;     /* kstat module linkage */
+       struct list_head        kpe_list;       /* kstat linkage */
+       struct proc_dir_entry   *kpe_proc;      /* procfs entry */
+} kstat_proc_entry_t;
+
 struct kstat_s {
        int             ks_magic;               /* magic value */
        kid_t           ks_kid;                 /* unique kstat ID */
        hrtime_t        ks_crtime;              /* creation time */
        hrtime_t        ks_snaptime;            /* last access time */
-       char            ks_module[KSTAT_STRLEN+1]; /* provider module name */
        int             ks_instance;            /* provider module instance */
-       char            ks_name[KSTAT_STRLEN+1]; /* kstat name */
        char            ks_class[KSTAT_STRLEN+1]; /* kstat class */
        uchar_t         ks_type;                /* kstat data type */
        uchar_t         ks_flags;               /* kstat flags */
        void            *ks_data;               /* kstat type-specific data */
        uint_t          ks_ndata;               /* # of data records */
        size_t          ks_data_size;           /* size of kstat data section */
-       struct proc_dir_entry *ks_proc;         /* proc linkage */
        kstat_update_t  *ks_update;             /* dynamic updates */
        void            *ks_private;            /* private data */
        kmutex_t        ks_private_lock;        /* kstat private data lock */
        kmutex_t        *ks_lock;               /* kstat data lock */
-       struct list_head ks_list;               /* kstat linkage */
-       kstat_module_t  *ks_owner;              /* kstat module linkage */
        kstat_raw_ops_t ks_raw_ops;             /* ops table for raw type */
        char            *ks_raw_buf;            /* buf used for raw ops */
        size_t          ks_raw_bufsize;         /* size of raw ops buffer */
+       kstat_proc_entry_t      ks_proc;        /* data for procfs entry */
 };
 
 typedef struct kstat_named_s {
@@ -189,6 +193,12 @@ extern kstat_t *__kstat_create(const char *ks_module, int ks_instance,
     const char *ks_name, const char *ks_class, uchar_t ks_type,
     uint_t ks_ndata, uchar_t ks_flags);
 
+extern void kstat_proc_entry_init(kstat_proc_entry_t *kpep,
+    const char *module, const char *name);
+extern void kstat_proc_entry_delete(kstat_proc_entry_t *kpep);
+extern void kstat_proc_entry_install(kstat_proc_entry_t *kpep,
+    const struct file_operations *file_ops, void *data);
+
 extern void __kstat_install(kstat_t *ksp);
 extern void __kstat_delete(kstat_t *ksp);
 extern void kstat_waitq_enter(kstat_io_t *);
diff --git a/include/spl/sys/procfs_list.h b/include/spl/sys/procfs_list.h
new file mode 100644 (file)
index 0000000..cbcb4bc
--- /dev/null
@@ -0,0 +1,71 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#ifndef        _SPL_PROCFS_LIST_H
+#define        _SPL_PROCFS_LIST_H
+
+#include <sys/kstat.h>
+#include <sys/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+
+typedef struct procfs_list procfs_list_t;
+struct procfs_list {
+       /* Accessed only by user of a procfs_list */
+       void            *pl_private;
+
+       /*
+        * Accessed both by user of a procfs_list and by procfs_list
+        * implementation
+        */
+       kmutex_t        pl_lock;
+       list_t          pl_list;
+
+       /* Accessed only by procfs_list implementation */
+       uint64_t        pl_next_id;
+       int             (*pl_show)(struct seq_file *f, void *p);
+       int             (*pl_show_header)(struct seq_file *f);
+       int             (*pl_clear)(procfs_list_t *procfs_list);
+       size_t          pl_node_offset;
+       kstat_proc_entry_t      pl_kstat_entry;
+};
+
+typedef struct procfs_list_node {
+       list_node_t     pln_link;
+       uint64_t        pln_id;
+} procfs_list_node_t;
+
+void procfs_list_install(const char *module,
+    const char *name,
+    procfs_list_t *procfs_list,
+    int (*show)(struct seq_file *f, void *p),
+    int (*show_header)(struct seq_file *f),
+    int (*clear)(procfs_list_t *procfs_list),
+    size_t procfs_list_node_off);
+void procfs_list_uninstall(procfs_list_t *procfs_list);
+void procfs_list_destroy(procfs_list_t *procfs_list);
+
+void procfs_list_add(procfs_list_t *procfs_list, void *p);
+
+#endif /* _SPL_PROCFS_LIST_H */
index b86c655575bc167abac46b71b91aa33143ab6569..443d835a1bd05741fe21e5126e6005fe28364c10 100644 (file)
@@ -863,22 +863,27 @@ extern boolean_t spa_refcount_zero(spa_t *spa);
 #define        SCL_STATE_ALL   (SCL_STATE | SCL_L2ARC | SCL_ZIO)
 
 /* Historical pool statistics */
-typedef struct spa_stats_history {
+typedef struct spa_history_kstat {
        kmutex_t                lock;
        uint64_t                count;
        uint64_t                size;
        kstat_t                 *kstat;
        void                    *private;
        list_t                  list;
-} spa_stats_history_t;
+} spa_history_kstat_t;
+
+typedef struct spa_history_list {
+       uint64_t                size;
+       procfs_list_t           procfs_list;
+} spa_history_list_t;
 
 typedef struct spa_stats {
-       spa_stats_history_t     read_history;
-       spa_stats_history_t     txg_history;
-       spa_stats_history_t     tx_assign_histogram;
-       spa_stats_history_t     io_history;
-       spa_stats_history_t     mmp_history;
-       spa_stats_history_t     state;          /* pool state */
+       spa_history_list_t      read_history;
+       spa_history_list_t      txg_history;
+       spa_history_kstat_t     tx_assign_histogram;
+       spa_history_kstat_t     io_history;
+       spa_history_list_t      mmp_history;
+       spa_history_kstat_t     state;          /* pool state */
 } spa_stats_t;
 
 typedef enum txg_state {
@@ -911,7 +916,7 @@ extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs);
 extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id);
 extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
     hrtime_t duration);
-extern void *spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
+extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
     uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
     int error);
 
index 6f502897ec18bedb9ad35c4687c2c61b7d733dec..11c048c2399c817ccb58da386dfdea1edf542d74 100644 (file)
@@ -62,6 +62,7 @@
 #include <sys/ctype.h>
 #include <sys/disp.h>
 #include <sys/trace.h>
+#include <sys/procfs_list.h>
 #include <linux/dcache_compat.h>
 #include <linux/utsname_compat.h>
 
@@ -351,6 +352,37 @@ extern void kstat_set_raw_ops(kstat_t *ksp,
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index));
 
+/*
+ * procfs list manipulation
+ */
+
+struct seq_file { };
+void seq_printf(struct seq_file *m, const char *fmt, ...);
+
+typedef struct procfs_list {
+       void            *pl_private;
+       kmutex_t        pl_lock;
+       list_t          pl_list;
+       uint64_t        pl_next_id;
+       size_t          pl_node_offset;
+} procfs_list_t;
+
+typedef struct procfs_list_node {
+       list_node_t     pln_link;
+       uint64_t        pln_id;
+} procfs_list_node_t;
+
+void procfs_list_install(const char *module,
+    const char *name,
+    procfs_list_t *procfs_list,
+    int (*show)(struct seq_file *f, void *p),
+    int (*show_header)(struct seq_file *f),
+    int (*clear)(procfs_list_t *procfs_list),
+    size_t procfs_list_node_off);
+void procfs_list_uninstall(procfs_list_t *procfs_list);
+void procfs_list_destroy(procfs_list_t *procfs_list);
+void procfs_list_add(procfs_list_t *procfs_list, void *p);
+
 /*
  * Kernel memory
  */
index aa9bfe21f02585abf0af850034e258d2a18911a7..f3a936ae7ba0d7ad2f102196832684180895d291 100644 (file)
@@ -76,13 +76,6 @@ extern void __dprintf(const char *file, const char *func,
 
 extern void zfs_panic_recover(const char *fmt, ...);
 
-typedef struct zfs_dbgmsg {
-       list_node_t zdm_node;
-       time_t zdm_timestamp;
-       int zdm_size;
-       char zdm_msg[1]; /* variable length allocation */
-} zfs_dbgmsg_t;
-
 extern void zfs_dbgmsg_init(void);
 extern void zfs_dbgmsg_fini(void);
 
index 341548ac300a1077cd809e3003f79bfecbc809e5..5baf52514a7b58e232da66f709ee5b8c82a86679 100644 (file)
@@ -424,6 +424,57 @@ cv_broadcast(kcondvar_t *cv)
        VERIFY0(pthread_cond_broadcast(cv));
 }
 
+/*
+ * =========================================================================
+ * procfs list
+ * =========================================================================
+ */
+
+void
+seq_printf(struct seq_file *m, const char *fmt, ...)
+{}
+
+void
+procfs_list_install(const char *module,
+    const char *name,
+    procfs_list_t *procfs_list,
+    int (*show)(struct seq_file *f, void *p),
+    int (*show_header)(struct seq_file *f),
+    int (*clear)(procfs_list_t *procfs_list),
+    size_t procfs_list_node_off)
+{
+       mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
+       list_create(&procfs_list->pl_list,
+           procfs_list_node_off + sizeof (procfs_list_node_t),
+           procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
+       procfs_list->pl_next_id = 1;
+       procfs_list->pl_node_offset = procfs_list_node_off;
+}
+
+void
+procfs_list_uninstall(procfs_list_t *procfs_list)
+{}
+
+void
+procfs_list_destroy(procfs_list_t *procfs_list)
+{
+       ASSERT(list_is_empty(&procfs_list->pl_list));
+       list_destroy(&procfs_list->pl_list);
+       mutex_destroy(&procfs_list->pl_lock);
+}
+
+#define        NODE_ID(procfs_list, obj) \
+               (((procfs_list_node_t *)(((char *)obj) + \
+               (procfs_list)->pl_node_offset))->pln_id)
+
+void
+procfs_list_add(procfs_list_t *procfs_list, void *p)
+{
+       ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+       NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
+       list_insert_tail(&procfs_list->pl_list, p);
+}
+
 /*
  * =========================================================================
  * vnode operations
index 97a431f22f12924aec3635216ea87abb0b4ec01b..3bcbf63cbc633214b7a7e70af493fbcd3aad382c 100644 (file)
@@ -18,6 +18,7 @@ $(MODULE)-objs += spl-kobj.o
 $(MODULE)-objs += spl-kstat.o
 $(MODULE)-objs += spl-mutex.o
 $(MODULE)-objs += spl-proc.o
+$(MODULE)-objs += spl-procfs-list.o
 $(MODULE)-objs += spl-rwlock.o
 $(MODULE)-objs += spl-taskq.o
 $(MODULE)-objs += spl-thread.o
index c3fc2e4b24f0e36d2628aed2439f99063e16ae08..8683693c8e6735cc047d76a3e1857ff185b60a19 100644 (file)
@@ -530,6 +530,18 @@ __kstat_set_raw_ops(kstat_t *ksp,
 }
 EXPORT_SYMBOL(__kstat_set_raw_ops);
 
+void
+kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module,
+    const char *name)
+{
+       kpep->kpe_owner = NULL;
+       kpep->kpe_proc = NULL;
+       INIT_LIST_HEAD(&kpep->kpe_list);
+       strncpy(kpep->kpe_module, module, KSTAT_STRLEN);
+       strncpy(kpep->kpe_name, name, KSTAT_STRLEN);
+}
+EXPORT_SYMBOL(kstat_proc_entry_init);
+
 kstat_t *
 __kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
     const char *ks_class, uchar_t ks_type, uint_t ks_ndata,
@@ -556,13 +568,10 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
        ksp->ks_magic = KS_MAGIC;
        mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
        ksp->ks_lock = &ksp->ks_private_lock;
-       INIT_LIST_HEAD(&ksp->ks_list);
 
        ksp->ks_crtime = gethrtime();
        ksp->ks_snaptime = ksp->ks_crtime;
-       strncpy(ksp->ks_module, ks_module, KSTAT_STRLEN);
        ksp->ks_instance = ks_instance;
-       strncpy(ksp->ks_name, ks_name, KSTAT_STRLEN);
        strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN);
        ksp->ks_type = ks_type;
        ksp->ks_flags = ks_flags;
@@ -573,6 +582,7 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
        ksp->ks_raw_ops.addr = NULL;
        ksp->ks_raw_buf = NULL;
        ksp->ks_raw_bufsize = 0;
+       kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name);
 
        switch (ksp->ks_type) {
                case KSTAT_TYPE_RAW:
@@ -614,14 +624,14 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
 EXPORT_SYMBOL(__kstat_create);
 
 static int
-kstat_detect_collision(kstat_t *ksp)
+kstat_detect_collision(kstat_proc_entry_t *kpep)
 {
        kstat_module_t *module;
-       kstat_t *tmp;
+       kstat_proc_entry_t *tmp;
        char *parent;
        char *cp;
 
-       parent = kmem_asprintf("%s", ksp->ks_module);
+       parent = kmem_asprintf("%s", kpep->kpe_module);
 
        if ((cp = strrchr(parent, '/')) == NULL) {
                strfree(parent);
@@ -630,8 +640,8 @@ kstat_detect_collision(kstat_t *ksp)
 
        cp[0] = '\0';
        if ((module = kstat_find_module(parent)) != NULL) {
-               list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) {
-                       if (strncmp(tmp->ks_name, cp+1, KSTAT_STRLEN) == 0) {
+               list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
+                       if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) {
                                strfree(parent);
                                return (EEXIST);
                        }
@@ -642,24 +652,30 @@ kstat_detect_collision(kstat_t *ksp)
        return (0);
 }
 
+/*
+ * Add a file to the proc filesystem under the kstat namespace (i.e.
+ * /proc/spl/kstat/). The file need not necessarily be implemented as a
+ * kstat.
+ */
 void
-__kstat_install(kstat_t *ksp)
+kstat_proc_entry_install(kstat_proc_entry_t *kpep,
+    const struct file_operations *file_ops, void *data)
 {
        kstat_module_t *module;
-       kstat_t *tmp;
+       kstat_proc_entry_t *tmp;
 
-       ASSERT(ksp);
+       ASSERT(kpep);
 
        mutex_enter(&kstat_module_lock);
 
-       module = kstat_find_module(ksp->ks_module);
+       module = kstat_find_module(kpep->kpe_module);
        if (module == NULL) {
-               if (kstat_detect_collision(ksp) != 0) {
+               if (kstat_detect_collision(kpep) != 0) {
                        cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \
-                           " collision", ksp->ks_module, ksp->ks_name);
+                           " collision", kpep->kpe_module, kpep->kpe_name);
                        goto out;
                }
-               module = kstat_create_module(ksp->ks_module);
+               module = kstat_create_module(kpep->kpe_module);
                if (module == NULL)
                        goto out;
        }
@@ -668,44 +684,60 @@ __kstat_install(kstat_t *ksp)
         * Only one entry by this name per-module, on failure the module
         * shouldn't be deleted because we know it has at least one entry.
         */
-       list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) {
-               if (strncmp(tmp->ks_name, ksp->ks_name, KSTAT_STRLEN) == 0)
+       list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
+               if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0)
                        goto out;
        }
 
-       list_add_tail(&ksp->ks_list, &module->ksm_kstat_list);
+       list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list);
 
-       mutex_enter(ksp->ks_lock);
-       ksp->ks_owner = module;
-       ksp->ks_proc = proc_create_data(ksp->ks_name, 0644,
-           module->ksm_proc, &proc_kstat_operations, (void *)ksp);
-       if (ksp->ks_proc == NULL) {
-               list_del_init(&ksp->ks_list);
+       kpep->kpe_owner = module;
+       kpep->kpe_proc = proc_create_data(kpep->kpe_name, 0644,
+           module->ksm_proc, file_ops, data);
+       if (kpep->kpe_proc == NULL) {
+               list_del_init(&kpep->kpe_list);
                if (list_empty(&module->ksm_kstat_list))
                        kstat_delete_module(module);
        }
-       mutex_exit(ksp->ks_lock);
 out:
        mutex_exit(&kstat_module_lock);
+
+}
+EXPORT_SYMBOL(kstat_proc_entry_install);
+
+void
+__kstat_install(kstat_t *ksp)
+{
+       ASSERT(ksp);
+       kstat_proc_entry_install(&ksp->ks_proc, &proc_kstat_operations, ksp);
 }
 EXPORT_SYMBOL(__kstat_install);
 
 void
-__kstat_delete(kstat_t *ksp)
+kstat_proc_entry_delete(kstat_proc_entry_t *kpep)
 {
-       kstat_module_t *module = ksp->ks_owner;
+       kstat_module_t *module = kpep->kpe_owner;
+       if (kpep->kpe_proc)
+               remove_proc_entry(kpep->kpe_name, module->ksm_proc);
 
        mutex_enter(&kstat_module_lock);
-       list_del_init(&ksp->ks_list);
+       list_del_init(&kpep->kpe_list);
+
+       /*
+        * Remove top level module directory if it wasn't empty before, but now
+        * is.
+        */
+       if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list))
+               kstat_delete_module(module);
        mutex_exit(&kstat_module_lock);
 
-       if (ksp->ks_proc) {
-               remove_proc_entry(ksp->ks_name, module->ksm_proc);
+}
+EXPORT_SYMBOL(kstat_proc_entry_delete);
 
-               /* Remove top level module directory if it's empty */
-               if (list_empty(&module->ksm_kstat_list))
-                       kstat_delete_module(module);
-       }
+void
+__kstat_delete(kstat_t *ksp)
+{
+       kstat_proc_entry_delete(&ksp->ks_proc);
 
        if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
                kmem_free(ksp->ks_data, ksp->ks_data_size);
diff --git a/module/spl/spl-procfs-list.c b/module/spl/spl-procfs-list.c
new file mode 100644 (file)
index 0000000..4902e0a
--- /dev/null
@@ -0,0 +1,256 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright (c) 2018 by Delphix. All rights reserved.
+ */
+
+#include <sys/list.h>
+#include <sys/mutex.h>
+#include <sys/procfs_list.h>
+#include <linux/proc_fs.h>
+
+/*
+ * A procfs_list is a wrapper around a linked list which implements the seq_file
+ * interface, allowing the contents of the list to be exposed through procfs.
+ * The kernel already has some utilities to help implement the seq_file
+ * interface for linked lists (seq_list_*), but they aren't appropriate for use
+ * with lists that have many entries, because seq_list_start walks the list at
+ * the start of each read syscall to find where it left off, so reading a file
+ * ends up being quadratic in the number of entries in the list.
+ *
+ * This implementation avoids this penalty by maintaining a separate cursor into
+ * the list per instance of the file that is open. It also maintains some extra
+ * information in each node of the list to prevent reads of entries that have
+ * been dropped from the list.
+ *
+ * Callers should only add elements to the list using procfs_list_add, which
+ * adds an element to the tail of the list. Other operations can be performed
+ * directly on the wrapped list using the normal list manipulation functions,
+ * but elements should only be removed from the head of the list.
+ */
+
+#define        NODE_ID(procfs_list, obj) \
+               (((procfs_list_node_t *)(((char *)obj) + \
+               (procfs_list)->pl_node_offset))->pln_id)
+
+typedef struct procfs_list_cursor {
+       procfs_list_t   *procfs_list;   /* List into which this cursor points */
+       void            *cached_node;   /* Most recently accessed node */
+       loff_t          cached_pos;     /* Position of cached_node */
+} procfs_list_cursor_t;
+
+static int
+procfs_list_seq_show(struct seq_file *f, void *p)
+{
+       procfs_list_cursor_t *cursor = f->private;
+       procfs_list_t *procfs_list = cursor->procfs_list;
+
+       ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+       if (p == SEQ_START_TOKEN) {
+               if (procfs_list->pl_show_header != NULL)
+                       return (procfs_list->pl_show_header(f));
+               else
+                       return (0);
+       }
+       return (procfs_list->pl_show(f, p));
+}
+
+static void *
+procfs_list_next_node(procfs_list_cursor_t *cursor, loff_t *pos)
+{
+       void *next_node;
+       procfs_list_t *procfs_list = cursor->procfs_list;
+
+       if (cursor->cached_node == SEQ_START_TOKEN)
+               next_node = list_head(&procfs_list->pl_list);
+       else
+               next_node = list_next(&procfs_list->pl_list,
+                   cursor->cached_node);
+
+       if (next_node != NULL) {
+               cursor->cached_node = next_node;
+               cursor->cached_pos = NODE_ID(procfs_list, cursor->cached_node);
+               *pos = cursor->cached_pos;
+       }
+       return (next_node);
+}
+
+static void *
+procfs_list_seq_start(struct seq_file *f, loff_t *pos)
+{
+       procfs_list_cursor_t *cursor = f->private;
+       procfs_list_t *procfs_list = cursor->procfs_list;
+
+       mutex_enter(&procfs_list->pl_lock);
+
+       if (*pos == 0) {
+               cursor->cached_node = SEQ_START_TOKEN;
+               cursor->cached_pos = 0;
+               return (SEQ_START_TOKEN);
+       }
+
+       /*
+        * Check if our cached pointer has become stale, which happens if the
+        * the message where we left off has been dropped from the list since
+        * the last read syscall completed.
+        */
+       void *oldest_node = list_head(&procfs_list->pl_list);
+       if (cursor->cached_node != SEQ_START_TOKEN && (oldest_node == NULL ||
+           NODE_ID(procfs_list, oldest_node) > cursor->cached_pos))
+               return (ERR_PTR(-EIO));
+
+       /*
+        * If it isn't starting from the beginning of the file, the seq_file
+        * code will either pick up at the same position it visited last or the
+        * following one.
+        */
+       if (*pos == cursor->cached_pos) {
+               return (cursor->cached_node);
+       } else {
+               ASSERT3U(*pos, ==, cursor->cached_pos + 1);
+               return (procfs_list_next_node(cursor, pos));
+       }
+}
+
+static void *
+procfs_list_seq_next(struct seq_file *f, void *p, loff_t *pos)
+{
+       procfs_list_cursor_t *cursor = f->private;
+       ASSERT(MUTEX_HELD(&cursor->procfs_list->pl_lock));
+       return (procfs_list_next_node(cursor, pos));
+}
+
+static void
+procfs_list_seq_stop(struct seq_file *f, void *p)
+{
+       procfs_list_cursor_t *cursor = f->private;
+       procfs_list_t *procfs_list = cursor->procfs_list;
+       mutex_exit(&procfs_list->pl_lock);
+}
+
+static struct seq_operations procfs_list_seq_ops = {
+       .show  = procfs_list_seq_show,
+       .start = procfs_list_seq_start,
+       .next  = procfs_list_seq_next,
+       .stop  = procfs_list_seq_stop,
+};
+
+static int
+procfs_list_open(struct inode *inode, struct file *filp)
+{
+       int rc = seq_open_private(filp, &procfs_list_seq_ops,
+           sizeof (procfs_list_cursor_t));
+       if (rc != 0)
+               return (rc);
+
+       struct seq_file *f = filp->private_data;
+       procfs_list_cursor_t *cursor = f->private;
+       cursor->procfs_list = PDE_DATA(inode);
+       cursor->cached_node = NULL;
+       cursor->cached_pos = 0;
+
+       return (0);
+}
+
+static ssize_t
+procfs_list_write(struct file *filp, const char __user *buf, size_t len,
+    loff_t *ppos)
+{
+       struct seq_file *f = filp->private_data;
+       procfs_list_cursor_t *cursor = f->private;
+       procfs_list_t *procfs_list = cursor->procfs_list;
+       int rc;
+
+       if (procfs_list->pl_clear != NULL &&
+           (rc = procfs_list->pl_clear(procfs_list)) != 0)
+               return (-rc);
+       return (len);
+}
+
+static struct file_operations procfs_list_operations = {
+       .owner          = THIS_MODULE,
+       .open           = procfs_list_open,
+       .write          = procfs_list_write,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release_private,
+};
+
+/*
+ * Initialize a procfs_list and create a file for it in the proc filesystem
+ * under the kstat namespace.
+ */
+void
+procfs_list_install(const char *module,
+    const char *name,
+    procfs_list_t *procfs_list,
+    int (*show)(struct seq_file *f, void *p),
+    int (*show_header)(struct seq_file *f),
+    int (*clear)(procfs_list_t *procfs_list),
+    size_t procfs_list_node_off)
+{
+       mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
+       list_create(&procfs_list->pl_list,
+           procfs_list_node_off + sizeof (procfs_list_node_t),
+           procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
+       procfs_list->pl_next_id = 1; /* Save id 0 for SEQ_START_TOKEN */
+       procfs_list->pl_show = show;
+       procfs_list->pl_show_header = show_header;
+       procfs_list->pl_clear = clear;
+       procfs_list->pl_node_offset = procfs_list_node_off;
+
+       kstat_proc_entry_init(&procfs_list->pl_kstat_entry, module, name);
+       kstat_proc_entry_install(&procfs_list->pl_kstat_entry,
+           &procfs_list_operations, procfs_list);
+}
+EXPORT_SYMBOL(procfs_list_install);
+
+/* Remove the proc filesystem file corresponding to the given list */
+void
+procfs_list_uninstall(procfs_list_t *procfs_list)
+{
+       kstat_proc_entry_delete(&procfs_list->pl_kstat_entry);
+}
+EXPORT_SYMBOL(procfs_list_uninstall);
+
+void
+procfs_list_destroy(procfs_list_t *procfs_list)
+{
+       ASSERT(list_is_empty(&procfs_list->pl_list));
+       list_destroy(&procfs_list->pl_list);
+       mutex_destroy(&procfs_list->pl_lock);
+}
+EXPORT_SYMBOL(procfs_list_destroy);
+
+/*
+ * Add a new node to the tail of the list. While the standard list manipulation
+ * functions can be use for all other operation, adding elements to the list
+ * should only be done using this helper so that the id of the new node is set
+ * correctly.
+ */
+void
+procfs_list_add(procfs_list_t *procfs_list, void *p)
+{
+       ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
+       NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
+       list_insert_tail(&procfs_list->pl_list, p);
+}
+EXPORT_SYMBOL(procfs_list_add);
index fa1cf9e986e5d9fb1bfb278ff4e03dfb074eeaba..c02ef86b51c66062edfc84e8d9d75fdc30913d51 100644 (file)
@@ -55,7 +55,6 @@ int zfs_multihost_history = 0;
  * Read statistics - Information exported regarding each arc_read call
  */
 typedef struct spa_read_history {
-       uint64_t        uid;            /* unique identifier */
        hrtime_t        start;          /* time read completed */
        uint64_t        objset;         /* read from this objset */
        uint64_t        object;         /* read of this object number */
@@ -65,13 +64,13 @@ typedef struct spa_read_history {
        uint32_t        aflags;         /* ARC flags (cached, prefetch, etc.) */
        pid_t           pid;            /* PID of task doing read */
        char            comm[16];       /* process name of task doing read */
-       list_node_t     srh_link;
+       procfs_list_node_t      srh_node;
 } spa_read_history_t;
 
 static int
-spa_read_history_headers(char *buf, size_t size)
+spa_read_history_show_header(struct seq_file *f)
 {
-       (void) snprintf(buf, size, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
+       seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
            "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
            "level", "blkid", "aflags", "origin", "pid", "process");
 
@@ -79,13 +78,13 @@ spa_read_history_headers(char *buf, size_t size)
 }
 
 static int
-spa_read_history_data(char *buf, size_t size, void *data)
+spa_read_history_show(struct seq_file *f, void *data)
 {
        spa_read_history_t *srh = (spa_read_history_t *)data;
 
-       (void) snprintf(buf, size, "%-8llu %-16llu 0x%-6llx "
+       seq_printf(f, "%-8llu %-16llu 0x%-6llx "
            "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
-           (u_longlong_t)srh->uid, srh->start,
+           (u_longlong_t)srh->srh_node.pln_id, srh->start,
            (longlong_t)srh->objset, (longlong_t)srh->object,
            (longlong_t)srh->level, (longlong_t)srh->blkid,
            srh->aflags, srh->origin, srh->pid, srh->comm);
@@ -93,120 +92,73 @@ spa_read_history_data(char *buf, size_t size, void *data)
        return (0);
 }
 
-/*
- * Calculate the address for the next spa_stats_history_t entry.  The
- * ssh->lock will be held until ksp->ks_ndata entries are processed.
- */
-static void *
-spa_read_history_addr(kstat_t *ksp, loff_t n)
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_read_history_truncate(spa_history_list_t *shl, unsigned int size)
 {
-       spa_t *spa = ksp->ks_private;
-       spa_stats_history_t *ssh = &spa->spa_stats.read_history;
-
-       ASSERT(MUTEX_HELD(&ssh->lock));
-
-       if (n == 0)
-               ssh->private = list_tail(&ssh->list);
-       else if (ssh->private)
-               ssh->private = list_prev(&ssh->list, ssh->private);
+       spa_read_history_t *srh;
+       while (shl->size > size) {
+               srh = list_remove_head(&shl->procfs_list.pl_list);
+               ASSERT3P(srh, !=, NULL);
+               kmem_free(srh, sizeof (spa_read_history_t));
+               shl->size--;
+       }
 
-       return (ssh->private);
+       if (size == 0)
+               ASSERT(list_is_empty(&shl->procfs_list.pl_list));
 }
 
-/*
- * When the kstat is written discard all spa_read_history_t entries.  The
- * ssh->lock will be held until ksp->ks_ndata entries are processed.
- */
 static int
-spa_read_history_update(kstat_t *ksp, int rw)
+spa_read_history_clear(procfs_list_t *procfs_list)
 {
-       spa_t *spa = ksp->ks_private;
-       spa_stats_history_t *ssh = &spa->spa_stats.read_history;
-
-       if (rw == KSTAT_WRITE) {
-               spa_read_history_t *srh;
-
-               while ((srh = list_remove_head(&ssh->list))) {
-                       ssh->size--;
-                       kmem_free(srh, sizeof (spa_read_history_t));
-               }
-
-               ASSERT3U(ssh->size, ==, 0);
-       }
-
-       ksp->ks_ndata = ssh->size;
-       ksp->ks_data_size = ssh->size * sizeof (spa_read_history_t);
-
+       spa_history_list_t *shl = procfs_list->pl_private;
+       mutex_enter(&procfs_list->pl_lock);
+       spa_read_history_truncate(shl, 0);
+       mutex_exit(&procfs_list->pl_lock);
        return (0);
 }
 
 static void
 spa_read_history_init(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.read_history;
-       char *name;
-       kstat_t *ksp;
+       spa_history_list_t *shl = &spa->spa_stats.read_history;
+       char *module;
 
-       mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
-       list_create(&ssh->list, sizeof (spa_read_history_t),
-           offsetof(spa_read_history_t, srh_link));
+       shl->size = 0;
 
-       ssh->count = 0;
-       ssh->size = 0;
-       ssh->private = NULL;
+       module = kmem_asprintf("zfs/%s", spa_name(spa));
 
-       name = kmem_asprintf("zfs/%s", spa_name(spa));
+       shl->procfs_list.pl_private = shl;
+       procfs_list_install(module,
+           "reads",
+           &shl->procfs_list,
+           spa_read_history_show,
+           spa_read_history_show_header,
+           spa_read_history_clear,
+           offsetof(spa_read_history_t, srh_node));
 
-       ksp = kstat_create(name, 0, "reads", "misc",
-           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-       ssh->kstat = ksp;
-
-       if (ksp) {
-               ksp->ks_lock = &ssh->lock;
-               ksp->ks_data = NULL;
-               ksp->ks_private = spa;
-               ksp->ks_update = spa_read_history_update;
-               kstat_set_raw_ops(ksp, spa_read_history_headers,
-                   spa_read_history_data, spa_read_history_addr);
-               kstat_install(ksp);
-       }
-       strfree(name);
+       strfree(module);
 }
 
 static void
 spa_read_history_destroy(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.read_history;
-       spa_read_history_t *srh;
-       kstat_t *ksp;
-
-       ksp = ssh->kstat;
-       if (ksp)
-               kstat_delete(ksp);
-
-       mutex_enter(&ssh->lock);
-       while ((srh = list_remove_head(&ssh->list))) {
-               ssh->size--;
-               kmem_free(srh, sizeof (spa_read_history_t));
-       }
-
-       ASSERT3U(ssh->size, ==, 0);
-       list_destroy(&ssh->list);
-       mutex_exit(&ssh->lock);
-
-       mutex_destroy(&ssh->lock);
+       spa_history_list_t *shl = &spa->spa_stats.read_history;
+       procfs_list_uninstall(&shl->procfs_list);
+       spa_read_history_truncate(shl, 0);
+       procfs_list_destroy(&shl->procfs_list);
 }
 
 void
 spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.read_history;
-       spa_read_history_t *srh, *rm;
+       spa_history_list_t *shl = &spa->spa_stats.read_history;
+       spa_read_history_t *srh;
 
        ASSERT3P(spa, !=, NULL);
        ASSERT3P(zb,  !=, NULL);
 
-       if (zfs_read_history == 0 && ssh->size == 0)
+       if (zfs_read_history == 0 && shl->size == 0)
                return;
 
        if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
@@ -222,19 +174,14 @@ spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
        srh->aflags = aflags;
        srh->pid    = getpid();
 
-       mutex_enter(&ssh->lock);
+       mutex_enter(&shl->procfs_list.pl_lock);
 
-       srh->uid = ssh->count++;
-       list_insert_head(&ssh->list, srh);
-       ssh->size++;
+       procfs_list_add(&shl->procfs_list, srh);
+       shl->size++;
 
-       while (ssh->size > zfs_read_history) {
-               ssh->size--;
-               rm = list_remove_tail(&ssh->list);
-               kmem_free(rm, sizeof (spa_read_history_t));
-       }
+       spa_read_history_truncate(shl, zfs_read_history);
 
-       mutex_exit(&ssh->lock);
+       mutex_exit(&shl->procfs_list.pl_lock);
 }
 
 /*
@@ -256,22 +203,21 @@ typedef struct spa_txg_history {
        uint64_t        writes;         /* number of write operations */
        uint64_t        ndirty;         /* number of dirty bytes */
        hrtime_t        times[TXG_STATE_COMMITTED]; /* completion times */
-       list_node_t     sth_link;
+       procfs_list_node_t      sth_node;
 } spa_txg_history_t;
 
 static int
-spa_txg_history_headers(char *buf, size_t size)
+spa_txg_history_show_header(struct seq_file *f)
 {
-       (void) snprintf(buf, size, "%-8s %-16s %-5s %-12s %-12s %-12s "
+       seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s "
            "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
            "ndirty", "nread", "nwritten", "reads", "writes",
            "otime", "qtime", "wtime", "stime");
-
        return (0);
 }
 
 static int
-spa_txg_history_data(char *buf, size_t size, void *data)
+spa_txg_history_show(struct seq_file *f, void *data)
 {
        spa_txg_history_t *sth = (spa_txg_history_t *)data;
        uint64_t open = 0, quiesce = 0, wait = 0, sync = 0;
@@ -303,7 +249,7 @@ spa_txg_history_data(char *buf, size_t size, void *data)
                sync = sth->times[TXG_STATE_SYNCED] -
                    sth->times[TXG_STATE_WAIT_FOR_SYNC];
 
-       (void) snprintf(buf, size, "%-8llu %-16llu %-5c %-12llu "
+       seq_printf(f, "%-8llu %-16llu %-5c %-12llu "
            "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
            (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
            (u_longlong_t)sth->ndirty,
@@ -315,110 +261,62 @@ spa_txg_history_data(char *buf, size_t size, void *data)
        return (0);
 }
 
-/*
- * Calculate the address for the next spa_stats_history_t entry.  The
- * ssh->lock will be held until ksp->ks_ndata entries are processed.
- */
-static void *
-spa_txg_history_addr(kstat_t *ksp, loff_t n)
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size)
 {
-       spa_t *spa = ksp->ks_private;
-       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
-
-       ASSERT(MUTEX_HELD(&ssh->lock));
+       spa_txg_history_t *sth;
+       while (shl->size > size) {
+               sth = list_remove_head(&shl->procfs_list.pl_list);
+               ASSERT3P(sth, !=, NULL);
+               kmem_free(sth, sizeof (spa_txg_history_t));
+               shl->size--;
+       }
 
-       if (n == 0)
-               ssh->private = list_tail(&ssh->list);
-       else if (ssh->private)
-               ssh->private = list_prev(&ssh->list, ssh->private);
+       if (size == 0)
+               ASSERT(list_is_empty(&shl->procfs_list.pl_list));
 
-       return (ssh->private);
 }
 
-/*
- * When the kstat is written discard all spa_txg_history_t entries.  The
- * ssh->lock will be held until ksp->ks_ndata entries are processed.
- */
 static int
-spa_txg_history_update(kstat_t *ksp, int rw)
+spa_txg_history_clear(procfs_list_t *procfs_list)
 {
-       spa_t *spa = ksp->ks_private;
-       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
-
-       ASSERT(MUTEX_HELD(&ssh->lock));
-
-       if (rw == KSTAT_WRITE) {
-               spa_txg_history_t *sth;
-
-               while ((sth = list_remove_head(&ssh->list))) {
-                       ssh->size--;
-                       kmem_free(sth, sizeof (spa_txg_history_t));
-               }
-
-               ASSERT3U(ssh->size, ==, 0);
-       }
-
-       ksp->ks_ndata = ssh->size;
-       ksp->ks_data_size = ssh->size * sizeof (spa_txg_history_t);
-
+       spa_history_list_t *shl = procfs_list->pl_private;
+       mutex_enter(&procfs_list->pl_lock);
+       spa_txg_history_truncate(shl, 0);
+       mutex_exit(&procfs_list->pl_lock);
        return (0);
 }
 
 static void
 spa_txg_history_init(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
-       char *name;
-       kstat_t *ksp;
+       spa_history_list_t *shl = &spa->spa_stats.txg_history;
+       char *module;
 
-       mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
-       list_create(&ssh->list, sizeof (spa_txg_history_t),
-           offsetof(spa_txg_history_t, sth_link));
+       shl->size = 0;
 
-       ssh->count = 0;
-       ssh->size = 0;
-       ssh->private = NULL;
+       module = kmem_asprintf("zfs/%s", spa_name(spa));
 
-       name = kmem_asprintf("zfs/%s", spa_name(spa));
+       shl->procfs_list.pl_private = shl;
+       procfs_list_install(module,
+           "txgs",
+           &shl->procfs_list,
+           spa_txg_history_show,
+           spa_txg_history_show_header,
+           spa_txg_history_clear,
+           offsetof(spa_txg_history_t, sth_node));
 
-       ksp = kstat_create(name, 0, "txgs", "misc",
-           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-       ssh->kstat = ksp;
-
-       if (ksp) {
-               ksp->ks_lock = &ssh->lock;
-               ksp->ks_data = NULL;
-               ksp->ks_private = spa;
-               ksp->ks_update = spa_txg_history_update;
-               kstat_set_raw_ops(ksp, spa_txg_history_headers,
-                   spa_txg_history_data, spa_txg_history_addr);
-               kstat_install(ksp);
-       }
-       strfree(name);
+       strfree(module);
 }
 
 static void
 spa_txg_history_destroy(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
-       spa_txg_history_t *sth;
-       kstat_t *ksp;
-
-       ksp = ssh->kstat;
-       if (ksp)
-               kstat_delete(ksp);
-
-       mutex_enter(&ssh->lock);
-       while ((sth = list_remove_head(&ssh->list))) {
-               ssh->size--;
-               kmem_free(sth, sizeof (spa_txg_history_t));
-       }
-
-       ASSERT3U(ssh->size, ==, 0);
-       list_destroy(&ssh->list);
-       mutex_exit(&ssh->lock);
-
-       mutex_destroy(&ssh->lock);
+       spa_history_list_t *shl = &spa->spa_stats.txg_history;
+       procfs_list_uninstall(&shl->procfs_list);
+       spa_txg_history_truncate(shl, 0);
+       procfs_list_destroy(&shl->procfs_list);
 }
 
 /*
@@ -427,10 +325,10 @@ spa_txg_history_destroy(spa_t *spa)
 void
 spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
-       spa_txg_history_t *sth, *rm;
+       spa_history_list_t *shl = &spa->spa_stats.txg_history;
+       spa_txg_history_t *sth;
 
-       if (zfs_txg_history == 0 && ssh->size == 0)
+       if (zfs_txg_history == 0 && shl->size == 0)
                return;
 
        sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP);
@@ -438,18 +336,11 @@ spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
        sth->state = TXG_STATE_OPEN;
        sth->times[TXG_STATE_BIRTH] = birth_time;
 
-       mutex_enter(&ssh->lock);
-
-       list_insert_head(&ssh->list, sth);
-       ssh->size++;
-
-       while (ssh->size > zfs_txg_history) {
-               ssh->size--;
-               rm = list_remove_tail(&ssh->list);
-               kmem_free(rm, sizeof (spa_txg_history_t));
-       }
-
-       mutex_exit(&ssh->lock);
+       mutex_enter(&shl->procfs_list.pl_lock);
+       procfs_list_add(&shl->procfs_list, sth);
+       shl->size++;
+       spa_txg_history_truncate(shl, zfs_txg_history);
+       mutex_exit(&shl->procfs_list.pl_lock);
 }
 
 /*
@@ -459,16 +350,16 @@ int
 spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
     hrtime_t completed_time)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+       spa_history_list_t *shl = &spa->spa_stats.txg_history;
        spa_txg_history_t *sth;
        int error = ENOENT;
 
        if (zfs_txg_history == 0)
                return (0);
 
-       mutex_enter(&ssh->lock);
-       for (sth = list_head(&ssh->list); sth != NULL;
-           sth = list_next(&ssh->list, sth)) {
+       mutex_enter(&shl->procfs_list.pl_lock);
+       for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
+           sth = list_prev(&shl->procfs_list.pl_list, sth)) {
                if (sth->txg == txg) {
                        sth->times[completed_state] = completed_time;
                        sth->state++;
@@ -476,7 +367,7 @@ spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
                        break;
                }
        }
-       mutex_exit(&ssh->lock);
+       mutex_exit(&shl->procfs_list.pl_lock);
 
        return (error);
 }
@@ -488,16 +379,16 @@ static int
 spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
     uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
+       spa_history_list_t *shl = &spa->spa_stats.txg_history;
        spa_txg_history_t *sth;
        int error = ENOENT;
 
        if (zfs_txg_history == 0)
                return (0);
 
-       mutex_enter(&ssh->lock);
-       for (sth = list_head(&ssh->list); sth != NULL;
-           sth = list_next(&ssh->list, sth)) {
+       mutex_enter(&shl->procfs_list.pl_lock);
+       for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
+           sth = list_prev(&shl->procfs_list.pl_list, sth)) {
                if (sth->txg == txg) {
                        sth->nread = nread;
                        sth->nwritten = nwritten;
@@ -508,7 +399,7 @@ spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
                        break;
                }
        }
-       mutex_exit(&ssh->lock);
+       mutex_exit(&shl->procfs_list.pl_lock);
 
        return (error);
 }
@@ -580,16 +471,16 @@ static int
 spa_tx_assign_update(kstat_t *ksp, int rw)
 {
        spa_t *spa = ksp->ks_private;
-       spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
+       spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
        int i;
 
        if (rw == KSTAT_WRITE) {
-               for (i = 0; i < ssh->count; i++)
-                       ((kstat_named_t *)ssh->private)[i].value.ui64 = 0;
+               for (i = 0; i < shk->count; i++)
+                       ((kstat_named_t *)shk->private)[i].value.ui64 = 0;
        }
 
-       for (i = ssh->count; i > 0; i--)
-               if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0)
+       for (i = shk->count; i > 0; i--)
+               if (((kstat_named_t *)shk->private)[i-1].value.ui64 != 0)
                        break;
 
        ksp->ks_ndata = i;
@@ -601,22 +492,22 @@ spa_tx_assign_update(kstat_t *ksp, int rw)
 static void
 spa_tx_assign_init(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
+       spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
        char *name;
        kstat_named_t *ks;
        kstat_t *ksp;
        int i;
 
-       mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
 
-       ssh->count = 42; /* power of two buckets for 1ns to 2,199s */
-       ssh->size = ssh->count * sizeof (kstat_named_t);
-       ssh->private = kmem_alloc(ssh->size, KM_SLEEP);
+       shk->count = 42; /* power of two buckets for 1ns to 2,199s */
+       shk->size = shk->count * sizeof (kstat_named_t);
+       shk->private = kmem_alloc(shk->size, KM_SLEEP);
 
        name = kmem_asprintf("zfs/%s", spa_name(spa));
 
-       for (i = 0; i < ssh->count; i++) {
-               ks = &((kstat_named_t *)ssh->private)[i];
+       for (i = 0; i < shk->count; i++) {
+               ks = &((kstat_named_t *)shk->private)[i];
                ks->data_type = KSTAT_DATA_UINT64;
                ks->value.ui64 = 0;
                (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
@@ -625,13 +516,13 @@ spa_tx_assign_init(spa_t *spa)
 
        ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
            KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
-       ssh->kstat = ksp;
+       shk->kstat = ksp;
 
        if (ksp) {
-               ksp->ks_lock = &ssh->lock;
-               ksp->ks_data = ssh->private;
-               ksp->ks_ndata = ssh->count;
-               ksp->ks_data_size = ssh->size;
+               ksp->ks_lock = &shk->lock;
+               ksp->ks_data = shk->private;
+               ksp->ks_ndata = shk->count;
+               ksp->ks_data_size = shk->size;
                ksp->ks_private = spa;
                ksp->ks_update = spa_tx_assign_update;
                kstat_install(ksp);
@@ -642,27 +533,27 @@ spa_tx_assign_init(spa_t *spa)
 static void
 spa_tx_assign_destroy(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
+       spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
        kstat_t *ksp;
 
-       ksp = ssh->kstat;
+       ksp = shk->kstat;
        if (ksp)
                kstat_delete(ksp);
 
-       kmem_free(ssh->private, ssh->size);
-       mutex_destroy(&ssh->lock);
+       kmem_free(shk->private, shk->size);
+       mutex_destroy(&shk->lock);
 }
 
 void
 spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
+       spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
        uint64_t idx = 0;
 
-       while (((1ULL << idx) < nsecs) && (idx < ssh->size - 1))
+       while (((1ULL << idx) < nsecs) && (idx < shk->size - 1))
                idx++;
 
-       atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64);
+       atomic_inc_64(&((kstat_named_t *)shk->private)[idx].value.ui64);
 }
 
 /*
@@ -682,19 +573,19 @@ spa_io_history_update(kstat_t *ksp, int rw)
 static void
 spa_io_history_init(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.io_history;
+       spa_history_kstat_t *shk = &spa->spa_stats.io_history;
        char *name;
        kstat_t *ksp;
 
-       mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
 
        name = kmem_asprintf("zfs/%s", spa_name(spa));
 
        ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0);
-       ssh->kstat = ksp;
+       shk->kstat = ksp;
 
        if (ksp) {
-               ksp->ks_lock = &ssh->lock;
+               ksp->ks_lock = &shk->lock;
                ksp->ks_private = spa;
                ksp->ks_update = spa_io_history_update;
                kstat_install(ksp);
@@ -705,12 +596,12 @@ spa_io_history_init(spa_t *spa)
 static void
 spa_io_history_destroy(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.io_history;
+       spa_history_kstat_t *shk = &spa->spa_stats.io_history;
 
-       if (ssh->kstat)
-               kstat_delete(ssh->kstat);
+       if (shk->kstat)
+               kstat_delete(shk->kstat);
 
-       mutex_destroy(&ssh->lock);
+       mutex_destroy(&shk->lock);
 }
 
 /*
@@ -733,7 +624,7 @@ spa_io_history_destroy(spa_t *spa)
  */
 
 typedef struct spa_mmp_history {
-       uint64_t        mmp_kstat_id;   /* unique # for updates */
+       uint64_t        mmp_node_id;    /* unique # for updates */
        uint64_t        txg;            /* txg of last sync */
        uint64_t        timestamp;      /* UTC time MMP write issued */
        uint64_t        mmp_delay;      /* mmp_thread.mmp_delay at timestamp */
@@ -743,20 +634,20 @@ typedef struct spa_mmp_history {
        int             io_error;       /* error status of MMP write */
        hrtime_t        error_start;    /* hrtime of start of error period */
        hrtime_t        duration;       /* time from submission to completion */
-       list_node_t     smh_link;
+       procfs_list_node_t      smh_node;
 } spa_mmp_history_t;
 
 static int
-spa_mmp_history_headers(char *buf, size_t size)
+spa_mmp_history_show_header(struct seq_file *f)
 {
-       (void) snprintf(buf, size, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
+       seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
            "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
            "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
        return (0);
 }
 
 static int
-spa_mmp_history_data(char *buf, size_t size, void *data)
+spa_mmp_history_show(struct seq_file *f, void *data)
 {
        spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
        char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
@@ -764,8 +655,8 @@ spa_mmp_history_data(char *buf, size_t size, void *data)
        char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
            "%-10lld %s\n";
 
-       (void) snprintf(buf, size, (smh->error_start ? skip_fmt : write_fmt),
-           (u_longlong_t)smh->mmp_kstat_id, (u_longlong_t)smh->txg,
+       seq_printf(f, (smh->error_start ? skip_fmt : write_fmt),
+           (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg,
            (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
            (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
            (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label,
@@ -774,137 +665,86 @@ spa_mmp_history_data(char *buf, size_t size, void *data)
        return (0);
 }
 
-/*
- * Calculate the address for the next spa_stats_history_t entry.  The
- * ssh->lock will be held until ksp->ks_ndata entries are processed.
- */
-static void *
-spa_mmp_history_addr(kstat_t *ksp, loff_t n)
+/* Remove oldest elements from list until there are no more than 'size' left */
+static void
+spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size)
 {
-       spa_t *spa = ksp->ks_private;
-       spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
-
-       ASSERT(MUTEX_HELD(&ssh->lock));
+       spa_mmp_history_t *smh;
+       while (shl->size > size) {
+               smh = list_remove_head(&shl->procfs_list.pl_list);
+               if (smh->vdev_path)
+                       strfree(smh->vdev_path);
+               kmem_free(smh, sizeof (spa_mmp_history_t));
+               shl->size--;
+       }
 
-       if (n == 0)
-               ssh->private = list_tail(&ssh->list);
-       else if (ssh->private)
-               ssh->private = list_prev(&ssh->list, ssh->private);
+       if (size == 0)
+               ASSERT(list_is_empty(&shl->procfs_list.pl_list));
 
-       return (ssh->private);
 }
 
-/*
- * When the kstat is written discard all spa_mmp_history_t entries.  The
- * ssh->lock will be held until ksp->ks_ndata entries are processed.
- */
 static int
-spa_mmp_history_update(kstat_t *ksp, int rw)
+spa_mmp_history_clear(procfs_list_t *procfs_list)
 {
-       spa_t *spa = ksp->ks_private;
-       spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
-
-       ASSERT(MUTEX_HELD(&ssh->lock));
-
-       if (rw == KSTAT_WRITE) {
-               spa_mmp_history_t *smh;
-
-               while ((smh = list_remove_head(&ssh->list))) {
-                       ssh->size--;
-                       if (smh->vdev_path)
-                               strfree(smh->vdev_path);
-                       kmem_free(smh, sizeof (spa_mmp_history_t));
-               }
-
-               ASSERT3U(ssh->size, ==, 0);
-       }
-
-       ksp->ks_ndata = ssh->size;
-       ksp->ks_data_size = ssh->size * sizeof (spa_mmp_history_t);
-
+       spa_history_list_t *shl = procfs_list->pl_private;
+       mutex_enter(&procfs_list->pl_lock);
+       spa_mmp_history_truncate(shl, 0);
+       mutex_exit(&procfs_list->pl_lock);
        return (0);
 }
 
 static void
 spa_mmp_history_init(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
-       char *name;
-       kstat_t *ksp;
+       spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+       char *module;
 
-       mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
-       list_create(&ssh->list, sizeof (spa_mmp_history_t),
-           offsetof(spa_mmp_history_t, smh_link));
+       shl->size = 0;
 
-       ssh->count = 0;
-       ssh->size = 0;
-       ssh->private = NULL;
-
-       name = kmem_asprintf("zfs/%s", spa_name(spa));
+       module = kmem_asprintf("zfs/%s", spa_name(spa));
 
-       ksp = kstat_create(name, 0, "multihost", "misc",
-           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-       ssh->kstat = ksp;
+       shl->procfs_list.pl_private = shl;
+       procfs_list_install(module,
+           "multihost",
+           &shl->procfs_list,
+           spa_mmp_history_show,
+           spa_mmp_history_show_header,
+           spa_mmp_history_clear,
+           offsetof(spa_mmp_history_t, smh_node));
 
-       if (ksp) {
-               ksp->ks_lock = &ssh->lock;
-               ksp->ks_data = NULL;
-               ksp->ks_private = spa;
-               ksp->ks_update = spa_mmp_history_update;
-               kstat_set_raw_ops(ksp, spa_mmp_history_headers,
-                   spa_mmp_history_data, spa_mmp_history_addr);
-               kstat_install(ksp);
-       }
-       strfree(name);
+       strfree(module);
 }
 
 static void
 spa_mmp_history_destroy(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
-       spa_mmp_history_t *smh;
-       kstat_t *ksp;
-
-       ksp = ssh->kstat;
-       if (ksp)
-               kstat_delete(ksp);
-
-       mutex_enter(&ssh->lock);
-       while ((smh = list_remove_head(&ssh->list))) {
-               ssh->size--;
-               if (smh->vdev_path)
-                       strfree(smh->vdev_path);
-               kmem_free(smh, sizeof (spa_mmp_history_t));
-       }
-
-       ASSERT3U(ssh->size, ==, 0);
-       list_destroy(&ssh->list);
-       mutex_exit(&ssh->lock);
-
-       mutex_destroy(&ssh->lock);
+       spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+       procfs_list_uninstall(&shl->procfs_list);
+       spa_mmp_history_truncate(shl, 0);
+       procfs_list_destroy(&shl->procfs_list);
 }
 
 /*
  * Set duration in existing "skip" record to how long we have waited for a leaf
  * vdev to become available.
  *
- * Important that we start search at the head of the list where new
+ * Important that we start search at the tail of the list where new
  * records are inserted, so this is normally an O(1) operation.
  */
 int
-spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id)
+spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
+       spa_history_list_t *shl = &spa->spa_stats.mmp_history;
        spa_mmp_history_t *smh;
        int error = ENOENT;
 
-       if (zfs_multihost_history == 0 && ssh->size == 0)
+       if (zfs_multihost_history == 0 && shl->size == 0)
                return (0);
 
-       mutex_enter(&ssh->lock);
-       for (smh = list_head(&ssh->list); smh != NULL;
-           smh = list_next(&ssh->list, smh)) {
-               if (smh->mmp_kstat_id == mmp_kstat_id) {
+       mutex_enter(&shl->procfs_list.pl_lock);
+       for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
+           smh = list_prev(&shl->procfs_list.pl_list, smh)) {
+               if (smh->mmp_node_id == mmp_node_id) {
                        ASSERT3U(smh->io_error, !=, 0);
                        smh->duration = gethrtime() - smh->error_start;
                        smh->vdev_guid++;
@@ -912,7 +752,7 @@ spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id)
                        break;
                }
        }
-       mutex_exit(&ssh->lock);
+       mutex_exit(&shl->procfs_list.pl_lock);
 
        return (error);
 }
@@ -922,20 +762,20 @@ spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id)
  * See comment re: search order above spa_mmp_history_set_skip().
  */
 int
-spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
+spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error,
     hrtime_t duration)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
+       spa_history_list_t *shl = &spa->spa_stats.mmp_history;
        spa_mmp_history_t *smh;
        int error = ENOENT;
 
-       if (zfs_multihost_history == 0 && ssh->size == 0)
+       if (zfs_multihost_history == 0 && shl->size == 0)
                return (0);
 
-       mutex_enter(&ssh->lock);
-       for (smh = list_head(&ssh->list); smh != NULL;
-           smh = list_next(&ssh->list, smh)) {
-               if (smh->mmp_kstat_id == mmp_kstat_id) {
+       mutex_enter(&shl->procfs_list.pl_lock);
+       for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
+           smh = list_prev(&shl->procfs_list.pl_list, smh)) {
+               if (smh->mmp_node_id == mmp_node_id) {
                        ASSERT(smh->io_error == 0);
                        smh->io_error = io_error;
                        smh->duration = duration;
@@ -943,7 +783,7 @@ spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
                        break;
                }
        }
-       mutex_exit(&ssh->lock);
+       mutex_exit(&shl->procfs_list.pl_lock);
 
        return (error);
 }
@@ -953,16 +793,16 @@ spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
  * error == 0 : a write was issued.
  * error != 0 : a write was not issued because no leaves were found.
  */
-void *
+void
 spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
-    uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
+    uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id,
     int error)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
-       spa_mmp_history_t *smh, *rm;
+       spa_history_list_t *shl = &spa->spa_stats.mmp_history;
+       spa_mmp_history_t *smh;
 
-       if (zfs_multihost_history == 0 && ssh->size == 0)
-               return (NULL);
+       if (zfs_multihost_history == 0 && shl->size == 0)
+               return;
 
        smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
        smh->txg = txg;
@@ -974,7 +814,7 @@ spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
                        smh->vdev_path = strdup(vd->vdev_path);
        }
        smh->vdev_label = label;
-       smh->mmp_kstat_id = mmp_kstat_id;
+       smh->mmp_node_id = mmp_node_id;
 
        if (error) {
                smh->io_error = error;
@@ -982,21 +822,11 @@ spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
                smh->vdev_guid = 1;
        }
 
-       mutex_enter(&ssh->lock);
-
-       list_insert_head(&ssh->list, smh);
-       ssh->size++;
-
-       while (ssh->size > zfs_multihost_history) {
-               ssh->size--;
-               rm = list_remove_tail(&ssh->list);
-               if (rm->vdev_path)
-                       strfree(rm->vdev_path);
-               kmem_free(rm, sizeof (spa_mmp_history_t));
-       }
-
-       mutex_exit(&ssh->lock);
-       return ((void *)smh);
+       mutex_enter(&shl->procfs_list.pl_lock);
+       procfs_list_add(&shl->procfs_list, smh);
+       shl->size++;
+       spa_mmp_history_truncate(shl, zfs_multihost_history);
+       mutex_exit(&shl->procfs_list.pl_lock);
 }
 
 static void *
@@ -1023,19 +853,19 @@ spa_state_data(char *buf, size_t size, void *data)
 static void
 spa_state_init(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.state;
+       spa_history_kstat_t *shk = &spa->spa_stats.state;
        char *name;
        kstat_t *ksp;
 
-       mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
+       mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
 
        name = kmem_asprintf("zfs/%s", spa_name(spa));
        ksp = kstat_create(name, 0, "state", "misc",
            KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 
-       ssh->kstat = ksp;
+       shk->kstat = ksp;
        if (ksp) {
-               ksp->ks_lock = &ssh->lock;
+               ksp->ks_lock = &shk->lock;
                ksp->ks_data = NULL;
                ksp->ks_private = spa;
                ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
@@ -1049,12 +879,12 @@ spa_state_init(spa_t *spa)
 static void
 spa_health_destroy(spa_t *spa)
 {
-       spa_stats_history_t *ssh = &spa->spa_stats.state;
-       kstat_t *ksp = ssh->kstat;
+       spa_history_kstat_t *shk = &spa->spa_stats.state;
+       kstat_t *ksp = shk->kstat;
        if (ksp)
                kstat_delete(ksp);
 
-       mutex_destroy(&ssh->lock);
+       mutex_destroy(&shk->lock);
 }
 
 void
index 30a883f853ad60ba26095590f5c5379a181dbe8f..89cdf7d81099708cc726cc24744dfe2037e6da37 100644 (file)
@@ -429,16 +429,16 @@ static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
        spa_t *spa = zio->io_spa;
-       spa_stats_history_t *ssh = &spa->spa_stats.io_history;
+       spa_history_kstat_t *shk = &spa->spa_stats.io_history;
 
        ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
        avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
        avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
 
-       if (ssh->kstat != NULL) {
-               mutex_enter(&ssh->lock);
-               kstat_waitq_enter(ssh->kstat->ks_data);
-               mutex_exit(&ssh->lock);
+       if (shk->kstat != NULL) {
+               mutex_enter(&shk->lock);
+               kstat_waitq_enter(shk->kstat->ks_data);
+               mutex_exit(&shk->lock);
        }
 }
 
@@ -446,16 +446,16 @@ static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
        spa_t *spa = zio->io_spa;
-       spa_stats_history_t *ssh = &spa->spa_stats.io_history;
+       spa_history_kstat_t *shk = &spa->spa_stats.io_history;
 
        ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
        avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
        avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
 
-       if (ssh->kstat != NULL) {
-               mutex_enter(&ssh->lock);
-               kstat_waitq_exit(ssh->kstat->ks_data);
-               mutex_exit(&ssh->lock);
+       if (shk->kstat != NULL) {
+               mutex_enter(&shk->lock);
+               kstat_waitq_exit(shk->kstat->ks_data);
+               mutex_exit(&shk->lock);
        }
 }
 
@@ -463,17 +463,17 @@ static void
 vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 {
        spa_t *spa = zio->io_spa;
-       spa_stats_history_t *ssh = &spa->spa_stats.io_history;
+       spa_history_kstat_t *shk = &spa->spa_stats.io_history;
 
        ASSERT(MUTEX_HELD(&vq->vq_lock));
        ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
        vq->vq_class[zio->io_priority].vqc_active++;
        avl_add(&vq->vq_active_tree, zio);
 
-       if (ssh->kstat != NULL) {
-               mutex_enter(&ssh->lock);
-               kstat_runq_enter(ssh->kstat->ks_data);
-               mutex_exit(&ssh->lock);
+       if (shk->kstat != NULL) {
+               mutex_enter(&shk->lock);
+               kstat_runq_enter(shk->kstat->ks_data);
+               mutex_exit(&shk->lock);
        }
 }
 
@@ -481,17 +481,17 @@ static void
 vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 {
        spa_t *spa = zio->io_spa;
-       spa_stats_history_t *ssh = &spa->spa_stats.io_history;
+       spa_history_kstat_t *shk = &spa->spa_stats.io_history;
 
        ASSERT(MUTEX_HELD(&vq->vq_lock));
        ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
        vq->vq_class[zio->io_priority].vqc_active--;
        avl_remove(&vq->vq_active_tree, zio);
 
-       if (ssh->kstat != NULL) {
-               kstat_io_t *ksio = ssh->kstat->ks_data;
+       if (shk->kstat != NULL) {
+               kstat_io_t *ksio = shk->kstat->ks_data;
 
-               mutex_enter(&ssh->lock);
+               mutex_enter(&shk->lock);
                kstat_runq_exit(ksio);
                if (zio->io_type == ZIO_TYPE_READ) {
                        ksio->reads++;
@@ -500,7 +500,7 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
                        ksio->writes++;
                        ksio->nwritten += zio->io_size;
                }
-               mutex_exit(&ssh->lock);
+               mutex_exit(&shk->lock);
        }
 }
 
index ca79893c918404e607383fa30e73b36babb1ef33..b5f93fd9bebb4cb31136e89c7470de8850b6bc5d 100644 (file)
  */
 
 #include <sys/zfs_context.h>
-#include <sys/kstat.h>
 
-list_t zfs_dbgmsgs;
+typedef struct zfs_dbgmsg {
+       procfs_list_node_t      zdm_node;
+       time_t                  zdm_timestamp;
+       int                     zdm_size;
+       char                    zdm_msg[1]; /* variable length allocation */
+} zfs_dbgmsg_t;
+
+procfs_list_t zfs_dbgmsgs;
 int zfs_dbgmsg_size = 0;
-kmutex_t zfs_dbgmsgs_lock;
 int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */
-kstat_t *zfs_dbgmsg_kstat;
 
 /*
  * Internal ZFS debug messages are enabled by default.
@@ -47,122 +51,70 @@ kstat_t *zfs_dbgmsg_kstat;
 int zfs_dbgmsg_enable = 1;
 
 static int
-zfs_dbgmsg_headers(char *buf, size_t size)
+zfs_dbgmsg_show_header(struct seq_file *f)
 {
-       (void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message");
-
+       seq_printf(f, "%-12s %-8s\n", "timestamp", "message");
        return (0);
 }
 
 static int
-zfs_dbgmsg_data(char *buf, size_t size, void *data)
+zfs_dbgmsg_show(struct seq_file *f, void *p)
 {
-       zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data;
-
-       (void) snprintf(buf, size, "%-12llu %-s\n",
+       zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p;
+       seq_printf(f, "%-12llu %-s\n",
            (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg);
-
        return (0);
 }
 
-static void *
-zfs_dbgmsg_addr(kstat_t *ksp, loff_t n)
-{
-       zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private;
-
-       ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
-
-       if (n == 0)
-               ksp->ks_private = list_head(&zfs_dbgmsgs);
-       else if (zdm)
-               ksp->ks_private = list_next(&zfs_dbgmsgs, zdm);
-
-       return (ksp->ks_private);
-}
-
 static void
 zfs_dbgmsg_purge(int max_size)
 {
-       zfs_dbgmsg_t *zdm;
-       int size;
-
-       ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock));
-
        while (zfs_dbgmsg_size > max_size) {
-               zdm = list_remove_head(&zfs_dbgmsgs);
+               zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list);
                if (zdm == NULL)
                        return;
 
-               size = zdm->zdm_size;
+               int size = zdm->zdm_size;
                kmem_free(zdm, size);
                zfs_dbgmsg_size -= size;
        }
 }
 
 static int
-zfs_dbgmsg_update(kstat_t *ksp, int rw)
+zfs_dbgmsg_clear(procfs_list_t *procfs_list)
 {
-       if (rw == KSTAT_WRITE)
-               zfs_dbgmsg_purge(0);
-
+       mutex_enter(&zfs_dbgmsgs.pl_lock);
+       zfs_dbgmsg_purge(0);
+       mutex_exit(&zfs_dbgmsgs.pl_lock);
        return (0);
 }
 
 void
 zfs_dbgmsg_init(void)
 {
-       list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t),
+       procfs_list_install("zfs",
+           "dbgmsg",
+           &zfs_dbgmsgs,
+           zfs_dbgmsg_show,
+           zfs_dbgmsg_show_header,
+           zfs_dbgmsg_clear,
            offsetof(zfs_dbgmsg_t, zdm_node));
-       mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL);
-
-       zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc",
-           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
-       if (zfs_dbgmsg_kstat) {
-               zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock;
-               zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX;
-               zfs_dbgmsg_kstat->ks_private = NULL;
-               zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update;
-               kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers,
-                   zfs_dbgmsg_data, zfs_dbgmsg_addr);
-               kstat_install(zfs_dbgmsg_kstat);
-       }
 }
 
 void
 zfs_dbgmsg_fini(void)
 {
-       if (zfs_dbgmsg_kstat)
-               kstat_delete(zfs_dbgmsg_kstat);
+       procfs_list_uninstall(&zfs_dbgmsgs);
+       zfs_dbgmsg_purge(0);
+
        /*
         * TODO - decide how to make this permanent
         */
 #ifdef _KERNEL
-       mutex_enter(&zfs_dbgmsgs_lock);
-       zfs_dbgmsg_purge(0);
-       mutex_exit(&zfs_dbgmsgs_lock);
-       mutex_destroy(&zfs_dbgmsgs_lock);
+       procfs_list_destroy(&zfs_dbgmsgs);
 #endif
 }
 
-void
-__zfs_dbgmsg(char *buf)
-{
-       zfs_dbgmsg_t *zdm;
-       int size;
-
-       size = sizeof (zfs_dbgmsg_t) + strlen(buf);
-       zdm = kmem_zalloc(size, KM_SLEEP);
-       zdm->zdm_size = size;
-       zdm->zdm_timestamp = gethrestime_sec();
-       strcpy(zdm->zdm_msg, buf);
-
-       mutex_enter(&zfs_dbgmsgs_lock);
-       list_insert_tail(&zfs_dbgmsgs, zdm);
-       zfs_dbgmsg_size += size;
-       zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
-       mutex_exit(&zfs_dbgmsgs_lock);
-}
-
 void
 __set_error(const char *file, const char *func, int line, int err)
 {
@@ -176,6 +128,22 @@ __set_error(const char *file, const char *func, int line, int err)
 }
 
 #ifdef _KERNEL
+static void
+__zfs_dbgmsg(char *buf)
+{
+       int size = sizeof (zfs_dbgmsg_t) + strlen(buf);
+       zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP);
+       zdm->zdm_size = size;
+       zdm->zdm_timestamp = gethrestime_sec();
+       strcpy(zdm->zdm_msg, buf);
+
+       mutex_enter(&zfs_dbgmsgs.pl_lock);
+       procfs_list_add(&zfs_dbgmsgs, zdm);
+       zfs_dbgmsg_size += size;
+       zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0));
+       mutex_exit(&zfs_dbgmsgs.pl_lock);
+}
+
 void
 __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
 {
@@ -244,14 +212,12 @@ __dprintf(const char *file, const char *func, int line, const char *fmt, ...)
 void
 zfs_dbgmsg_print(const char *tag)
 {
-       zfs_dbgmsg_t *zdm;
-
        (void) printf("ZFS_DBGMSG(%s):\n", tag);
-       mutex_enter(&zfs_dbgmsgs_lock);
-       for (zdm = list_head(&zfs_dbgmsgs); zdm;
-           zdm = list_next(&zfs_dbgmsgs, zdm))
+       mutex_enter(&zfs_dbgmsgs.pl_lock);
+       for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL;
+           zdm = list_next(&zfs_dbgmsgs.pl_list, zdm))
                (void) printf("%s\n", zdm->zdm_msg);
-       mutex_exit(&zfs_dbgmsgs_lock);
+       mutex_exit(&zfs_dbgmsgs.pl_lock);
 }
 #endif /* _KERNEL */
 
index 4b41c3f743cae17b034a25ccc10d73517cf676d0..95e70f043203b4c16852f0ea63686f6dfd0f90f4 100644 (file)
@@ -584,10 +584,6 @@ tests = ['inuse_001_pos', 'inuse_003_pos', 'inuse_004_pos',
 post =
 tags = ['functional', 'inuse']
 
-[tests/functional/kstat]
-tests = ['state']
-tags = ['functional', 'kstat']
-
 [tests/functional/large_files]
 tests = ['large_files_001_pos', 'large_files_002_pos']
 tags = ['functional', 'large_files']
@@ -672,6 +668,11 @@ tags = ['functional', 'poolversion']
 tests = ['privilege_001_pos', 'privilege_002_pos']
 tags = ['functional', 'privilege']
 
+[tests/functional/procfs]
+tests = ['procfs_list_basic', 'procfs_list_concurrent_readers',
+    'procfs_list_stale_read', 'pool_state']
+tags = ['functional', 'procfs']
+
 [tests/functional/projectquota]
 tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos',
     'projectquota_001_pos', 'projectquota_002_pos', 'projectquota_003_pos',
index e0a4aca99cb45fad2d61a1f6464d39d365c9a0eb..961a340272a9c41ecc5a80239b5a2e7c1f8521ab 100644 (file)
@@ -29,7 +29,6 @@ SUBDIRS = \
        inheritance \
        inuse \
        io \
-       kstat \
        large_files \
        largest_pool \
        libzfs \
@@ -48,6 +47,7 @@ SUBDIRS = \
        pool_names \
        poolversion \
        privilege \
+       procfs \
        projectquota \
        quota \
        raidz \
diff --git a/tests/zfs-tests/tests/functional/kstat/Makefile.am b/tests/zfs-tests/tests/functional/kstat/Makefile.am
deleted file mode 100644 (file)
index 8ad83ec..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/kstat
-dist_pkgdata_SCRIPTS = \
-       setup.ksh \
-       cleanup.ksh \
-       state.ksh
diff --git a/tests/zfs-tests/tests/functional/kstat/cleanup.ksh b/tests/zfs-tests/tests/functional/kstat/cleanup.ksh
deleted file mode 100755 (executable)
index 8a212ce..0000000
+++ /dev/null
@@ -1,28 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-#
-# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
-#
-
-. $STF_SUITE/include/libtest.shlib
-
-default_cleanup
diff --git a/tests/zfs-tests/tests/functional/kstat/setup.ksh b/tests/zfs-tests/tests/functional/kstat/setup.ksh
deleted file mode 100755 (executable)
index 57717a0..0000000
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-#
-#
-# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
-#
-
-. $STF_SUITE/include/libtest.shlib
-
-if ! is_linux ; then
-       log_unsupported "/proc/spl/kstat/<pool>/health only supported on Linux"
-fi
-
-default_mirror_setup $DISKS
-
-log_pass
diff --git a/tests/zfs-tests/tests/functional/kstat/state.ksh b/tests/zfs-tests/tests/functional/kstat/state.ksh
deleted file mode 100755 (executable)
index 3c29266..0000000
+++ /dev/null
@@ -1,144 +0,0 @@
-#!/bin/ksh -p
-#
-# CDDL HEADER START
-#
-# The contents of this file are subject to the terms of the
-# Common Development and Distribution License (the "License").
-# You may not use this file except in compliance with the License.
-#
-# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
-# or http://www.opensolaris.org/os/licensing.
-# See the License for the specific language governing permissions
-# and limitations under the License.
-#
-# When distributing Covered Code, include this CDDL HEADER in each
-# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
-# If applicable, add the following below this CDDL HEADER, with the
-# fields enclosed by brackets "[]" replaced with your own identifying
-# information: Portions Copyright [yyyy] [name of copyright owner]
-#
-# CDDL HEADER END
-
-#
-# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
-#
-
-#
-# DESCRIPTION:
-# Test /proc/spl/kstat/zfs/<pool>/state kstat
-#
-# STRATEGY:
-# 1. Create a mirrored pool
-# 2. Check that pool is ONLINE
-# 3. Fault one disk
-# 4. Check that pool is DEGRADED
-# 5. Create a new pool with a single scsi_debug disk
-# 6. Remove the disk
-# 7. Check that pool is SUSPENDED
-# 8. Add the disk back in
-# 9. Clear errors and destroy the pools
-
-. $STF_SUITE/include/libtest.shlib
-
-verify_runnable "both"
-
-function cleanup
-{
-       # Destroy the scsi_debug pool
-       if [ -n "$TESTPOOL2" ] ; then
-               if  [ -n "$host" ] ; then
-                       # Re-enable the disk
-                       scan_scsi_hosts $host
-
-                       # Device may have changed names after being inserted
-                       SDISK=$(get_debug_device)
-                       log_must ln $DEV_RDSKDIR/$SDISK $REALDISK
-               fi
-
-               # Restore our working pool image
-               if [ -n "$BACKUP" ] ; then
-                       gunzip -c $BACKUP > $REALDISK
-                       log_must rm -f $BACKUP
-               fi
-
-               # Our disk is back.  Now we can clear errors and destroy the
-               # pool cleanly.
-               log_must zpool clear $TESTPOOL2
-
-               # Now that the disk is back and errors cleared, wait for our
-               # hung 'zpool scrub' to finish.
-               wait
-
-               destroy_pool $TESTPOOL2
-               log_must rm $REALDISK
-               unload_scsi_debug
-       fi
-}
-
-# Check that our pool state values match what's expected
-#
-# $1: pool name
-# $2: expected state ("ONLINE", "DEGRADED", "SUSPENDED", etc)
-function check_all
-{
-       pool=$1
-       expected=$2
-
-       state1=$(zpool status $pool | awk '/state: /{print $2}');
-       state2=$(zpool list -H -o health $pool)
-       state3=$(cat /proc/spl/kstat/zfs/$pool/state)
-       log_note "Checking $expected = $state1 = $state2 = $state3"
-       if [[ "$expected" == "$state1" &&  "$expected" == "$state2" && \
-           "$expected" == "$state3" ]] ; then
-               true
-       else
-               false
-       fi
-}
-
-log_onexit cleanup
-
-log_assert "Testing /proc/spl/kstat/zfs/<pool>/state kstat"
-
-# Test that the initial pool is healthy
-check_all $TESTPOOL "ONLINE"
-
-# Fault one of the disks, and check that pool is degraded
-DISK1=$(echo "$DISKS" | awk '{print $2}')
-zpool offline -tf $TESTPOOL $DISK1
-check_all $TESTPOOL "DEGRADED"
-
-# Create a new pool out of a scsi_debug disk
-TESTPOOL2=testpool2
-MINVDEVSIZE_MB=$((MINVDEVSIZE / 1048576))
-load_scsi_debug $MINVDEVSIZE_MB 1 1 1 '512b'
-
-SDISK=$(get_debug_device)
-host=$(get_scsi_host $SDISK)
-
-# Use $REALDISK instead of $SDISK in our pool because $SDISK can change names
-# as we remove/add the disk (i.e. /dev/sdf -> /dev/sdg).
-REALDISK=/dev/kstat-state-realdisk
-log_must [ ! -e $REALDISK ]
-ln $DEV_RDSKDIR/$SDISK $REALDISK
-
-log_must zpool create $TESTPOOL2 $REALDISK
-
-# Backup the contents of the disk image
-BACKUP=$TEST_BASE_DIR/kstat-state-realdisk.gz
-log_must [ ! -e $BACKUP ]
-gzip -c $REALDISK > $BACKUP
-
-# Yank out the disk from under the pool
-log_must rm $REALDISK
-remove_disk $SDISK
-
-# Run a 'zpool scrub' in the background to suspend the pool.  We run it in the
-# background since the command will hang when the pool gets suspended.  The
-# command will resume and exit after we restore the missing disk later on.
-zpool scrub $TESTPOOL2 &
-sleep 1                # Give the scrub some time to run before we check if it fails
-
-log_must check_all $TESTPOOL2 "SUSPENDED"
-
-log_pass "/proc/spl/kstat/zfs/<pool>/state test successful"
diff --git a/tests/zfs-tests/tests/functional/procfs/Makefile.am b/tests/zfs-tests/tests/functional/procfs/Makefile.am
new file mode 100644 (file)
index 0000000..a7f022d
--- /dev/null
@@ -0,0 +1,8 @@
+pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/procfs
+dist_pkgdata_SCRIPTS = \
+       setup.ksh \
+       cleanup.ksh \
+       procfs_list_basic.ksh \
+       procfs_list_concurrent_readers.ksh \
+       procfs_list_stale_read.ksh \
+       pool_state.ksh
diff --git a/tests/zfs-tests/tests/functional/procfs/cleanup.ksh b/tests/zfs-tests/tests/functional/procfs/cleanup.ksh
new file mode 100755 (executable)
index 0000000..8fe4657
--- /dev/null
@@ -0,0 +1,29 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+default_cleanup
diff --git a/tests/zfs-tests/tests/functional/procfs/pool_state.ksh b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh
new file mode 100755 (executable)
index 0000000..3c29266
--- /dev/null
@@ -0,0 +1,144 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+
+#
+# Copyright (c) 2018 by Lawrence Livermore National Security, LLC.
+#
+
+#
+# DESCRIPTION:
+# Test /proc/spl/kstat/zfs/<pool>/state kstat
+#
+# STRATEGY:
+# 1. Create a mirrored pool
+# 2. Check that pool is ONLINE
+# 3. Fault one disk
+# 4. Check that pool is DEGRADED
+# 5. Create a new pool with a single scsi_debug disk
+# 6. Remove the disk
+# 7. Check that pool is SUSPENDED
+# 8. Add the disk back in
+# 9. Clear errors and destroy the pools
+
+. $STF_SUITE/include/libtest.shlib
+
+verify_runnable "both"
+
+function cleanup
+{
+       # Destroy the scsi_debug pool
+       if [ -n "$TESTPOOL2" ] ; then
+               if  [ -n "$host" ] ; then
+                       # Re-enable the disk
+                       scan_scsi_hosts $host
+
+                       # Device may have changed names after being inserted
+                       SDISK=$(get_debug_device)
+                       log_must ln $DEV_RDSKDIR/$SDISK $REALDISK
+               fi
+
+               # Restore our working pool image
+               if [ -n "$BACKUP" ] ; then
+                       gunzip -c $BACKUP > $REALDISK
+                       log_must rm -f $BACKUP
+               fi
+
+               # Our disk is back.  Now we can clear errors and destroy the
+               # pool cleanly.
+               log_must zpool clear $TESTPOOL2
+
+               # Now that the disk is back and errors cleared, wait for our
+               # hung 'zpool scrub' to finish.
+               wait
+
+               destroy_pool $TESTPOOL2
+               log_must rm $REALDISK
+               unload_scsi_debug
+       fi
+}
+
+# Check that our pool state values match what's expected
+#
+# $1: pool name
+# $2: expected state ("ONLINE", "DEGRADED", "SUSPENDED", etc)
+function check_all
+{
+       pool=$1
+       expected=$2
+
+       state1=$(zpool status $pool | awk '/state: /{print $2}');
+       state2=$(zpool list -H -o health $pool)
+       state3=$(cat /proc/spl/kstat/zfs/$pool/state)
+       log_note "Checking $expected = $state1 = $state2 = $state3"
+       if [[ "$expected" == "$state1" &&  "$expected" == "$state2" && \
+           "$expected" == "$state3" ]] ; then
+               true
+       else
+               false
+       fi
+}
+
+log_onexit cleanup
+
+log_assert "Testing /proc/spl/kstat/zfs/<pool>/state kstat"
+
+# Test that the initial pool is healthy
+check_all $TESTPOOL "ONLINE"
+
+# Fault one of the disks, and check that pool is degraded
+DISK1=$(echo "$DISKS" | awk '{print $2}')
+zpool offline -tf $TESTPOOL $DISK1
+check_all $TESTPOOL "DEGRADED"
+
+# Create a new pool out of a scsi_debug disk
+TESTPOOL2=testpool2
+MINVDEVSIZE_MB=$((MINVDEVSIZE / 1048576))
+load_scsi_debug $MINVDEVSIZE_MB 1 1 1 '512b'
+
+SDISK=$(get_debug_device)
+host=$(get_scsi_host $SDISK)
+
+# Use $REALDISK instead of $SDISK in our pool because $SDISK can change names
+# as we remove/add the disk (i.e. /dev/sdf -> /dev/sdg).
+REALDISK=/dev/kstat-state-realdisk
+log_must [ ! -e $REALDISK ]
+ln $DEV_RDSKDIR/$SDISK $REALDISK
+
+log_must zpool create $TESTPOOL2 $REALDISK
+
+# Backup the contents of the disk image
+BACKUP=$TEST_BASE_DIR/kstat-state-realdisk.gz
+log_must [ ! -e $BACKUP ]
+gzip -c $REALDISK > $BACKUP
+
+# Yank out the disk from under the pool
+log_must rm $REALDISK
+remove_disk $SDISK
+
+# Run a 'zpool scrub' in the background to suspend the pool.  We run it in the
+# background since the command will hang when the pool gets suspended.  The
+# command will resume and exit after we restore the missing disk later on.
+zpool scrub $TESTPOOL2 &
+sleep 1                # Give the scrub some time to run before we check if it fails
+
+log_must check_all $TESTPOOL2 "SUSPENDED"
+
+log_pass "/proc/spl/kstat/zfs/<pool>/state test successful"
diff --git a/tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh b/tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh
new file mode 100755 (executable)
index 0000000..c9eff36
--- /dev/null
@@ -0,0 +1,95 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Test that we can read from and write to a file in procfs whose contents is
+# backed by a linked list.
+#
+# STRATEGY:
+# 1. Take some snapshots of a filesystem, which will cause some messages to be
+#    written to the zfs dbgmsgs.
+# 2. Read the dbgmsgs via procfs and verify that the expected messages are
+#    present.
+# 3. Write to the dbgmsgs file to clear the messages.
+# 4. Read the dbgmsgs again, and make sure the messages are no longer present.
+#
+
+function cleanup
+{
+       datasetexists $FS && log_must zfs destroy -r $FS
+}
+
+function count_snap_cmds
+{
+       typeset expected_count=$1
+       count=$(grep "command: zfs snapshot $FS@testsnapshot" | wc -l)
+       log_must eval "[[ $count -eq $expected_count ]]"
+}
+
+typeset -r ZFS_DBGMSG=/proc/spl/kstat/zfs/dbgmsg
+typeset -r FS=$TESTPOOL/fs
+typeset snap_msgs
+
+log_onexit cleanup
+
+# Clear out old messages
+echo 0 >$ZFS_DBGMSG || log_fail "failed to write to $ZFS_DBGMSG"
+
+log_must zfs create $FS
+for i in {1..20}; do
+       log_must zfs snapshot "$FS@testsnapshot$i"
+done
+log_must zpool sync $TESTPOOL
+
+#
+# Read the debug message file in small chunks to make sure that the read is
+# split up into multiple syscalls. This tests that when a syscall begins we
+# correctly pick up in the list of messages where the previous syscall left
+# off. The size of the read can affect how many bytes the seq_file code has
+# left in its internal buffer, which in turn can affect the relative pos that
+# the seq_file code picks up at when the next read starts. Try a few
+# different size reads to make sure we can handle each case.
+#
+# Check that the file has the right contents by grepping for some of the
+# messages that we expect to be present.
+#
+for chunk_sz in {1,64,256,1024,4096}; do
+       dd if=$ZFS_DBGMSG bs=$chunk_sz | count_snap_cmds 20
+done
+
+# Clear out old messages and check that they really are gone
+echo 0 >$ZFS_DBGMSG || log_fail "failed to write to $ZFS_DBGMSG"
+cat $ZFS_DBGMSG | count_snap_cmds 0
+#
+# Even though we don't expect any messages in the file, reading should still
+# succeed.
+#
+log_must cat $ZFS_DBGMSG
+
+log_pass "Basic reading/writing of procfs file backed by linked list successful"
diff --git a/tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh b/tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh
new file mode 100755 (executable)
index 0000000..473de5c
--- /dev/null
@@ -0,0 +1,82 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Make sure that interleaving reads from different readers does not affect the
+# results that are returned.
+#
+# STRATEGY:
+# 1. Make sure a few debug messages have been logged.
+# 2. Open the procfs file and start reading from it.
+# 3. Open the file again, and read its entire contents.
+# 4. Resume reading from the first instance.
+# 5. Check that the contents read by the two instances are identical.
+#
+
+function cleanup
+{
+       [[ -z $msgs1 ]] || log_must rm $msgs1
+       [[ -z $msgs2 ]] || log_must rm $msgs2
+       datasetexists $FS && log_must zfs destroy -r $FS
+}
+
+typeset -r ZFS_DBGMSG=/proc/spl/kstat/zfs/dbgmsg
+typeset -r FS=$TESTPOOL/fs
+typeset msgs1 msgs2
+
+log_onexit cleanup
+
+# Clear out old messages
+echo 0 >$ZFS_DBGMSG || log_fail "failed to write to $ZFS_DBGMSG"
+
+# Add some new messages
+log_must zfs create $FS
+for i in {1..20}; do
+       log_must zfs snapshot "$FS@testsnapshot$i"
+done
+log_must zpool sync $TESTPOOL
+
+msgs1=$(mktemp) || log_fail
+msgs2=$(mktemp) || log_fail
+
+#
+# Start reading file, pause and read it from another process, and then finish
+# reading.
+#
+{ dd bs=512 count=4; cat $ZFS_DBGMSG >$msgs1; cat; } <$ZFS_DBGMSG >$msgs2
+
+#
+# Truncate the result of the read that completed second in case it picked up an
+# extra message that was logged after the first read completed.
+#
+log_must truncate -s $(stat -c "%s" $msgs1) $msgs2
+
+log_must diff $msgs1 $msgs2
+
+log_pass "Concurrent readers receive identical results"
diff --git a/tests/zfs-tests/tests/functional/procfs/procfs_list_stale_read.ksh b/tests/zfs-tests/tests/functional/procfs/procfs_list_stale_read.ksh
new file mode 100755 (executable)
index 0000000..c363e7f
--- /dev/null
@@ -0,0 +1,98 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+#
+# DESCRIPTION:
+# Make sure errors caused by messages being dropped from the list backing the
+# procfs file are handled gracefully.
+#
+# STRATEGY:
+# 1. Make sure a few entries have been logged.
+# 2. Open the procfs file and start reading from it.
+# 3. Write to the file to cause its contents to be dropped.
+# 4. Resume reading from the first instance, and check that the expected
+#    error is received.
+# 5. Repeat steps 1-4, except instead of dropping all the messages by writing
+#    to the file, cause enough new messages to be written that the old messages
+#    are dropped.
+#
+
+function cleanup
+{
+       echo $default_max_entries >$MAX_ENTRIES_PARAM || log_fail
+}
+
+function sync_n
+{
+       for i in {1..$1}; do
+               log_must zpool sync $TESTPOOL
+       done
+       return 0
+}
+
+function do_test
+{
+       typeset cmd=$1
+
+       # Clear out old entries
+       echo 0 >$TXG_HIST || log_fail
+
+       # Add some new entries
+       sync_n 20
+
+       # Confirm that there actually is something in the file.
+       [[ $(wc -l <$TXG_HIST) -ge 20 ]] || log_fail "expected more entries"
+
+       #
+       # Start reading file, pause and run a command that will cause the
+       # current offset into the file to become invalid, and then try to
+       # finish reading.
+       #
+       {
+               log_must dd bs=512 count=4 >/dev/null
+               log_must eval "$cmd"
+               cat 2>&1 >/dev/null | log_must grep "Input/output error"
+       } <$TXG_HIST
+}
+
+typeset -r TXG_HIST=/proc/spl/kstat/zfs/$TESTPOOL/txgs
+typeset MAX_ENTRIES_PARAM=/sys/module/zfs/parameters/zfs_txg_history
+typeset default_max_entries
+
+log_onexit cleanup
+
+default_max_entries=$(cat $MAX_ENTRIES_PARAM) || log_fail
+echo 50 >$MAX_ENTRIES_PARAM || log_fail
+
+# Clear all of the existing entries.
+do_test "echo 0 >$TXG_HIST"
+
+# Add enough new entries to the list that all of the old ones are dropped.
+do_test "sync_n 60"
+
+log_pass "Attempting to read dropped message returns expected error"
diff --git a/tests/zfs-tests/tests/functional/procfs/setup.ksh b/tests/zfs-tests/tests/functional/procfs/setup.ksh
new file mode 100755 (executable)
index 0000000..b3812db
--- /dev/null
@@ -0,0 +1,34 @@
+#!/bin/ksh -p
+#
+# CDDL HEADER START
+#
+# The contents of this file are subject to the terms of the
+# Common Development and Distribution License (the "License").
+# You may not use this file except in compliance with the License.
+#
+# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+# or http://www.opensolaris.org/os/licensing.
+# See the License for the specific language governing permissions
+# and limitations under the License.
+#
+# When distributing Covered Code, include this CDDL HEADER in each
+# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+# If applicable, add the following below this CDDL HEADER, with the
+# fields enclosed by brackets "[]" replaced with your own identifying
+# information: Portions Copyright [yyyy] [name of copyright owner]
+#
+# CDDL HEADER END
+#
+
+#
+# Copyright (c) 2018 by Delphix. All rights reserved.
+#
+
+. $STF_SUITE/include/libtest.shlib
+
+if ! is_linux ; then
+       log_unsupported "procfs is only used on Linux"
+fi
+
+default_mirror_setup $DISKS
+log_pass