From d12614521a307c709778e5f7f91ae6085f63f9e0 Mon Sep 17 00:00:00 2001 From: John Gallagher Date: Wed, 26 Sep 2018 11:08:12 -0700 Subject: [PATCH] Fixes for procfs files backed by linked lists There are some issues with the way the seq_file interface is implemented for kstats backed by linked lists (zfs_dbgmsgs and certain per-pool debugging info): * We don't account for the fact that seq_file sometimes visits a node multiple times, which results in missing messages when read through procfs. * We don't keep separate state for each reader of a file, so concurrent readers will receive incorrect results. * We don't account for the fact that entries may have been removed from the list between read syscalls, so reading from these files in procfs can cause the system to crash. This change fixes these issues and adds procfs_list, a wrapper around a linked list which abstracts away the details of implementing the seq_file interface for a list and exposing the contents of the list through procfs. Reviewed by: Don Brady Reviewed-by: Serapheim Dimitropoulos Reviewed by: Brad Lewis Reviewed-by: Brian Behlendorf Signed-off-by: John Gallagher External-issue: LX-1211 Closes #7819 --- configure.ac | 2 +- include/spl/sys/Makefile.am | 1 + include/spl/sys/kstat.h | 20 +- include/spl/sys/procfs_list.h | 71 +++ include/sys/spa.h | 23 +- include/sys/zfs_context.h | 32 + include/sys/zfs_debug.h | 7 - lib/libzpool/kernel.c | 51 ++ module/spl/Makefile.in | 1 + module/spl/spl-kstat.c | 100 ++- module/spl/spl-procfs-list.c | 256 ++++++++ module/zfs/spa_stats.c | 600 +++++++----------- module/zfs/vdev_queue.c | 40 +- module/zfs/zfs_debug.c | 132 ++-- tests/runfiles/linux.run | 9 +- tests/zfs-tests/tests/functional/Makefile.am | 2 +- .../tests/functional/kstat/Makefile.am | 5 - .../tests/functional/procfs/Makefile.am | 8 + .../functional/{kstat => procfs}/cleanup.ksh | 3 +- .../state.ksh => procfs/pool_state.ksh} | 0 .../functional/procfs/procfs_list_basic.ksh | 95 +++ .../procfs/procfs_list_concurrent_readers.ksh | 82 +++ .../procfs/procfs_list_stale_read.ksh | 98 +++ .../functional/{kstat => procfs}/setup.ksh | 6 +- 24 files changed, 1086 insertions(+), 558 deletions(-) create mode 100644 include/spl/sys/procfs_list.h create mode 100644 module/spl/spl-procfs-list.c delete mode 100644 tests/zfs-tests/tests/functional/kstat/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/procfs/Makefile.am rename tests/zfs-tests/tests/functional/{kstat => procfs}/cleanup.ksh (92%) rename tests/zfs-tests/tests/functional/{kstat/state.ksh => procfs/pool_state.ksh} (100%) create mode 100755 tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh create mode 100755 tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh create mode 100755 tests/zfs-tests/tests/functional/procfs/procfs_list_stale_read.ksh rename tests/zfs-tests/tests/functional/{kstat => procfs}/setup.ksh (86%) diff --git a/configure.ac b/configure.ac index 18d91b359..301258e7f 100644 --- a/configure.ac +++ b/configure.ac @@ -283,7 +283,6 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/inheritance/Makefile tests/zfs-tests/tests/functional/inuse/Makefile tests/zfs-tests/tests/functional/io/Makefile - tests/zfs-tests/tests/functional/kstat/Makefile tests/zfs-tests/tests/functional/large_files/Makefile tests/zfs-tests/tests/functional/largest_pool/Makefile tests/zfs-tests/tests/functional/link_count/Makefile @@ -301,6 +300,7 @@ AC_CONFIG_FILES([ tests/zfs-tests/tests/functional/pool_checkpoint/Makefile tests/zfs-tests/tests/functional/poolversion/Makefile tests/zfs-tests/tests/functional/privilege/Makefile + tests/zfs-tests/tests/functional/procfs/Makefile tests/zfs-tests/tests/functional/projectquota/Makefile tests/zfs-tests/tests/functional/pyzfs/Makefile tests/zfs-tests/tests/functional/quota/Makefile diff --git a/include/spl/sys/Makefile.am b/include/spl/sys/Makefile.am index d58ed0e20..e596ff373 100644 --- a/include/spl/sys/Makefile.am +++ b/include/spl/sys/Makefile.am @@ -28,6 +28,7 @@ KERNEL_H = \ $(top_srcdir)/include/spl/sys/param.h \ $(top_srcdir)/include/spl/sys/processor.h \ $(top_srcdir)/include/spl/sys/proc.h \ + $(top_srcdir)/include/spl/sys/procfs_list.h \ $(top_srcdir)/include/spl/sys/random.h \ $(top_srcdir)/include/spl/sys/rwlock.h \ $(top_srcdir)/include/spl/sys/shrinker.h \ diff --git a/include/spl/sys/kstat.h b/include/spl/sys/kstat.h index f197ce455..53274d8f5 100644 --- a/include/spl/sys/kstat.h +++ b/include/spl/sys/kstat.h @@ -98,30 +98,34 @@ typedef struct kstat_raw_ops { void *(*addr)(kstat_t *ksp, loff_t index); } kstat_raw_ops_t; +typedef struct kstat_proc_entry { + char kpe_name[KSTAT_STRLEN+1]; /* kstat name */ + char kpe_module[KSTAT_STRLEN+1]; /* provider module name */ + kstat_module_t *kpe_owner; /* kstat module linkage */ + struct list_head kpe_list; /* kstat linkage */ + struct proc_dir_entry *kpe_proc; /* procfs entry */ +} kstat_proc_entry_t; + struct kstat_s { int ks_magic; /* magic value */ kid_t ks_kid; /* unique kstat ID */ hrtime_t ks_crtime; /* creation time */ hrtime_t ks_snaptime; /* last access time */ - char ks_module[KSTAT_STRLEN+1]; /* provider module name */ int ks_instance; /* provider module instance */ - char ks_name[KSTAT_STRLEN+1]; /* kstat name */ char ks_class[KSTAT_STRLEN+1]; /* kstat class */ uchar_t ks_type; /* kstat data type */ uchar_t ks_flags; /* kstat flags */ void *ks_data; /* kstat type-specific data */ uint_t ks_ndata; /* # of data records */ size_t ks_data_size; /* size of kstat data section */ - struct proc_dir_entry *ks_proc; /* proc linkage */ kstat_update_t *ks_update; /* dynamic updates */ void *ks_private; /* private data */ kmutex_t ks_private_lock; /* kstat private data lock */ kmutex_t *ks_lock; /* kstat data lock */ - struct list_head ks_list; /* kstat linkage */ - kstat_module_t *ks_owner; /* kstat module linkage */ kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ char *ks_raw_buf; /* buf used for raw ops */ size_t ks_raw_bufsize; /* size of raw ops buffer */ + kstat_proc_entry_t ks_proc; /* data for procfs entry */ }; typedef struct kstat_named_s { @@ -189,6 +193,12 @@ extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, const char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags); +extern void kstat_proc_entry_init(kstat_proc_entry_t *kpep, + const char *module, const char *name); +extern void kstat_proc_entry_delete(kstat_proc_entry_t *kpep); +extern void kstat_proc_entry_install(kstat_proc_entry_t *kpep, + const struct file_operations *file_ops, void *data); + extern void __kstat_install(kstat_t *ksp); extern void __kstat_delete(kstat_t *ksp); extern void kstat_waitq_enter(kstat_io_t *); diff --git a/include/spl/sys/procfs_list.h b/include/spl/sys/procfs_list.h new file mode 100644 index 000000000..cbcb4bcff --- /dev/null +++ b/include/spl/sys/procfs_list.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _SPL_PROCFS_LIST_H +#define _SPL_PROCFS_LIST_H + +#include +#include +#include +#include + +typedef struct procfs_list procfs_list_t; +struct procfs_list { + /* Accessed only by user of a procfs_list */ + void *pl_private; + + /* + * Accessed both by user of a procfs_list and by procfs_list + * implementation + */ + kmutex_t pl_lock; + list_t pl_list; + + /* Accessed only by procfs_list implementation */ + uint64_t pl_next_id; + int (*pl_show)(struct seq_file *f, void *p); + int (*pl_show_header)(struct seq_file *f); + int (*pl_clear)(procfs_list_t *procfs_list); + size_t pl_node_offset; + kstat_proc_entry_t pl_kstat_entry; +}; + +typedef struct procfs_list_node { + list_node_t pln_link; + uint64_t pln_id; +} procfs_list_node_t; + +void procfs_list_install(const char *module, + const char *name, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off); +void procfs_list_uninstall(procfs_list_t *procfs_list); +void procfs_list_destroy(procfs_list_t *procfs_list); + +void procfs_list_add(procfs_list_t *procfs_list, void *p); + +#endif /* _SPL_PROCFS_LIST_H */ diff --git a/include/sys/spa.h b/include/sys/spa.h index b86c65557..443d835a1 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -863,22 +863,27 @@ extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) /* Historical pool statistics */ -typedef struct spa_stats_history { +typedef struct spa_history_kstat { kmutex_t lock; uint64_t count; uint64_t size; kstat_t *kstat; void *private; list_t list; -} spa_stats_history_t; +} spa_history_kstat_t; + +typedef struct spa_history_list { + uint64_t size; + procfs_list_t procfs_list; +} spa_history_list_t; typedef struct spa_stats { - spa_stats_history_t read_history; - spa_stats_history_t txg_history; - spa_stats_history_t tx_assign_histogram; - spa_stats_history_t io_history; - spa_stats_history_t mmp_history; - spa_stats_history_t state; /* pool state */ + spa_history_list_t read_history; + spa_history_list_t txg_history; + spa_history_kstat_t tx_assign_histogram; + spa_history_kstat_t io_history; + spa_history_list_t mmp_history; + spa_history_kstat_t state; /* pool state */ } spa_stats_t; typedef enum txg_state { @@ -911,7 +916,7 @@ extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id); extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, hrtime_t duration); -extern void *spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, +extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, int error); diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h index 6f502897e..11c048c23 100644 --- a/include/sys/zfs_context.h +++ b/include/sys/zfs_context.h @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -351,6 +352,37 @@ extern void kstat_set_raw_ops(kstat_t *ksp, int (*data)(char *buf, size_t size, void *data), void *(*addr)(kstat_t *ksp, loff_t index)); +/* + * procfs list manipulation + */ + +struct seq_file { }; +void seq_printf(struct seq_file *m, const char *fmt, ...); + +typedef struct procfs_list { + void *pl_private; + kmutex_t pl_lock; + list_t pl_list; + uint64_t pl_next_id; + size_t pl_node_offset; +} procfs_list_t; + +typedef struct procfs_list_node { + list_node_t pln_link; + uint64_t pln_id; +} procfs_list_node_t; + +void procfs_list_install(const char *module, + const char *name, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off); +void procfs_list_uninstall(procfs_list_t *procfs_list); +void procfs_list_destroy(procfs_list_t *procfs_list); +void procfs_list_add(procfs_list_t *procfs_list, void *p); + /* * Kernel memory */ diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h index aa9bfe21f..f3a936ae7 100644 --- a/include/sys/zfs_debug.h +++ b/include/sys/zfs_debug.h @@ -76,13 +76,6 @@ extern void __dprintf(const char *file, const char *func, extern void zfs_panic_recover(const char *fmt, ...); -typedef struct zfs_dbgmsg { - list_node_t zdm_node; - time_t zdm_timestamp; - int zdm_size; - char zdm_msg[1]; /* variable length allocation */ -} zfs_dbgmsg_t; - extern void zfs_dbgmsg_init(void); extern void zfs_dbgmsg_fini(void); diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c index 341548ac3..5baf52514 100644 --- a/lib/libzpool/kernel.c +++ b/lib/libzpool/kernel.c @@ -424,6 +424,57 @@ cv_broadcast(kcondvar_t *cv) VERIFY0(pthread_cond_broadcast(cv)); } +/* + * ========================================================================= + * procfs list + * ========================================================================= + */ + +void +seq_printf(struct seq_file *m, const char *fmt, ...) +{} + +void +procfs_list_install(const char *module, + const char *name, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_next_id = 1; + procfs_list->pl_node_offset = procfs_list_node_off; +} + +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{} + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} + /* * ========================================================================= * vnode operations diff --git a/module/spl/Makefile.in b/module/spl/Makefile.in index 97a431f22..3bcbf63cb 100644 --- a/module/spl/Makefile.in +++ b/module/spl/Makefile.in @@ -18,6 +18,7 @@ $(MODULE)-objs += spl-kobj.o $(MODULE)-objs += spl-kstat.o $(MODULE)-objs += spl-mutex.o $(MODULE)-objs += spl-proc.o +$(MODULE)-objs += spl-procfs-list.o $(MODULE)-objs += spl-rwlock.o $(MODULE)-objs += spl-taskq.o $(MODULE)-objs += spl-thread.o diff --git a/module/spl/spl-kstat.c b/module/spl/spl-kstat.c index c3fc2e4b2..8683693c8 100644 --- a/module/spl/spl-kstat.c +++ b/module/spl/spl-kstat.c @@ -530,6 +530,18 @@ __kstat_set_raw_ops(kstat_t *ksp, } EXPORT_SYMBOL(__kstat_set_raw_ops); +void +kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, + const char *name) +{ + kpep->kpe_owner = NULL; + kpep->kpe_proc = NULL; + INIT_LIST_HEAD(&kpep->kpe_list); + strncpy(kpep->kpe_module, module, KSTAT_STRLEN); + strncpy(kpep->kpe_name, name, KSTAT_STRLEN); +} +EXPORT_SYMBOL(kstat_proc_entry_init); + kstat_t * __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, const char *ks_class, uchar_t ks_type, uint_t ks_ndata, @@ -556,13 +568,10 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, ksp->ks_magic = KS_MAGIC; mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); ksp->ks_lock = &ksp->ks_private_lock; - INIT_LIST_HEAD(&ksp->ks_list); ksp->ks_crtime = gethrtime(); ksp->ks_snaptime = ksp->ks_crtime; - strncpy(ksp->ks_module, ks_module, KSTAT_STRLEN); ksp->ks_instance = ks_instance; - strncpy(ksp->ks_name, ks_name, KSTAT_STRLEN); strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); ksp->ks_type = ks_type; ksp->ks_flags = ks_flags; @@ -573,6 +582,7 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, ksp->ks_raw_ops.addr = NULL; ksp->ks_raw_buf = NULL; ksp->ks_raw_bufsize = 0; + kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name); switch (ksp->ks_type) { case KSTAT_TYPE_RAW: @@ -614,14 +624,14 @@ __kstat_create(const char *ks_module, int ks_instance, const char *ks_name, EXPORT_SYMBOL(__kstat_create); static int -kstat_detect_collision(kstat_t *ksp) +kstat_detect_collision(kstat_proc_entry_t *kpep) { kstat_module_t *module; - kstat_t *tmp; + kstat_proc_entry_t *tmp; char *parent; char *cp; - parent = kmem_asprintf("%s", ksp->ks_module); + parent = kmem_asprintf("%s", kpep->kpe_module); if ((cp = strrchr(parent, '/')) == NULL) { strfree(parent); @@ -630,8 +640,8 @@ kstat_detect_collision(kstat_t *ksp) cp[0] = '\0'; if ((module = kstat_find_module(parent)) != NULL) { - list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) { - if (strncmp(tmp->ks_name, cp+1, KSTAT_STRLEN) == 0) { + list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { + if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) { strfree(parent); return (EEXIST); } @@ -642,24 +652,30 @@ kstat_detect_collision(kstat_t *ksp) return (0); } +/* + * Add a file to the proc filesystem under the kstat namespace (i.e. + * /proc/spl/kstat/). The file need not necessarily be implemented as a + * kstat. + */ void -__kstat_install(kstat_t *ksp) +kstat_proc_entry_install(kstat_proc_entry_t *kpep, + const struct file_operations *file_ops, void *data) { kstat_module_t *module; - kstat_t *tmp; + kstat_proc_entry_t *tmp; - ASSERT(ksp); + ASSERT(kpep); mutex_enter(&kstat_module_lock); - module = kstat_find_module(ksp->ks_module); + module = kstat_find_module(kpep->kpe_module); if (module == NULL) { - if (kstat_detect_collision(ksp) != 0) { + if (kstat_detect_collision(kpep) != 0) { cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \ - " collision", ksp->ks_module, ksp->ks_name); + " collision", kpep->kpe_module, kpep->kpe_name); goto out; } - module = kstat_create_module(ksp->ks_module); + module = kstat_create_module(kpep->kpe_module); if (module == NULL) goto out; } @@ -668,44 +684,60 @@ __kstat_install(kstat_t *ksp) * Only one entry by this name per-module, on failure the module * shouldn't be deleted because we know it has at least one entry. */ - list_for_each_entry(tmp, &module->ksm_kstat_list, ks_list) { - if (strncmp(tmp->ks_name, ksp->ks_name, KSTAT_STRLEN) == 0) + list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { + if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0) goto out; } - list_add_tail(&ksp->ks_list, &module->ksm_kstat_list); + list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list); - mutex_enter(ksp->ks_lock); - ksp->ks_owner = module; - ksp->ks_proc = proc_create_data(ksp->ks_name, 0644, - module->ksm_proc, &proc_kstat_operations, (void *)ksp); - if (ksp->ks_proc == NULL) { - list_del_init(&ksp->ks_list); + kpep->kpe_owner = module; + kpep->kpe_proc = proc_create_data(kpep->kpe_name, 0644, + module->ksm_proc, file_ops, data); + if (kpep->kpe_proc == NULL) { + list_del_init(&kpep->kpe_list); if (list_empty(&module->ksm_kstat_list)) kstat_delete_module(module); } - mutex_exit(ksp->ks_lock); out: mutex_exit(&kstat_module_lock); + +} +EXPORT_SYMBOL(kstat_proc_entry_install); + +void +__kstat_install(kstat_t *ksp) +{ + ASSERT(ksp); + kstat_proc_entry_install(&ksp->ks_proc, &proc_kstat_operations, ksp); } EXPORT_SYMBOL(__kstat_install); void -__kstat_delete(kstat_t *ksp) +kstat_proc_entry_delete(kstat_proc_entry_t *kpep) { - kstat_module_t *module = ksp->ks_owner; + kstat_module_t *module = kpep->kpe_owner; + if (kpep->kpe_proc) + remove_proc_entry(kpep->kpe_name, module->ksm_proc); mutex_enter(&kstat_module_lock); - list_del_init(&ksp->ks_list); + list_del_init(&kpep->kpe_list); + + /* + * Remove top level module directory if it wasn't empty before, but now + * is. + */ + if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list)) + kstat_delete_module(module); mutex_exit(&kstat_module_lock); - if (ksp->ks_proc) { - remove_proc_entry(ksp->ks_name, module->ksm_proc); +} +EXPORT_SYMBOL(kstat_proc_entry_delete); - /* Remove top level module directory if it's empty */ - if (list_empty(&module->ksm_kstat_list)) - kstat_delete_module(module); - } +void +__kstat_delete(kstat_t *ksp) +{ + kstat_proc_entry_delete(&ksp->ks_proc); if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) kmem_free(ksp->ks_data, ksp->ks_data_size); diff --git a/module/spl/spl-procfs-list.c b/module/spl/spl-procfs-list.c new file mode 100644 index 000000000..4902e0a56 --- /dev/null +++ b/module/spl/spl-procfs-list.c @@ -0,0 +1,256 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include + +/* + * A procfs_list is a wrapper around a linked list which implements the seq_file + * interface, allowing the contents of the list to be exposed through procfs. + * The kernel already has some utilities to help implement the seq_file + * interface for linked lists (seq_list_*), but they aren't appropriate for use + * with lists that have many entries, because seq_list_start walks the list at + * the start of each read syscall to find where it left off, so reading a file + * ends up being quadratic in the number of entries in the list. + * + * This implementation avoids this penalty by maintaining a separate cursor into + * the list per instance of the file that is open. It also maintains some extra + * information in each node of the list to prevent reads of entries that have + * been dropped from the list. + * + * Callers should only add elements to the list using procfs_list_add, which + * adds an element to the tail of the list. Other operations can be performed + * directly on the wrapped list using the normal list manipulation functions, + * but elements should only be removed from the head of the list. + */ + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +typedef struct procfs_list_cursor { + procfs_list_t *procfs_list; /* List into which this cursor points */ + void *cached_node; /* Most recently accessed node */ + loff_t cached_pos; /* Position of cached_node */ +} procfs_list_cursor_t; + +static int +procfs_list_seq_show(struct seq_file *f, void *p) +{ + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + if (p == SEQ_START_TOKEN) { + if (procfs_list->pl_show_header != NULL) + return (procfs_list->pl_show_header(f)); + else + return (0); + } + return (procfs_list->pl_show(f, p)); +} + +static void * +procfs_list_next_node(procfs_list_cursor_t *cursor, loff_t *pos) +{ + void *next_node; + procfs_list_t *procfs_list = cursor->procfs_list; + + if (cursor->cached_node == SEQ_START_TOKEN) + next_node = list_head(&procfs_list->pl_list); + else + next_node = list_next(&procfs_list->pl_list, + cursor->cached_node); + + if (next_node != NULL) { + cursor->cached_node = next_node; + cursor->cached_pos = NODE_ID(procfs_list, cursor->cached_node); + *pos = cursor->cached_pos; + } + return (next_node); +} + +static void * +procfs_list_seq_start(struct seq_file *f, loff_t *pos) +{ + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + + mutex_enter(&procfs_list->pl_lock); + + if (*pos == 0) { + cursor->cached_node = SEQ_START_TOKEN; + cursor->cached_pos = 0; + return (SEQ_START_TOKEN); + } + + /* + * Check if our cached pointer has become stale, which happens if the + * the message where we left off has been dropped from the list since + * the last read syscall completed. + */ + void *oldest_node = list_head(&procfs_list->pl_list); + if (cursor->cached_node != SEQ_START_TOKEN && (oldest_node == NULL || + NODE_ID(procfs_list, oldest_node) > cursor->cached_pos)) + return (ERR_PTR(-EIO)); + + /* + * If it isn't starting from the beginning of the file, the seq_file + * code will either pick up at the same position it visited last or the + * following one. + */ + if (*pos == cursor->cached_pos) { + return (cursor->cached_node); + } else { + ASSERT3U(*pos, ==, cursor->cached_pos + 1); + return (procfs_list_next_node(cursor, pos)); + } +} + +static void * +procfs_list_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + procfs_list_cursor_t *cursor = f->private; + ASSERT(MUTEX_HELD(&cursor->procfs_list->pl_lock)); + return (procfs_list_next_node(cursor, pos)); +} + +static void +procfs_list_seq_stop(struct seq_file *f, void *p) +{ + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + mutex_exit(&procfs_list->pl_lock); +} + +static struct seq_operations procfs_list_seq_ops = { + .show = procfs_list_seq_show, + .start = procfs_list_seq_start, + .next = procfs_list_seq_next, + .stop = procfs_list_seq_stop, +}; + +static int +procfs_list_open(struct inode *inode, struct file *filp) +{ + int rc = seq_open_private(filp, &procfs_list_seq_ops, + sizeof (procfs_list_cursor_t)); + if (rc != 0) + return (rc); + + struct seq_file *f = filp->private_data; + procfs_list_cursor_t *cursor = f->private; + cursor->procfs_list = PDE_DATA(inode); + cursor->cached_node = NULL; + cursor->cached_pos = 0; + + return (0); +} + +static ssize_t +procfs_list_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos) +{ + struct seq_file *f = filp->private_data; + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + int rc; + + if (procfs_list->pl_clear != NULL && + (rc = procfs_list->pl_clear(procfs_list)) != 0) + return (-rc); + return (len); +} + +static struct file_operations procfs_list_operations = { + .owner = THIS_MODULE, + .open = procfs_list_open, + .write = procfs_list_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +}; + +/* + * Initialize a procfs_list and create a file for it in the proc filesystem + * under the kstat namespace. + */ +void +procfs_list_install(const char *module, + const char *name, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_next_id = 1; /* Save id 0 for SEQ_START_TOKEN */ + procfs_list->pl_show = show; + procfs_list->pl_show_header = show_header; + procfs_list->pl_clear = clear; + procfs_list->pl_node_offset = procfs_list_node_off; + + kstat_proc_entry_init(&procfs_list->pl_kstat_entry, module, name); + kstat_proc_entry_install(&procfs_list->pl_kstat_entry, + &procfs_list_operations, procfs_list); +} +EXPORT_SYMBOL(procfs_list_install); + +/* Remove the proc filesystem file corresponding to the given list */ +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{ + kstat_proc_entry_delete(&procfs_list->pl_kstat_entry); +} +EXPORT_SYMBOL(procfs_list_uninstall); + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} +EXPORT_SYMBOL(procfs_list_destroy); + +/* + * Add a new node to the tail of the list. While the standard list manipulation + * functions can be use for all other operation, adding elements to the list + * should only be done using this helper so that the id of the new node is set + * correctly. + */ +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} +EXPORT_SYMBOL(procfs_list_add); diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index fa1cf9e98..c02ef86b5 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -55,7 +55,6 @@ int zfs_multihost_history = 0; * Read statistics - Information exported regarding each arc_read call */ typedef struct spa_read_history { - uint64_t uid; /* unique identifier */ hrtime_t start; /* time read completed */ uint64_t objset; /* read from this objset */ uint64_t object; /* read of this object number */ @@ -65,13 +64,13 @@ typedef struct spa_read_history { uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ pid_t pid; /* PID of task doing read */ char comm[16]; /* process name of task doing read */ - list_node_t srh_link; + procfs_list_node_t srh_node; } spa_read_history_t; static int -spa_read_history_headers(char *buf, size_t size) +spa_read_history_show_header(struct seq_file *f) { - (void) snprintf(buf, size, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " + seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", "level", "blkid", "aflags", "origin", "pid", "process"); @@ -79,13 +78,13 @@ spa_read_history_headers(char *buf, size_t size) } static int -spa_read_history_data(char *buf, size_t size, void *data) +spa_read_history_show(struct seq_file *f, void *data) { spa_read_history_t *srh = (spa_read_history_t *)data; - (void) snprintf(buf, size, "%-8llu %-16llu 0x%-6llx " + seq_printf(f, "%-8llu %-16llu 0x%-6llx " "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", - (u_longlong_t)srh->uid, srh->start, + (u_longlong_t)srh->srh_node.pln_id, srh->start, (longlong_t)srh->objset, (longlong_t)srh->object, (longlong_t)srh->level, (longlong_t)srh->blkid, srh->aflags, srh->origin, srh->pid, srh->comm); @@ -93,120 +92,73 @@ spa_read_history_data(char *buf, size_t size, void *data) return (0); } -/* - * Calculate the address for the next spa_stats_history_t entry. The - * ssh->lock will be held until ksp->ks_ndata entries are processed. - */ -static void * -spa_read_history_addr(kstat_t *ksp, loff_t n) +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) { - spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.read_history; - - ASSERT(MUTEX_HELD(&ssh->lock)); - - if (n == 0) - ssh->private = list_tail(&ssh->list); - else if (ssh->private) - ssh->private = list_prev(&ssh->list, ssh->private); + spa_read_history_t *srh; + while (shl->size > size) { + srh = list_remove_head(&shl->procfs_list.pl_list); + ASSERT3P(srh, !=, NULL); + kmem_free(srh, sizeof (spa_read_history_t)); + shl->size--; + } - return (ssh->private); + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); } -/* - * When the kstat is written discard all spa_read_history_t entries. The - * ssh->lock will be held until ksp->ks_ndata entries are processed. - */ static int -spa_read_history_update(kstat_t *ksp, int rw) +spa_read_history_clear(procfs_list_t *procfs_list) { - spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.read_history; - - if (rw == KSTAT_WRITE) { - spa_read_history_t *srh; - - while ((srh = list_remove_head(&ssh->list))) { - ssh->size--; - kmem_free(srh, sizeof (spa_read_history_t)); - } - - ASSERT3U(ssh->size, ==, 0); - } - - ksp->ks_ndata = ssh->size; - ksp->ks_data_size = ssh->size * sizeof (spa_read_history_t); - + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_read_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_read_history_init(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.read_history; - char *name; - kstat_t *ksp; + spa_history_list_t *shl = &spa->spa_stats.read_history; + char *module; - mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&ssh->list, sizeof (spa_read_history_t), - offsetof(spa_read_history_t, srh_link)); + shl->size = 0; - ssh->count = 0; - ssh->size = 0; - ssh->private = NULL; + module = kmem_asprintf("zfs/%s", spa_name(spa)); - name = kmem_asprintf("zfs/%s", spa_name(spa)); + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "reads", + &shl->procfs_list, + spa_read_history_show, + spa_read_history_show_header, + spa_read_history_clear, + offsetof(spa_read_history_t, srh_node)); - ksp = kstat_create(name, 0, "reads", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - ssh->kstat = ksp; - - if (ksp) { - ksp->ks_lock = &ssh->lock; - ksp->ks_data = NULL; - ksp->ks_private = spa; - ksp->ks_update = spa_read_history_update; - kstat_set_raw_ops(ksp, spa_read_history_headers, - spa_read_history_data, spa_read_history_addr); - kstat_install(ksp); - } - strfree(name); + strfree(module); } static void spa_read_history_destroy(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.read_history; - spa_read_history_t *srh; - kstat_t *ksp; - - ksp = ssh->kstat; - if (ksp) - kstat_delete(ksp); - - mutex_enter(&ssh->lock); - while ((srh = list_remove_head(&ssh->list))) { - ssh->size--; - kmem_free(srh, sizeof (spa_read_history_t)); - } - - ASSERT3U(ssh->size, ==, 0); - list_destroy(&ssh->list); - mutex_exit(&ssh->lock); - - mutex_destroy(&ssh->lock); + spa_history_list_t *shl = &spa->spa_stats.read_history; + procfs_list_uninstall(&shl->procfs_list); + spa_read_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); } void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) { - spa_stats_history_t *ssh = &spa->spa_stats.read_history; - spa_read_history_t *srh, *rm; + spa_history_list_t *shl = &spa->spa_stats.read_history; + spa_read_history_t *srh; ASSERT3P(spa, !=, NULL); ASSERT3P(zb, !=, NULL); - if (zfs_read_history == 0 && ssh->size == 0) + if (zfs_read_history == 0 && shl->size == 0) return; if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) @@ -222,19 +174,14 @@ spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) srh->aflags = aflags; srh->pid = getpid(); - mutex_enter(&ssh->lock); + mutex_enter(&shl->procfs_list.pl_lock); - srh->uid = ssh->count++; - list_insert_head(&ssh->list, srh); - ssh->size++; + procfs_list_add(&shl->procfs_list, srh); + shl->size++; - while (ssh->size > zfs_read_history) { - ssh->size--; - rm = list_remove_tail(&ssh->list); - kmem_free(rm, sizeof (spa_read_history_t)); - } + spa_read_history_truncate(shl, zfs_read_history); - mutex_exit(&ssh->lock); + mutex_exit(&shl->procfs_list.pl_lock); } /* @@ -256,22 +203,21 @@ typedef struct spa_txg_history { uint64_t writes; /* number of write operations */ uint64_t ndirty; /* number of dirty bytes */ hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ - list_node_t sth_link; + procfs_list_node_t sth_node; } spa_txg_history_t; static int -spa_txg_history_headers(char *buf, size_t size) +spa_txg_history_show_header(struct seq_file *f) { - (void) snprintf(buf, size, "%-8s %-16s %-5s %-12s %-12s %-12s " + seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", "ndirty", "nread", "nwritten", "reads", "writes", "otime", "qtime", "wtime", "stime"); - return (0); } static int -spa_txg_history_data(char *buf, size_t size, void *data) +spa_txg_history_show(struct seq_file *f, void *data) { spa_txg_history_t *sth = (spa_txg_history_t *)data; uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; @@ -303,7 +249,7 @@ spa_txg_history_data(char *buf, size_t size, void *data) sync = sth->times[TXG_STATE_SYNCED] - sth->times[TXG_STATE_WAIT_FOR_SYNC]; - (void) snprintf(buf, size, "%-8llu %-16llu %-5c %-12llu " + seq_printf(f, "%-8llu %-16llu %-5c %-12llu " "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, (u_longlong_t)sth->ndirty, @@ -315,110 +261,62 @@ spa_txg_history_data(char *buf, size_t size, void *data) return (0); } -/* - * Calculate the address for the next spa_stats_history_t entry. The - * ssh->lock will be held until ksp->ks_ndata entries are processed. - */ -static void * -spa_txg_history_addr(kstat_t *ksp, loff_t n) +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) { - spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; - - ASSERT(MUTEX_HELD(&ssh->lock)); + spa_txg_history_t *sth; + while (shl->size > size) { + sth = list_remove_head(&shl->procfs_list.pl_list); + ASSERT3P(sth, !=, NULL); + kmem_free(sth, sizeof (spa_txg_history_t)); + shl->size--; + } - if (n == 0) - ssh->private = list_tail(&ssh->list); - else if (ssh->private) - ssh->private = list_prev(&ssh->list, ssh->private); + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); - return (ssh->private); } -/* - * When the kstat is written discard all spa_txg_history_t entries. The - * ssh->lock will be held until ksp->ks_ndata entries are processed. - */ static int -spa_txg_history_update(kstat_t *ksp, int rw) +spa_txg_history_clear(procfs_list_t *procfs_list) { - spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; - - ASSERT(MUTEX_HELD(&ssh->lock)); - - if (rw == KSTAT_WRITE) { - spa_txg_history_t *sth; - - while ((sth = list_remove_head(&ssh->list))) { - ssh->size--; - kmem_free(sth, sizeof (spa_txg_history_t)); - } - - ASSERT3U(ssh->size, ==, 0); - } - - ksp->ks_ndata = ssh->size; - ksp->ks_data_size = ssh->size * sizeof (spa_txg_history_t); - + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_txg_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_txg_history_init(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; - char *name; - kstat_t *ksp; + spa_history_list_t *shl = &spa->spa_stats.txg_history; + char *module; - mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&ssh->list, sizeof (spa_txg_history_t), - offsetof(spa_txg_history_t, sth_link)); + shl->size = 0; - ssh->count = 0; - ssh->size = 0; - ssh->private = NULL; + module = kmem_asprintf("zfs/%s", spa_name(spa)); - name = kmem_asprintf("zfs/%s", spa_name(spa)); + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "txgs", + &shl->procfs_list, + spa_txg_history_show, + spa_txg_history_show_header, + spa_txg_history_clear, + offsetof(spa_txg_history_t, sth_node)); - ksp = kstat_create(name, 0, "txgs", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - ssh->kstat = ksp; - - if (ksp) { - ksp->ks_lock = &ssh->lock; - ksp->ks_data = NULL; - ksp->ks_private = spa; - ksp->ks_update = spa_txg_history_update; - kstat_set_raw_ops(ksp, spa_txg_history_headers, - spa_txg_history_data, spa_txg_history_addr); - kstat_install(ksp); - } - strfree(name); + strfree(module); } static void spa_txg_history_destroy(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; - spa_txg_history_t *sth; - kstat_t *ksp; - - ksp = ssh->kstat; - if (ksp) - kstat_delete(ksp); - - mutex_enter(&ssh->lock); - while ((sth = list_remove_head(&ssh->list))) { - ssh->size--; - kmem_free(sth, sizeof (spa_txg_history_t)); - } - - ASSERT3U(ssh->size, ==, 0); - list_destroy(&ssh->list); - mutex_exit(&ssh->lock); - - mutex_destroy(&ssh->lock); + spa_history_list_t *shl = &spa->spa_stats.txg_history; + procfs_list_uninstall(&shl->procfs_list); + spa_txg_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); } /* @@ -427,10 +325,10 @@ spa_txg_history_destroy(spa_t *spa) void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) { - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; - spa_txg_history_t *sth, *rm; + spa_history_list_t *shl = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; - if (zfs_txg_history == 0 && ssh->size == 0) + if (zfs_txg_history == 0 && shl->size == 0) return; sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); @@ -438,18 +336,11 @@ spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) sth->state = TXG_STATE_OPEN; sth->times[TXG_STATE_BIRTH] = birth_time; - mutex_enter(&ssh->lock); - - list_insert_head(&ssh->list, sth); - ssh->size++; - - while (ssh->size > zfs_txg_history) { - ssh->size--; - rm = list_remove_tail(&ssh->list); - kmem_free(rm, sizeof (spa_txg_history_t)); - } - - mutex_exit(&ssh->lock); + mutex_enter(&shl->procfs_list.pl_lock); + procfs_list_add(&shl->procfs_list, sth); + shl->size++; + spa_txg_history_truncate(shl, zfs_txg_history); + mutex_exit(&shl->procfs_list.pl_lock); } /* @@ -459,16 +350,16 @@ int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time) { - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_history_list_t *shl = &spa->spa_stats.txg_history; spa_txg_history_t *sth; int error = ENOENT; if (zfs_txg_history == 0) return (0); - mutex_enter(&ssh->lock); - for (sth = list_head(&ssh->list); sth != NULL; - sth = list_next(&ssh->list, sth)) { + mutex_enter(&shl->procfs_list.pl_lock); + for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; + sth = list_prev(&shl->procfs_list.pl_list, sth)) { if (sth->txg == txg) { sth->times[completed_state] = completed_time; sth->state++; @@ -476,7 +367,7 @@ spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, break; } } - mutex_exit(&ssh->lock); + mutex_exit(&shl->procfs_list.pl_lock); return (error); } @@ -488,16 +379,16 @@ static int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) { - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_history_list_t *shl = &spa->spa_stats.txg_history; spa_txg_history_t *sth; int error = ENOENT; if (zfs_txg_history == 0) return (0); - mutex_enter(&ssh->lock); - for (sth = list_head(&ssh->list); sth != NULL; - sth = list_next(&ssh->list, sth)) { + mutex_enter(&shl->procfs_list.pl_lock); + for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; + sth = list_prev(&shl->procfs_list.pl_list, sth)) { if (sth->txg == txg) { sth->nread = nread; sth->nwritten = nwritten; @@ -508,7 +399,7 @@ spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, break; } } - mutex_exit(&ssh->lock); + mutex_exit(&shl->procfs_list.pl_lock); return (error); } @@ -580,16 +471,16 @@ static int spa_tx_assign_update(kstat_t *ksp, int rw) { spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; int i; if (rw == KSTAT_WRITE) { - for (i = 0; i < ssh->count; i++) - ((kstat_named_t *)ssh->private)[i].value.ui64 = 0; + for (i = 0; i < shk->count; i++) + ((kstat_named_t *)shk->private)[i].value.ui64 = 0; } - for (i = ssh->count; i > 0; i--) - if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0) + for (i = shk->count; i > 0; i--) + if (((kstat_named_t *)shk->private)[i-1].value.ui64 != 0) break; ksp->ks_ndata = i; @@ -601,22 +492,22 @@ spa_tx_assign_update(kstat_t *ksp, int rw) static void spa_tx_assign_init(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; char *name; kstat_named_t *ks; kstat_t *ksp; int i; - mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); - ssh->count = 42; /* power of two buckets for 1ns to 2,199s */ - ssh->size = ssh->count * sizeof (kstat_named_t); - ssh->private = kmem_alloc(ssh->size, KM_SLEEP); + shk->count = 42; /* power of two buckets for 1ns to 2,199s */ + shk->size = shk->count * sizeof (kstat_named_t); + shk->private = kmem_alloc(shk->size, KM_SLEEP); name = kmem_asprintf("zfs/%s", spa_name(spa)); - for (i = 0; i < ssh->count; i++) { - ks = &((kstat_named_t *)ssh->private)[i]; + for (i = 0; i < shk->count; i++) { + ks = &((kstat_named_t *)shk->private)[i]; ks->data_type = KSTAT_DATA_UINT64; ks->value.ui64 = 0; (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", @@ -625,13 +516,13 @@ spa_tx_assign_init(spa_t *spa) ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); - ssh->kstat = ksp; + shk->kstat = ksp; if (ksp) { - ksp->ks_lock = &ssh->lock; - ksp->ks_data = ssh->private; - ksp->ks_ndata = ssh->count; - ksp->ks_data_size = ssh->size; + ksp->ks_lock = &shk->lock; + ksp->ks_data = shk->private; + ksp->ks_ndata = shk->count; + ksp->ks_data_size = shk->size; ksp->ks_private = spa; ksp->ks_update = spa_tx_assign_update; kstat_install(ksp); @@ -642,27 +533,27 @@ spa_tx_assign_init(spa_t *spa) static void spa_tx_assign_destroy(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; kstat_t *ksp; - ksp = ssh->kstat; + ksp = shk->kstat; if (ksp) kstat_delete(ksp); - kmem_free(ssh->private, ssh->size); - mutex_destroy(&ssh->lock); + kmem_free(shk->private, shk->size); + mutex_destroy(&shk->lock); } void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) { - spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; uint64_t idx = 0; - while (((1ULL << idx) < nsecs) && (idx < ssh->size - 1)) + while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) idx++; - atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64); + atomic_inc_64(&((kstat_named_t *)shk->private)[idx].value.ui64); } /* @@ -682,19 +573,19 @@ spa_io_history_update(kstat_t *ksp, int rw) static void spa_io_history_init(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.io_history; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; char *name; kstat_t *ksp; - mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); name = kmem_asprintf("zfs/%s", spa_name(spa)); ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); - ssh->kstat = ksp; + shk->kstat = ksp; if (ksp) { - ksp->ks_lock = &ssh->lock; + ksp->ks_lock = &shk->lock; ksp->ks_private = spa; ksp->ks_update = spa_io_history_update; kstat_install(ksp); @@ -705,12 +596,12 @@ spa_io_history_init(spa_t *spa) static void spa_io_history_destroy(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.io_history; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; - if (ssh->kstat) - kstat_delete(ssh->kstat); + if (shk->kstat) + kstat_delete(shk->kstat); - mutex_destroy(&ssh->lock); + mutex_destroy(&shk->lock); } /* @@ -733,7 +624,7 @@ spa_io_history_destroy(spa_t *spa) */ typedef struct spa_mmp_history { - uint64_t mmp_kstat_id; /* unique # for updates */ + uint64_t mmp_node_id; /* unique # for updates */ uint64_t txg; /* txg of last sync */ uint64_t timestamp; /* UTC time MMP write issued */ uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ @@ -743,20 +634,20 @@ typedef struct spa_mmp_history { int io_error; /* error status of MMP write */ hrtime_t error_start; /* hrtime of start of error period */ hrtime_t duration; /* time from submission to completion */ - list_node_t smh_link; + procfs_list_node_t smh_node; } spa_mmp_history_t; static int -spa_mmp_history_headers(char *buf, size_t size) +spa_mmp_history_show_header(struct seq_file *f) { - (void) snprintf(buf, size, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " + seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); return (0); } static int -spa_mmp_history_data(char *buf, size_t size, void *data) +spa_mmp_history_show(struct seq_file *f, void *data) { spa_mmp_history_t *smh = (spa_mmp_history_t *)data; char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " @@ -764,8 +655,8 @@ spa_mmp_history_data(char *buf, size_t size, void *data) char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " "%-10lld %s\n"; - (void) snprintf(buf, size, (smh->error_start ? skip_fmt : write_fmt), - (u_longlong_t)smh->mmp_kstat_id, (u_longlong_t)smh->txg, + seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), + (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, @@ -774,137 +665,86 @@ spa_mmp_history_data(char *buf, size_t size, void *data) return (0); } -/* - * Calculate the address for the next spa_stats_history_t entry. The - * ssh->lock will be held until ksp->ks_ndata entries are processed. - */ -static void * -spa_mmp_history_addr(kstat_t *ksp, loff_t n) +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) { - spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.mmp_history; - - ASSERT(MUTEX_HELD(&ssh->lock)); + spa_mmp_history_t *smh; + while (shl->size > size) { + smh = list_remove_head(&shl->procfs_list.pl_list); + if (smh->vdev_path) + strfree(smh->vdev_path); + kmem_free(smh, sizeof (spa_mmp_history_t)); + shl->size--; + } - if (n == 0) - ssh->private = list_tail(&ssh->list); - else if (ssh->private) - ssh->private = list_prev(&ssh->list, ssh->private); + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); - return (ssh->private); } -/* - * When the kstat is written discard all spa_mmp_history_t entries. The - * ssh->lock will be held until ksp->ks_ndata entries are processed. - */ static int -spa_mmp_history_update(kstat_t *ksp, int rw) +spa_mmp_history_clear(procfs_list_t *procfs_list) { - spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.mmp_history; - - ASSERT(MUTEX_HELD(&ssh->lock)); - - if (rw == KSTAT_WRITE) { - spa_mmp_history_t *smh; - - while ((smh = list_remove_head(&ssh->list))) { - ssh->size--; - if (smh->vdev_path) - strfree(smh->vdev_path); - kmem_free(smh, sizeof (spa_mmp_history_t)); - } - - ASSERT3U(ssh->size, ==, 0); - } - - ksp->ks_ndata = ssh->size; - ksp->ks_data_size = ssh->size * sizeof (spa_mmp_history_t); - + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_mmp_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_mmp_history_init(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.mmp_history; - char *name; - kstat_t *ksp; + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + char *module; - mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&ssh->list, sizeof (spa_mmp_history_t), - offsetof(spa_mmp_history_t, smh_link)); + shl->size = 0; - ssh->count = 0; - ssh->size = 0; - ssh->private = NULL; - - name = kmem_asprintf("zfs/%s", spa_name(spa)); + module = kmem_asprintf("zfs/%s", spa_name(spa)); - ksp = kstat_create(name, 0, "multihost", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - ssh->kstat = ksp; + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "multihost", + &shl->procfs_list, + spa_mmp_history_show, + spa_mmp_history_show_header, + spa_mmp_history_clear, + offsetof(spa_mmp_history_t, smh_node)); - if (ksp) { - ksp->ks_lock = &ssh->lock; - ksp->ks_data = NULL; - ksp->ks_private = spa; - ksp->ks_update = spa_mmp_history_update; - kstat_set_raw_ops(ksp, spa_mmp_history_headers, - spa_mmp_history_data, spa_mmp_history_addr); - kstat_install(ksp); - } - strfree(name); + strfree(module); } static void spa_mmp_history_destroy(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.mmp_history; - spa_mmp_history_t *smh; - kstat_t *ksp; - - ksp = ssh->kstat; - if (ksp) - kstat_delete(ksp); - - mutex_enter(&ssh->lock); - while ((smh = list_remove_head(&ssh->list))) { - ssh->size--; - if (smh->vdev_path) - strfree(smh->vdev_path); - kmem_free(smh, sizeof (spa_mmp_history_t)); - } - - ASSERT3U(ssh->size, ==, 0); - list_destroy(&ssh->list); - mutex_exit(&ssh->lock); - - mutex_destroy(&ssh->lock); + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + procfs_list_uninstall(&shl->procfs_list); + spa_mmp_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); } /* * Set duration in existing "skip" record to how long we have waited for a leaf * vdev to become available. * - * Important that we start search at the head of the list where new + * Important that we start search at the tail of the list where new * records are inserted, so this is normally an O(1) operation. */ int -spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id) +spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) { - spa_stats_history_t *ssh = &spa->spa_stats.mmp_history; + spa_history_list_t *shl = &spa->spa_stats.mmp_history; spa_mmp_history_t *smh; int error = ENOENT; - if (zfs_multihost_history == 0 && ssh->size == 0) + if (zfs_multihost_history == 0 && shl->size == 0) return (0); - mutex_enter(&ssh->lock); - for (smh = list_head(&ssh->list); smh != NULL; - smh = list_next(&ssh->list, smh)) { - if (smh->mmp_kstat_id == mmp_kstat_id) { + mutex_enter(&shl->procfs_list.pl_lock); + for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; + smh = list_prev(&shl->procfs_list.pl_list, smh)) { + if (smh->mmp_node_id == mmp_node_id) { ASSERT3U(smh->io_error, !=, 0); smh->duration = gethrtime() - smh->error_start; smh->vdev_guid++; @@ -912,7 +752,7 @@ spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id) break; } } - mutex_exit(&ssh->lock); + mutex_exit(&shl->procfs_list.pl_lock); return (error); } @@ -922,20 +762,20 @@ spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id) * See comment re: search order above spa_mmp_history_set_skip(). */ int -spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, +spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, hrtime_t duration) { - spa_stats_history_t *ssh = &spa->spa_stats.mmp_history; + spa_history_list_t *shl = &spa->spa_stats.mmp_history; spa_mmp_history_t *smh; int error = ENOENT; - if (zfs_multihost_history == 0 && ssh->size == 0) + if (zfs_multihost_history == 0 && shl->size == 0) return (0); - mutex_enter(&ssh->lock); - for (smh = list_head(&ssh->list); smh != NULL; - smh = list_next(&ssh->list, smh)) { - if (smh->mmp_kstat_id == mmp_kstat_id) { + mutex_enter(&shl->procfs_list.pl_lock); + for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; + smh = list_prev(&shl->procfs_list.pl_list, smh)) { + if (smh->mmp_node_id == mmp_node_id) { ASSERT(smh->io_error == 0); smh->io_error = io_error; smh->duration = duration; @@ -943,7 +783,7 @@ spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, break; } } - mutex_exit(&ssh->lock); + mutex_exit(&shl->procfs_list.pl_lock); return (error); } @@ -953,16 +793,16 @@ spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, * error == 0 : a write was issued. * error != 0 : a write was not issued because no leaves were found. */ -void * +void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, - uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, + uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, int error) { - spa_stats_history_t *ssh = &spa->spa_stats.mmp_history; - spa_mmp_history_t *smh, *rm; + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + spa_mmp_history_t *smh; - if (zfs_multihost_history == 0 && ssh->size == 0) - return (NULL); + if (zfs_multihost_history == 0 && shl->size == 0) + return; smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); smh->txg = txg; @@ -974,7 +814,7 @@ spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, smh->vdev_path = strdup(vd->vdev_path); } smh->vdev_label = label; - smh->mmp_kstat_id = mmp_kstat_id; + smh->mmp_node_id = mmp_node_id; if (error) { smh->io_error = error; @@ -982,21 +822,11 @@ spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, smh->vdev_guid = 1; } - mutex_enter(&ssh->lock); - - list_insert_head(&ssh->list, smh); - ssh->size++; - - while (ssh->size > zfs_multihost_history) { - ssh->size--; - rm = list_remove_tail(&ssh->list); - if (rm->vdev_path) - strfree(rm->vdev_path); - kmem_free(rm, sizeof (spa_mmp_history_t)); - } - - mutex_exit(&ssh->lock); - return ((void *)smh); + mutex_enter(&shl->procfs_list.pl_lock); + procfs_list_add(&shl->procfs_list, smh); + shl->size++; + spa_mmp_history_truncate(shl, zfs_multihost_history); + mutex_exit(&shl->procfs_list.pl_lock); } static void * @@ -1023,19 +853,19 @@ spa_state_data(char *buf, size_t size, void *data) static void spa_state_init(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.state; + spa_history_kstat_t *shk = &spa->spa_stats.state; char *name; kstat_t *ksp; - mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); name = kmem_asprintf("zfs/%s", spa_name(spa)); ksp = kstat_create(name, 0, "state", "misc", KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - ssh->kstat = ksp; + shk->kstat = ksp; if (ksp) { - ksp->ks_lock = &ssh->lock; + ksp->ks_lock = &shk->lock; ksp->ks_data = NULL; ksp->ks_private = spa; ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; @@ -1049,12 +879,12 @@ spa_state_init(spa_t *spa) static void spa_health_destroy(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.state; - kstat_t *ksp = ssh->kstat; + spa_history_kstat_t *shk = &spa->spa_stats.state; + kstat_t *ksp = shk->kstat; if (ksp) kstat_delete(ksp); - mutex_destroy(&ssh->lock); + mutex_destroy(&shk->lock); } void diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c index 30a883f85..89cdf7d81 100644 --- a/module/zfs/vdev_queue.c +++ b/module/zfs/vdev_queue.c @@ -429,16 +429,16 @@ static void vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; - spa_stats_history_t *ssh = &spa->spa_stats.io_history; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); - if (ssh->kstat != NULL) { - mutex_enter(&ssh->lock); - kstat_waitq_enter(ssh->kstat->ks_data); - mutex_exit(&ssh->lock); + if (shk->kstat != NULL) { + mutex_enter(&shk->lock); + kstat_waitq_enter(shk->kstat->ks_data); + mutex_exit(&shk->lock); } } @@ -446,16 +446,16 @@ static void vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; - spa_stats_history_t *ssh = &spa->spa_stats.io_history; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); - if (ssh->kstat != NULL) { - mutex_enter(&ssh->lock); - kstat_waitq_exit(ssh->kstat->ks_data); - mutex_exit(&ssh->lock); + if (shk->kstat != NULL) { + mutex_enter(&shk->lock); + kstat_waitq_exit(shk->kstat->ks_data); + mutex_exit(&shk->lock); } } @@ -463,17 +463,17 @@ static void vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; - spa_stats_history_t *ssh = &spa->spa_stats.io_history; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active++; avl_add(&vq->vq_active_tree, zio); - if (ssh->kstat != NULL) { - mutex_enter(&ssh->lock); - kstat_runq_enter(ssh->kstat->ks_data); - mutex_exit(&ssh->lock); + if (shk->kstat != NULL) { + mutex_enter(&shk->lock); + kstat_runq_enter(shk->kstat->ks_data); + mutex_exit(&shk->lock); } } @@ -481,17 +481,17 @@ static void vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) { spa_t *spa = zio->io_spa; - spa_stats_history_t *ssh = &spa->spa_stats.io_history; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; ASSERT(MUTEX_HELD(&vq->vq_lock)); ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); vq->vq_class[zio->io_priority].vqc_active--; avl_remove(&vq->vq_active_tree, zio); - if (ssh->kstat != NULL) { - kstat_io_t *ksio = ssh->kstat->ks_data; + if (shk->kstat != NULL) { + kstat_io_t *ksio = shk->kstat->ks_data; - mutex_enter(&ssh->lock); + mutex_enter(&shk->lock); kstat_runq_exit(ksio); if (zio->io_type == ZIO_TYPE_READ) { ksio->reads++; @@ -500,7 +500,7 @@ vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) ksio->writes++; ksio->nwritten += zio->io_size; } - mutex_exit(&ssh->lock); + mutex_exit(&shk->lock); } } diff --git a/module/zfs/zfs_debug.c b/module/zfs/zfs_debug.c index ca79893c9..b5f93fd9b 100644 --- a/module/zfs/zfs_debug.c +++ b/module/zfs/zfs_debug.c @@ -24,13 +24,17 @@ */ #include -#include -list_t zfs_dbgmsgs; +typedef struct zfs_dbgmsg { + procfs_list_node_t zdm_node; + time_t zdm_timestamp; + int zdm_size; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +procfs_list_t zfs_dbgmsgs; int zfs_dbgmsg_size = 0; -kmutex_t zfs_dbgmsgs_lock; int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ -kstat_t *zfs_dbgmsg_kstat; /* * Internal ZFS debug messages are enabled by default. @@ -47,122 +51,70 @@ kstat_t *zfs_dbgmsg_kstat; int zfs_dbgmsg_enable = 1; static int -zfs_dbgmsg_headers(char *buf, size_t size) +zfs_dbgmsg_show_header(struct seq_file *f) { - (void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message"); - + seq_printf(f, "%-12s %-8s\n", "timestamp", "message"); return (0); } static int -zfs_dbgmsg_data(char *buf, size_t size, void *data) +zfs_dbgmsg_show(struct seq_file *f, void *p) { - zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data; - - (void) snprintf(buf, size, "%-12llu %-s\n", + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p; + seq_printf(f, "%-12llu %-s\n", (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); - return (0); } -static void * -zfs_dbgmsg_addr(kstat_t *ksp, loff_t n) -{ - zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private; - - ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); - - if (n == 0) - ksp->ks_private = list_head(&zfs_dbgmsgs); - else if (zdm) - ksp->ks_private = list_next(&zfs_dbgmsgs, zdm); - - return (ksp->ks_private); -} - static void zfs_dbgmsg_purge(int max_size) { - zfs_dbgmsg_t *zdm; - int size; - - ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); - while (zfs_dbgmsg_size > max_size) { - zdm = list_remove_head(&zfs_dbgmsgs); + zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list); if (zdm == NULL) return; - size = zdm->zdm_size; + int size = zdm->zdm_size; kmem_free(zdm, size); zfs_dbgmsg_size -= size; } } static int -zfs_dbgmsg_update(kstat_t *ksp, int rw) +zfs_dbgmsg_clear(procfs_list_t *procfs_list) { - if (rw == KSTAT_WRITE) - zfs_dbgmsg_purge(0); - + mutex_enter(&zfs_dbgmsgs.pl_lock); + zfs_dbgmsg_purge(0); + mutex_exit(&zfs_dbgmsgs.pl_lock); return (0); } void zfs_dbgmsg_init(void) { - list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + procfs_list_install("zfs", + "dbgmsg", + &zfs_dbgmsgs, + zfs_dbgmsg_show, + zfs_dbgmsg_show_header, + zfs_dbgmsg_clear, offsetof(zfs_dbgmsg_t, zdm_node)); - mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); - - zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - if (zfs_dbgmsg_kstat) { - zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock; - zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX; - zfs_dbgmsg_kstat->ks_private = NULL; - zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update; - kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers, - zfs_dbgmsg_data, zfs_dbgmsg_addr); - kstat_install(zfs_dbgmsg_kstat); - } } void zfs_dbgmsg_fini(void) { - if (zfs_dbgmsg_kstat) - kstat_delete(zfs_dbgmsg_kstat); + procfs_list_uninstall(&zfs_dbgmsgs); + zfs_dbgmsg_purge(0); + /* * TODO - decide how to make this permanent */ #ifdef _KERNEL - mutex_enter(&zfs_dbgmsgs_lock); - zfs_dbgmsg_purge(0); - mutex_exit(&zfs_dbgmsgs_lock); - mutex_destroy(&zfs_dbgmsgs_lock); + procfs_list_destroy(&zfs_dbgmsgs); #endif } -void -__zfs_dbgmsg(char *buf) -{ - zfs_dbgmsg_t *zdm; - int size; - - size = sizeof (zfs_dbgmsg_t) + strlen(buf); - zdm = kmem_zalloc(size, KM_SLEEP); - zdm->zdm_size = size; - zdm->zdm_timestamp = gethrestime_sec(); - strcpy(zdm->zdm_msg, buf); - - mutex_enter(&zfs_dbgmsgs_lock); - list_insert_tail(&zfs_dbgmsgs, zdm); - zfs_dbgmsg_size += size; - zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); - mutex_exit(&zfs_dbgmsgs_lock); -} - void __set_error(const char *file, const char *func, int line, int err) { @@ -176,6 +128,22 @@ __set_error(const char *file, const char *func, int line, int err) } #ifdef _KERNEL +static void +__zfs_dbgmsg(char *buf) +{ + int size = sizeof (zfs_dbgmsg_t) + strlen(buf); + zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP); + zdm->zdm_size = size; + zdm->zdm_timestamp = gethrestime_sec(); + strcpy(zdm->zdm_msg, buf); + + mutex_enter(&zfs_dbgmsgs.pl_lock); + procfs_list_add(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); + mutex_exit(&zfs_dbgmsgs.pl_lock); +} + void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { @@ -244,14 +212,12 @@ __dprintf(const char *file, const char *func, int line, const char *fmt, ...) void zfs_dbgmsg_print(const char *tag) { - zfs_dbgmsg_t *zdm; - (void) printf("ZFS_DBGMSG(%s):\n", tag); - mutex_enter(&zfs_dbgmsgs_lock); - for (zdm = list_head(&zfs_dbgmsgs); zdm; - zdm = list_next(&zfs_dbgmsgs, zdm)) + mutex_enter(&zfs_dbgmsgs.pl_lock); + for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL; + zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) (void) printf("%s\n", zdm->zdm_msg); - mutex_exit(&zfs_dbgmsgs_lock); + mutex_exit(&zfs_dbgmsgs.pl_lock); } #endif /* _KERNEL */ diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 4b41c3f74..95e70f043 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -584,10 +584,6 @@ tests = ['inuse_001_pos', 'inuse_003_pos', 'inuse_004_pos', post = tags = ['functional', 'inuse'] -[tests/functional/kstat] -tests = ['state'] -tags = ['functional', 'kstat'] - [tests/functional/large_files] tests = ['large_files_001_pos', 'large_files_002_pos'] tags = ['functional', 'large_files'] @@ -672,6 +668,11 @@ tags = ['functional', 'poolversion'] tests = ['privilege_001_pos', 'privilege_002_pos'] tags = ['functional', 'privilege'] +[tests/functional/procfs] +tests = ['procfs_list_basic', 'procfs_list_concurrent_readers', + 'procfs_list_stale_read', 'pool_state'] +tags = ['functional', 'procfs'] + [tests/functional/projectquota] tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', 'projectquota_001_pos', 'projectquota_002_pos', 'projectquota_003_pos', diff --git a/tests/zfs-tests/tests/functional/Makefile.am b/tests/zfs-tests/tests/functional/Makefile.am index e0a4aca99..961a34027 100644 --- a/tests/zfs-tests/tests/functional/Makefile.am +++ b/tests/zfs-tests/tests/functional/Makefile.am @@ -29,7 +29,6 @@ SUBDIRS = \ inheritance \ inuse \ io \ - kstat \ large_files \ largest_pool \ libzfs \ @@ -48,6 +47,7 @@ SUBDIRS = \ pool_names \ poolversion \ privilege \ + procfs \ projectquota \ quota \ raidz \ diff --git a/tests/zfs-tests/tests/functional/kstat/Makefile.am b/tests/zfs-tests/tests/functional/kstat/Makefile.am deleted file mode 100644 index 8ad83ec3e..000000000 --- a/tests/zfs-tests/tests/functional/kstat/Makefile.am +++ /dev/null @@ -1,5 +0,0 @@ -pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/kstat -dist_pkgdata_SCRIPTS = \ - setup.ksh \ - cleanup.ksh \ - state.ksh diff --git a/tests/zfs-tests/tests/functional/procfs/Makefile.am b/tests/zfs-tests/tests/functional/procfs/Makefile.am new file mode 100644 index 000000000..a7f022d9f --- /dev/null +++ b/tests/zfs-tests/tests/functional/procfs/Makefile.am @@ -0,0 +1,8 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/tests/functional/procfs +dist_pkgdata_SCRIPTS = \ + setup.ksh \ + cleanup.ksh \ + procfs_list_basic.ksh \ + procfs_list_concurrent_readers.ksh \ + procfs_list_stale_read.ksh \ + pool_state.ksh diff --git a/tests/zfs-tests/tests/functional/kstat/cleanup.ksh b/tests/zfs-tests/tests/functional/procfs/cleanup.ksh similarity index 92% rename from tests/zfs-tests/tests/functional/kstat/cleanup.ksh rename to tests/zfs-tests/tests/functional/procfs/cleanup.ksh index 8a212ce37..8fe46577e 100755 --- a/tests/zfs-tests/tests/functional/kstat/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/procfs/cleanup.ksh @@ -19,8 +19,9 @@ # # CDDL HEADER END # + # -# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2018 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib diff --git a/tests/zfs-tests/tests/functional/kstat/state.ksh b/tests/zfs-tests/tests/functional/procfs/pool_state.ksh similarity index 100% rename from tests/zfs-tests/tests/functional/kstat/state.ksh rename to tests/zfs-tests/tests/functional/procfs/pool_state.ksh diff --git a/tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh b/tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh new file mode 100755 index 000000000..c9eff3649 --- /dev/null +++ b/tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh @@ -0,0 +1,95 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Test that we can read from and write to a file in procfs whose contents is +# backed by a linked list. +# +# STRATEGY: +# 1. Take some snapshots of a filesystem, which will cause some messages to be +# written to the zfs dbgmsgs. +# 2. Read the dbgmsgs via procfs and verify that the expected messages are +# present. +# 3. Write to the dbgmsgs file to clear the messages. +# 4. Read the dbgmsgs again, and make sure the messages are no longer present. +# + +function cleanup +{ + datasetexists $FS && log_must zfs destroy -r $FS +} + +function count_snap_cmds +{ + typeset expected_count=$1 + count=$(grep "command: zfs snapshot $FS@testsnapshot" | wc -l) + log_must eval "[[ $count -eq $expected_count ]]" +} + +typeset -r ZFS_DBGMSG=/proc/spl/kstat/zfs/dbgmsg +typeset -r FS=$TESTPOOL/fs +typeset snap_msgs + +log_onexit cleanup + +# Clear out old messages +echo 0 >$ZFS_DBGMSG || log_fail "failed to write to $ZFS_DBGMSG" + +log_must zfs create $FS +for i in {1..20}; do + log_must zfs snapshot "$FS@testsnapshot$i" +done +log_must zpool sync $TESTPOOL + +# +# Read the debug message file in small chunks to make sure that the read is +# split up into multiple syscalls. This tests that when a syscall begins we +# correctly pick up in the list of messages where the previous syscall left +# off. The size of the read can affect how many bytes the seq_file code has +# left in its internal buffer, which in turn can affect the relative pos that +# the seq_file code picks up at when the next read starts. Try a few +# different size reads to make sure we can handle each case. +# +# Check that the file has the right contents by grepping for some of the +# messages that we expect to be present. +# +for chunk_sz in {1,64,256,1024,4096}; do + dd if=$ZFS_DBGMSG bs=$chunk_sz | count_snap_cmds 20 +done + +# Clear out old messages and check that they really are gone +echo 0 >$ZFS_DBGMSG || log_fail "failed to write to $ZFS_DBGMSG" +cat $ZFS_DBGMSG | count_snap_cmds 0 +# +# Even though we don't expect any messages in the file, reading should still +# succeed. +# +log_must cat $ZFS_DBGMSG + +log_pass "Basic reading/writing of procfs file backed by linked list successful" diff --git a/tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh b/tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh new file mode 100755 index 000000000..473de5c84 --- /dev/null +++ b/tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh @@ -0,0 +1,82 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Make sure that interleaving reads from different readers does not affect the +# results that are returned. +# +# STRATEGY: +# 1. Make sure a few debug messages have been logged. +# 2. Open the procfs file and start reading from it. +# 3. Open the file again, and read its entire contents. +# 4. Resume reading from the first instance. +# 5. Check that the contents read by the two instances are identical. +# + +function cleanup +{ + [[ -z $msgs1 ]] || log_must rm $msgs1 + [[ -z $msgs2 ]] || log_must rm $msgs2 + datasetexists $FS && log_must zfs destroy -r $FS +} + +typeset -r ZFS_DBGMSG=/proc/spl/kstat/zfs/dbgmsg +typeset -r FS=$TESTPOOL/fs +typeset msgs1 msgs2 + +log_onexit cleanup + +# Clear out old messages +echo 0 >$ZFS_DBGMSG || log_fail "failed to write to $ZFS_DBGMSG" + +# Add some new messages +log_must zfs create $FS +for i in {1..20}; do + log_must zfs snapshot "$FS@testsnapshot$i" +done +log_must zpool sync $TESTPOOL + +msgs1=$(mktemp) || log_fail +msgs2=$(mktemp) || log_fail + +# +# Start reading file, pause and read it from another process, and then finish +# reading. +# +{ dd bs=512 count=4; cat $ZFS_DBGMSG >$msgs1; cat; } <$ZFS_DBGMSG >$msgs2 + +# +# Truncate the result of the read that completed second in case it picked up an +# extra message that was logged after the first read completed. +# +log_must truncate -s $(stat -c "%s" $msgs1) $msgs2 + +log_must diff $msgs1 $msgs2 + +log_pass "Concurrent readers receive identical results" diff --git a/tests/zfs-tests/tests/functional/procfs/procfs_list_stale_read.ksh b/tests/zfs-tests/tests/functional/procfs/procfs_list_stale_read.ksh new file mode 100755 index 000000000..c363e7f8b --- /dev/null +++ b/tests/zfs-tests/tests/functional/procfs/procfs_list_stale_read.ksh @@ -0,0 +1,98 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2018 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Make sure errors caused by messages being dropped from the list backing the +# procfs file are handled gracefully. +# +# STRATEGY: +# 1. Make sure a few entries have been logged. +# 2. Open the procfs file and start reading from it. +# 3. Write to the file to cause its contents to be dropped. +# 4. Resume reading from the first instance, and check that the expected +# error is received. +# 5. Repeat steps 1-4, except instead of dropping all the messages by writing +# to the file, cause enough new messages to be written that the old messages +# are dropped. +# + +function cleanup +{ + echo $default_max_entries >$MAX_ENTRIES_PARAM || log_fail +} + +function sync_n +{ + for i in {1..$1}; do + log_must zpool sync $TESTPOOL + done + return 0 +} + +function do_test +{ + typeset cmd=$1 + + # Clear out old entries + echo 0 >$TXG_HIST || log_fail + + # Add some new entries + sync_n 20 + + # Confirm that there actually is something in the file. + [[ $(wc -l <$TXG_HIST) -ge 20 ]] || log_fail "expected more entries" + + # + # Start reading file, pause and run a command that will cause the + # current offset into the file to become invalid, and then try to + # finish reading. + # + { + log_must dd bs=512 count=4 >/dev/null + log_must eval "$cmd" + cat 2>&1 >/dev/null | log_must grep "Input/output error" + } <$TXG_HIST +} + +typeset -r TXG_HIST=/proc/spl/kstat/zfs/$TESTPOOL/txgs +typeset MAX_ENTRIES_PARAM=/sys/module/zfs/parameters/zfs_txg_history +typeset default_max_entries + +log_onexit cleanup + +default_max_entries=$(cat $MAX_ENTRIES_PARAM) || log_fail +echo 50 >$MAX_ENTRIES_PARAM || log_fail + +# Clear all of the existing entries. +do_test "echo 0 >$TXG_HIST" + +# Add enough new entries to the list that all of the old ones are dropped. +do_test "sync_n 60" + +log_pass "Attempting to read dropped message returns expected error" diff --git a/tests/zfs-tests/tests/functional/kstat/setup.ksh b/tests/zfs-tests/tests/functional/procfs/setup.ksh similarity index 86% rename from tests/zfs-tests/tests/functional/kstat/setup.ksh rename to tests/zfs-tests/tests/functional/procfs/setup.ksh index 57717a096..b3812dbdc 100755 --- a/tests/zfs-tests/tests/functional/kstat/setup.ksh +++ b/tests/zfs-tests/tests/functional/procfs/setup.ksh @@ -19,16 +19,16 @@ # # CDDL HEADER END # + # -# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# Copyright (c) 2018 by Delphix. All rights reserved. # . $STF_SUITE/include/libtest.shlib if ! is_linux ; then - log_unsupported "/proc/spl/kstat//health only supported on Linux" + log_unsupported "procfs is only used on Linux" fi default_mirror_setup $DISKS - log_pass -- 2.39.2