X-Git-Url: https://git.proxmox.com/?a=blobdiff_plain;f=module%2Fzfs%2Fspa_stats.c;h=c02ef86b51c66062edfc84e8d9d75fdc30913d51;hb=d12614521a;hp=dbc761e115d5a7fea577011600ad6a119244a790;hpb=3ccab25205cc8836ceb79bbd164208021468233a;p=mirror_zfs.git diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c index dbc761e11..c02ef86b5 100644 --- a/module/zfs/spa_stats.c +++ b/module/zfs/spa_stats.c @@ -21,6 +21,9 @@ #include #include +#include +#include +#include /* * Keeps stats on last N reads per spa_t, disabled by default. @@ -33,9 +36,14 @@ int zfs_read_history = 0; int zfs_read_history_hits = 0; /* - * Keeps stats on the last N txgs, disabled by default. + * Keeps stats on the last 100 txgs by default. */ -int zfs_txg_history = 0; +int zfs_txg_history = 100; + +/* + * Keeps stats on the last N MMP updates, disabled by default. + */ +int zfs_multihost_history = 0; /* * ========================================================================== @@ -47,7 +55,6 @@ int zfs_txg_history = 0; * Read statistics - Information exported regarding each arc_read call */ typedef struct spa_read_history { - uint64_t uid; /* unique identifier */ hrtime_t start; /* time read completed */ uint64_t objset; /* read from this objset */ uint64_t object; /* read of this object number */ @@ -57,156 +64,107 @@ typedef struct spa_read_history { uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ pid_t pid; /* PID of task doing read */ char comm[16]; /* process name of task doing read */ - list_node_t srh_link; + procfs_list_node_t srh_node; } spa_read_history_t; static int -spa_read_history_headers(char *buf, size_t size) +spa_read_history_show_header(struct seq_file *f) { - size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " + seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", "level", "blkid", "aflags", "origin", "pid", "process"); - buf[size] = '\0'; return (0); } static int -spa_read_history_data(char *buf, size_t size, void *data) +spa_read_history_show(struct seq_file *f, void *data) { spa_read_history_t *srh = (spa_read_history_t *)data; - size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx " + seq_printf(f, "%-8llu %-16llu 0x%-6llx " "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", - (u_longlong_t)srh->uid, srh->start, + (u_longlong_t)srh->srh_node.pln_id, srh->start, (longlong_t)srh->objset, (longlong_t)srh->object, (longlong_t)srh->level, (longlong_t)srh->blkid, srh->aflags, srh->origin, srh->pid, srh->comm); - buf[size] = '\0'; return (0); } -/* - * Calculate the address for the next spa_stats_history_t entry. The - * ssh->lock will be held until ksp->ks_ndata entries are processed. - */ -static void * -spa_read_history_addr(kstat_t *ksp, loff_t n) +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) { - spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.read_history; - - ASSERT(MUTEX_HELD(&ssh->lock)); - - if (n == 0) - ssh->private = list_tail(&ssh->list); - else if (ssh->private) - ssh->private = list_prev(&ssh->list, ssh->private); + spa_read_history_t *srh; + while (shl->size > size) { + srh = list_remove_head(&shl->procfs_list.pl_list); + ASSERT3P(srh, !=, NULL); + kmem_free(srh, sizeof (spa_read_history_t)); + shl->size--; + } - return (ssh->private); + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); } -/* - * When the kstat is written discard all spa_read_history_t entires. The - * ssh->lock will be held until ksp->ks_ndata entries are processed. - */ static int -spa_read_history_update(kstat_t *ksp, int rw) +spa_read_history_clear(procfs_list_t *procfs_list) { - spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.read_history; - - if (rw == KSTAT_WRITE) { - spa_read_history_t *srh; - - while ((srh = list_remove_head(&ssh->list))) { - ssh->size--; - kmem_free(srh, sizeof (spa_read_history_t)); - } - - ASSERT3U(ssh->size, ==, 0); - } - - ksp->ks_ndata = ssh->size; - ksp->ks_data_size = ssh->size * sizeof (spa_read_history_t); - + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_read_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_read_history_init(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.read_history; - char name[KSTAT_STRLEN]; - kstat_t *ksp; + spa_history_list_t *shl = &spa->spa_stats.read_history; + char *module; - mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&ssh->list, sizeof (spa_read_history_t), - offsetof(spa_read_history_t, srh_link)); + shl->size = 0; - ssh->count = 0; - ssh->size = 0; - ssh->private = NULL; + module = kmem_asprintf("zfs/%s", spa_name(spa)); - (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); - name[KSTAT_STRLEN-1] = '\0'; - - ksp = kstat_create(name, 0, "reads", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - ssh->kstat = ksp; + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "reads", + &shl->procfs_list, + spa_read_history_show, + spa_read_history_show_header, + spa_read_history_clear, + offsetof(spa_read_history_t, srh_node)); - if (ksp) { - ksp->ks_lock = &ssh->lock; - ksp->ks_data = NULL; - ksp->ks_private = spa; - ksp->ks_update = spa_read_history_update; - kstat_set_raw_ops(ksp, spa_read_history_headers, - spa_read_history_data, spa_read_history_addr); - kstat_install(ksp); - } + strfree(module); } static void spa_read_history_destroy(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.read_history; - spa_read_history_t *srh; - kstat_t *ksp; - - ksp = ssh->kstat; - if (ksp) - kstat_delete(ksp); - - mutex_enter(&ssh->lock); - while ((srh = list_remove_head(&ssh->list))) { - ssh->size--; - kmem_free(srh, sizeof (spa_read_history_t)); - } - - ASSERT3U(ssh->size, ==, 0); - list_destroy(&ssh->list); - mutex_exit(&ssh->lock); - - mutex_destroy(&ssh->lock); + spa_history_list_t *shl = &spa->spa_stats.read_history; + procfs_list_uninstall(&shl->procfs_list); + spa_read_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); } void -spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) +spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) { - spa_stats_history_t *ssh = &spa->spa_stats.read_history; - spa_read_history_t *srh, *rm; + spa_history_list_t *shl = &spa->spa_stats.read_history; + spa_read_history_t *srh; ASSERT3P(spa, !=, NULL); ASSERT3P(zb, !=, NULL); - if (zfs_read_history == 0 && ssh->size == 0) + if (zfs_read_history == 0 && shl->size == 0) return; - if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED)) + if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) return; - srh = kmem_zalloc(sizeof (spa_read_history_t), KM_PUSHPAGE); + srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); srh->start = gethrtime(); srh->objset = zb->zb_objset; @@ -216,19 +174,14 @@ spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags) srh->aflags = aflags; srh->pid = getpid(); - mutex_enter(&ssh->lock); + mutex_enter(&shl->procfs_list.pl_lock); - srh->uid = ssh->count++; - list_insert_head(&ssh->list, srh); - ssh->size++; + procfs_list_add(&shl->procfs_list, srh); + shl->size++; - while (ssh->size > zfs_read_history) { - ssh->size--; - rm = list_remove_tail(&ssh->list); - kmem_free(rm, sizeof (spa_read_history_t)); - } + spa_read_history_truncate(shl, zfs_read_history); - mutex_exit(&ssh->lock); + mutex_exit(&shl->procfs_list.pl_lock); } /* @@ -250,23 +203,21 @@ typedef struct spa_txg_history { uint64_t writes; /* number of write operations */ uint64_t ndirty; /* number of dirty bytes */ hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ - list_node_t sth_link; + procfs_list_node_t sth_node; } spa_txg_history_t; static int -spa_txg_history_headers(char *buf, size_t size) +spa_txg_history_show_header(struct seq_file *f) { - size = snprintf(buf, size - 1, "%-8s %-16s %-5s %-12s %-12s %-12s " + seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", "ndirty", "nread", "nwritten", "reads", "writes", "otime", "qtime", "wtime", "stime"); - buf[size] = '\0'; - return (0); } static int -spa_txg_history_data(char *buf, size_t size, void *data) +spa_txg_history_show(struct seq_file *f, void *data) { spa_txg_history_t *sth = (spa_txg_history_t *)data; uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; @@ -298,7 +249,7 @@ spa_txg_history_data(char *buf, size_t size, void *data) sync = sth->times[TXG_STATE_SYNCED] - sth->times[TXG_STATE_WAIT_FOR_SYNC]; - size = snprintf(buf, size - 1, "%-8llu %-16llu %-5c %-12llu " + seq_printf(f, "%-8llu %-16llu %-5c %-12llu " "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, (u_longlong_t)sth->ndirty, @@ -306,115 +257,66 @@ spa_txg_history_data(char *buf, size_t size, void *data) (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, (u_longlong_t)sync); - buf[size] = '\0'; return (0); } -/* - * Calculate the address for the next spa_stats_history_t entry. The - * ssh->lock will be held until ksp->ks_ndata entries are processed. - */ -static void * -spa_txg_history_addr(kstat_t *ksp, loff_t n) +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) { - spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; - - ASSERT(MUTEX_HELD(&ssh->lock)); + spa_txg_history_t *sth; + while (shl->size > size) { + sth = list_remove_head(&shl->procfs_list.pl_list); + ASSERT3P(sth, !=, NULL); + kmem_free(sth, sizeof (spa_txg_history_t)); + shl->size--; + } - if (n == 0) - ssh->private = list_tail(&ssh->list); - else if (ssh->private) - ssh->private = list_prev(&ssh->list, ssh->private); + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); - return (ssh->private); } -/* - * When the kstat is written discard all spa_txg_history_t entires. The - * ssh->lock will be held until ksp->ks_ndata entries are processed. - */ static int -spa_txg_history_update(kstat_t *ksp, int rw) +spa_txg_history_clear(procfs_list_t *procfs_list) { - spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; - - ASSERT(MUTEX_HELD(&ssh->lock)); - - if (rw == KSTAT_WRITE) { - spa_txg_history_t *sth; - - while ((sth = list_remove_head(&ssh->list))) { - ssh->size--; - kmem_free(sth, sizeof (spa_txg_history_t)); - } - - ASSERT3U(ssh->size, ==, 0); - } - - ksp->ks_ndata = ssh->size; - ksp->ks_data_size = ssh->size * sizeof (spa_txg_history_t); - + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_txg_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); return (0); } static void spa_txg_history_init(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; - char name[KSTAT_STRLEN]; - kstat_t *ksp; + spa_history_list_t *shl = &spa->spa_stats.txg_history; + char *module; - mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&ssh->list, sizeof (spa_txg_history_t), - offsetof(spa_txg_history_t, sth_link)); + shl->size = 0; - ssh->count = 0; - ssh->size = 0; - ssh->private = NULL; + module = kmem_asprintf("zfs/%s", spa_name(spa)); - (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); - name[KSTAT_STRLEN-1] = '\0'; + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "txgs", + &shl->procfs_list, + spa_txg_history_show, + spa_txg_history_show_header, + spa_txg_history_clear, + offsetof(spa_txg_history_t, sth_node)); - ksp = kstat_create(name, 0, "txgs", "misc", - KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); - ssh->kstat = ksp; - - if (ksp) { - ksp->ks_lock = &ssh->lock; - ksp->ks_data = NULL; - ksp->ks_private = spa; - ksp->ks_update = spa_txg_history_update; - kstat_set_raw_ops(ksp, spa_txg_history_headers, - spa_txg_history_data, spa_txg_history_addr); - kstat_install(ksp); - } + strfree(module); } static void spa_txg_history_destroy(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; - spa_txg_history_t *sth; - kstat_t *ksp; - - ksp = ssh->kstat; - if (ksp) - kstat_delete(ksp); - - mutex_enter(&ssh->lock); - while ((sth = list_remove_head(&ssh->list))) { - ssh->size--; - kmem_free(sth, sizeof (spa_txg_history_t)); - } - - ASSERT3U(ssh->size, ==, 0); - list_destroy(&ssh->list); - mutex_exit(&ssh->lock); - - mutex_destroy(&ssh->lock); + spa_history_list_t *shl = &spa->spa_stats.txg_history; + procfs_list_uninstall(&shl->procfs_list); + spa_txg_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); } /* @@ -423,29 +325,22 @@ spa_txg_history_destroy(spa_t *spa) void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) { - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; - spa_txg_history_t *sth, *rm; + spa_history_list_t *shl = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; - if (zfs_txg_history == 0 && ssh->size == 0) + if (zfs_txg_history == 0 && shl->size == 0) return; - sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_PUSHPAGE); + sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); sth->txg = txg; sth->state = TXG_STATE_OPEN; sth->times[TXG_STATE_BIRTH] = birth_time; - mutex_enter(&ssh->lock); - - list_insert_head(&ssh->list, sth); - ssh->size++; - - while (ssh->size > zfs_txg_history) { - ssh->size--; - rm = list_remove_tail(&ssh->list); - kmem_free(rm, sizeof (spa_txg_history_t)); - } - - mutex_exit(&ssh->lock); + mutex_enter(&shl->procfs_list.pl_lock); + procfs_list_add(&shl->procfs_list, sth); + shl->size++; + spa_txg_history_truncate(shl, zfs_txg_history); + mutex_exit(&shl->procfs_list.pl_lock); } /* @@ -455,16 +350,16 @@ int spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, hrtime_t completed_time) { - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_history_list_t *shl = &spa->spa_stats.txg_history; spa_txg_history_t *sth; int error = ENOENT; if (zfs_txg_history == 0) return (0); - mutex_enter(&ssh->lock); - for (sth = list_head(&ssh->list); sth != NULL; - sth = list_next(&ssh->list, sth)) { + mutex_enter(&shl->procfs_list.pl_lock); + for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; + sth = list_prev(&shl->procfs_list.pl_list, sth)) { if (sth->txg == txg) { sth->times[completed_state] = completed_time; sth->state++; @@ -472,7 +367,7 @@ spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, break; } } - mutex_exit(&ssh->lock); + mutex_exit(&shl->procfs_list.pl_lock); return (error); } @@ -480,20 +375,20 @@ spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, /* * Set txg IO stats. */ -int +static int spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) { - spa_stats_history_t *ssh = &spa->spa_stats.txg_history; + spa_history_list_t *shl = &spa->spa_stats.txg_history; spa_txg_history_t *sth; int error = ENOENT; if (zfs_txg_history == 0) return (0); - mutex_enter(&ssh->lock); - for (sth = list_head(&ssh->list); sth != NULL; - sth = list_next(&ssh->list, sth)) { + mutex_enter(&shl->procfs_list.pl_lock); + for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; + sth = list_prev(&shl->procfs_list.pl_list, sth)) { if (sth->txg == txg) { sth->nread = nread; sth->nwritten = nwritten; @@ -504,11 +399,59 @@ spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, break; } } - mutex_exit(&ssh->lock); + mutex_exit(&shl->procfs_list.pl_lock); return (error); } +txg_stat_t * +spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) +{ + txg_stat_t *ts; + + if (zfs_txg_history == 0) + return (NULL); + + ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); + vdev_get_stats(spa->spa_root_vdev, &ts->vs1); + spa_config_exit(spa, SCL_ALL, FTAG); + + ts->txg = txg; + ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + + spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); + + return (ts); +} + +void +spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) +{ + if (ts == NULL) + return; + + if (zfs_txg_history == 0) { + kmem_free(ts, sizeof (txg_stat_t)); + return; + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); + vdev_get_stats(spa->spa_root_vdev, &ts->vs2); + spa_config_exit(spa, SCL_ALL, FTAG); + + spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); + spa_txg_history_set_io(spa, ts->txg, + ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], + ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], + ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], + ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], + ts->ndirty); + + kmem_free(ts, sizeof (txg_stat_t)); +} + /* * ========================================================================== * SPA TX Assign Histogram Routines @@ -528,16 +471,16 @@ static int spa_tx_assign_update(kstat_t *ksp, int rw) { spa_t *spa = ksp->ks_private; - spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; int i; if (rw == KSTAT_WRITE) { - for (i = 0; i < ssh->count; i++) - ((kstat_named_t *)ssh->private)[i].value.ui64 = 0; + for (i = 0; i < shk->count; i++) + ((kstat_named_t *)shk->private)[i].value.ui64 = 0; } - for (i = ssh->count; i > 0; i--) - if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0) + for (i = shk->count; i > 0; i--) + if (((kstat_named_t *)shk->private)[i-1].value.ui64 != 0) break; ksp->ks_ndata = i; @@ -549,23 +492,22 @@ spa_tx_assign_update(kstat_t *ksp, int rw) static void spa_tx_assign_init(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; - char name[KSTAT_STRLEN]; + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; + char *name; kstat_named_t *ks; kstat_t *ksp; int i; - mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); - ssh->count = 42; /* power of two buckets for 1ns to 2,199s */ - ssh->size = ssh->count * sizeof (kstat_named_t); - ssh->private = kmem_alloc(ssh->size, KM_SLEEP); + shk->count = 42; /* power of two buckets for 1ns to 2,199s */ + shk->size = shk->count * sizeof (kstat_named_t); + shk->private = kmem_alloc(shk->size, KM_SLEEP); - (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); - name[KSTAT_STRLEN-1] = '\0'; + name = kmem_asprintf("zfs/%s", spa_name(spa)); - for (i = 0; i < ssh->count; i++) { - ks = &((kstat_named_t *)ssh->private)[i]; + for (i = 0; i < shk->count; i++) { + ks = &((kstat_named_t *)shk->private)[i]; ks->data_type = KSTAT_DATA_UINT64; ks->value.ui64 = 0; (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", @@ -574,43 +516,44 @@ spa_tx_assign_init(spa_t *spa) ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); - ssh->kstat = ksp; + shk->kstat = ksp; if (ksp) { - ksp->ks_lock = &ssh->lock; - ksp->ks_data = ssh->private; - ksp->ks_ndata = ssh->count; - ksp->ks_data_size = ssh->size; + ksp->ks_lock = &shk->lock; + ksp->ks_data = shk->private; + ksp->ks_ndata = shk->count; + ksp->ks_data_size = shk->size; ksp->ks_private = spa; ksp->ks_update = spa_tx_assign_update; kstat_install(ksp); } + strfree(name); } static void spa_tx_assign_destroy(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; kstat_t *ksp; - ksp = ssh->kstat; + ksp = shk->kstat; if (ksp) kstat_delete(ksp); - kmem_free(ssh->private, ssh->size); - mutex_destroy(&ssh->lock); + kmem_free(shk->private, shk->size); + mutex_destroy(&shk->lock); } void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) { - spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram; + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; uint64_t idx = 0; - while (((1 << idx) < nsecs) && (idx < ssh->size - 1)) + while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) idx++; - atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64); + atomic_inc_64(&((kstat_named_t *)shk->private)[idx].value.ui64); } /* @@ -630,35 +573,318 @@ spa_io_history_update(kstat_t *ksp, int rw) static void spa_io_history_init(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.io_history; - char name[KSTAT_STRLEN]; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; + char *name; kstat_t *ksp; - mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); - (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa)); - name[KSTAT_STRLEN-1] = '\0'; + name = kmem_asprintf("zfs/%s", spa_name(spa)); ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); - ssh->kstat = ksp; + shk->kstat = ksp; if (ksp) { - ksp->ks_lock = &ssh->lock; + ksp->ks_lock = &shk->lock; ksp->ks_private = spa; ksp->ks_update = spa_io_history_update; kstat_install(ksp); } + strfree(name); } static void spa_io_history_destroy(spa_t *spa) { - spa_stats_history_t *ssh = &spa->spa_stats.io_history; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; - if (ssh->kstat) - kstat_delete(ssh->kstat); + if (shk->kstat) + kstat_delete(shk->kstat); - mutex_destroy(&ssh->lock); + mutex_destroy(&shk->lock); +} + +/* + * ========================================================================== + * SPA MMP History Routines + * ========================================================================== + */ + +/* + * MMP statistics - Information exported regarding attempted MMP writes + * For MMP writes issued, fields used as per comments below. + * For MMP writes skipped, an entry represents a span of time when + * writes were skipped for same reason (error from mmp_random_leaf). + * Differences are: + * timestamp time first write skipped, if >1 skipped in a row + * mmp_delay delay value at timestamp + * vdev_guid number of writes skipped + * io_error one of enum mmp_error + * duration time span (ns) of skipped writes + */ + +typedef struct spa_mmp_history { + uint64_t mmp_node_id; /* unique # for updates */ + uint64_t txg; /* txg of last sync */ + uint64_t timestamp; /* UTC time MMP write issued */ + uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ + uint64_t vdev_guid; /* unique ID of leaf vdev */ + char *vdev_path; + int vdev_label; /* vdev label */ + int io_error; /* error status of MMP write */ + hrtime_t error_start; /* hrtime of start of error period */ + hrtime_t duration; /* time from submission to completion */ + procfs_list_node_t smh_node; +} spa_mmp_history_t; + +static int +spa_mmp_history_show_header(struct seq_file *f) +{ + seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " + "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", + "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); + return (0); +} + +static int +spa_mmp_history_show(struct seq_file *f, void *data) +{ + spa_mmp_history_t *smh = (spa_mmp_history_t *)data; + char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " + "%-10lld %s\n"; + char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " + "%-10lld %s\n"; + + seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), + (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, + (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, + (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, + (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, + (smh->vdev_path ? smh->vdev_path : "-")); + + return (0); +} + +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) +{ + spa_mmp_history_t *smh; + while (shl->size > size) { + smh = list_remove_head(&shl->procfs_list.pl_list); + if (smh->vdev_path) + strfree(smh->vdev_path); + kmem_free(smh, sizeof (spa_mmp_history_t)); + shl->size--; + } + + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); + +} + +static int +spa_mmp_history_clear(procfs_list_t *procfs_list) +{ + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_mmp_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); + return (0); +} + +static void +spa_mmp_history_init(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + char *module; + + shl->size = 0; + + module = kmem_asprintf("zfs/%s", spa_name(spa)); + + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "multihost", + &shl->procfs_list, + spa_mmp_history_show, + spa_mmp_history_show_header, + spa_mmp_history_clear, + offsetof(spa_mmp_history_t, smh_node)); + + strfree(module); +} + +static void +spa_mmp_history_destroy(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + procfs_list_uninstall(&shl->procfs_list); + spa_mmp_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); +} + +/* + * Set duration in existing "skip" record to how long we have waited for a leaf + * vdev to become available. + * + * Important that we start search at the tail of the list where new + * records are inserted, so this is normally an O(1) operation. + */ +int +spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + spa_mmp_history_t *smh; + int error = ENOENT; + + if (zfs_multihost_history == 0 && shl->size == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; + smh = list_prev(&shl->procfs_list.pl_list, smh)) { + if (smh->mmp_node_id == mmp_node_id) { + ASSERT3U(smh->io_error, !=, 0); + smh->duration = gethrtime() - smh->error_start; + smh->vdev_guid++; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +/* + * Set MMP write duration and error status in existing record. + * See comment re: search order above spa_mmp_history_set_skip(). + */ +int +spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, + hrtime_t duration) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + spa_mmp_history_t *smh; + int error = ENOENT; + + if (zfs_multihost_history == 0 && shl->size == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; + smh = list_prev(&shl->procfs_list.pl_list, smh)) { + if (smh->mmp_node_id == mmp_node_id) { + ASSERT(smh->io_error == 0); + smh->io_error = io_error; + smh->duration = duration; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +/* + * Add a new MMP historical record. + * error == 0 : a write was issued. + * error != 0 : a write was not issued because no leaves were found. + */ +void +spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, + uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, + int error) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + spa_mmp_history_t *smh; + + if (zfs_multihost_history == 0 && shl->size == 0) + return; + + smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); + smh->txg = txg; + smh->timestamp = timestamp; + smh->mmp_delay = mmp_delay; + if (vd) { + smh->vdev_guid = vd->vdev_guid; + if (vd->vdev_path) + smh->vdev_path = strdup(vd->vdev_path); + } + smh->vdev_label = label; + smh->mmp_node_id = mmp_node_id; + + if (error) { + smh->io_error = error; + smh->error_start = gethrtime(); + smh->vdev_guid = 1; + } + + mutex_enter(&shl->procfs_list.pl_lock); + procfs_list_add(&shl->procfs_list, smh); + shl->size++; + spa_mmp_history_truncate(shl, zfs_multihost_history); + mutex_exit(&shl->procfs_list.pl_lock); +} + +static void * +spa_state_addr(kstat_t *ksp, loff_t n) +{ + return (ksp->ks_private); /* return the spa_t */ +} + +static int +spa_state_data(char *buf, size_t size, void *data) +{ + spa_t *spa = (spa_t *)data; + (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); + return (0); +} + +/* + * Return the state of the pool in /proc/spl/kstat/zfs//state. + * + * This is a lock-less read of the pool's state (unlike using 'zpool', which + * can potentially block for seconds). Because it doesn't block, it can useful + * as a pool heartbeat value. + */ +static void +spa_state_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.state; + char *name; + kstat_t *ksp; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + name = kmem_asprintf("zfs/%s", spa_name(spa)); + ksp = kstat_create(name, 0, "state", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + + shk->kstat = ksp; + if (ksp) { + ksp->ks_lock = &shk->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; + kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); + kstat_install(ksp); + } + + strfree(name); +} + +static void +spa_health_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.state; + kstat_t *ksp = shk->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_destroy(&shk->lock); } void @@ -668,24 +894,37 @@ spa_stats_init(spa_t *spa) spa_txg_history_init(spa); spa_tx_assign_init(spa); spa_io_history_init(spa); + spa_mmp_history_init(spa); + spa_state_init(spa); } void spa_stats_destroy(spa_t *spa) { + spa_health_destroy(spa); spa_tx_assign_destroy(spa); spa_txg_history_destroy(spa); spa_read_history_destroy(spa); spa_io_history_destroy(spa); + spa_mmp_history_destroy(spa); } -#if defined(_KERNEL) && defined(HAVE_SPL) +#if defined(_KERNEL) +/* CSTYLED */ module_param(zfs_read_history, int, 0644); -MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads"); +MODULE_PARM_DESC(zfs_read_history, + "Historical statistics for the last N reads"); module_param(zfs_read_history_hits, int, 0644); -MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history"); +MODULE_PARM_DESC(zfs_read_history_hits, + "Include cache hits in read history"); module_param(zfs_txg_history, int, 0644); -MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); +MODULE_PARM_DESC(zfs_txg_history, + "Historical statistics for the last N txgs"); + +module_param(zfs_multihost_history, int, 0644); +MODULE_PARM_DESC(zfs_multihost_history, + "Historical statistics for last N multihost writes"); +/* END CSTYLED */ #endif