]> git.proxmox.com Git - mirror_zfs.git/commitdiff
Add visibility in to arc_read
authorPrakash Surya <surya1@llnl.gov>
Fri, 6 Sep 2013 23:09:05 +0000 (16:09 -0700)
committerBrian Behlendorf <behlendorf1@llnl.gov>
Fri, 25 Oct 2013 20:57:25 +0000 (13:57 -0700)
This change is an attempt to add visibility into the arc_read calls
occurring on a system, in real time. To do this, a list was added to the
in memory SPA data structure for a pool, with each element on the list
corresponding to a call to arc_read. These entries are then exported
through the kstat interface, which can then be interpreted in userspace.

For each arc_read call, the following information is exported:

 * A unique identifier (uint64_t)
 * The time the entry was added to the list (hrtime_t)
   (*not* wall clock time; relative to the other entries on the list)
 * The objset ID (uint64_t)
 * The object number (uint64_t)
 * The indirection level (uint64_t)
 * The block ID (uint64_t)
 * The name of the function originating the arc_read call (char[24])
 * The arc_flags from the arc_read call (uint32_t)
 * The PID of the reading thread (pid_t)
 * The command or name of thread originating read (char[16])

From this exported information one can see, in real time, exactly what
is being read, what function is generating the read, and whether or not
the read was found to be already cached.

There is still some work to be done, but this should serve as a good
starting point.

Specifically, dbuf_read's are not accounted for in the currently
exported information. Thus, a follow up patch should probably be added
to export these calls that never call into arc_read (they only hit the
dbuf hash table). In addition, it might be nice to create a utility
similar to "arcstat.py" to digest the exported information and display
it in a more readable format. Or perhaps, log the information and allow
for it to be "replayed" at a later time.

Signed-off-by: Prakash Surya <surya1@llnl.gov>
Signed-off-by: Brian Behlendorf <behlendorf1@llnl.gov>
include/sys/spa.h
include/sys/spa_impl.h
include/sys/zfs_context.h
include/sys/zio.h
lib/libzpool/Makefile.am
lib/libzpool/kernel.c
module/zfs/Makefile.in
module/zfs/arc.c
module/zfs/dmu_traverse.c
module/zfs/spa_misc.c
module/zfs/spa_stats.c [new file with mode: 0644]

index 401ae8343e96f678776afee5413c7dbeb93ff9fe..a4a717ac04509db96891c9bf1c3bd8e19c26e527 100644 (file)
@@ -51,6 +51,8 @@ typedef struct zilog zilog_t;
 typedef struct spa_aux_vdev spa_aux_vdev_t;
 typedef struct ddt ddt_t;
 typedef struct ddt_entry ddt_entry_t;
+typedef struct zbookmark zbookmark_t;
+
 struct dsl_pool;
 struct dsl_dataset;
 
@@ -534,6 +536,25 @@ extern boolean_t spa_refcount_zero(spa_t *spa);
 #define        SCL_ALL         ((1 << SCL_LOCKS) - 1)
 #define        SCL_STATE_ALL   (SCL_STATE | SCL_L2ARC | SCL_ZIO)
 
+/* Historical pool statistics */
+typedef struct spa_stats_history {
+       kmutex_t                lock;
+       uint64_t                count;
+       uint64_t                size;
+       kstat_t                 *kstat;
+       void                    *private;
+       list_t                  list;
+} spa_stats_history_t;
+
+typedef struct spa_stats {
+       spa_stats_history_t     read_history;
+} spa_stats_t;
+
+extern void spa_stats_init(spa_t *spa);
+extern void spa_stats_destroy(spa_t *spa);
+extern void spa_read_history_add(spa_t *spa, const zbookmark_t *zb,
+    uint32_t aflags);
+
 /* Pool configuration locks */
 extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
 extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw);
index 47dfe432eed2f38fa536cec8a34198f7add27012..107d14a76cfd0aa287451f16ae767eaef0315f11 100644 (file)
@@ -236,6 +236,8 @@ struct spa {
        uint64_t        spa_deadman_calls;      /* number of deadman calls */
        uint64_t        spa_sync_starttime;     /* starting time fo spa_sync */
        uint64_t        spa_deadman_synctime;   /* deadman expiration timer */
+       spa_stats_t     spa_stats;              /* assorted spa statistics */
+
        /*
         * spa_refcnt & spa_config_lock must be the last elements
         * because refcount_t changes size based on compilation options.
index a126c058ebd9d411195ec4988339ee845a43409c..dfd11d9f1e5e0fb9afd79606b327977c99a92d4a 100644 (file)
@@ -210,6 +210,7 @@ typedef struct kthread {
 } kthread_t;
 
 #define        curthread                       zk_thread_current()
+#define        getcomm()                       "unknown"
 #define        thread_exit                     zk_thread_exit
 #define        thread_create(stk, stksize, func, arg, len, pp, state, pri)     \
        zk_thread_create(stk, stksize, (thread_func_t)func, arg,        \
@@ -347,6 +348,10 @@ extern kstat_t *kstat_create(char *, int,
     char *, char *, uchar_t, ulong_t, uchar_t);
 extern void kstat_install(kstat_t *);
 extern void kstat_delete(kstat_t *);
+extern void kstat_set_raw_ops(kstat_t *ksp,
+    int (*headers)(char *buf, size_t size),
+    int (*data)(char *buf, size_t size, void *data),
+    void *(*addr)(kstat_t *ksp, loff_t index));
 
 /*
  * Kernel memory
index 189966bef54c4ed5a2a9fdf931e53e90737807fc..f5a128e0b31957446d0f32344fdf3aaeda13ea09 100644 (file)
@@ -256,12 +256,13 @@ extern char *zio_type_name[ZIO_TYPES];
  * Therefore it must not change size or alignment between 32/64 bit
  * compilation options.
  */
-typedef struct zbookmark {
+struct zbookmark {
        uint64_t        zb_objset;
        uint64_t        zb_object;
        int64_t         zb_level;
        uint64_t        zb_blkid;
-} zbookmark_t;
+       char *          zb_func;
+};
 
 #define        SET_BOOKMARK(zb, objset, object, level, blkid)  \
 {                                                       \
@@ -269,6 +270,7 @@ typedef struct zbookmark {
        (zb)->zb_object = object;                       \
        (zb)->zb_level = level;                         \
        (zb)->zb_blkid = blkid;                         \
+       (zb)->zb_func = FTAG;                           \
 }
 
 #define        ZB_DESTROYED_OBJSET     (-1ULL)
index da81659d240c0b869903bf60b91b2327a8582058..b2d6394546a8766fde9fb15890213f3eecaa77fd 100644 (file)
@@ -62,6 +62,7 @@ libzpool_la_SOURCES = \
        $(top_srcdir)/module/zfs/spa_errlog.c \
        $(top_srcdir)/module/zfs/spa_history.c \
        $(top_srcdir)/module/zfs/spa_misc.c \
+       $(top_srcdir)/module/zfs/spa_stats.c \
        $(top_srcdir)/module/zfs/space_map.c \
        $(top_srcdir)/module/zfs/txg.c \
        $(top_srcdir)/module/zfs/uberblock.c \
index e4d645cf7797db2519834c2ea95fe47bc0f1789e..57a3739e23e81fb682ae9500daa93d9793522787 100644 (file)
@@ -240,6 +240,14 @@ void
 kstat_delete(kstat_t *ksp)
 {}
 
+/*ARGSUSED*/
+void
+kstat_set_raw_ops(kstat_t *ksp,
+    int (*headers)(char *buf, size_t size),
+    int (*data)(char *buf, size_t size, void *data),
+    void *(*addr)(kstat_t *ksp, loff_t index))
+{}
+
 /*
  * =========================================================================
  * mutexes
index e71228454110d48861f2bec8a181916fd9189998..6f0f6ef05ae19b98aab37fc527b775a934fcf518 100644 (file)
@@ -44,6 +44,7 @@ $(MODULE)-objs += @top_srcdir@/module/zfs/spa_config.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/spa_errlog.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/spa_history.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/spa_misc.o
+$(MODULE)-objs += @top_srcdir@/module/zfs/spa_stats.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/space_map.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/txg.o
 $(MODULE)-objs += @top_srcdir@/module/zfs/uberblock.o
index 1298c5b91bc6a5157f47c038ef3aa203b8a0b7ef..366f5bf9b5daa66f655c7a0a408cb25fe4c5b2b7 100644 (file)
@@ -2943,6 +2943,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done,
        kmutex_t *hash_lock;
        zio_t *rzio;
        uint64_t guid = spa_load_guid(spa);
+       int rc = 0;
 
 top:
        hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
@@ -2976,10 +2977,10 @@ top:
                                hdr->b_acb = acb;
                                add_reference(hdr, hash_lock, private);
                                mutex_exit(hash_lock);
-                               return (0);
+                               goto out;
                        }
                        mutex_exit(hash_lock);
-                       return (0);
+                       goto out;
                }
 
                ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
@@ -3174,12 +3175,12 @@ top:
 
                                if (*arc_flags & ARC_NOWAIT) {
                                        zio_nowait(rzio);
-                                       return (0);
+                                       goto out;
                                }
 
                                ASSERT(*arc_flags & ARC_WAIT);
                                if (zio_wait(rzio) == 0)
-                                       return (0);
+                                       goto out;
 
                                /* l2arc read error; goto zio_read() */
                        } else {
@@ -3203,13 +3204,18 @@ top:
                rzio = zio_read(pio, spa, bp, buf->b_data, size,
                    arc_read_done, buf, priority, zio_flags, zb);
 
-               if (*arc_flags & ARC_WAIT)
-                       return (zio_wait(rzio));
+               if (*arc_flags & ARC_WAIT) {
+                       rc = zio_wait(rzio);
+                       goto out;
+               }
 
                ASSERT(*arc_flags & ARC_NOWAIT);
                zio_nowait(rzio);
        }
-       return (0);
+
+out:
+       spa_read_history_add(spa, zb, *arc_flags);
+       return (rc);
 }
 
 arc_prune_t *
index 32b3e50fc005a079e28e56dc77a585e4e8db7d83..1827ffc7413fe1bb7362023a1c9164626b81a955 100644 (file)
@@ -517,6 +517,9 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
        mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL);
        cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL);
 
+       SET_BOOKMARK(czb, td->td_objset,
+           ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
        /* See comment on ZIL traversal in dsl_scan_visitds. */
        if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) {
                uint32_t flags = ARC_WAIT;
@@ -525,7 +528,7 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
 
                err = arc_read(NULL, td->td_spa, rootbp,
                    arc_getbuf_func, &buf,
-                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL);
+                   ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, czb);
                if (err != 0)
                        return (err);
 
@@ -539,8 +542,6 @@ traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp,
            td, TQ_NOQUEUE))
                pd->pd_exited = B_TRUE;
 
-       SET_BOOKMARK(czb, td->td_objset,
-           ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
        err = traverse_visitbp(td, NULL, rootbp, czb);
 
        mutex_enter(&pd->pd_mtx);
index 9ba6e896338bd54d8769fd5690cfbb7dd25fd879..a0b4797c97cb09d5a38bd20efef909ef2d7bc4d1 100644 (file)
@@ -48,6 +48,7 @@
 #include <sys/metaslab_impl.h>
 #include <sys/arc.h>
 #include <sys/ddt.h>
+#include <sys/kstat.h>
 #include "zfs_prop.h"
 #include "zfeature_common.h"
 
@@ -253,7 +254,6 @@ unsigned long zfs_deadman_synctime = 1000ULL;
  */
 int zfs_deadman_enabled = 1;
 
-
 /*
  * ==========================================================================
  * SPA config locking
@@ -495,6 +495,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
 
        refcount_create(&spa->spa_refcount);
        spa_config_lock_init(spa);
+       spa_stats_init(spa);
 
        avl_add(&spa_namespace_avl, spa);
 
@@ -580,6 +581,7 @@ spa_remove(spa_t *spa)
 
        refcount_destroy(&spa->spa_refcount);
 
+       spa_stats_destroy(spa);
        spa_config_lock_destroy(spa);
 
        for (t = 0; t < TXG_SIZE; t++)
diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
new file mode 100644 (file)
index 0000000..271a3c3
--- /dev/null
@@ -0,0 +1,248 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa_impl.h>
+
+/*
+ * Keeps stats on last N reads per spa_t, disabled by default.
+ */
+int zfs_read_history = 0;
+
+/*
+ * Include cache hits in history, disabled by default.
+ */
+int zfs_read_history_hits = 0;
+
+/*
+ * ==========================================================================
+ * SPA Read History Routines
+ * ==========================================================================
+ */
+
+/*
+ * Read statistics - Information exported regarding each arc_read call
+ */
+typedef struct spa_read_history {
+       uint64_t        uid;            /* unique identifier */
+       hrtime_t        start;          /* time read completed */
+       uint64_t        objset;         /* read from this objset */
+       uint64_t        object;         /* read of this object number */
+       uint64_t        level;          /* block's indirection level */
+       uint64_t        blkid;          /* read of this block id */
+       char            origin[24];     /* read originated from here */
+       uint32_t        aflags;         /* ARC flags (cached, prefetch, etc.) */
+       pid_t           pid;            /* PID of task doing read */
+       char            comm[16];       /* process name of task doing read */
+       list_node_t     srh_link;
+} spa_read_history_t;
+
+static int
+spa_read_history_headers(char *buf, size_t size)
+{
+       size = snprintf(buf, size - 1, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
+           "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
+           "level", "blkid", "aflags", "origin", "pid", "process");
+       buf[size] = '\0';
+
+       return (0);
+}
+
+static int
+spa_read_history_data(char *buf, size_t size, void *data)
+{
+       spa_read_history_t *srh = (spa_read_history_t *)data;
+
+       size = snprintf(buf, size - 1, "%-8llu %-16llu 0x%-6llx "
+           "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
+           (u_longlong_t)srh->uid, srh->start,
+           (longlong_t)srh->objset, (longlong_t)srh->object,
+           (longlong_t)srh->level, (longlong_t)srh->blkid,
+           srh->aflags, srh->origin, srh->pid, srh->comm);
+       buf[size] = '\0';
+
+       return (0);
+}
+
+/*
+ * Calculate the address for the next spa_stats_history_t entry.  The
+ * ssh->lock will be held until ksp->ks_ndata entries are processed.
+ */
+static void *
+spa_read_history_addr(kstat_t *ksp, loff_t n)
+{
+       spa_t *spa = ksp->ks_private;
+       spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+
+       ASSERT(MUTEX_HELD(&ssh->lock));
+
+       if (n == 0)
+               ssh->private = list_tail(&ssh->list);
+       else if (ssh->private)
+               ssh->private = list_prev(&ssh->list, ssh->private);
+
+       return (ssh->private);
+}
+
+/*
+ * When the kstat is written discard all spa_read_history_t entires.  The
+ * ssh->lock will be held until ksp->ks_ndata entries are processed.
+ */
+static int
+spa_read_history_update(kstat_t *ksp, int rw)
+{
+       spa_t *spa = ksp->ks_private;
+       spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+
+       if (rw == KSTAT_WRITE) {
+               spa_read_history_t *srh;
+
+               while ((srh = list_remove_head(&ssh->list))) {
+                       ssh->size--;
+                       kmem_free(srh, sizeof(spa_read_history_t));
+               }
+
+               ASSERT3U(ssh->size, ==, 0);
+       }
+
+       ksp->ks_ndata = ssh->size;
+       ksp->ks_data_size = ssh->size * sizeof(spa_read_history_t);
+
+       return (0);
+}
+
+static void
+spa_read_history_init(spa_t *spa)
+{
+       spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+       char name[KSTAT_STRLEN];
+       kstat_t *ksp;
+
+       mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
+       list_create(&ssh->list, sizeof (spa_read_history_t),
+           offsetof(spa_read_history_t, srh_link));
+
+       ssh->count = 0;
+       ssh->size = 0;
+       ssh->private = NULL;
+
+       (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
+       name[KSTAT_STRLEN-1] = '\0';
+
+       ksp = kstat_create(name, 0, "reads", "misc",
+           KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
+       ssh->kstat = ksp;
+
+       if (ksp) {
+               ksp->ks_lock = &ssh->lock;
+               ksp->ks_data = NULL;
+               ksp->ks_private = spa;
+               ksp->ks_update = spa_read_history_update;
+               kstat_set_raw_ops(ksp, spa_read_history_headers,
+                   spa_read_history_data, spa_read_history_addr);
+               kstat_install(ksp);
+       }
+}
+
+static void
+spa_read_history_destroy(spa_t *spa)
+{
+       spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+       spa_read_history_t *srh;
+       kstat_t *ksp;
+
+       ksp = ssh->kstat;
+       if (ksp)
+               kstat_delete(ksp);
+
+       mutex_enter(&ssh->lock);
+       while ((srh = list_remove_head(&ssh->list))) {
+               ssh->size--;
+               kmem_free(srh, sizeof(spa_read_history_t));
+       }
+
+       ASSERT3U(ssh->size, ==, 0);
+       list_destroy(&ssh->list);
+       mutex_exit(&ssh->lock);
+
+       mutex_destroy(&ssh->lock);
+}
+
+void
+spa_read_history_add(spa_t *spa, const zbookmark_t *zb, uint32_t aflags)
+{
+       spa_stats_history_t *ssh = &spa->spa_stats.read_history;
+       spa_read_history_t *srh, *rm;
+
+       ASSERT3P(spa, !=, NULL);
+       ASSERT3P(zb,  !=, NULL);
+
+       if (zfs_read_history == 0 && ssh->size == 0)
+               return;
+
+       if (zfs_read_history_hits == 0 && (aflags & ARC_CACHED))
+               return;
+
+       srh = kmem_zalloc(sizeof(spa_read_history_t), KM_PUSHPAGE);
+       strlcpy(srh->origin, zb->zb_func, sizeof(srh->origin));
+       strlcpy(srh->comm, getcomm(), sizeof(srh->comm));
+       srh->start  = gethrtime();
+       srh->objset = zb->zb_objset;
+       srh->object = zb->zb_object;
+       srh->level  = zb->zb_level;
+       srh->blkid  = zb->zb_blkid;
+       srh->aflags = aflags;
+       srh->pid    = getpid();
+
+       mutex_enter(&ssh->lock);
+
+       srh->uid = ssh->count++;
+       list_insert_head(&ssh->list, srh);
+       ssh->size++;
+
+       while (ssh->size > zfs_read_history) {
+               ssh->size--;
+               rm = list_remove_tail(&ssh->list);
+               kmem_free(rm, sizeof(spa_read_history_t));
+       }
+
+       mutex_exit(&ssh->lock);
+}
+
+void
+spa_stats_init(spa_t *spa)
+{
+       spa_read_history_init(spa);
+}
+
+void
+spa_stats_destroy(spa_t *spa)
+{
+       spa_read_history_destroy(spa);
+}
+
+#if defined(_KERNEL) && defined(HAVE_SPL)
+module_param(zfs_read_history, int, 0644);
+MODULE_PARM_DESC(zfs_read_history, "Historic statistics for the last N reads");
+
+module_param(zfs_read_history_hits, int, 0644);
+MODULE_PARM_DESC(zfs_read_history_hits, "Include cache hits in read history");
+#endif