4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
22 #include <sys/zfs_context.h>
23 #include <sys/spa_impl.h>
24 #include <sys/vdev_impl.h>
26 #include <zfs_comutil.h>
29 * Keeps stats on last N reads per spa_t, disabled by default.
31 int zfs_read_history
= 0;
34 * Include cache hits in history, disabled by default.
36 int zfs_read_history_hits
= 0;
39 * Keeps stats on the last N txgs, disabled by default.
41 int zfs_txg_history
= 0;
44 * Keeps stats on the last N MMP updates, disabled by default.
46 int zfs_multihost_history
= 0;
49 * ==========================================================================
50 * SPA Read History Routines
51 * ==========================================================================
55 * Read statistics - Information exported regarding each arc_read call
57 typedef struct spa_read_history
{
58 uint64_t uid
; /* unique identifier */
59 hrtime_t start
; /* time read completed */
60 uint64_t objset
; /* read from this objset */
61 uint64_t object
; /* read of this object number */
62 uint64_t level
; /* block's indirection level */
63 uint64_t blkid
; /* read of this block id */
64 char origin
[24]; /* read originated from here */
65 uint32_t aflags
; /* ARC flags (cached, prefetch, etc.) */
66 pid_t pid
; /* PID of task doing read */
67 char comm
[16]; /* process name of task doing read */
72 spa_read_history_headers(char *buf
, size_t size
)
74 (void) snprintf(buf
, size
, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
75 "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
76 "level", "blkid", "aflags", "origin", "pid", "process");
82 spa_read_history_data(char *buf
, size_t size
, void *data
)
84 spa_read_history_t
*srh
= (spa_read_history_t
*)data
;
86 (void) snprintf(buf
, size
, "%-8llu %-16llu 0x%-6llx "
87 "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
88 (u_longlong_t
)srh
->uid
, srh
->start
,
89 (longlong_t
)srh
->objset
, (longlong_t
)srh
->object
,
90 (longlong_t
)srh
->level
, (longlong_t
)srh
->blkid
,
91 srh
->aflags
, srh
->origin
, srh
->pid
, srh
->comm
);
97 * Calculate the address for the next spa_stats_history_t entry. The
98 * ssh->lock will be held until ksp->ks_ndata entries are processed.
101 spa_read_history_addr(kstat_t
*ksp
, loff_t n
)
103 spa_t
*spa
= ksp
->ks_private
;
104 spa_stats_history_t
*ssh
= &spa
->spa_stats
.read_history
;
106 ASSERT(MUTEX_HELD(&ssh
->lock
));
109 ssh
->private = list_tail(&ssh
->list
);
110 else if (ssh
->private)
111 ssh
->private = list_prev(&ssh
->list
, ssh
->private);
113 return (ssh
->private);
117 * When the kstat is written discard all spa_read_history_t entries. The
118 * ssh->lock will be held until ksp->ks_ndata entries are processed.
121 spa_read_history_update(kstat_t
*ksp
, int rw
)
123 spa_t
*spa
= ksp
->ks_private
;
124 spa_stats_history_t
*ssh
= &spa
->spa_stats
.read_history
;
126 if (rw
== KSTAT_WRITE
) {
127 spa_read_history_t
*srh
;
129 while ((srh
= list_remove_head(&ssh
->list
))) {
131 kmem_free(srh
, sizeof (spa_read_history_t
));
134 ASSERT3U(ssh
->size
, ==, 0);
137 ksp
->ks_ndata
= ssh
->size
;
138 ksp
->ks_data_size
= ssh
->size
* sizeof (spa_read_history_t
);
144 spa_read_history_init(spa_t
*spa
)
146 spa_stats_history_t
*ssh
= &spa
->spa_stats
.read_history
;
147 char name
[KSTAT_STRLEN
];
150 mutex_init(&ssh
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
151 list_create(&ssh
->list
, sizeof (spa_read_history_t
),
152 offsetof(spa_read_history_t
, srh_link
));
158 (void) snprintf(name
, KSTAT_STRLEN
, "zfs/%s", spa_name(spa
));
160 ksp
= kstat_create(name
, 0, "reads", "misc",
161 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
165 ksp
->ks_lock
= &ssh
->lock
;
167 ksp
->ks_private
= spa
;
168 ksp
->ks_update
= spa_read_history_update
;
169 kstat_set_raw_ops(ksp
, spa_read_history_headers
,
170 spa_read_history_data
, spa_read_history_addr
);
176 spa_read_history_destroy(spa_t
*spa
)
178 spa_stats_history_t
*ssh
= &spa
->spa_stats
.read_history
;
179 spa_read_history_t
*srh
;
186 mutex_enter(&ssh
->lock
);
187 while ((srh
= list_remove_head(&ssh
->list
))) {
189 kmem_free(srh
, sizeof (spa_read_history_t
));
192 ASSERT3U(ssh
->size
, ==, 0);
193 list_destroy(&ssh
->list
);
194 mutex_exit(&ssh
->lock
);
196 mutex_destroy(&ssh
->lock
);
200 spa_read_history_add(spa_t
*spa
, const zbookmark_phys_t
*zb
, uint32_t aflags
)
202 spa_stats_history_t
*ssh
= &spa
->spa_stats
.read_history
;
203 spa_read_history_t
*srh
, *rm
;
205 ASSERT3P(spa
, !=, NULL
);
206 ASSERT3P(zb
, !=, NULL
);
208 if (zfs_read_history
== 0 && ssh
->size
== 0)
211 if (zfs_read_history_hits
== 0 && (aflags
& ARC_FLAG_CACHED
))
214 srh
= kmem_zalloc(sizeof (spa_read_history_t
), KM_SLEEP
);
215 strlcpy(srh
->comm
, getcomm(), sizeof (srh
->comm
));
216 srh
->start
= gethrtime();
217 srh
->objset
= zb
->zb_objset
;
218 srh
->object
= zb
->zb_object
;
219 srh
->level
= zb
->zb_level
;
220 srh
->blkid
= zb
->zb_blkid
;
221 srh
->aflags
= aflags
;
224 mutex_enter(&ssh
->lock
);
226 srh
->uid
= ssh
->count
++;
227 list_insert_head(&ssh
->list
, srh
);
230 while (ssh
->size
> zfs_read_history
) {
232 rm
= list_remove_tail(&ssh
->list
);
233 kmem_free(rm
, sizeof (spa_read_history_t
));
236 mutex_exit(&ssh
->lock
);
240 * ==========================================================================
241 * SPA TXG History Routines
242 * ==========================================================================
246 * Txg statistics - Information exported regarding each txg sync
249 typedef struct spa_txg_history
{
250 uint64_t txg
; /* txg id */
251 txg_state_t state
; /* active txg state */
252 uint64_t nread
; /* number of bytes read */
253 uint64_t nwritten
; /* number of bytes written */
254 uint64_t reads
; /* number of read operations */
255 uint64_t writes
; /* number of write operations */
256 uint64_t ndirty
; /* number of dirty bytes */
257 hrtime_t times
[TXG_STATE_COMMITTED
]; /* completion times */
258 list_node_t sth_link
;
262 spa_txg_history_headers(char *buf
, size_t size
)
264 (void) snprintf(buf
, size
, "%-8s %-16s %-5s %-12s %-12s %-12s "
265 "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
266 "ndirty", "nread", "nwritten", "reads", "writes",
267 "otime", "qtime", "wtime", "stime");
273 spa_txg_history_data(char *buf
, size_t size
, void *data
)
275 spa_txg_history_t
*sth
= (spa_txg_history_t
*)data
;
276 uint64_t open
= 0, quiesce
= 0, wait
= 0, sync
= 0;
279 switch (sth
->state
) {
280 case TXG_STATE_BIRTH
: state
= 'B'; break;
281 case TXG_STATE_OPEN
: state
= 'O'; break;
282 case TXG_STATE_QUIESCED
: state
= 'Q'; break;
283 case TXG_STATE_WAIT_FOR_SYNC
: state
= 'W'; break;
284 case TXG_STATE_SYNCED
: state
= 'S'; break;
285 case TXG_STATE_COMMITTED
: state
= 'C'; break;
286 default: state
= '?'; break;
289 if (sth
->times
[TXG_STATE_OPEN
])
290 open
= sth
->times
[TXG_STATE_OPEN
] -
291 sth
->times
[TXG_STATE_BIRTH
];
293 if (sth
->times
[TXG_STATE_QUIESCED
])
294 quiesce
= sth
->times
[TXG_STATE_QUIESCED
] -
295 sth
->times
[TXG_STATE_OPEN
];
297 if (sth
->times
[TXG_STATE_WAIT_FOR_SYNC
])
298 wait
= sth
->times
[TXG_STATE_WAIT_FOR_SYNC
] -
299 sth
->times
[TXG_STATE_QUIESCED
];
301 if (sth
->times
[TXG_STATE_SYNCED
])
302 sync
= sth
->times
[TXG_STATE_SYNCED
] -
303 sth
->times
[TXG_STATE_WAIT_FOR_SYNC
];
305 (void) snprintf(buf
, size
, "%-8llu %-16llu %-5c %-12llu "
306 "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
307 (longlong_t
)sth
->txg
, sth
->times
[TXG_STATE_BIRTH
], state
,
308 (u_longlong_t
)sth
->ndirty
,
309 (u_longlong_t
)sth
->nread
, (u_longlong_t
)sth
->nwritten
,
310 (u_longlong_t
)sth
->reads
, (u_longlong_t
)sth
->writes
,
311 (u_longlong_t
)open
, (u_longlong_t
)quiesce
, (u_longlong_t
)wait
,
318 * Calculate the address for the next spa_stats_history_t entry. The
319 * ssh->lock will be held until ksp->ks_ndata entries are processed.
322 spa_txg_history_addr(kstat_t
*ksp
, loff_t n
)
324 spa_t
*spa
= ksp
->ks_private
;
325 spa_stats_history_t
*ssh
= &spa
->spa_stats
.txg_history
;
327 ASSERT(MUTEX_HELD(&ssh
->lock
));
330 ssh
->private = list_tail(&ssh
->list
);
331 else if (ssh
->private)
332 ssh
->private = list_prev(&ssh
->list
, ssh
->private);
334 return (ssh
->private);
338 * When the kstat is written discard all spa_txg_history_t entries. The
339 * ssh->lock will be held until ksp->ks_ndata entries are processed.
342 spa_txg_history_update(kstat_t
*ksp
, int rw
)
344 spa_t
*spa
= ksp
->ks_private
;
345 spa_stats_history_t
*ssh
= &spa
->spa_stats
.txg_history
;
347 ASSERT(MUTEX_HELD(&ssh
->lock
));
349 if (rw
== KSTAT_WRITE
) {
350 spa_txg_history_t
*sth
;
352 while ((sth
= list_remove_head(&ssh
->list
))) {
354 kmem_free(sth
, sizeof (spa_txg_history_t
));
357 ASSERT3U(ssh
->size
, ==, 0);
360 ksp
->ks_ndata
= ssh
->size
;
361 ksp
->ks_data_size
= ssh
->size
* sizeof (spa_txg_history_t
);
367 spa_txg_history_init(spa_t
*spa
)
369 spa_stats_history_t
*ssh
= &spa
->spa_stats
.txg_history
;
370 char name
[KSTAT_STRLEN
];
373 mutex_init(&ssh
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
374 list_create(&ssh
->list
, sizeof (spa_txg_history_t
),
375 offsetof(spa_txg_history_t
, sth_link
));
381 (void) snprintf(name
, KSTAT_STRLEN
, "zfs/%s", spa_name(spa
));
383 ksp
= kstat_create(name
, 0, "txgs", "misc",
384 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
388 ksp
->ks_lock
= &ssh
->lock
;
390 ksp
->ks_private
= spa
;
391 ksp
->ks_update
= spa_txg_history_update
;
392 kstat_set_raw_ops(ksp
, spa_txg_history_headers
,
393 spa_txg_history_data
, spa_txg_history_addr
);
399 spa_txg_history_destroy(spa_t
*spa
)
401 spa_stats_history_t
*ssh
= &spa
->spa_stats
.txg_history
;
402 spa_txg_history_t
*sth
;
409 mutex_enter(&ssh
->lock
);
410 while ((sth
= list_remove_head(&ssh
->list
))) {
412 kmem_free(sth
, sizeof (spa_txg_history_t
));
415 ASSERT3U(ssh
->size
, ==, 0);
416 list_destroy(&ssh
->list
);
417 mutex_exit(&ssh
->lock
);
419 mutex_destroy(&ssh
->lock
);
423 * Add a new txg to historical record.
426 spa_txg_history_add(spa_t
*spa
, uint64_t txg
, hrtime_t birth_time
)
428 spa_stats_history_t
*ssh
= &spa
->spa_stats
.txg_history
;
429 spa_txg_history_t
*sth
, *rm
;
431 if (zfs_txg_history
== 0 && ssh
->size
== 0)
434 sth
= kmem_zalloc(sizeof (spa_txg_history_t
), KM_SLEEP
);
436 sth
->state
= TXG_STATE_OPEN
;
437 sth
->times
[TXG_STATE_BIRTH
] = birth_time
;
439 mutex_enter(&ssh
->lock
);
441 list_insert_head(&ssh
->list
, sth
);
444 while (ssh
->size
> zfs_txg_history
) {
446 rm
= list_remove_tail(&ssh
->list
);
447 kmem_free(rm
, sizeof (spa_txg_history_t
));
450 mutex_exit(&ssh
->lock
);
454 * Set txg state completion time and increment current state.
457 spa_txg_history_set(spa_t
*spa
, uint64_t txg
, txg_state_t completed_state
,
458 hrtime_t completed_time
)
460 spa_stats_history_t
*ssh
= &spa
->spa_stats
.txg_history
;
461 spa_txg_history_t
*sth
;
464 if (zfs_txg_history
== 0)
467 mutex_enter(&ssh
->lock
);
468 for (sth
= list_head(&ssh
->list
); sth
!= NULL
;
469 sth
= list_next(&ssh
->list
, sth
)) {
470 if (sth
->txg
== txg
) {
471 sth
->times
[completed_state
] = completed_time
;
477 mutex_exit(&ssh
->lock
);
486 spa_txg_history_set_io(spa_t
*spa
, uint64_t txg
, uint64_t nread
,
487 uint64_t nwritten
, uint64_t reads
, uint64_t writes
, uint64_t ndirty
)
489 spa_stats_history_t
*ssh
= &spa
->spa_stats
.txg_history
;
490 spa_txg_history_t
*sth
;
493 if (zfs_txg_history
== 0)
496 mutex_enter(&ssh
->lock
);
497 for (sth
= list_head(&ssh
->list
); sth
!= NULL
;
498 sth
= list_next(&ssh
->list
, sth
)) {
499 if (sth
->txg
== txg
) {
501 sth
->nwritten
= nwritten
;
503 sth
->writes
= writes
;
504 sth
->ndirty
= ndirty
;
509 mutex_exit(&ssh
->lock
);
515 spa_txg_history_init_io(spa_t
*spa
, uint64_t txg
, dsl_pool_t
*dp
)
519 if (zfs_txg_history
== 0)
522 ts
= kmem_alloc(sizeof (txg_stat_t
), KM_SLEEP
);
524 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_READER
);
525 vdev_get_stats(spa
->spa_root_vdev
, &ts
->vs1
);
526 spa_config_exit(spa
, SCL_ALL
, FTAG
);
529 ts
->ndirty
= dp
->dp_dirty_pertxg
[txg
& TXG_MASK
];
531 spa_txg_history_set(spa
, txg
, TXG_STATE_WAIT_FOR_SYNC
, gethrtime());
537 spa_txg_history_fini_io(spa_t
*spa
, txg_stat_t
*ts
)
542 if (zfs_txg_history
== 0) {
543 kmem_free(ts
, sizeof (txg_stat_t
));
547 spa_config_enter(spa
, SCL_ALL
, FTAG
, RW_READER
);
548 vdev_get_stats(spa
->spa_root_vdev
, &ts
->vs2
);
549 spa_config_exit(spa
, SCL_ALL
, FTAG
);
551 spa_txg_history_set(spa
, ts
->txg
, TXG_STATE_SYNCED
, gethrtime());
552 spa_txg_history_set_io(spa
, ts
->txg
,
553 ts
->vs2
.vs_bytes
[ZIO_TYPE_READ
] - ts
->vs1
.vs_bytes
[ZIO_TYPE_READ
],
554 ts
->vs2
.vs_bytes
[ZIO_TYPE_WRITE
] - ts
->vs1
.vs_bytes
[ZIO_TYPE_WRITE
],
555 ts
->vs2
.vs_ops
[ZIO_TYPE_READ
] - ts
->vs1
.vs_ops
[ZIO_TYPE_READ
],
556 ts
->vs2
.vs_ops
[ZIO_TYPE_WRITE
] - ts
->vs1
.vs_ops
[ZIO_TYPE_WRITE
],
559 kmem_free(ts
, sizeof (txg_stat_t
));
563 * ==========================================================================
564 * SPA TX Assign Histogram Routines
565 * ==========================================================================
569 * Tx statistics - Information exported regarding dmu_tx_assign time.
573 * When the kstat is written zero all buckets. When the kstat is read
574 * count the number of trailing buckets set to zero and update ks_ndata
575 * such that they are not output.
578 spa_tx_assign_update(kstat_t
*ksp
, int rw
)
580 spa_t
*spa
= ksp
->ks_private
;
581 spa_stats_history_t
*ssh
= &spa
->spa_stats
.tx_assign_histogram
;
584 if (rw
== KSTAT_WRITE
) {
585 for (i
= 0; i
< ssh
->count
; i
++)
586 ((kstat_named_t
*)ssh
->private)[i
].value
.ui64
= 0;
589 for (i
= ssh
->count
; i
> 0; i
--)
590 if (((kstat_named_t
*)ssh
->private)[i
-1].value
.ui64
!= 0)
594 ksp
->ks_data_size
= i
* sizeof (kstat_named_t
);
600 spa_tx_assign_init(spa_t
*spa
)
602 spa_stats_history_t
*ssh
= &spa
->spa_stats
.tx_assign_histogram
;
603 char name
[KSTAT_STRLEN
];
608 mutex_init(&ssh
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
610 ssh
->count
= 42; /* power of two buckets for 1ns to 2,199s */
611 ssh
->size
= ssh
->count
* sizeof (kstat_named_t
);
612 ssh
->private = kmem_alloc(ssh
->size
, KM_SLEEP
);
614 (void) snprintf(name
, KSTAT_STRLEN
, "zfs/%s", spa_name(spa
));
616 for (i
= 0; i
< ssh
->count
; i
++) {
617 ks
= &((kstat_named_t
*)ssh
->private)[i
];
618 ks
->data_type
= KSTAT_DATA_UINT64
;
620 (void) snprintf(ks
->name
, KSTAT_STRLEN
, "%llu ns",
621 (u_longlong_t
)1 << i
);
624 ksp
= kstat_create(name
, 0, "dmu_tx_assign", "misc",
625 KSTAT_TYPE_NAMED
, 0, KSTAT_FLAG_VIRTUAL
);
629 ksp
->ks_lock
= &ssh
->lock
;
630 ksp
->ks_data
= ssh
->private;
631 ksp
->ks_ndata
= ssh
->count
;
632 ksp
->ks_data_size
= ssh
->size
;
633 ksp
->ks_private
= spa
;
634 ksp
->ks_update
= spa_tx_assign_update
;
640 spa_tx_assign_destroy(spa_t
*spa
)
642 spa_stats_history_t
*ssh
= &spa
->spa_stats
.tx_assign_histogram
;
649 kmem_free(ssh
->private, ssh
->size
);
650 mutex_destroy(&ssh
->lock
);
654 spa_tx_assign_add_nsecs(spa_t
*spa
, uint64_t nsecs
)
656 spa_stats_history_t
*ssh
= &spa
->spa_stats
.tx_assign_histogram
;
659 while (((1ULL << idx
) < nsecs
) && (idx
< ssh
->size
- 1))
662 atomic_inc_64(&((kstat_named_t
*)ssh
->private)[idx
].value
.ui64
);
666 * ==========================================================================
667 * SPA IO History Routines
668 * ==========================================================================
671 spa_io_history_update(kstat_t
*ksp
, int rw
)
673 if (rw
== KSTAT_WRITE
)
674 memset(ksp
->ks_data
, 0, ksp
->ks_data_size
);
680 spa_io_history_init(spa_t
*spa
)
682 spa_stats_history_t
*ssh
= &spa
->spa_stats
.io_history
;
683 char name
[KSTAT_STRLEN
];
686 mutex_init(&ssh
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
688 (void) snprintf(name
, KSTAT_STRLEN
, "zfs/%s", spa_name(spa
));
690 ksp
= kstat_create(name
, 0, "io", "disk", KSTAT_TYPE_IO
, 1, 0);
694 ksp
->ks_lock
= &ssh
->lock
;
695 ksp
->ks_private
= spa
;
696 ksp
->ks_update
= spa_io_history_update
;
702 spa_io_history_destroy(spa_t
*spa
)
704 spa_stats_history_t
*ssh
= &spa
->spa_stats
.io_history
;
707 kstat_delete(ssh
->kstat
);
709 mutex_destroy(&ssh
->lock
);
713 * ==========================================================================
714 * SPA MMP History Routines
715 * ==========================================================================
719 * MMP statistics - Information exported regarding attempted MMP writes
720 * For MMP writes issued, fields used as per comments below.
721 * For MMP writes skipped, an entry represents a span of time when
722 * writes were skipped for same reason (error from mmp_random_leaf).
724 * timestamp time first write skipped, if >1 skipped in a row
725 * mmp_delay delay value at timestamp
726 * vdev_guid number of writes skipped
727 * io_error one of enum mmp_error
728 * duration time span (ns) of skipped writes
731 typedef struct spa_mmp_history
{
732 uint64_t mmp_kstat_id
; /* unique # for updates */
733 uint64_t txg
; /* txg of last sync */
734 uint64_t timestamp
; /* UTC time MMP write issued */
735 uint64_t mmp_delay
; /* mmp_thread.mmp_delay at timestamp */
736 uint64_t vdev_guid
; /* unique ID of leaf vdev */
738 int vdev_label
; /* vdev label */
739 int io_error
; /* error status of MMP write */
740 hrtime_t error_start
; /* hrtime of start of error period */
741 hrtime_t duration
; /* time from submission to completion */
742 list_node_t smh_link
;
746 spa_mmp_history_headers(char *buf
, size_t size
)
748 (void) snprintf(buf
, size
, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
749 "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
750 "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
755 spa_mmp_history_data(char *buf
, size_t size
, void *data
)
757 spa_mmp_history_t
*smh
= (spa_mmp_history_t
*)data
;
758 char skip_fmt
[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
760 char write_fmt
[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
763 (void) snprintf(buf
, size
, (smh
->error_start
? skip_fmt
: write_fmt
),
764 (u_longlong_t
)smh
->mmp_kstat_id
, (u_longlong_t
)smh
->txg
,
765 (u_longlong_t
)smh
->timestamp
, (longlong_t
)smh
->io_error
,
766 (longlong_t
)smh
->duration
, (u_longlong_t
)smh
->mmp_delay
,
767 (u_longlong_t
)smh
->vdev_guid
, (u_longlong_t
)smh
->vdev_label
,
768 (smh
->vdev_path
? smh
->vdev_path
: "-"));
774 * Calculate the address for the next spa_stats_history_t entry. The
775 * ssh->lock will be held until ksp->ks_ndata entries are processed.
778 spa_mmp_history_addr(kstat_t
*ksp
, loff_t n
)
780 spa_t
*spa
= ksp
->ks_private
;
781 spa_stats_history_t
*ssh
= &spa
->spa_stats
.mmp_history
;
783 ASSERT(MUTEX_HELD(&ssh
->lock
));
786 ssh
->private = list_tail(&ssh
->list
);
787 else if (ssh
->private)
788 ssh
->private = list_prev(&ssh
->list
, ssh
->private);
790 return (ssh
->private);
794 * When the kstat is written discard all spa_mmp_history_t entries. The
795 * ssh->lock will be held until ksp->ks_ndata entries are processed.
798 spa_mmp_history_update(kstat_t
*ksp
, int rw
)
800 spa_t
*spa
= ksp
->ks_private
;
801 spa_stats_history_t
*ssh
= &spa
->spa_stats
.mmp_history
;
803 ASSERT(MUTEX_HELD(&ssh
->lock
));
805 if (rw
== KSTAT_WRITE
) {
806 spa_mmp_history_t
*smh
;
808 while ((smh
= list_remove_head(&ssh
->list
))) {
811 strfree(smh
->vdev_path
);
812 kmem_free(smh
, sizeof (spa_mmp_history_t
));
815 ASSERT3U(ssh
->size
, ==, 0);
818 ksp
->ks_ndata
= ssh
->size
;
819 ksp
->ks_data_size
= ssh
->size
* sizeof (spa_mmp_history_t
);
825 spa_mmp_history_init(spa_t
*spa
)
827 spa_stats_history_t
*ssh
= &spa
->spa_stats
.mmp_history
;
828 char name
[KSTAT_STRLEN
];
831 mutex_init(&ssh
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
832 list_create(&ssh
->list
, sizeof (spa_mmp_history_t
),
833 offsetof(spa_mmp_history_t
, smh_link
));
839 (void) snprintf(name
, KSTAT_STRLEN
, "zfs/%s", spa_name(spa
));
841 ksp
= kstat_create(name
, 0, "multihost", "misc",
842 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
846 ksp
->ks_lock
= &ssh
->lock
;
848 ksp
->ks_private
= spa
;
849 ksp
->ks_update
= spa_mmp_history_update
;
850 kstat_set_raw_ops(ksp
, spa_mmp_history_headers
,
851 spa_mmp_history_data
, spa_mmp_history_addr
);
857 spa_mmp_history_destroy(spa_t
*spa
)
859 spa_stats_history_t
*ssh
= &spa
->spa_stats
.mmp_history
;
860 spa_mmp_history_t
*smh
;
867 mutex_enter(&ssh
->lock
);
868 while ((smh
= list_remove_head(&ssh
->list
))) {
871 strfree(smh
->vdev_path
);
872 kmem_free(smh
, sizeof (spa_mmp_history_t
));
875 ASSERT3U(ssh
->size
, ==, 0);
876 list_destroy(&ssh
->list
);
877 mutex_exit(&ssh
->lock
);
879 mutex_destroy(&ssh
->lock
);
883 * Set duration in existing "skip" record to how long we have waited for a leaf
884 * vdev to become available.
886 * Important that we start search at the head of the list where new
887 * records are inserted, so this is normally an O(1) operation.
890 spa_mmp_history_set_skip(spa_t
*spa
, uint64_t mmp_kstat_id
)
892 spa_stats_history_t
*ssh
= &spa
->spa_stats
.mmp_history
;
893 spa_mmp_history_t
*smh
;
896 if (zfs_multihost_history
== 0 && ssh
->size
== 0)
899 mutex_enter(&ssh
->lock
);
900 for (smh
= list_head(&ssh
->list
); smh
!= NULL
;
901 smh
= list_next(&ssh
->list
, smh
)) {
902 if (smh
->mmp_kstat_id
== mmp_kstat_id
) {
903 ASSERT3U(smh
->io_error
, !=, 0);
904 smh
->duration
= gethrtime() - smh
->error_start
;
910 mutex_exit(&ssh
->lock
);
916 * Set MMP write duration and error status in existing record.
917 * See comment re: search order above spa_mmp_history_set_skip().
920 spa_mmp_history_set(spa_t
*spa
, uint64_t mmp_kstat_id
, int io_error
,
923 spa_stats_history_t
*ssh
= &spa
->spa_stats
.mmp_history
;
924 spa_mmp_history_t
*smh
;
927 if (zfs_multihost_history
== 0 && ssh
->size
== 0)
930 mutex_enter(&ssh
->lock
);
931 for (smh
= list_head(&ssh
->list
); smh
!= NULL
;
932 smh
= list_next(&ssh
->list
, smh
)) {
933 if (smh
->mmp_kstat_id
== mmp_kstat_id
) {
934 ASSERT(smh
->io_error
== 0);
935 smh
->io_error
= io_error
;
936 smh
->duration
= duration
;
941 mutex_exit(&ssh
->lock
);
947 * Add a new MMP historical record.
948 * error == 0 : a write was issued.
949 * error != 0 : a write was not issued because no leaves were found.
952 spa_mmp_history_add(spa_t
*spa
, uint64_t txg
, uint64_t timestamp
,
953 uint64_t mmp_delay
, vdev_t
*vd
, int label
, uint64_t mmp_kstat_id
,
956 spa_stats_history_t
*ssh
= &spa
->spa_stats
.mmp_history
;
957 spa_mmp_history_t
*smh
, *rm
;
959 if (zfs_multihost_history
== 0 && ssh
->size
== 0)
962 smh
= kmem_zalloc(sizeof (spa_mmp_history_t
), KM_SLEEP
);
964 smh
->timestamp
= timestamp
;
965 smh
->mmp_delay
= mmp_delay
;
967 smh
->vdev_guid
= vd
->vdev_guid
;
969 smh
->vdev_path
= strdup(vd
->vdev_path
);
971 smh
->vdev_label
= label
;
972 smh
->mmp_kstat_id
= mmp_kstat_id
;
975 smh
->io_error
= error
;
976 smh
->error_start
= gethrtime();
980 mutex_enter(&ssh
->lock
);
982 list_insert_head(&ssh
->list
, smh
);
985 while (ssh
->size
> zfs_multihost_history
) {
987 rm
= list_remove_tail(&ssh
->list
);
989 strfree(rm
->vdev_path
);
990 kmem_free(rm
, sizeof (spa_mmp_history_t
));
993 mutex_exit(&ssh
->lock
);
994 return ((void *)smh
);
998 spa_state_addr(kstat_t
*ksp
, loff_t n
)
1000 return (ksp
->ks_private
); /* return the spa_t */
1004 spa_state_data(char *buf
, size_t size
, void *data
)
1006 spa_t
*spa
= (spa_t
*)data
;
1007 (void) snprintf(buf
, size
, "%s\n", spa_state_to_name(spa
));
1012 * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
1014 * This is a lock-less read of the pool's state (unlike using 'zpool', which
1015 * can potentially block for seconds). Because it doesn't block, it can useful
1016 * as a pool heartbeat value.
1019 spa_state_init(spa_t
*spa
)
1021 spa_stats_history_t
*ssh
= &spa
->spa_stats
.state
;
1025 mutex_init(&ssh
->lock
, NULL
, MUTEX_DEFAULT
, NULL
);
1027 name
= kmem_asprintf("zfs/%s", spa_name(spa
));
1028 ksp
= kstat_create(name
, 0, "state", "misc",
1029 KSTAT_TYPE_RAW
, 0, KSTAT_FLAG_VIRTUAL
);
1033 ksp
->ks_lock
= &ssh
->lock
;
1034 ksp
->ks_data
= NULL
;
1035 ksp
->ks_private
= spa
;
1036 ksp
->ks_flags
|= KSTAT_FLAG_NO_HEADERS
;
1037 kstat_set_raw_ops(ksp
, NULL
, spa_state_data
, spa_state_addr
);
1045 spa_health_destroy(spa_t
*spa
)
1047 spa_stats_history_t
*ssh
= &spa
->spa_stats
.state
;
1048 kstat_t
*ksp
= ssh
->kstat
;
1052 mutex_destroy(&ssh
->lock
);
1056 spa_stats_init(spa_t
*spa
)
1058 spa_read_history_init(spa
);
1059 spa_txg_history_init(spa
);
1060 spa_tx_assign_init(spa
);
1061 spa_io_history_init(spa
);
1062 spa_mmp_history_init(spa
);
1063 spa_state_init(spa
);
1067 spa_stats_destroy(spa_t
*spa
)
1069 spa_health_destroy(spa
);
1070 spa_tx_assign_destroy(spa
);
1071 spa_txg_history_destroy(spa
);
1072 spa_read_history_destroy(spa
);
1073 spa_io_history_destroy(spa
);
1074 spa_mmp_history_destroy(spa
);
1077 #if defined(_KERNEL) && defined(HAVE_SPL)
1079 module_param(zfs_read_history
, int, 0644);
1080 MODULE_PARM_DESC(zfs_read_history
,
1081 "Historical statistics for the last N reads");
1083 module_param(zfs_read_history_hits
, int, 0644);
1084 MODULE_PARM_DESC(zfs_read_history_hits
,
1085 "Include cache hits in read history");
1087 module_param(zfs_txg_history
, int, 0644);
1088 MODULE_PARM_DESC(zfs_txg_history
,
1089 "Historical statistics for the last N txgs");
1091 module_param(zfs_multihost_history
, int, 0644);
1092 MODULE_PARM_DESC(zfs_multihost_history
,
1093 "Historical statistics for last N multihost writes");