]> git.proxmox.com Git - mirror_zfs-debian.git/blob - module/zfs/spa_stats.c
New upstream version 0.7.11
[mirror_zfs-debian.git] / module / zfs / spa_stats.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 #include <sys/zfs_context.h>
23 #include <sys/spa_impl.h>
24 #include <sys/vdev_impl.h>
25 #include <sys/spa.h>
26 #include <zfs_comutil.h>
27
28 /*
29 * Keeps stats on last N reads per spa_t, disabled by default.
30 */
31 int zfs_read_history = 0;
32
33 /*
34 * Include cache hits in history, disabled by default.
35 */
36 int zfs_read_history_hits = 0;
37
38 /*
39 * Keeps stats on the last N txgs, disabled by default.
40 */
41 int zfs_txg_history = 0;
42
43 /*
44 * Keeps stats on the last N MMP updates, disabled by default.
45 */
46 int zfs_multihost_history = 0;
47
48 /*
49 * ==========================================================================
50 * SPA Read History Routines
51 * ==========================================================================
52 */
53
54 /*
55 * Read statistics - Information exported regarding each arc_read call
56 */
57 typedef struct spa_read_history {
58 uint64_t uid; /* unique identifier */
59 hrtime_t start; /* time read completed */
60 uint64_t objset; /* read from this objset */
61 uint64_t object; /* read of this object number */
62 uint64_t level; /* block's indirection level */
63 uint64_t blkid; /* read of this block id */
64 char origin[24]; /* read originated from here */
65 uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */
66 pid_t pid; /* PID of task doing read */
67 char comm[16]; /* process name of task doing read */
68 list_node_t srh_link;
69 } spa_read_history_t;
70
71 static int
72 spa_read_history_headers(char *buf, size_t size)
73 {
74 (void) snprintf(buf, size, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
75 "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
76 "level", "blkid", "aflags", "origin", "pid", "process");
77
78 return (0);
79 }
80
81 static int
82 spa_read_history_data(char *buf, size_t size, void *data)
83 {
84 spa_read_history_t *srh = (spa_read_history_t *)data;
85
86 (void) snprintf(buf, size, "%-8llu %-16llu 0x%-6llx "
87 "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
88 (u_longlong_t)srh->uid, srh->start,
89 (longlong_t)srh->objset, (longlong_t)srh->object,
90 (longlong_t)srh->level, (longlong_t)srh->blkid,
91 srh->aflags, srh->origin, srh->pid, srh->comm);
92
93 return (0);
94 }
95
96 /*
97 * Calculate the address for the next spa_stats_history_t entry. The
98 * ssh->lock will be held until ksp->ks_ndata entries are processed.
99 */
100 static void *
101 spa_read_history_addr(kstat_t *ksp, loff_t n)
102 {
103 spa_t *spa = ksp->ks_private;
104 spa_stats_history_t *ssh = &spa->spa_stats.read_history;
105
106 ASSERT(MUTEX_HELD(&ssh->lock));
107
108 if (n == 0)
109 ssh->private = list_tail(&ssh->list);
110 else if (ssh->private)
111 ssh->private = list_prev(&ssh->list, ssh->private);
112
113 return (ssh->private);
114 }
115
116 /*
117 * When the kstat is written discard all spa_read_history_t entries. The
118 * ssh->lock will be held until ksp->ks_ndata entries are processed.
119 */
120 static int
121 spa_read_history_update(kstat_t *ksp, int rw)
122 {
123 spa_t *spa = ksp->ks_private;
124 spa_stats_history_t *ssh = &spa->spa_stats.read_history;
125
126 if (rw == KSTAT_WRITE) {
127 spa_read_history_t *srh;
128
129 while ((srh = list_remove_head(&ssh->list))) {
130 ssh->size--;
131 kmem_free(srh, sizeof (spa_read_history_t));
132 }
133
134 ASSERT3U(ssh->size, ==, 0);
135 }
136
137 ksp->ks_ndata = ssh->size;
138 ksp->ks_data_size = ssh->size * sizeof (spa_read_history_t);
139
140 return (0);
141 }
142
143 static void
144 spa_read_history_init(spa_t *spa)
145 {
146 spa_stats_history_t *ssh = &spa->spa_stats.read_history;
147 char name[KSTAT_STRLEN];
148 kstat_t *ksp;
149
150 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
151 list_create(&ssh->list, sizeof (spa_read_history_t),
152 offsetof(spa_read_history_t, srh_link));
153
154 ssh->count = 0;
155 ssh->size = 0;
156 ssh->private = NULL;
157
158 (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
159
160 ksp = kstat_create(name, 0, "reads", "misc",
161 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
162 ssh->kstat = ksp;
163
164 if (ksp) {
165 ksp->ks_lock = &ssh->lock;
166 ksp->ks_data = NULL;
167 ksp->ks_private = spa;
168 ksp->ks_update = spa_read_history_update;
169 kstat_set_raw_ops(ksp, spa_read_history_headers,
170 spa_read_history_data, spa_read_history_addr);
171 kstat_install(ksp);
172 }
173 }
174
175 static void
176 spa_read_history_destroy(spa_t *spa)
177 {
178 spa_stats_history_t *ssh = &spa->spa_stats.read_history;
179 spa_read_history_t *srh;
180 kstat_t *ksp;
181
182 ksp = ssh->kstat;
183 if (ksp)
184 kstat_delete(ksp);
185
186 mutex_enter(&ssh->lock);
187 while ((srh = list_remove_head(&ssh->list))) {
188 ssh->size--;
189 kmem_free(srh, sizeof (spa_read_history_t));
190 }
191
192 ASSERT3U(ssh->size, ==, 0);
193 list_destroy(&ssh->list);
194 mutex_exit(&ssh->lock);
195
196 mutex_destroy(&ssh->lock);
197 }
198
199 void
200 spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
201 {
202 spa_stats_history_t *ssh = &spa->spa_stats.read_history;
203 spa_read_history_t *srh, *rm;
204
205 ASSERT3P(spa, !=, NULL);
206 ASSERT3P(zb, !=, NULL);
207
208 if (zfs_read_history == 0 && ssh->size == 0)
209 return;
210
211 if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
212 return;
213
214 srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);
215 strlcpy(srh->comm, getcomm(), sizeof (srh->comm));
216 srh->start = gethrtime();
217 srh->objset = zb->zb_objset;
218 srh->object = zb->zb_object;
219 srh->level = zb->zb_level;
220 srh->blkid = zb->zb_blkid;
221 srh->aflags = aflags;
222 srh->pid = getpid();
223
224 mutex_enter(&ssh->lock);
225
226 srh->uid = ssh->count++;
227 list_insert_head(&ssh->list, srh);
228 ssh->size++;
229
230 while (ssh->size > zfs_read_history) {
231 ssh->size--;
232 rm = list_remove_tail(&ssh->list);
233 kmem_free(rm, sizeof (spa_read_history_t));
234 }
235
236 mutex_exit(&ssh->lock);
237 }
238
239 /*
240 * ==========================================================================
241 * SPA TXG History Routines
242 * ==========================================================================
243 */
244
245 /*
246 * Txg statistics - Information exported regarding each txg sync
247 */
248
249 typedef struct spa_txg_history {
250 uint64_t txg; /* txg id */
251 txg_state_t state; /* active txg state */
252 uint64_t nread; /* number of bytes read */
253 uint64_t nwritten; /* number of bytes written */
254 uint64_t reads; /* number of read operations */
255 uint64_t writes; /* number of write operations */
256 uint64_t ndirty; /* number of dirty bytes */
257 hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */
258 list_node_t sth_link;
259 } spa_txg_history_t;
260
261 static int
262 spa_txg_history_headers(char *buf, size_t size)
263 {
264 (void) snprintf(buf, size, "%-8s %-16s %-5s %-12s %-12s %-12s "
265 "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
266 "ndirty", "nread", "nwritten", "reads", "writes",
267 "otime", "qtime", "wtime", "stime");
268
269 return (0);
270 }
271
272 static int
273 spa_txg_history_data(char *buf, size_t size, void *data)
274 {
275 spa_txg_history_t *sth = (spa_txg_history_t *)data;
276 uint64_t open = 0, quiesce = 0, wait = 0, sync = 0;
277 char state;
278
279 switch (sth->state) {
280 case TXG_STATE_BIRTH: state = 'B'; break;
281 case TXG_STATE_OPEN: state = 'O'; break;
282 case TXG_STATE_QUIESCED: state = 'Q'; break;
283 case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break;
284 case TXG_STATE_SYNCED: state = 'S'; break;
285 case TXG_STATE_COMMITTED: state = 'C'; break;
286 default: state = '?'; break;
287 }
288
289 if (sth->times[TXG_STATE_OPEN])
290 open = sth->times[TXG_STATE_OPEN] -
291 sth->times[TXG_STATE_BIRTH];
292
293 if (sth->times[TXG_STATE_QUIESCED])
294 quiesce = sth->times[TXG_STATE_QUIESCED] -
295 sth->times[TXG_STATE_OPEN];
296
297 if (sth->times[TXG_STATE_WAIT_FOR_SYNC])
298 wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] -
299 sth->times[TXG_STATE_QUIESCED];
300
301 if (sth->times[TXG_STATE_SYNCED])
302 sync = sth->times[TXG_STATE_SYNCED] -
303 sth->times[TXG_STATE_WAIT_FOR_SYNC];
304
305 (void) snprintf(buf, size, "%-8llu %-16llu %-5c %-12llu "
306 "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
307 (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
308 (u_longlong_t)sth->ndirty,
309 (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
310 (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
311 (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait,
312 (u_longlong_t)sync);
313
314 return (0);
315 }
316
317 /*
318 * Calculate the address for the next spa_stats_history_t entry. The
319 * ssh->lock will be held until ksp->ks_ndata entries are processed.
320 */
321 static void *
322 spa_txg_history_addr(kstat_t *ksp, loff_t n)
323 {
324 spa_t *spa = ksp->ks_private;
325 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
326
327 ASSERT(MUTEX_HELD(&ssh->lock));
328
329 if (n == 0)
330 ssh->private = list_tail(&ssh->list);
331 else if (ssh->private)
332 ssh->private = list_prev(&ssh->list, ssh->private);
333
334 return (ssh->private);
335 }
336
337 /*
338 * When the kstat is written discard all spa_txg_history_t entries. The
339 * ssh->lock will be held until ksp->ks_ndata entries are processed.
340 */
341 static int
342 spa_txg_history_update(kstat_t *ksp, int rw)
343 {
344 spa_t *spa = ksp->ks_private;
345 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
346
347 ASSERT(MUTEX_HELD(&ssh->lock));
348
349 if (rw == KSTAT_WRITE) {
350 spa_txg_history_t *sth;
351
352 while ((sth = list_remove_head(&ssh->list))) {
353 ssh->size--;
354 kmem_free(sth, sizeof (spa_txg_history_t));
355 }
356
357 ASSERT3U(ssh->size, ==, 0);
358 }
359
360 ksp->ks_ndata = ssh->size;
361 ksp->ks_data_size = ssh->size * sizeof (spa_txg_history_t);
362
363 return (0);
364 }
365
366 static void
367 spa_txg_history_init(spa_t *spa)
368 {
369 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
370 char name[KSTAT_STRLEN];
371 kstat_t *ksp;
372
373 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
374 list_create(&ssh->list, sizeof (spa_txg_history_t),
375 offsetof(spa_txg_history_t, sth_link));
376
377 ssh->count = 0;
378 ssh->size = 0;
379 ssh->private = NULL;
380
381 (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
382
383 ksp = kstat_create(name, 0, "txgs", "misc",
384 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
385 ssh->kstat = ksp;
386
387 if (ksp) {
388 ksp->ks_lock = &ssh->lock;
389 ksp->ks_data = NULL;
390 ksp->ks_private = spa;
391 ksp->ks_update = spa_txg_history_update;
392 kstat_set_raw_ops(ksp, spa_txg_history_headers,
393 spa_txg_history_data, spa_txg_history_addr);
394 kstat_install(ksp);
395 }
396 }
397
398 static void
399 spa_txg_history_destroy(spa_t *spa)
400 {
401 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
402 spa_txg_history_t *sth;
403 kstat_t *ksp;
404
405 ksp = ssh->kstat;
406 if (ksp)
407 kstat_delete(ksp);
408
409 mutex_enter(&ssh->lock);
410 while ((sth = list_remove_head(&ssh->list))) {
411 ssh->size--;
412 kmem_free(sth, sizeof (spa_txg_history_t));
413 }
414
415 ASSERT3U(ssh->size, ==, 0);
416 list_destroy(&ssh->list);
417 mutex_exit(&ssh->lock);
418
419 mutex_destroy(&ssh->lock);
420 }
421
422 /*
423 * Add a new txg to historical record.
424 */
425 void
426 spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
427 {
428 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
429 spa_txg_history_t *sth, *rm;
430
431 if (zfs_txg_history == 0 && ssh->size == 0)
432 return;
433
434 sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP);
435 sth->txg = txg;
436 sth->state = TXG_STATE_OPEN;
437 sth->times[TXG_STATE_BIRTH] = birth_time;
438
439 mutex_enter(&ssh->lock);
440
441 list_insert_head(&ssh->list, sth);
442 ssh->size++;
443
444 while (ssh->size > zfs_txg_history) {
445 ssh->size--;
446 rm = list_remove_tail(&ssh->list);
447 kmem_free(rm, sizeof (spa_txg_history_t));
448 }
449
450 mutex_exit(&ssh->lock);
451 }
452
453 /*
454 * Set txg state completion time and increment current state.
455 */
456 int
457 spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
458 hrtime_t completed_time)
459 {
460 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
461 spa_txg_history_t *sth;
462 int error = ENOENT;
463
464 if (zfs_txg_history == 0)
465 return (0);
466
467 mutex_enter(&ssh->lock);
468 for (sth = list_head(&ssh->list); sth != NULL;
469 sth = list_next(&ssh->list, sth)) {
470 if (sth->txg == txg) {
471 sth->times[completed_state] = completed_time;
472 sth->state++;
473 error = 0;
474 break;
475 }
476 }
477 mutex_exit(&ssh->lock);
478
479 return (error);
480 }
481
482 /*
483 * Set txg IO stats.
484 */
485 static int
486 spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
487 uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
488 {
489 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
490 spa_txg_history_t *sth;
491 int error = ENOENT;
492
493 if (zfs_txg_history == 0)
494 return (0);
495
496 mutex_enter(&ssh->lock);
497 for (sth = list_head(&ssh->list); sth != NULL;
498 sth = list_next(&ssh->list, sth)) {
499 if (sth->txg == txg) {
500 sth->nread = nread;
501 sth->nwritten = nwritten;
502 sth->reads = reads;
503 sth->writes = writes;
504 sth->ndirty = ndirty;
505 error = 0;
506 break;
507 }
508 }
509 mutex_exit(&ssh->lock);
510
511 return (error);
512 }
513
514 txg_stat_t *
515 spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
516 {
517 txg_stat_t *ts;
518
519 if (zfs_txg_history == 0)
520 return (NULL);
521
522 ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP);
523
524 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
525 vdev_get_stats(spa->spa_root_vdev, &ts->vs1);
526 spa_config_exit(spa, SCL_ALL, FTAG);
527
528 ts->txg = txg;
529 ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
530
531 spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime());
532
533 return (ts);
534 }
535
536 void
537 spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
538 {
539 if (ts == NULL)
540 return;
541
542 if (zfs_txg_history == 0) {
543 kmem_free(ts, sizeof (txg_stat_t));
544 return;
545 }
546
547 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
548 vdev_get_stats(spa->spa_root_vdev, &ts->vs2);
549 spa_config_exit(spa, SCL_ALL, FTAG);
550
551 spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime());
552 spa_txg_history_set_io(spa, ts->txg,
553 ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ],
554 ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE],
555 ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ],
556 ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE],
557 ts->ndirty);
558
559 kmem_free(ts, sizeof (txg_stat_t));
560 }
561
562 /*
563 * ==========================================================================
564 * SPA TX Assign Histogram Routines
565 * ==========================================================================
566 */
567
568 /*
569 * Tx statistics - Information exported regarding dmu_tx_assign time.
570 */
571
572 /*
573 * When the kstat is written zero all buckets. When the kstat is read
574 * count the number of trailing buckets set to zero and update ks_ndata
575 * such that they are not output.
576 */
577 static int
578 spa_tx_assign_update(kstat_t *ksp, int rw)
579 {
580 spa_t *spa = ksp->ks_private;
581 spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
582 int i;
583
584 if (rw == KSTAT_WRITE) {
585 for (i = 0; i < ssh->count; i++)
586 ((kstat_named_t *)ssh->private)[i].value.ui64 = 0;
587 }
588
589 for (i = ssh->count; i > 0; i--)
590 if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0)
591 break;
592
593 ksp->ks_ndata = i;
594 ksp->ks_data_size = i * sizeof (kstat_named_t);
595
596 return (0);
597 }
598
599 static void
600 spa_tx_assign_init(spa_t *spa)
601 {
602 spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
603 char name[KSTAT_STRLEN];
604 kstat_named_t *ks;
605 kstat_t *ksp;
606 int i;
607
608 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
609
610 ssh->count = 42; /* power of two buckets for 1ns to 2,199s */
611 ssh->size = ssh->count * sizeof (kstat_named_t);
612 ssh->private = kmem_alloc(ssh->size, KM_SLEEP);
613
614 (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
615
616 for (i = 0; i < ssh->count; i++) {
617 ks = &((kstat_named_t *)ssh->private)[i];
618 ks->data_type = KSTAT_DATA_UINT64;
619 ks->value.ui64 = 0;
620 (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
621 (u_longlong_t)1 << i);
622 }
623
624 ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
625 KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
626 ssh->kstat = ksp;
627
628 if (ksp) {
629 ksp->ks_lock = &ssh->lock;
630 ksp->ks_data = ssh->private;
631 ksp->ks_ndata = ssh->count;
632 ksp->ks_data_size = ssh->size;
633 ksp->ks_private = spa;
634 ksp->ks_update = spa_tx_assign_update;
635 kstat_install(ksp);
636 }
637 }
638
639 static void
640 spa_tx_assign_destroy(spa_t *spa)
641 {
642 spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
643 kstat_t *ksp;
644
645 ksp = ssh->kstat;
646 if (ksp)
647 kstat_delete(ksp);
648
649 kmem_free(ssh->private, ssh->size);
650 mutex_destroy(&ssh->lock);
651 }
652
653 void
654 spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
655 {
656 spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
657 uint64_t idx = 0;
658
659 while (((1ULL << idx) < nsecs) && (idx < ssh->size - 1))
660 idx++;
661
662 atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64);
663 }
664
665 /*
666 * ==========================================================================
667 * SPA IO History Routines
668 * ==========================================================================
669 */
670 static int
671 spa_io_history_update(kstat_t *ksp, int rw)
672 {
673 if (rw == KSTAT_WRITE)
674 memset(ksp->ks_data, 0, ksp->ks_data_size);
675
676 return (0);
677 }
678
679 static void
680 spa_io_history_init(spa_t *spa)
681 {
682 spa_stats_history_t *ssh = &spa->spa_stats.io_history;
683 char name[KSTAT_STRLEN];
684 kstat_t *ksp;
685
686 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
687
688 (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
689
690 ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0);
691 ssh->kstat = ksp;
692
693 if (ksp) {
694 ksp->ks_lock = &ssh->lock;
695 ksp->ks_private = spa;
696 ksp->ks_update = spa_io_history_update;
697 kstat_install(ksp);
698 }
699 }
700
701 static void
702 spa_io_history_destroy(spa_t *spa)
703 {
704 spa_stats_history_t *ssh = &spa->spa_stats.io_history;
705
706 if (ssh->kstat)
707 kstat_delete(ssh->kstat);
708
709 mutex_destroy(&ssh->lock);
710 }
711
712 /*
713 * ==========================================================================
714 * SPA MMP History Routines
715 * ==========================================================================
716 */
717
718 /*
719 * MMP statistics - Information exported regarding attempted MMP writes
720 * For MMP writes issued, fields used as per comments below.
721 * For MMP writes skipped, an entry represents a span of time when
722 * writes were skipped for same reason (error from mmp_random_leaf).
723 * Differences are:
724 * timestamp time first write skipped, if >1 skipped in a row
725 * mmp_delay delay value at timestamp
726 * vdev_guid number of writes skipped
727 * io_error one of enum mmp_error
728 * duration time span (ns) of skipped writes
729 */
730
731 typedef struct spa_mmp_history {
732 uint64_t mmp_kstat_id; /* unique # for updates */
733 uint64_t txg; /* txg of last sync */
734 uint64_t timestamp; /* UTC time MMP write issued */
735 uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */
736 uint64_t vdev_guid; /* unique ID of leaf vdev */
737 char *vdev_path;
738 int vdev_label; /* vdev label */
739 int io_error; /* error status of MMP write */
740 hrtime_t error_start; /* hrtime of start of error period */
741 hrtime_t duration; /* time from submission to completion */
742 list_node_t smh_link;
743 } spa_mmp_history_t;
744
745 static int
746 spa_mmp_history_headers(char *buf, size_t size)
747 {
748 (void) snprintf(buf, size, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
749 "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
750 "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
751 return (0);
752 }
753
754 static int
755 spa_mmp_history_data(char *buf, size_t size, void *data)
756 {
757 spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
758 char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
759 "%-10lld %s\n";
760 char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
761 "%-10lld %s\n";
762
763 (void) snprintf(buf, size, (smh->error_start ? skip_fmt : write_fmt),
764 (u_longlong_t)smh->mmp_kstat_id, (u_longlong_t)smh->txg,
765 (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
766 (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
767 (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label,
768 (smh->vdev_path ? smh->vdev_path : "-"));
769
770 return (0);
771 }
772
773 /*
774 * Calculate the address for the next spa_stats_history_t entry. The
775 * ssh->lock will be held until ksp->ks_ndata entries are processed.
776 */
777 static void *
778 spa_mmp_history_addr(kstat_t *ksp, loff_t n)
779 {
780 spa_t *spa = ksp->ks_private;
781 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
782
783 ASSERT(MUTEX_HELD(&ssh->lock));
784
785 if (n == 0)
786 ssh->private = list_tail(&ssh->list);
787 else if (ssh->private)
788 ssh->private = list_prev(&ssh->list, ssh->private);
789
790 return (ssh->private);
791 }
792
793 /*
794 * When the kstat is written discard all spa_mmp_history_t entries. The
795 * ssh->lock will be held until ksp->ks_ndata entries are processed.
796 */
797 static int
798 spa_mmp_history_update(kstat_t *ksp, int rw)
799 {
800 spa_t *spa = ksp->ks_private;
801 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
802
803 ASSERT(MUTEX_HELD(&ssh->lock));
804
805 if (rw == KSTAT_WRITE) {
806 spa_mmp_history_t *smh;
807
808 while ((smh = list_remove_head(&ssh->list))) {
809 ssh->size--;
810 if (smh->vdev_path)
811 strfree(smh->vdev_path);
812 kmem_free(smh, sizeof (spa_mmp_history_t));
813 }
814
815 ASSERT3U(ssh->size, ==, 0);
816 }
817
818 ksp->ks_ndata = ssh->size;
819 ksp->ks_data_size = ssh->size * sizeof (spa_mmp_history_t);
820
821 return (0);
822 }
823
824 static void
825 spa_mmp_history_init(spa_t *spa)
826 {
827 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
828 char name[KSTAT_STRLEN];
829 kstat_t *ksp;
830
831 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
832 list_create(&ssh->list, sizeof (spa_mmp_history_t),
833 offsetof(spa_mmp_history_t, smh_link));
834
835 ssh->count = 0;
836 ssh->size = 0;
837 ssh->private = NULL;
838
839 (void) snprintf(name, KSTAT_STRLEN, "zfs/%s", spa_name(spa));
840
841 ksp = kstat_create(name, 0, "multihost", "misc",
842 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
843 ssh->kstat = ksp;
844
845 if (ksp) {
846 ksp->ks_lock = &ssh->lock;
847 ksp->ks_data = NULL;
848 ksp->ks_private = spa;
849 ksp->ks_update = spa_mmp_history_update;
850 kstat_set_raw_ops(ksp, spa_mmp_history_headers,
851 spa_mmp_history_data, spa_mmp_history_addr);
852 kstat_install(ksp);
853 }
854 }
855
856 static void
857 spa_mmp_history_destroy(spa_t *spa)
858 {
859 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
860 spa_mmp_history_t *smh;
861 kstat_t *ksp;
862
863 ksp = ssh->kstat;
864 if (ksp)
865 kstat_delete(ksp);
866
867 mutex_enter(&ssh->lock);
868 while ((smh = list_remove_head(&ssh->list))) {
869 ssh->size--;
870 if (smh->vdev_path)
871 strfree(smh->vdev_path);
872 kmem_free(smh, sizeof (spa_mmp_history_t));
873 }
874
875 ASSERT3U(ssh->size, ==, 0);
876 list_destroy(&ssh->list);
877 mutex_exit(&ssh->lock);
878
879 mutex_destroy(&ssh->lock);
880 }
881
882 /*
883 * Set duration in existing "skip" record to how long we have waited for a leaf
884 * vdev to become available.
885 *
886 * Important that we start search at the head of the list where new
887 * records are inserted, so this is normally an O(1) operation.
888 */
889 int
890 spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id)
891 {
892 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
893 spa_mmp_history_t *smh;
894 int error = ENOENT;
895
896 if (zfs_multihost_history == 0 && ssh->size == 0)
897 return (0);
898
899 mutex_enter(&ssh->lock);
900 for (smh = list_head(&ssh->list); smh != NULL;
901 smh = list_next(&ssh->list, smh)) {
902 if (smh->mmp_kstat_id == mmp_kstat_id) {
903 ASSERT3U(smh->io_error, !=, 0);
904 smh->duration = gethrtime() - smh->error_start;
905 smh->vdev_guid++;
906 error = 0;
907 break;
908 }
909 }
910 mutex_exit(&ssh->lock);
911
912 return (error);
913 }
914
915 /*
916 * Set MMP write duration and error status in existing record.
917 * See comment re: search order above spa_mmp_history_set_skip().
918 */
919 int
920 spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
921 hrtime_t duration)
922 {
923 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
924 spa_mmp_history_t *smh;
925 int error = ENOENT;
926
927 if (zfs_multihost_history == 0 && ssh->size == 0)
928 return (0);
929
930 mutex_enter(&ssh->lock);
931 for (smh = list_head(&ssh->list); smh != NULL;
932 smh = list_next(&ssh->list, smh)) {
933 if (smh->mmp_kstat_id == mmp_kstat_id) {
934 ASSERT(smh->io_error == 0);
935 smh->io_error = io_error;
936 smh->duration = duration;
937 error = 0;
938 break;
939 }
940 }
941 mutex_exit(&ssh->lock);
942
943 return (error);
944 }
945
946 /*
947 * Add a new MMP historical record.
948 * error == 0 : a write was issued.
949 * error != 0 : a write was not issued because no leaves were found.
950 */
951 void *
952 spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
953 uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
954 int error)
955 {
956 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
957 spa_mmp_history_t *smh, *rm;
958
959 if (zfs_multihost_history == 0 && ssh->size == 0)
960 return (NULL);
961
962 smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
963 smh->txg = txg;
964 smh->timestamp = timestamp;
965 smh->mmp_delay = mmp_delay;
966 if (vd) {
967 smh->vdev_guid = vd->vdev_guid;
968 if (vd->vdev_path)
969 smh->vdev_path = strdup(vd->vdev_path);
970 }
971 smh->vdev_label = label;
972 smh->mmp_kstat_id = mmp_kstat_id;
973
974 if (error) {
975 smh->io_error = error;
976 smh->error_start = gethrtime();
977 smh->vdev_guid = 1;
978 }
979
980 mutex_enter(&ssh->lock);
981
982 list_insert_head(&ssh->list, smh);
983 ssh->size++;
984
985 while (ssh->size > zfs_multihost_history) {
986 ssh->size--;
987 rm = list_remove_tail(&ssh->list);
988 if (rm->vdev_path)
989 strfree(rm->vdev_path);
990 kmem_free(rm, sizeof (spa_mmp_history_t));
991 }
992
993 mutex_exit(&ssh->lock);
994 return ((void *)smh);
995 }
996
997 static void *
998 spa_state_addr(kstat_t *ksp, loff_t n)
999 {
1000 return (ksp->ks_private); /* return the spa_t */
1001 }
1002
1003 static int
1004 spa_state_data(char *buf, size_t size, void *data)
1005 {
1006 spa_t *spa = (spa_t *)data;
1007 (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa));
1008 return (0);
1009 }
1010
1011 /*
1012 * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
1013 *
1014 * This is a lock-less read of the pool's state (unlike using 'zpool', which
1015 * can potentially block for seconds). Because it doesn't block, it can useful
1016 * as a pool heartbeat value.
1017 */
1018 static void
1019 spa_state_init(spa_t *spa)
1020 {
1021 spa_stats_history_t *ssh = &spa->spa_stats.state;
1022 char *name;
1023 kstat_t *ksp;
1024
1025 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
1026
1027 name = kmem_asprintf("zfs/%s", spa_name(spa));
1028 ksp = kstat_create(name, 0, "state", "misc",
1029 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
1030
1031 ssh->kstat = ksp;
1032 if (ksp) {
1033 ksp->ks_lock = &ssh->lock;
1034 ksp->ks_data = NULL;
1035 ksp->ks_private = spa;
1036 ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
1037 kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr);
1038 kstat_install(ksp);
1039 }
1040
1041 strfree(name);
1042 }
1043
1044 static void
1045 spa_health_destroy(spa_t *spa)
1046 {
1047 spa_stats_history_t *ssh = &spa->spa_stats.state;
1048 kstat_t *ksp = ssh->kstat;
1049 if (ksp)
1050 kstat_delete(ksp);
1051
1052 mutex_destroy(&ssh->lock);
1053 }
1054
1055 void
1056 spa_stats_init(spa_t *spa)
1057 {
1058 spa_read_history_init(spa);
1059 spa_txg_history_init(spa);
1060 spa_tx_assign_init(spa);
1061 spa_io_history_init(spa);
1062 spa_mmp_history_init(spa);
1063 spa_state_init(spa);
1064 }
1065
1066 void
1067 spa_stats_destroy(spa_t *spa)
1068 {
1069 spa_health_destroy(spa);
1070 spa_tx_assign_destroy(spa);
1071 spa_txg_history_destroy(spa);
1072 spa_read_history_destroy(spa);
1073 spa_io_history_destroy(spa);
1074 spa_mmp_history_destroy(spa);
1075 }
1076
1077 #if defined(_KERNEL) && defined(HAVE_SPL)
1078 /* CSTYLED */
1079 module_param(zfs_read_history, int, 0644);
1080 MODULE_PARM_DESC(zfs_read_history,
1081 "Historical statistics for the last N reads");
1082
1083 module_param(zfs_read_history_hits, int, 0644);
1084 MODULE_PARM_DESC(zfs_read_history_hits,
1085 "Include cache hits in read history");
1086
1087 module_param(zfs_txg_history, int, 0644);
1088 MODULE_PARM_DESC(zfs_txg_history,
1089 "Historical statistics for the last N txgs");
1090
1091 module_param(zfs_multihost_history, int, 0644);
1092 MODULE_PARM_DESC(zfs_multihost_history,
1093 "Historical statistics for last N multihost writes");
1094 /* END CSTYLED */
1095 #endif