]> git.proxmox.com Git - mirror_zfs.git/blob - module/zfs/spa_stats.c
Merge branch 'zfsonlinux/merge-spl'
[mirror_zfs.git] / module / zfs / spa_stats.c
1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 #include <sys/zfs_context.h>
23 #include <sys/spa_impl.h>
24 #include <sys/vdev_impl.h>
25
26 /*
27 * Keeps stats on last N reads per spa_t, disabled by default.
28 */
29 int zfs_read_history = 0;
30
31 /*
32 * Include cache hits in history, disabled by default.
33 */
34 int zfs_read_history_hits = 0;
35
36 /*
37 * Keeps stats on the last 100 txgs by default.
38 */
39 int zfs_txg_history = 100;
40
41 /*
42 * Keeps stats on the last N MMP updates, disabled by default.
43 */
44 int zfs_multihost_history = 0;
45
46 /*
47 * ==========================================================================
48 * SPA Read History Routines
49 * ==========================================================================
50 */
51
52 /*
53 * Read statistics - Information exported regarding each arc_read call
54 */
55 typedef struct spa_read_history {
56 uint64_t uid; /* unique identifier */
57 hrtime_t start; /* time read completed */
58 uint64_t objset; /* read from this objset */
59 uint64_t object; /* read of this object number */
60 uint64_t level; /* block's indirection level */
61 uint64_t blkid; /* read of this block id */
62 char origin[24]; /* read originated from here */
63 uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */
64 pid_t pid; /* PID of task doing read */
65 char comm[16]; /* process name of task doing read */
66 list_node_t srh_link;
67 } spa_read_history_t;
68
69 static int
70 spa_read_history_headers(char *buf, size_t size)
71 {
72 (void) snprintf(buf, size, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
73 "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
74 "level", "blkid", "aflags", "origin", "pid", "process");
75
76 return (0);
77 }
78
79 static int
80 spa_read_history_data(char *buf, size_t size, void *data)
81 {
82 spa_read_history_t *srh = (spa_read_history_t *)data;
83
84 (void) snprintf(buf, size, "%-8llu %-16llu 0x%-6llx "
85 "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
86 (u_longlong_t)srh->uid, srh->start,
87 (longlong_t)srh->objset, (longlong_t)srh->object,
88 (longlong_t)srh->level, (longlong_t)srh->blkid,
89 srh->aflags, srh->origin, srh->pid, srh->comm);
90
91 return (0);
92 }
93
94 /*
95 * Calculate the address for the next spa_stats_history_t entry. The
96 * ssh->lock will be held until ksp->ks_ndata entries are processed.
97 */
98 static void *
99 spa_read_history_addr(kstat_t *ksp, loff_t n)
100 {
101 spa_t *spa = ksp->ks_private;
102 spa_stats_history_t *ssh = &spa->spa_stats.read_history;
103
104 ASSERT(MUTEX_HELD(&ssh->lock));
105
106 if (n == 0)
107 ssh->private = list_tail(&ssh->list);
108 else if (ssh->private)
109 ssh->private = list_prev(&ssh->list, ssh->private);
110
111 return (ssh->private);
112 }
113
114 /*
115 * When the kstat is written discard all spa_read_history_t entries. The
116 * ssh->lock will be held until ksp->ks_ndata entries are processed.
117 */
118 static int
119 spa_read_history_update(kstat_t *ksp, int rw)
120 {
121 spa_t *spa = ksp->ks_private;
122 spa_stats_history_t *ssh = &spa->spa_stats.read_history;
123
124 if (rw == KSTAT_WRITE) {
125 spa_read_history_t *srh;
126
127 while ((srh = list_remove_head(&ssh->list))) {
128 ssh->size--;
129 kmem_free(srh, sizeof (spa_read_history_t));
130 }
131
132 ASSERT3U(ssh->size, ==, 0);
133 }
134
135 ksp->ks_ndata = ssh->size;
136 ksp->ks_data_size = ssh->size * sizeof (spa_read_history_t);
137
138 return (0);
139 }
140
141 static void
142 spa_read_history_init(spa_t *spa)
143 {
144 spa_stats_history_t *ssh = &spa->spa_stats.read_history;
145 char *name;
146 kstat_t *ksp;
147
148 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
149 list_create(&ssh->list, sizeof (spa_read_history_t),
150 offsetof(spa_read_history_t, srh_link));
151
152 ssh->count = 0;
153 ssh->size = 0;
154 ssh->private = NULL;
155
156 name = kmem_asprintf("zfs/%s", spa_name(spa));
157
158 ksp = kstat_create(name, 0, "reads", "misc",
159 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
160 ssh->kstat = ksp;
161
162 if (ksp) {
163 ksp->ks_lock = &ssh->lock;
164 ksp->ks_data = NULL;
165 ksp->ks_private = spa;
166 ksp->ks_update = spa_read_history_update;
167 kstat_set_raw_ops(ksp, spa_read_history_headers,
168 spa_read_history_data, spa_read_history_addr);
169 kstat_install(ksp);
170 }
171 strfree(name);
172 }
173
174 static void
175 spa_read_history_destroy(spa_t *spa)
176 {
177 spa_stats_history_t *ssh = &spa->spa_stats.read_history;
178 spa_read_history_t *srh;
179 kstat_t *ksp;
180
181 ksp = ssh->kstat;
182 if (ksp)
183 kstat_delete(ksp);
184
185 mutex_enter(&ssh->lock);
186 while ((srh = list_remove_head(&ssh->list))) {
187 ssh->size--;
188 kmem_free(srh, sizeof (spa_read_history_t));
189 }
190
191 ASSERT3U(ssh->size, ==, 0);
192 list_destroy(&ssh->list);
193 mutex_exit(&ssh->lock);
194
195 mutex_destroy(&ssh->lock);
196 }
197
198 void
199 spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
200 {
201 spa_stats_history_t *ssh = &spa->spa_stats.read_history;
202 spa_read_history_t *srh, *rm;
203
204 ASSERT3P(spa, !=, NULL);
205 ASSERT3P(zb, !=, NULL);
206
207 if (zfs_read_history == 0 && ssh->size == 0)
208 return;
209
210 if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
211 return;
212
213 srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);
214 strlcpy(srh->comm, getcomm(), sizeof (srh->comm));
215 srh->start = gethrtime();
216 srh->objset = zb->zb_objset;
217 srh->object = zb->zb_object;
218 srh->level = zb->zb_level;
219 srh->blkid = zb->zb_blkid;
220 srh->aflags = aflags;
221 srh->pid = getpid();
222
223 mutex_enter(&ssh->lock);
224
225 srh->uid = ssh->count++;
226 list_insert_head(&ssh->list, srh);
227 ssh->size++;
228
229 while (ssh->size > zfs_read_history) {
230 ssh->size--;
231 rm = list_remove_tail(&ssh->list);
232 kmem_free(rm, sizeof (spa_read_history_t));
233 }
234
235 mutex_exit(&ssh->lock);
236 }
237
238 /*
239 * ==========================================================================
240 * SPA TXG History Routines
241 * ==========================================================================
242 */
243
244 /*
245 * Txg statistics - Information exported regarding each txg sync
246 */
247
248 typedef struct spa_txg_history {
249 uint64_t txg; /* txg id */
250 txg_state_t state; /* active txg state */
251 uint64_t nread; /* number of bytes read */
252 uint64_t nwritten; /* number of bytes written */
253 uint64_t reads; /* number of read operations */
254 uint64_t writes; /* number of write operations */
255 uint64_t ndirty; /* number of dirty bytes */
256 hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */
257 list_node_t sth_link;
258 } spa_txg_history_t;
259
260 static int
261 spa_txg_history_headers(char *buf, size_t size)
262 {
263 (void) snprintf(buf, size, "%-8s %-16s %-5s %-12s %-12s %-12s "
264 "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
265 "ndirty", "nread", "nwritten", "reads", "writes",
266 "otime", "qtime", "wtime", "stime");
267
268 return (0);
269 }
270
271 static int
272 spa_txg_history_data(char *buf, size_t size, void *data)
273 {
274 spa_txg_history_t *sth = (spa_txg_history_t *)data;
275 uint64_t open = 0, quiesce = 0, wait = 0, sync = 0;
276 char state;
277
278 switch (sth->state) {
279 case TXG_STATE_BIRTH: state = 'B'; break;
280 case TXG_STATE_OPEN: state = 'O'; break;
281 case TXG_STATE_QUIESCED: state = 'Q'; break;
282 case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break;
283 case TXG_STATE_SYNCED: state = 'S'; break;
284 case TXG_STATE_COMMITTED: state = 'C'; break;
285 default: state = '?'; break;
286 }
287
288 if (sth->times[TXG_STATE_OPEN])
289 open = sth->times[TXG_STATE_OPEN] -
290 sth->times[TXG_STATE_BIRTH];
291
292 if (sth->times[TXG_STATE_QUIESCED])
293 quiesce = sth->times[TXG_STATE_QUIESCED] -
294 sth->times[TXG_STATE_OPEN];
295
296 if (sth->times[TXG_STATE_WAIT_FOR_SYNC])
297 wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] -
298 sth->times[TXG_STATE_QUIESCED];
299
300 if (sth->times[TXG_STATE_SYNCED])
301 sync = sth->times[TXG_STATE_SYNCED] -
302 sth->times[TXG_STATE_WAIT_FOR_SYNC];
303
304 (void) snprintf(buf, size, "%-8llu %-16llu %-5c %-12llu "
305 "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
306 (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
307 (u_longlong_t)sth->ndirty,
308 (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
309 (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
310 (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait,
311 (u_longlong_t)sync);
312
313 return (0);
314 }
315
316 /*
317 * Calculate the address for the next spa_stats_history_t entry. The
318 * ssh->lock will be held until ksp->ks_ndata entries are processed.
319 */
320 static void *
321 spa_txg_history_addr(kstat_t *ksp, loff_t n)
322 {
323 spa_t *spa = ksp->ks_private;
324 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
325
326 ASSERT(MUTEX_HELD(&ssh->lock));
327
328 if (n == 0)
329 ssh->private = list_tail(&ssh->list);
330 else if (ssh->private)
331 ssh->private = list_prev(&ssh->list, ssh->private);
332
333 return (ssh->private);
334 }
335
336 /*
337 * When the kstat is written discard all spa_txg_history_t entries. The
338 * ssh->lock will be held until ksp->ks_ndata entries are processed.
339 */
340 static int
341 spa_txg_history_update(kstat_t *ksp, int rw)
342 {
343 spa_t *spa = ksp->ks_private;
344 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
345
346 ASSERT(MUTEX_HELD(&ssh->lock));
347
348 if (rw == KSTAT_WRITE) {
349 spa_txg_history_t *sth;
350
351 while ((sth = list_remove_head(&ssh->list))) {
352 ssh->size--;
353 kmem_free(sth, sizeof (spa_txg_history_t));
354 }
355
356 ASSERT3U(ssh->size, ==, 0);
357 }
358
359 ksp->ks_ndata = ssh->size;
360 ksp->ks_data_size = ssh->size * sizeof (spa_txg_history_t);
361
362 return (0);
363 }
364
365 static void
366 spa_txg_history_init(spa_t *spa)
367 {
368 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
369 char *name;
370 kstat_t *ksp;
371
372 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
373 list_create(&ssh->list, sizeof (spa_txg_history_t),
374 offsetof(spa_txg_history_t, sth_link));
375
376 ssh->count = 0;
377 ssh->size = 0;
378 ssh->private = NULL;
379
380 name = kmem_asprintf("zfs/%s", spa_name(spa));
381
382 ksp = kstat_create(name, 0, "txgs", "misc",
383 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
384 ssh->kstat = ksp;
385
386 if (ksp) {
387 ksp->ks_lock = &ssh->lock;
388 ksp->ks_data = NULL;
389 ksp->ks_private = spa;
390 ksp->ks_update = spa_txg_history_update;
391 kstat_set_raw_ops(ksp, spa_txg_history_headers,
392 spa_txg_history_data, spa_txg_history_addr);
393 kstat_install(ksp);
394 }
395 strfree(name);
396 }
397
398 static void
399 spa_txg_history_destroy(spa_t *spa)
400 {
401 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
402 spa_txg_history_t *sth;
403 kstat_t *ksp;
404
405 ksp = ssh->kstat;
406 if (ksp)
407 kstat_delete(ksp);
408
409 mutex_enter(&ssh->lock);
410 while ((sth = list_remove_head(&ssh->list))) {
411 ssh->size--;
412 kmem_free(sth, sizeof (spa_txg_history_t));
413 }
414
415 ASSERT3U(ssh->size, ==, 0);
416 list_destroy(&ssh->list);
417 mutex_exit(&ssh->lock);
418
419 mutex_destroy(&ssh->lock);
420 }
421
422 /*
423 * Add a new txg to historical record.
424 */
425 void
426 spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
427 {
428 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
429 spa_txg_history_t *sth, *rm;
430
431 if (zfs_txg_history == 0 && ssh->size == 0)
432 return;
433
434 sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP);
435 sth->txg = txg;
436 sth->state = TXG_STATE_OPEN;
437 sth->times[TXG_STATE_BIRTH] = birth_time;
438
439 mutex_enter(&ssh->lock);
440
441 list_insert_head(&ssh->list, sth);
442 ssh->size++;
443
444 while (ssh->size > zfs_txg_history) {
445 ssh->size--;
446 rm = list_remove_tail(&ssh->list);
447 kmem_free(rm, sizeof (spa_txg_history_t));
448 }
449
450 mutex_exit(&ssh->lock);
451 }
452
453 /*
454 * Set txg state completion time and increment current state.
455 */
456 int
457 spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
458 hrtime_t completed_time)
459 {
460 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
461 spa_txg_history_t *sth;
462 int error = ENOENT;
463
464 if (zfs_txg_history == 0)
465 return (0);
466
467 mutex_enter(&ssh->lock);
468 for (sth = list_head(&ssh->list); sth != NULL;
469 sth = list_next(&ssh->list, sth)) {
470 if (sth->txg == txg) {
471 sth->times[completed_state] = completed_time;
472 sth->state++;
473 error = 0;
474 break;
475 }
476 }
477 mutex_exit(&ssh->lock);
478
479 return (error);
480 }
481
482 /*
483 * Set txg IO stats.
484 */
485 static int
486 spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
487 uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
488 {
489 spa_stats_history_t *ssh = &spa->spa_stats.txg_history;
490 spa_txg_history_t *sth;
491 int error = ENOENT;
492
493 if (zfs_txg_history == 0)
494 return (0);
495
496 mutex_enter(&ssh->lock);
497 for (sth = list_head(&ssh->list); sth != NULL;
498 sth = list_next(&ssh->list, sth)) {
499 if (sth->txg == txg) {
500 sth->nread = nread;
501 sth->nwritten = nwritten;
502 sth->reads = reads;
503 sth->writes = writes;
504 sth->ndirty = ndirty;
505 error = 0;
506 break;
507 }
508 }
509 mutex_exit(&ssh->lock);
510
511 return (error);
512 }
513
514 txg_stat_t *
515 spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
516 {
517 txg_stat_t *ts;
518
519 if (zfs_txg_history == 0)
520 return (NULL);
521
522 ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP);
523
524 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
525 vdev_get_stats(spa->spa_root_vdev, &ts->vs1);
526 spa_config_exit(spa, SCL_ALL, FTAG);
527
528 ts->txg = txg;
529 ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
530
531 spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime());
532
533 return (ts);
534 }
535
536 void
537 spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
538 {
539 if (ts == NULL)
540 return;
541
542 if (zfs_txg_history == 0) {
543 kmem_free(ts, sizeof (txg_stat_t));
544 return;
545 }
546
547 spa_config_enter(spa, SCL_ALL, FTAG, RW_READER);
548 vdev_get_stats(spa->spa_root_vdev, &ts->vs2);
549 spa_config_exit(spa, SCL_ALL, FTAG);
550
551 spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime());
552 spa_txg_history_set_io(spa, ts->txg,
553 ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ],
554 ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE],
555 ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ],
556 ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE],
557 ts->ndirty);
558
559 kmem_free(ts, sizeof (txg_stat_t));
560 }
561
562 /*
563 * ==========================================================================
564 * SPA TX Assign Histogram Routines
565 * ==========================================================================
566 */
567
568 /*
569 * Tx statistics - Information exported regarding dmu_tx_assign time.
570 */
571
572 /*
573 * When the kstat is written zero all buckets. When the kstat is read
574 * count the number of trailing buckets set to zero and update ks_ndata
575 * such that they are not output.
576 */
577 static int
578 spa_tx_assign_update(kstat_t *ksp, int rw)
579 {
580 spa_t *spa = ksp->ks_private;
581 spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
582 int i;
583
584 if (rw == KSTAT_WRITE) {
585 for (i = 0; i < ssh->count; i++)
586 ((kstat_named_t *)ssh->private)[i].value.ui64 = 0;
587 }
588
589 for (i = ssh->count; i > 0; i--)
590 if (((kstat_named_t *)ssh->private)[i-1].value.ui64 != 0)
591 break;
592
593 ksp->ks_ndata = i;
594 ksp->ks_data_size = i * sizeof (kstat_named_t);
595
596 return (0);
597 }
598
599 static void
600 spa_tx_assign_init(spa_t *spa)
601 {
602 spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
603 char *name;
604 kstat_named_t *ks;
605 kstat_t *ksp;
606 int i;
607
608 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
609
610 ssh->count = 42; /* power of two buckets for 1ns to 2,199s */
611 ssh->size = ssh->count * sizeof (kstat_named_t);
612 ssh->private = kmem_alloc(ssh->size, KM_SLEEP);
613
614 name = kmem_asprintf("zfs/%s", spa_name(spa));
615
616 for (i = 0; i < ssh->count; i++) {
617 ks = &((kstat_named_t *)ssh->private)[i];
618 ks->data_type = KSTAT_DATA_UINT64;
619 ks->value.ui64 = 0;
620 (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
621 (u_longlong_t)1 << i);
622 }
623
624 ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
625 KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
626 ssh->kstat = ksp;
627
628 if (ksp) {
629 ksp->ks_lock = &ssh->lock;
630 ksp->ks_data = ssh->private;
631 ksp->ks_ndata = ssh->count;
632 ksp->ks_data_size = ssh->size;
633 ksp->ks_private = spa;
634 ksp->ks_update = spa_tx_assign_update;
635 kstat_install(ksp);
636 }
637 strfree(name);
638 }
639
640 static void
641 spa_tx_assign_destroy(spa_t *spa)
642 {
643 spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
644 kstat_t *ksp;
645
646 ksp = ssh->kstat;
647 if (ksp)
648 kstat_delete(ksp);
649
650 kmem_free(ssh->private, ssh->size);
651 mutex_destroy(&ssh->lock);
652 }
653
654 void
655 spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
656 {
657 spa_stats_history_t *ssh = &spa->spa_stats.tx_assign_histogram;
658 uint64_t idx = 0;
659
660 while (((1ULL << idx) < nsecs) && (idx < ssh->size - 1))
661 idx++;
662
663 atomic_inc_64(&((kstat_named_t *)ssh->private)[idx].value.ui64);
664 }
665
666 /*
667 * ==========================================================================
668 * SPA IO History Routines
669 * ==========================================================================
670 */
671 static int
672 spa_io_history_update(kstat_t *ksp, int rw)
673 {
674 if (rw == KSTAT_WRITE)
675 memset(ksp->ks_data, 0, ksp->ks_data_size);
676
677 return (0);
678 }
679
680 static void
681 spa_io_history_init(spa_t *spa)
682 {
683 spa_stats_history_t *ssh = &spa->spa_stats.io_history;
684 char *name;
685 kstat_t *ksp;
686
687 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
688
689 name = kmem_asprintf("zfs/%s", spa_name(spa));
690
691 ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0);
692 ssh->kstat = ksp;
693
694 if (ksp) {
695 ksp->ks_lock = &ssh->lock;
696 ksp->ks_private = spa;
697 ksp->ks_update = spa_io_history_update;
698 kstat_install(ksp);
699 }
700 strfree(name);
701 }
702
703 static void
704 spa_io_history_destroy(spa_t *spa)
705 {
706 spa_stats_history_t *ssh = &spa->spa_stats.io_history;
707
708 if (ssh->kstat)
709 kstat_delete(ssh->kstat);
710
711 mutex_destroy(&ssh->lock);
712 }
713
714 /*
715 * ==========================================================================
716 * SPA MMP History Routines
717 * ==========================================================================
718 */
719
720 /*
721 * MMP statistics - Information exported regarding attempted MMP writes
722 * For MMP writes issued, fields used as per comments below.
723 * For MMP writes skipped, an entry represents a span of time when
724 * writes were skipped for same reason (error from mmp_random_leaf).
725 * Differences are:
726 * timestamp time first write skipped, if >1 skipped in a row
727 * mmp_delay delay value at timestamp
728 * vdev_guid number of writes skipped
729 * io_error one of enum mmp_error
730 * duration time span (ns) of skipped writes
731 */
732
733 typedef struct spa_mmp_history {
734 uint64_t mmp_kstat_id; /* unique # for updates */
735 uint64_t txg; /* txg of last sync */
736 uint64_t timestamp; /* UTC time MMP write issued */
737 uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */
738 uint64_t vdev_guid; /* unique ID of leaf vdev */
739 char *vdev_path;
740 int vdev_label; /* vdev label */
741 int io_error; /* error status of MMP write */
742 hrtime_t error_start; /* hrtime of start of error period */
743 hrtime_t duration; /* time from submission to completion */
744 list_node_t smh_link;
745 } spa_mmp_history_t;
746
747 static int
748 spa_mmp_history_headers(char *buf, size_t size)
749 {
750 (void) snprintf(buf, size, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
751 "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
752 "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
753 return (0);
754 }
755
756 static int
757 spa_mmp_history_data(char *buf, size_t size, void *data)
758 {
759 spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
760 char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
761 "%-10lld %s\n";
762 char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
763 "%-10lld %s\n";
764
765 (void) snprintf(buf, size, (smh->error_start ? skip_fmt : write_fmt),
766 (u_longlong_t)smh->mmp_kstat_id, (u_longlong_t)smh->txg,
767 (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
768 (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
769 (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label,
770 (smh->vdev_path ? smh->vdev_path : "-"));
771
772 return (0);
773 }
774
775 /*
776 * Calculate the address for the next spa_stats_history_t entry. The
777 * ssh->lock will be held until ksp->ks_ndata entries are processed.
778 */
779 static void *
780 spa_mmp_history_addr(kstat_t *ksp, loff_t n)
781 {
782 spa_t *spa = ksp->ks_private;
783 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
784
785 ASSERT(MUTEX_HELD(&ssh->lock));
786
787 if (n == 0)
788 ssh->private = list_tail(&ssh->list);
789 else if (ssh->private)
790 ssh->private = list_prev(&ssh->list, ssh->private);
791
792 return (ssh->private);
793 }
794
795 /*
796 * When the kstat is written discard all spa_mmp_history_t entries. The
797 * ssh->lock will be held until ksp->ks_ndata entries are processed.
798 */
799 static int
800 spa_mmp_history_update(kstat_t *ksp, int rw)
801 {
802 spa_t *spa = ksp->ks_private;
803 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
804
805 ASSERT(MUTEX_HELD(&ssh->lock));
806
807 if (rw == KSTAT_WRITE) {
808 spa_mmp_history_t *smh;
809
810 while ((smh = list_remove_head(&ssh->list))) {
811 ssh->size--;
812 if (smh->vdev_path)
813 strfree(smh->vdev_path);
814 kmem_free(smh, sizeof (spa_mmp_history_t));
815 }
816
817 ASSERT3U(ssh->size, ==, 0);
818 }
819
820 ksp->ks_ndata = ssh->size;
821 ksp->ks_data_size = ssh->size * sizeof (spa_mmp_history_t);
822
823 return (0);
824 }
825
826 static void
827 spa_mmp_history_init(spa_t *spa)
828 {
829 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
830 char *name;
831 kstat_t *ksp;
832
833 mutex_init(&ssh->lock, NULL, MUTEX_DEFAULT, NULL);
834 list_create(&ssh->list, sizeof (spa_mmp_history_t),
835 offsetof(spa_mmp_history_t, smh_link));
836
837 ssh->count = 0;
838 ssh->size = 0;
839 ssh->private = NULL;
840
841 name = kmem_asprintf("zfs/%s", spa_name(spa));
842
843 ksp = kstat_create(name, 0, "multihost", "misc",
844 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
845 ssh->kstat = ksp;
846
847 if (ksp) {
848 ksp->ks_lock = &ssh->lock;
849 ksp->ks_data = NULL;
850 ksp->ks_private = spa;
851 ksp->ks_update = spa_mmp_history_update;
852 kstat_set_raw_ops(ksp, spa_mmp_history_headers,
853 spa_mmp_history_data, spa_mmp_history_addr);
854 kstat_install(ksp);
855 }
856 strfree(name);
857 }
858
859 static void
860 spa_mmp_history_destroy(spa_t *spa)
861 {
862 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
863 spa_mmp_history_t *smh;
864 kstat_t *ksp;
865
866 ksp = ssh->kstat;
867 if (ksp)
868 kstat_delete(ksp);
869
870 mutex_enter(&ssh->lock);
871 while ((smh = list_remove_head(&ssh->list))) {
872 ssh->size--;
873 if (smh->vdev_path)
874 strfree(smh->vdev_path);
875 kmem_free(smh, sizeof (spa_mmp_history_t));
876 }
877
878 ASSERT3U(ssh->size, ==, 0);
879 list_destroy(&ssh->list);
880 mutex_exit(&ssh->lock);
881
882 mutex_destroy(&ssh->lock);
883 }
884
885 /*
886 * Set duration in existing "skip" record to how long we have waited for a leaf
887 * vdev to become available.
888 *
889 * Important that we start search at the head of the list where new
890 * records are inserted, so this is normally an O(1) operation.
891 */
892 int
893 spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id)
894 {
895 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
896 spa_mmp_history_t *smh;
897 int error = ENOENT;
898
899 if (zfs_multihost_history == 0 && ssh->size == 0)
900 return (0);
901
902 mutex_enter(&ssh->lock);
903 for (smh = list_head(&ssh->list); smh != NULL;
904 smh = list_next(&ssh->list, smh)) {
905 if (smh->mmp_kstat_id == mmp_kstat_id) {
906 ASSERT3U(smh->io_error, !=, 0);
907 smh->duration = gethrtime() - smh->error_start;
908 smh->vdev_guid++;
909 error = 0;
910 break;
911 }
912 }
913 mutex_exit(&ssh->lock);
914
915 return (error);
916 }
917
918 /*
919 * Set MMP write duration and error status in existing record.
920 * See comment re: search order above spa_mmp_history_set_skip().
921 */
922 int
923 spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
924 hrtime_t duration)
925 {
926 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
927 spa_mmp_history_t *smh;
928 int error = ENOENT;
929
930 if (zfs_multihost_history == 0 && ssh->size == 0)
931 return (0);
932
933 mutex_enter(&ssh->lock);
934 for (smh = list_head(&ssh->list); smh != NULL;
935 smh = list_next(&ssh->list, smh)) {
936 if (smh->mmp_kstat_id == mmp_kstat_id) {
937 ASSERT(smh->io_error == 0);
938 smh->io_error = io_error;
939 smh->duration = duration;
940 error = 0;
941 break;
942 }
943 }
944 mutex_exit(&ssh->lock);
945
946 return (error);
947 }
948
949 /*
950 * Add a new MMP historical record.
951 * error == 0 : a write was issued.
952 * error != 0 : a write was not issued because no leaves were found.
953 */
954 void *
955 spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
956 uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
957 int error)
958 {
959 spa_stats_history_t *ssh = &spa->spa_stats.mmp_history;
960 spa_mmp_history_t *smh, *rm;
961
962 if (zfs_multihost_history == 0 && ssh->size == 0)
963 return (NULL);
964
965 smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
966 smh->txg = txg;
967 smh->timestamp = timestamp;
968 smh->mmp_delay = mmp_delay;
969 if (vd) {
970 smh->vdev_guid = vd->vdev_guid;
971 if (vd->vdev_path)
972 smh->vdev_path = strdup(vd->vdev_path);
973 }
974 smh->vdev_label = label;
975 smh->mmp_kstat_id = mmp_kstat_id;
976
977 if (error) {
978 smh->io_error = error;
979 smh->error_start = gethrtime();
980 smh->vdev_guid = 1;
981 }
982
983 mutex_enter(&ssh->lock);
984
985 list_insert_head(&ssh->list, smh);
986 ssh->size++;
987
988 while (ssh->size > zfs_multihost_history) {
989 ssh->size--;
990 rm = list_remove_tail(&ssh->list);
991 if (rm->vdev_path)
992 strfree(rm->vdev_path);
993 kmem_free(rm, sizeof (spa_mmp_history_t));
994 }
995
996 mutex_exit(&ssh->lock);
997 return ((void *)smh);
998 }
999
1000 void
1001 spa_stats_init(spa_t *spa)
1002 {
1003 spa_read_history_init(spa);
1004 spa_txg_history_init(spa);
1005 spa_tx_assign_init(spa);
1006 spa_io_history_init(spa);
1007 spa_mmp_history_init(spa);
1008 }
1009
1010 void
1011 spa_stats_destroy(spa_t *spa)
1012 {
1013 spa_tx_assign_destroy(spa);
1014 spa_txg_history_destroy(spa);
1015 spa_read_history_destroy(spa);
1016 spa_io_history_destroy(spa);
1017 spa_mmp_history_destroy(spa);
1018 }
1019
1020 #if defined(_KERNEL) && defined(HAVE_SPL)
1021 /* CSTYLED */
1022 module_param(zfs_read_history, int, 0644);
1023 MODULE_PARM_DESC(zfs_read_history,
1024 "Historical statistics for the last N reads");
1025
1026 module_param(zfs_read_history_hits, int, 0644);
1027 MODULE_PARM_DESC(zfs_read_history_hits,
1028 "Include cache hits in read history");
1029
1030 module_param(zfs_txg_history, int, 0644);
1031 MODULE_PARM_DESC(zfs_txg_history,
1032 "Historical statistics for the last N txgs");
1033
1034 module_param(zfs_multihost_history, int, 0644);
1035 MODULE_PARM_DESC(zfs_multihost_history,
1036 "Historical statistics for last N multihost writes");
1037 /* END CSTYLED */
1038 #endif