]>
Commit | Line | Data |
---|---|---|
1421c891 PS |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | ||
22 | #include <sys/zfs_context.h> | |
23 | #include <sys/spa_impl.h> | |
379ca9cf | 24 | #include <sys/vdev_impl.h> |
f0ed6c74 TH |
25 | #include <sys/spa.h> |
26 | #include <zfs_comutil.h> | |
1421c891 PS |
27 | |
28 | /* | |
29 | * Keeps stats on last N reads per spa_t, disabled by default. | |
30 | */ | |
31 | int zfs_read_history = 0; | |
32 | ||
33 | /* | |
34 | * Include cache hits in history, disabled by default. | |
35 | */ | |
36 | int zfs_read_history_hits = 0; | |
37 | ||
0b1401ee | 38 | /* |
01ff0d75 | 39 | * Keeps stats on the last 100 txgs by default. |
0b1401ee | 40 | */ |
01ff0d75 | 41 | int zfs_txg_history = 100; |
0b1401ee | 42 | |
379ca9cf OF |
43 | /* |
44 | * Keeps stats on the last N MMP updates, disabled by default. | |
45 | */ | |
46 | int zfs_multihost_history = 0; | |
47 | ||
1421c891 PS |
48 | /* |
49 | * ========================================================================== | |
50 | * SPA Read History Routines | |
51 | * ========================================================================== | |
52 | */ | |
53 | ||
54 | /* | |
55 | * Read statistics - Information exported regarding each arc_read call | |
56 | */ | |
57 | typedef struct spa_read_history { | |
1421c891 PS |
58 | hrtime_t start; /* time read completed */ |
59 | uint64_t objset; /* read from this objset */ | |
60 | uint64_t object; /* read of this object number */ | |
61 | uint64_t level; /* block's indirection level */ | |
62 | uint64_t blkid; /* read of this block id */ | |
63 | char origin[24]; /* read originated from here */ | |
64 | uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ | |
65 | pid_t pid; /* PID of task doing read */ | |
66 | char comm[16]; /* process name of task doing read */ | |
d1261452 | 67 | procfs_list_node_t srh_node; |
1421c891 PS |
68 | } spa_read_history_t; |
69 | ||
70 | static int | |
d1261452 | 71 | spa_read_history_show_header(struct seq_file *f) |
1421c891 | 72 | { |
d1261452 | 73 | seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " |
1421c891 PS |
74 | "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", |
75 | "level", "blkid", "aflags", "origin", "pid", "process"); | |
1421c891 PS |
76 | |
77 | return (0); | |
78 | } | |
79 | ||
80 | static int | |
d1261452 | 81 | spa_read_history_show(struct seq_file *f, void *data) |
1421c891 PS |
82 | { |
83 | spa_read_history_t *srh = (spa_read_history_t *)data; | |
84 | ||
d1261452 | 85 | seq_printf(f, "%-8llu %-16llu 0x%-6llx " |
1421c891 | 86 | "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", |
d1261452 | 87 | (u_longlong_t)srh->srh_node.pln_id, srh->start, |
1421c891 PS |
88 | (longlong_t)srh->objset, (longlong_t)srh->object, |
89 | (longlong_t)srh->level, (longlong_t)srh->blkid, | |
90 | srh->aflags, srh->origin, srh->pid, srh->comm); | |
1421c891 PS |
91 | |
92 | return (0); | |
93 | } | |
94 | ||
d1261452 JG |
95 | /* Remove oldest elements from list until there are no more than 'size' left */ |
96 | static void | |
97 | spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) | |
1421c891 | 98 | { |
d1261452 JG |
99 | spa_read_history_t *srh; |
100 | while (shl->size > size) { | |
101 | srh = list_remove_head(&shl->procfs_list.pl_list); | |
102 | ASSERT3P(srh, !=, NULL); | |
103 | kmem_free(srh, sizeof (spa_read_history_t)); | |
104 | shl->size--; | |
105 | } | |
1421c891 | 106 | |
d1261452 JG |
107 | if (size == 0) |
108 | ASSERT(list_is_empty(&shl->procfs_list.pl_list)); | |
1421c891 PS |
109 | } |
110 | ||
1421c891 | 111 | static int |
d1261452 | 112 | spa_read_history_clear(procfs_list_t *procfs_list) |
1421c891 | 113 | { |
d1261452 JG |
114 | spa_history_list_t *shl = procfs_list->pl_private; |
115 | mutex_enter(&procfs_list->pl_lock); | |
116 | spa_read_history_truncate(shl, 0); | |
117 | mutex_exit(&procfs_list->pl_lock); | |
1421c891 PS |
118 | return (0); |
119 | } | |
120 | ||
121 | static void | |
122 | spa_read_history_init(spa_t *spa) | |
123 | { | |
d1261452 JG |
124 | spa_history_list_t *shl = &spa->spa_stats.read_history; |
125 | char *module; | |
1421c891 | 126 | |
d1261452 | 127 | shl->size = 0; |
1421c891 | 128 | |
d1261452 | 129 | module = kmem_asprintf("zfs/%s", spa_name(spa)); |
1421c891 | 130 | |
d1261452 JG |
131 | shl->procfs_list.pl_private = shl; |
132 | procfs_list_install(module, | |
133 | "reads", | |
134 | &shl->procfs_list, | |
135 | spa_read_history_show, | |
136 | spa_read_history_show_header, | |
137 | spa_read_history_clear, | |
138 | offsetof(spa_read_history_t, srh_node)); | |
1421c891 | 139 | |
d1261452 | 140 | strfree(module); |
1421c891 PS |
141 | } |
142 | ||
143 | static void | |
144 | spa_read_history_destroy(spa_t *spa) | |
145 | { | |
d1261452 JG |
146 | spa_history_list_t *shl = &spa->spa_stats.read_history; |
147 | procfs_list_uninstall(&shl->procfs_list); | |
148 | spa_read_history_truncate(shl, 0); | |
149 | procfs_list_destroy(&shl->procfs_list); | |
1421c891 PS |
150 | } |
151 | ||
152 | void | |
5dbd68a3 | 153 | spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) |
1421c891 | 154 | { |
d1261452 JG |
155 | spa_history_list_t *shl = &spa->spa_stats.read_history; |
156 | spa_read_history_t *srh; | |
1421c891 PS |
157 | |
158 | ASSERT3P(spa, !=, NULL); | |
159 | ASSERT3P(zb, !=, NULL); | |
160 | ||
d1261452 | 161 | if (zfs_read_history == 0 && shl->size == 0) |
1421c891 PS |
162 | return; |
163 | ||
2a432414 | 164 | if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) |
1421c891 PS |
165 | return; |
166 | ||
79c76d5b | 167 | srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); |
d1d7e268 | 168 | strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); |
1421c891 PS |
169 | srh->start = gethrtime(); |
170 | srh->objset = zb->zb_objset; | |
171 | srh->object = zb->zb_object; | |
172 | srh->level = zb->zb_level; | |
173 | srh->blkid = zb->zb_blkid; | |
174 | srh->aflags = aflags; | |
175 | srh->pid = getpid(); | |
176 | ||
d1261452 | 177 | mutex_enter(&shl->procfs_list.pl_lock); |
1421c891 | 178 | |
d1261452 JG |
179 | procfs_list_add(&shl->procfs_list, srh); |
180 | shl->size++; | |
1421c891 | 181 | |
d1261452 | 182 | spa_read_history_truncate(shl, zfs_read_history); |
1421c891 | 183 | |
d1261452 | 184 | mutex_exit(&shl->procfs_list.pl_lock); |
1421c891 PS |
185 | } |
186 | ||
0b1401ee BB |
187 | /* |
188 | * ========================================================================== | |
189 | * SPA TXG History Routines | |
190 | * ========================================================================== | |
191 | */ | |
192 | ||
193 | /* | |
194 | * Txg statistics - Information exported regarding each txg sync | |
195 | */ | |
196 | ||
197 | typedef struct spa_txg_history { | |
198 | uint64_t txg; /* txg id */ | |
199 | txg_state_t state; /* active txg state */ | |
200 | uint64_t nread; /* number of bytes read */ | |
201 | uint64_t nwritten; /* number of bytes written */ | |
202 | uint64_t reads; /* number of read operations */ | |
203 | uint64_t writes; /* number of write operations */ | |
3ccab252 | 204 | uint64_t ndirty; /* number of dirty bytes */ |
0b1401ee | 205 | hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ |
d1261452 | 206 | procfs_list_node_t sth_node; |
0b1401ee BB |
207 | } spa_txg_history_t; |
208 | ||
209 | static int | |
d1261452 | 210 | spa_txg_history_show_header(struct seq_file *f) |
0b1401ee | 211 | { |
d1261452 | 212 | seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " |
478d64fd | 213 | "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", |
3ccab252 | 214 | "ndirty", "nread", "nwritten", "reads", "writes", |
478d64fd | 215 | "otime", "qtime", "wtime", "stime"); |
0b1401ee BB |
216 | return (0); |
217 | } | |
218 | ||
219 | static int | |
d1261452 | 220 | spa_txg_history_show(struct seq_file *f, void *data) |
0b1401ee BB |
221 | { |
222 | spa_txg_history_t *sth = (spa_txg_history_t *)data; | |
478d64fd | 223 | uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; |
0b1401ee BB |
224 | char state; |
225 | ||
226 | switch (sth->state) { | |
227 | case TXG_STATE_BIRTH: state = 'B'; break; | |
228 | case TXG_STATE_OPEN: state = 'O'; break; | |
229 | case TXG_STATE_QUIESCED: state = 'Q'; break; | |
478d64fd | 230 | case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; |
0b1401ee BB |
231 | case TXG_STATE_SYNCED: state = 'S'; break; |
232 | case TXG_STATE_COMMITTED: state = 'C'; break; | |
233 | default: state = '?'; break; | |
234 | } | |
235 | ||
236 | if (sth->times[TXG_STATE_OPEN]) | |
237 | open = sth->times[TXG_STATE_OPEN] - | |
238 | sth->times[TXG_STATE_BIRTH]; | |
239 | ||
240 | if (sth->times[TXG_STATE_QUIESCED]) | |
241 | quiesce = sth->times[TXG_STATE_QUIESCED] - | |
242 | sth->times[TXG_STATE_OPEN]; | |
243 | ||
478d64fd IL |
244 | if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) |
245 | wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - | |
246 | sth->times[TXG_STATE_QUIESCED]; | |
247 | ||
0b1401ee BB |
248 | if (sth->times[TXG_STATE_SYNCED]) |
249 | sync = sth->times[TXG_STATE_SYNCED] - | |
478d64fd | 250 | sth->times[TXG_STATE_WAIT_FOR_SYNC]; |
0b1401ee | 251 | |
d1261452 | 252 | seq_printf(f, "%-8llu %-16llu %-5c %-12llu " |
478d64fd | 253 | "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", |
0b1401ee | 254 | (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, |
3ccab252 | 255 | (u_longlong_t)sth->ndirty, |
0b1401ee BB |
256 | (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, |
257 | (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, | |
478d64fd IL |
258 | (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, |
259 | (u_longlong_t)sync); | |
0b1401ee BB |
260 | |
261 | return (0); | |
262 | } | |
263 | ||
d1261452 JG |
264 | /* Remove oldest elements from list until there are no more than 'size' left */ |
265 | static void | |
266 | spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) | |
0b1401ee | 267 | { |
d1261452 JG |
268 | spa_txg_history_t *sth; |
269 | while (shl->size > size) { | |
270 | sth = list_remove_head(&shl->procfs_list.pl_list); | |
271 | ASSERT3P(sth, !=, NULL); | |
272 | kmem_free(sth, sizeof (spa_txg_history_t)); | |
273 | shl->size--; | |
274 | } | |
0b1401ee | 275 | |
d1261452 JG |
276 | if (size == 0) |
277 | ASSERT(list_is_empty(&shl->procfs_list.pl_list)); | |
0b1401ee | 278 | |
0b1401ee BB |
279 | } |
280 | ||
0b1401ee | 281 | static int |
d1261452 | 282 | spa_txg_history_clear(procfs_list_t *procfs_list) |
0b1401ee | 283 | { |
d1261452 JG |
284 | spa_history_list_t *shl = procfs_list->pl_private; |
285 | mutex_enter(&procfs_list->pl_lock); | |
286 | spa_txg_history_truncate(shl, 0); | |
287 | mutex_exit(&procfs_list->pl_lock); | |
0b1401ee BB |
288 | return (0); |
289 | } | |
290 | ||
291 | static void | |
292 | spa_txg_history_init(spa_t *spa) | |
293 | { | |
d1261452 JG |
294 | spa_history_list_t *shl = &spa->spa_stats.txg_history; |
295 | char *module; | |
0b1401ee | 296 | |
d1261452 | 297 | shl->size = 0; |
0b1401ee | 298 | |
d1261452 | 299 | module = kmem_asprintf("zfs/%s", spa_name(spa)); |
0b1401ee | 300 | |
d1261452 JG |
301 | shl->procfs_list.pl_private = shl; |
302 | procfs_list_install(module, | |
303 | "txgs", | |
304 | &shl->procfs_list, | |
305 | spa_txg_history_show, | |
306 | spa_txg_history_show_header, | |
307 | spa_txg_history_clear, | |
308 | offsetof(spa_txg_history_t, sth_node)); | |
0b1401ee | 309 | |
d1261452 | 310 | strfree(module); |
0b1401ee BB |
311 | } |
312 | ||
313 | static void | |
314 | spa_txg_history_destroy(spa_t *spa) | |
315 | { | |
d1261452 JG |
316 | spa_history_list_t *shl = &spa->spa_stats.txg_history; |
317 | procfs_list_uninstall(&shl->procfs_list); | |
318 | spa_txg_history_truncate(shl, 0); | |
319 | procfs_list_destroy(&shl->procfs_list); | |
0b1401ee BB |
320 | } |
321 | ||
322 | /* | |
323 | * Add a new txg to historical record. | |
324 | */ | |
325 | void | |
01b738f4 | 326 | spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) |
0b1401ee | 327 | { |
d1261452 JG |
328 | spa_history_list_t *shl = &spa->spa_stats.txg_history; |
329 | spa_txg_history_t *sth; | |
0b1401ee | 330 | |
d1261452 | 331 | if (zfs_txg_history == 0 && shl->size == 0) |
0b1401ee BB |
332 | return; |
333 | ||
79c76d5b | 334 | sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); |
0b1401ee BB |
335 | sth->txg = txg; |
336 | sth->state = TXG_STATE_OPEN; | |
01b738f4 | 337 | sth->times[TXG_STATE_BIRTH] = birth_time; |
0b1401ee | 338 | |
d1261452 JG |
339 | mutex_enter(&shl->procfs_list.pl_lock); |
340 | procfs_list_add(&shl->procfs_list, sth); | |
341 | shl->size++; | |
342 | spa_txg_history_truncate(shl, zfs_txg_history); | |
343 | mutex_exit(&shl->procfs_list.pl_lock); | |
0b1401ee BB |
344 | } |
345 | ||
346 | /* | |
347 | * Set txg state completion time and increment current state. | |
348 | */ | |
349 | int | |
350 | spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, | |
351 | hrtime_t completed_time) | |
352 | { | |
d1261452 | 353 | spa_history_list_t *shl = &spa->spa_stats.txg_history; |
0b1401ee BB |
354 | spa_txg_history_t *sth; |
355 | int error = ENOENT; | |
356 | ||
357 | if (zfs_txg_history == 0) | |
358 | return (0); | |
359 | ||
d1261452 JG |
360 | mutex_enter(&shl->procfs_list.pl_lock); |
361 | for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; | |
362 | sth = list_prev(&shl->procfs_list.pl_list, sth)) { | |
0b1401ee BB |
363 | if (sth->txg == txg) { |
364 | sth->times[completed_state] = completed_time; | |
365 | sth->state++; | |
366 | error = 0; | |
367 | break; | |
368 | } | |
369 | } | |
d1261452 | 370 | mutex_exit(&shl->procfs_list.pl_lock); |
0b1401ee BB |
371 | |
372 | return (error); | |
373 | } | |
374 | ||
375 | /* | |
376 | * Set txg IO stats. | |
377 | */ | |
baf67d15 | 378 | static int |
0b1401ee | 379 | spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, |
3ccab252 | 380 | uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) |
0b1401ee | 381 | { |
d1261452 | 382 | spa_history_list_t *shl = &spa->spa_stats.txg_history; |
0b1401ee BB |
383 | spa_txg_history_t *sth; |
384 | int error = ENOENT; | |
385 | ||
386 | if (zfs_txg_history == 0) | |
387 | return (0); | |
388 | ||
d1261452 JG |
389 | mutex_enter(&shl->procfs_list.pl_lock); |
390 | for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; | |
391 | sth = list_prev(&shl->procfs_list.pl_list, sth)) { | |
0b1401ee BB |
392 | if (sth->txg == txg) { |
393 | sth->nread = nread; | |
394 | sth->nwritten = nwritten; | |
395 | sth->reads = reads; | |
396 | sth->writes = writes; | |
3ccab252 | 397 | sth->ndirty = ndirty; |
0b1401ee BB |
398 | error = 0; |
399 | break; | |
400 | } | |
401 | } | |
d1261452 | 402 | mutex_exit(&shl->procfs_list.pl_lock); |
0b1401ee BB |
403 | |
404 | return (error); | |
405 | } | |
406 | ||
baf67d15 BB |
407 | txg_stat_t * |
408 | spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) | |
409 | { | |
410 | txg_stat_t *ts; | |
411 | ||
412 | if (zfs_txg_history == 0) | |
413 | return (NULL); | |
414 | ||
415 | ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); | |
416 | ||
417 | spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); | |
418 | vdev_get_stats(spa->spa_root_vdev, &ts->vs1); | |
419 | spa_config_exit(spa, SCL_ALL, FTAG); | |
420 | ||
421 | ts->txg = txg; | |
422 | ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; | |
423 | ||
424 | spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); | |
425 | ||
426 | return (ts); | |
427 | } | |
428 | ||
429 | void | |
430 | spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) | |
431 | { | |
432 | if (ts == NULL) | |
433 | return; | |
434 | ||
435 | if (zfs_txg_history == 0) { | |
436 | kmem_free(ts, sizeof (txg_stat_t)); | |
437 | return; | |
438 | } | |
439 | ||
440 | spa_config_enter(spa, SCL_ALL, FTAG, RW_READER); | |
441 | vdev_get_stats(spa->spa_root_vdev, &ts->vs2); | |
442 | spa_config_exit(spa, SCL_ALL, FTAG); | |
443 | ||
444 | spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); | |
445 | spa_txg_history_set_io(spa, ts->txg, | |
446 | ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], | |
447 | ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], | |
448 | ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], | |
449 | ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], | |
450 | ts->ndirty); | |
451 | ||
452 | kmem_free(ts, sizeof (txg_stat_t)); | |
453 | } | |
454 | ||
2d37239a BB |
455 | /* |
456 | * ========================================================================== | |
457 | * SPA TX Assign Histogram Routines | |
458 | * ========================================================================== | |
459 | */ | |
460 | ||
461 | /* | |
462 | * Tx statistics - Information exported regarding dmu_tx_assign time. | |
463 | */ | |
464 | ||
465 | /* | |
466 | * When the kstat is written zero all buckets. When the kstat is read | |
467 | * count the number of trailing buckets set to zero and update ks_ndata | |
468 | * such that they are not output. | |
469 | */ | |
470 | static int | |
471 | spa_tx_assign_update(kstat_t *ksp, int rw) | |
472 | { | |
473 | spa_t *spa = ksp->ks_private; | |
d1261452 | 474 | spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; |
2d37239a BB |
475 | int i; |
476 | ||
477 | if (rw == KSTAT_WRITE) { | |
d1261452 JG |
478 | for (i = 0; i < shk->count; i++) |
479 | ((kstat_named_t *)shk->private)[i].value.ui64 = 0; | |
2d37239a BB |
480 | } |
481 | ||
d1261452 JG |
482 | for (i = shk->count; i > 0; i--) |
483 | if (((kstat_named_t *)shk->private)[i-1].value.ui64 != 0) | |
2d37239a BB |
484 | break; |
485 | ||
486 | ksp->ks_ndata = i; | |
d1d7e268 | 487 | ksp->ks_data_size = i * sizeof (kstat_named_t); |
2d37239a BB |
488 | |
489 | return (0); | |
490 | } | |
491 | ||
492 | static void | |
493 | spa_tx_assign_init(spa_t *spa) | |
494 | { | |
d1261452 | 495 | spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; |
761b8ec6 | 496 | char *name; |
2d37239a BB |
497 | kstat_named_t *ks; |
498 | kstat_t *ksp; | |
499 | int i; | |
500 | ||
d1261452 | 501 | mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); |
2d37239a | 502 | |
d1261452 JG |
503 | shk->count = 42; /* power of two buckets for 1ns to 2,199s */ |
504 | shk->size = shk->count * sizeof (kstat_named_t); | |
505 | shk->private = kmem_alloc(shk->size, KM_SLEEP); | |
2d37239a | 506 | |
761b8ec6 | 507 | name = kmem_asprintf("zfs/%s", spa_name(spa)); |
2d37239a | 508 | |
d1261452 JG |
509 | for (i = 0; i < shk->count; i++) { |
510 | ks = &((kstat_named_t *)shk->private)[i]; | |
2d37239a BB |
511 | ks->data_type = KSTAT_DATA_UINT64; |
512 | ks->value.ui64 = 0; | |
513 | (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", | |
514 | (u_longlong_t)1 << i); | |
515 | } | |
516 | ||
517 | ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", | |
518 | KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); | |
d1261452 | 519 | shk->kstat = ksp; |
2d37239a BB |
520 | |
521 | if (ksp) { | |
d1261452 JG |
522 | ksp->ks_lock = &shk->lock; |
523 | ksp->ks_data = shk->private; | |
524 | ksp->ks_ndata = shk->count; | |
525 | ksp->ks_data_size = shk->size; | |
2d37239a BB |
526 | ksp->ks_private = spa; |
527 | ksp->ks_update = spa_tx_assign_update; | |
528 | kstat_install(ksp); | |
529 | } | |
761b8ec6 | 530 | strfree(name); |
2d37239a BB |
531 | } |
532 | ||
533 | static void | |
534 | spa_tx_assign_destroy(spa_t *spa) | |
535 | { | |
d1261452 | 536 | spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; |
2d37239a BB |
537 | kstat_t *ksp; |
538 | ||
d1261452 | 539 | ksp = shk->kstat; |
2d37239a BB |
540 | if (ksp) |
541 | kstat_delete(ksp); | |
542 | ||
d1261452 JG |
543 | kmem_free(shk->private, shk->size); |
544 | mutex_destroy(&shk->lock); | |
2d37239a BB |
545 | } |
546 | ||
547 | void | |
548 | spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) | |
549 | { | |
d1261452 | 550 | spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; |
2d37239a BB |
551 | uint64_t idx = 0; |
552 | ||
d1261452 | 553 | while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) |
2d37239a BB |
554 | idx++; |
555 | ||
d1261452 | 556 | atomic_inc_64(&((kstat_named_t *)shk->private)[idx].value.ui64); |
2d37239a BB |
557 | } |
558 | ||
330847ff MA |
559 | /* |
560 | * ========================================================================== | |
561 | * SPA IO History Routines | |
562 | * ========================================================================== | |
563 | */ | |
564 | static int | |
565 | spa_io_history_update(kstat_t *ksp, int rw) | |
566 | { | |
567 | if (rw == KSTAT_WRITE) | |
568 | memset(ksp->ks_data, 0, ksp->ks_data_size); | |
569 | ||
570 | return (0); | |
571 | } | |
572 | ||
573 | static void | |
574 | spa_io_history_init(spa_t *spa) | |
575 | { | |
d1261452 | 576 | spa_history_kstat_t *shk = &spa->spa_stats.io_history; |
761b8ec6 | 577 | char *name; |
330847ff MA |
578 | kstat_t *ksp; |
579 | ||
d1261452 | 580 | mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); |
330847ff | 581 | |
761b8ec6 | 582 | name = kmem_asprintf("zfs/%s", spa_name(spa)); |
330847ff MA |
583 | |
584 | ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); | |
d1261452 | 585 | shk->kstat = ksp; |
330847ff MA |
586 | |
587 | if (ksp) { | |
d1261452 | 588 | ksp->ks_lock = &shk->lock; |
330847ff MA |
589 | ksp->ks_private = spa; |
590 | ksp->ks_update = spa_io_history_update; | |
591 | kstat_install(ksp); | |
592 | } | |
761b8ec6 | 593 | strfree(name); |
330847ff MA |
594 | } |
595 | ||
596 | static void | |
597 | spa_io_history_destroy(spa_t *spa) | |
598 | { | |
d1261452 | 599 | spa_history_kstat_t *shk = &spa->spa_stats.io_history; |
330847ff | 600 | |
d1261452 JG |
601 | if (shk->kstat) |
602 | kstat_delete(shk->kstat); | |
330847ff | 603 | |
d1261452 | 604 | mutex_destroy(&shk->lock); |
330847ff MA |
605 | } |
606 | ||
379ca9cf OF |
607 | /* |
608 | * ========================================================================== | |
609 | * SPA MMP History Routines | |
610 | * ========================================================================== | |
611 | */ | |
612 | ||
613 | /* | |
d2160d05 OF |
614 | * MMP statistics - Information exported regarding attempted MMP writes |
615 | * For MMP writes issued, fields used as per comments below. | |
616 | * For MMP writes skipped, an entry represents a span of time when | |
617 | * writes were skipped for same reason (error from mmp_random_leaf). | |
618 | * Differences are: | |
619 | * timestamp time first write skipped, if >1 skipped in a row | |
620 | * mmp_delay delay value at timestamp | |
621 | * vdev_guid number of writes skipped | |
622 | * io_error one of enum mmp_error | |
623 | * duration time span (ns) of skipped writes | |
379ca9cf OF |
624 | */ |
625 | ||
626 | typedef struct spa_mmp_history { | |
d1261452 | 627 | uint64_t mmp_node_id; /* unique # for updates */ |
379ca9cf | 628 | uint64_t txg; /* txg of last sync */ |
d2160d05 OF |
629 | uint64_t timestamp; /* UTC time MMP write issued */ |
630 | uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ | |
379ca9cf OF |
631 | uint64_t vdev_guid; /* unique ID of leaf vdev */ |
632 | char *vdev_path; | |
d2160d05 | 633 | int vdev_label; /* vdev label */ |
7088545d | 634 | int io_error; /* error status of MMP write */ |
d2160d05 | 635 | hrtime_t error_start; /* hrtime of start of error period */ |
7088545d | 636 | hrtime_t duration; /* time from submission to completion */ |
d1261452 | 637 | procfs_list_node_t smh_node; |
379ca9cf OF |
638 | } spa_mmp_history_t; |
639 | ||
640 | static int | |
d1261452 | 641 | spa_mmp_history_show_header(struct seq_file *f) |
379ca9cf | 642 | { |
d1261452 | 643 | seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " |
7088545d OF |
644 | "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", |
645 | "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); | |
379ca9cf OF |
646 | return (0); |
647 | } | |
648 | ||
649 | static int | |
d1261452 | 650 | spa_mmp_history_show(struct seq_file *f, void *data) |
379ca9cf OF |
651 | { |
652 | spa_mmp_history_t *smh = (spa_mmp_history_t *)data; | |
d2160d05 OF |
653 | char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " |
654 | "%-10lld %s\n"; | |
655 | char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " | |
656 | "%-10lld %s\n"; | |
379ca9cf | 657 | |
d1261452 JG |
658 | seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), |
659 | (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, | |
7088545d OF |
660 | (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, |
661 | (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, | |
662 | (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, | |
379ca9cf OF |
663 | (smh->vdev_path ? smh->vdev_path : "-")); |
664 | ||
665 | return (0); | |
666 | } | |
667 | ||
d1261452 JG |
668 | /* Remove oldest elements from list until there are no more than 'size' left */ |
669 | static void | |
670 | spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) | |
379ca9cf | 671 | { |
d1261452 JG |
672 | spa_mmp_history_t *smh; |
673 | while (shl->size > size) { | |
674 | smh = list_remove_head(&shl->procfs_list.pl_list); | |
675 | if (smh->vdev_path) | |
676 | strfree(smh->vdev_path); | |
677 | kmem_free(smh, sizeof (spa_mmp_history_t)); | |
678 | shl->size--; | |
679 | } | |
379ca9cf | 680 | |
d1261452 JG |
681 | if (size == 0) |
682 | ASSERT(list_is_empty(&shl->procfs_list.pl_list)); | |
379ca9cf | 683 | |
379ca9cf OF |
684 | } |
685 | ||
379ca9cf | 686 | static int |
d1261452 | 687 | spa_mmp_history_clear(procfs_list_t *procfs_list) |
379ca9cf | 688 | { |
d1261452 JG |
689 | spa_history_list_t *shl = procfs_list->pl_private; |
690 | mutex_enter(&procfs_list->pl_lock); | |
691 | spa_mmp_history_truncate(shl, 0); | |
692 | mutex_exit(&procfs_list->pl_lock); | |
379ca9cf OF |
693 | return (0); |
694 | } | |
695 | ||
696 | static void | |
697 | spa_mmp_history_init(spa_t *spa) | |
698 | { | |
d1261452 JG |
699 | spa_history_list_t *shl = &spa->spa_stats.mmp_history; |
700 | char *module; | |
379ca9cf | 701 | |
d1261452 | 702 | shl->size = 0; |
379ca9cf | 703 | |
d1261452 | 704 | module = kmem_asprintf("zfs/%s", spa_name(spa)); |
379ca9cf | 705 | |
d1261452 JG |
706 | shl->procfs_list.pl_private = shl; |
707 | procfs_list_install(module, | |
708 | "multihost", | |
709 | &shl->procfs_list, | |
710 | spa_mmp_history_show, | |
711 | spa_mmp_history_show_header, | |
712 | spa_mmp_history_clear, | |
713 | offsetof(spa_mmp_history_t, smh_node)); | |
379ca9cf | 714 | |
d1261452 | 715 | strfree(module); |
379ca9cf OF |
716 | } |
717 | ||
718 | static void | |
719 | spa_mmp_history_destroy(spa_t *spa) | |
720 | { | |
d1261452 JG |
721 | spa_history_list_t *shl = &spa->spa_stats.mmp_history; |
722 | procfs_list_uninstall(&shl->procfs_list); | |
723 | spa_mmp_history_truncate(shl, 0); | |
724 | procfs_list_destroy(&shl->procfs_list); | |
379ca9cf OF |
725 | } |
726 | ||
d2160d05 OF |
727 | /* |
728 | * Set duration in existing "skip" record to how long we have waited for a leaf | |
729 | * vdev to become available. | |
730 | * | |
d1261452 | 731 | * Important that we start search at the tail of the list where new |
d2160d05 OF |
732 | * records are inserted, so this is normally an O(1) operation. |
733 | */ | |
734 | int | |
d1261452 | 735 | spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) |
d2160d05 | 736 | { |
d1261452 | 737 | spa_history_list_t *shl = &spa->spa_stats.mmp_history; |
d2160d05 OF |
738 | spa_mmp_history_t *smh; |
739 | int error = ENOENT; | |
740 | ||
d1261452 | 741 | if (zfs_multihost_history == 0 && shl->size == 0) |
d2160d05 OF |
742 | return (0); |
743 | ||
d1261452 JG |
744 | mutex_enter(&shl->procfs_list.pl_lock); |
745 | for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; | |
746 | smh = list_prev(&shl->procfs_list.pl_list, smh)) { | |
747 | if (smh->mmp_node_id == mmp_node_id) { | |
d2160d05 OF |
748 | ASSERT3U(smh->io_error, !=, 0); |
749 | smh->duration = gethrtime() - smh->error_start; | |
750 | smh->vdev_guid++; | |
751 | error = 0; | |
752 | break; | |
753 | } | |
754 | } | |
d1261452 | 755 | mutex_exit(&shl->procfs_list.pl_lock); |
d2160d05 OF |
756 | |
757 | return (error); | |
758 | } | |
759 | ||
379ca9cf | 760 | /* |
7088545d | 761 | * Set MMP write duration and error status in existing record. |
d2160d05 | 762 | * See comment re: search order above spa_mmp_history_set_skip(). |
7088545d OF |
763 | */ |
764 | int | |
d1261452 | 765 | spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, |
7088545d OF |
766 | hrtime_t duration) |
767 | { | |
d1261452 | 768 | spa_history_list_t *shl = &spa->spa_stats.mmp_history; |
7088545d OF |
769 | spa_mmp_history_t *smh; |
770 | int error = ENOENT; | |
771 | ||
d1261452 | 772 | if (zfs_multihost_history == 0 && shl->size == 0) |
7088545d OF |
773 | return (0); |
774 | ||
d1261452 JG |
775 | mutex_enter(&shl->procfs_list.pl_lock); |
776 | for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; | |
777 | smh = list_prev(&shl->procfs_list.pl_list, smh)) { | |
778 | if (smh->mmp_node_id == mmp_node_id) { | |
d2160d05 | 779 | ASSERT(smh->io_error == 0); |
7088545d OF |
780 | smh->io_error = io_error; |
781 | smh->duration = duration; | |
782 | error = 0; | |
783 | break; | |
784 | } | |
785 | } | |
d1261452 | 786 | mutex_exit(&shl->procfs_list.pl_lock); |
7088545d OF |
787 | |
788 | return (error); | |
789 | } | |
790 | ||
791 | /* | |
d2160d05 OF |
792 | * Add a new MMP historical record. |
793 | * error == 0 : a write was issued. | |
794 | * error != 0 : a write was not issued because no leaves were found. | |
379ca9cf | 795 | */ |
d1261452 | 796 | void |
d2160d05 | 797 | spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, |
d1261452 | 798 | uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, |
d2160d05 | 799 | int error) |
379ca9cf | 800 | { |
d1261452 JG |
801 | spa_history_list_t *shl = &spa->spa_stats.mmp_history; |
802 | spa_mmp_history_t *smh; | |
379ca9cf | 803 | |
d1261452 JG |
804 | if (zfs_multihost_history == 0 && shl->size == 0) |
805 | return; | |
379ca9cf OF |
806 | |
807 | smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); | |
808 | smh->txg = txg; | |
809 | smh->timestamp = timestamp; | |
810 | smh->mmp_delay = mmp_delay; | |
d2160d05 OF |
811 | if (vd) { |
812 | smh->vdev_guid = vd->vdev_guid; | |
813 | if (vd->vdev_path) | |
814 | smh->vdev_path = strdup(vd->vdev_path); | |
815 | } | |
379ca9cf | 816 | smh->vdev_label = label; |
d1261452 | 817 | smh->mmp_node_id = mmp_node_id; |
379ca9cf | 818 | |
d2160d05 OF |
819 | if (error) { |
820 | smh->io_error = error; | |
821 | smh->error_start = gethrtime(); | |
822 | smh->vdev_guid = 1; | |
823 | } | |
824 | ||
d1261452 JG |
825 | mutex_enter(&shl->procfs_list.pl_lock); |
826 | procfs_list_add(&shl->procfs_list, smh); | |
827 | shl->size++; | |
828 | spa_mmp_history_truncate(shl, zfs_multihost_history); | |
829 | mutex_exit(&shl->procfs_list.pl_lock); | |
379ca9cf OF |
830 | } |
831 | ||
f0ed6c74 TH |
832 | static void * |
833 | spa_state_addr(kstat_t *ksp, loff_t n) | |
834 | { | |
835 | return (ksp->ks_private); /* return the spa_t */ | |
836 | } | |
837 | ||
838 | static int | |
839 | spa_state_data(char *buf, size_t size, void *data) | |
840 | { | |
841 | spa_t *spa = (spa_t *)data; | |
842 | (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); | |
843 | return (0); | |
844 | } | |
845 | ||
846 | /* | |
847 | * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state. | |
848 | * | |
849 | * This is a lock-less read of the pool's state (unlike using 'zpool', which | |
850 | * can potentially block for seconds). Because it doesn't block, it can useful | |
851 | * as a pool heartbeat value. | |
852 | */ | |
853 | static void | |
854 | spa_state_init(spa_t *spa) | |
855 | { | |
d1261452 | 856 | spa_history_kstat_t *shk = &spa->spa_stats.state; |
f0ed6c74 TH |
857 | char *name; |
858 | kstat_t *ksp; | |
859 | ||
d1261452 | 860 | mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); |
f0ed6c74 TH |
861 | |
862 | name = kmem_asprintf("zfs/%s", spa_name(spa)); | |
863 | ksp = kstat_create(name, 0, "state", "misc", | |
864 | KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); | |
865 | ||
d1261452 | 866 | shk->kstat = ksp; |
f0ed6c74 | 867 | if (ksp) { |
d1261452 | 868 | ksp->ks_lock = &shk->lock; |
f0ed6c74 TH |
869 | ksp->ks_data = NULL; |
870 | ksp->ks_private = spa; | |
871 | ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; | |
872 | kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); | |
873 | kstat_install(ksp); | |
874 | } | |
875 | ||
876 | strfree(name); | |
877 | } | |
878 | ||
879 | static void | |
880 | spa_health_destroy(spa_t *spa) | |
881 | { | |
d1261452 JG |
882 | spa_history_kstat_t *shk = &spa->spa_stats.state; |
883 | kstat_t *ksp = shk->kstat; | |
f0ed6c74 TH |
884 | if (ksp) |
885 | kstat_delete(ksp); | |
886 | ||
d1261452 | 887 | mutex_destroy(&shk->lock); |
f0ed6c74 TH |
888 | } |
889 | ||
1421c891 PS |
890 | void |
891 | spa_stats_init(spa_t *spa) | |
892 | { | |
893 | spa_read_history_init(spa); | |
0b1401ee | 894 | spa_txg_history_init(spa); |
2d37239a | 895 | spa_tx_assign_init(spa); |
330847ff | 896 | spa_io_history_init(spa); |
379ca9cf | 897 | spa_mmp_history_init(spa); |
f0ed6c74 | 898 | spa_state_init(spa); |
1421c891 PS |
899 | } |
900 | ||
901 | void | |
902 | spa_stats_destroy(spa_t *spa) | |
903 | { | |
f0ed6c74 | 904 | spa_health_destroy(spa); |
2d37239a | 905 | spa_tx_assign_destroy(spa); |
0b1401ee | 906 | spa_txg_history_destroy(spa); |
1421c891 | 907 | spa_read_history_destroy(spa); |
330847ff | 908 | spa_io_history_destroy(spa); |
379ca9cf | 909 | spa_mmp_history_destroy(spa); |
1421c891 PS |
910 | } |
911 | ||
93ce2b4c | 912 | #if defined(_KERNEL) |
379ca9cf | 913 | /* CSTYLED */ |
1421c891 | 914 | module_param(zfs_read_history, int, 0644); |
379ca9cf OF |
915 | MODULE_PARM_DESC(zfs_read_history, |
916 | "Historical statistics for the last N reads"); | |
1421c891 PS |
917 | |
918 | module_param(zfs_read_history_hits, int, 0644); | |
379ca9cf OF |
919 | MODULE_PARM_DESC(zfs_read_history_hits, |
920 | "Include cache hits in read history"); | |
0b1401ee BB |
921 | |
922 | module_param(zfs_txg_history, int, 0644); | |
379ca9cf OF |
923 | MODULE_PARM_DESC(zfs_txg_history, |
924 | "Historical statistics for the last N txgs"); | |
925 | ||
926 | module_param(zfs_multihost_history, int, 0644); | |
927 | MODULE_PARM_DESC(zfs_multihost_history, | |
928 | "Historical statistics for last N multihost writes"); | |
929 | /* END CSTYLED */ | |
1421c891 | 930 | #endif |