]>
Commit | Line | Data |
---|---|---|
20effc67 | 1 | local g = import 'grafonnet/grafana.libsonnet'; |
20effc67 | 2 | |
2a845540 TL |
3 | (import 'utils.libsonnet') { |
4 | 'hosts-overview.json': | |
5 | $.dashboardSchema( | |
6 | 'Host Overview', | |
7 | '', | |
8 | 'y0KGL0iZz', | |
9 | 'now-1h', | |
10 | '30s', | |
11 | 16, | |
12 | $._config.dashboardTags, | |
13 | '', | |
14 | ) | |
15 | .addRequired( | |
16 | type='grafana', id='grafana', name='Grafana', version='5.3.2' | |
17 | ) | |
18 | .addRequired( | |
19 | type='panel', id='graph', name='Graph', version='5.0.0' | |
20 | ) | |
21 | .addRequired( | |
22 | type='panel', id='singlestat', name='Singlestat', version='5.0.0' | |
23 | ) | |
24 | .addAnnotation( | |
25 | $.addAnnotationSchema( | |
26 | 1, | |
27 | '-- Grafana --', | |
28 | true, | |
29 | true, | |
30 | 'rgba(0, 211, 255, 1)', | |
31 | 'Annotations & Alerts', | |
32 | 'dashboard' | |
33 | ) | |
34 | ) | |
35 | .addTemplate( | |
36 | g.template.datasource('datasource', | |
37 | 'prometheus', | |
38 | 'default', | |
39 | label='Data Source') | |
40 | ) | |
41 | .addTemplate( | |
42 | $.addClusterTemplate() | |
43 | ) | |
44 | .addTemplate( | |
45 | $.addJobTemplate() | |
46 | ) | |
47 | .addTemplate( | |
48 | $.addTemplateSchema('osd_hosts', | |
49 | '$datasource', | |
50 | 'label_values(ceph_disk_occupation{%(matchers)s}, exported_instance)' % $.matchers(), | |
51 | 1, | |
52 | true, | |
53 | 1, | |
54 | null, | |
55 | '([^.]*).*') | |
56 | ) | |
57 | .addTemplate( | |
58 | $.addTemplateSchema('mon_hosts', | |
59 | '$datasource', | |
60 | 'label_values(ceph_mon_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), | |
61 | 1, | |
62 | true, | |
63 | 1, | |
64 | null, | |
65 | 'mon.(.*)') | |
66 | ) | |
67 | .addTemplate( | |
68 | $.addTemplateSchema('mds_hosts', | |
69 | '$datasource', | |
70 | 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % $.matchers(), | |
71 | 1, | |
72 | true, | |
73 | 1, | |
74 | null, | |
75 | 'mds.(.*)') | |
76 | ) | |
77 | .addTemplate( | |
78 | $.addTemplateSchema('rgw_hosts', | |
79 | '$datasource', | |
80 | 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(), | |
81 | 1, | |
82 | true, | |
83 | 1, | |
84 | null, | |
85 | 'rgw.(.*)') | |
86 | ) | |
87 | .addPanels([ | |
88 | $.simpleSingleStatPanel( | |
89 | 'none', | |
90 | 'OSD Hosts', | |
20effc67 | 91 | '', |
2a845540 TL |
92 | 'current', |
93 | 'count(sum by (hostname) (ceph_osd_metadata{%(matchers)s}))' % $.matchers(), | |
94 | true, | |
95 | 'time_series', | |
96 | 0, | |
97 | 0, | |
98 | 4, | |
99 | 5 | |
100 | ), | |
101 | $.simpleSingleStatPanel( | |
102 | 'percentunit', | |
103 | 'AVG CPU Busy', | |
104 | 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster', | |
105 | 'current', | |
106 | ||| | |
107 | avg(1 - ( | |
108 | avg by(instance) ( | |
109 | rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or | |
110 | rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) | |
111 | ) | |
112 | )) | |
113 | |||, | |
114 | true, | |
115 | 'time_series', | |
116 | 4, | |
117 | 0, | |
118 | 4, | |
119 | 5 | |
120 | ), | |
121 | $.simpleSingleStatPanel( | |
122 | 'percentunit', | |
123 | 'AVG RAM Utilization', | |
124 | 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)', | |
125 | 'current', | |
126 | ||| | |
127 | avg (( | |
128 | ( | |
129 | node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or | |
130 | node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} | |
131 | ) - (( | |
132 | node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or | |
133 | node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) + | |
134 | ( | |
135 | node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or | |
136 | node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} | |
137 | ) + ( | |
138 | node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or | |
139 | node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} | |
140 | ) + ( | |
141 | node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or | |
142 | node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} | |
143 | ) | |
144 | ) | |
145 | ) / ( | |
146 | node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or | |
147 | node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"} | |
148 | )) | |
149 | |||, | |
150 | true, | |
151 | 'time_series', | |
152 | 8, | |
153 | 0, | |
154 | 4, | |
155 | 5 | |
156 | ), | |
157 | $.simpleSingleStatPanel( | |
158 | 'none', | |
159 | 'Physical IOPS', | |
160 | 'IOPS Load at the device as reported by the OS on all OSD hosts', | |
161 | 'current', | |
162 | ||| | |
163 | sum (( | |
164 | rate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or | |
165 | rate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval]) | |
166 | ) + ( | |
167 | rate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or | |
168 | rate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval]) | |
169 | )) | |
170 | |||, | |
171 | true, | |
172 | 'time_series', | |
173 | 12, | |
174 | 0, | |
175 | 4, | |
176 | 5 | |
177 | ), | |
178 | $.simpleSingleStatPanel( | |
179 | 'percent', | |
180 | 'AVG Disk Utilization', | |
181 | 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)', | |
182 | 'current', | |
183 | ||| | |
184 | avg ( | |
185 | label_replace( | |
186 | (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or | |
187 | (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100), | |
188 | "instance", "$1", "instance", "([^.:]*).*" | |
189 | ) * on(instance, device) group_left(ceph_daemon) label_replace( | |
190 | label_replace( | |
191 | ceph_disk_occupation_human{%(matchers)s, instance=~"($osd_hosts).*"}, | |
192 | "device", "$1", "device", "/dev/(.*)" | |
193 | ), "instance", "$1", "instance", "([^.:]*).*" | |
194 | ) | |
195 | ) | |
196 | ||| % $.matchers(), | |
197 | true, | |
198 | 'time_series', | |
20effc67 | 199 | 16, |
2a845540 TL |
200 | 0, |
201 | 4, | |
202 | 5 | |
203 | ), | |
204 | $.simpleSingleStatPanel( | |
205 | 'bytes', | |
206 | 'Network Load', | |
207 | 'Total send/receive network load across all hosts in the ceph cluster', | |
208 | 'current', | |
209 | ||| | |
210 | sum ( | |
211 | ( | |
212 | rate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or | |
213 | rate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) | |
214 | ) unless on (device, instance) | |
215 | label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") | |
216 | ) + | |
217 | sum ( | |
218 | ( | |
219 | rate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or | |
220 | rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) | |
221 | ) unless on (device, instance) | |
222 | label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)") | |
223 | ) | |
224 | |||, | |
225 | true, | |
226 | 'time_series', | |
227 | 20, | |
228 | 0, | |
229 | 4, | |
230 | 5 | |
231 | ), | |
232 | $.simpleGraphPanel( | |
233 | {}, | |
234 | 'CPU Busy - Top 10 Hosts', | |
235 | 'Show the top 10 busiest hosts by cpu', | |
236 | 'percent', | |
237 | null, | |
238 | 0, | |
239 | ||| | |
240 | topk(10, | |
241 | 100 * ( | |
242 | 1 - ( | |
243 | avg by(instance) ( | |
244 | rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or | |
245 | rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) | |
246 | ) | |
247 | ) | |
248 | ) | |
249 | ) | |
250 | |||, | |
251 | '{{instance}}', | |
252 | 0, | |
253 | 5, | |
254 | 12, | |
255 | 9 | |
256 | ), | |
257 | $.simpleGraphPanel( | |
258 | {}, | |
259 | 'Network Load - Top 10 Hosts', | |
260 | 'Top 10 hosts by network load', | |
261 | 'Bps', | |
262 | null, | |
263 | 0, | |
264 | ||| | |
265 | topk(10, (sum by(instance) ( | |
266 | ( | |
267 | rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or | |
268 | rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) | |
269 | ) + | |
270 | ( | |
271 | rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or | |
272 | rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) | |
273 | ) unless on (device, instance) | |
274 | label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")) | |
275 | )) | |
276 | |||, | |
277 | '{{instance}}', | |
278 | 12, | |
279 | 5, | |
280 | 12, | |
281 | 9 | |
282 | ), | |
283 | ]), | |
284 | 'host-details.json': | |
285 | $.dashboardSchema( | |
286 | 'Host Details', | |
287 | '', | |
288 | 'rtOg0AiWz', | |
289 | 'now-1h', | |
290 | '30s', | |
291 | 16, | |
292 | $._config.dashboardTags + ['overview'], | |
293 | '' | |
294 | ) | |
295 | .addRequired( | |
296 | type='grafana', id='grafana', name='Grafana', version='5.3.2' | |
297 | ) | |
298 | .addRequired( | |
299 | type='panel', id='graph', name='Graph', version='5.0.0' | |
300 | ) | |
301 | .addRequired( | |
302 | type='panel', id='singlestat', name='Singlestat', version='5.0.0' | |
303 | ) | |
304 | .addAnnotation( | |
305 | $.addAnnotationSchema( | |
306 | 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard' | |
307 | ) | |
308 | ) | |
309 | .addTemplate( | |
310 | g.template.datasource('datasource', 'prometheus', 'default', label='Data Source') | |
311 | ) | |
312 | .addTemplate( | |
313 | $.addClusterTemplate() | |
314 | ) | |
315 | .addTemplate( | |
316 | $.addJobTemplate() | |
317 | ) | |
318 | .addTemplate( | |
319 | $.addTemplateSchema('ceph_hosts', | |
320 | '$datasource', | |
39ae355f | 321 | if $._config.showMultiCluster then ('label_values({%(clusterMatcher)s}, instance)' % $.matchers()) else 'label_values(instance)', |
2a845540 TL |
322 | 1, |
323 | false, | |
324 | 3, | |
325 | 'Hostname', | |
326 | '([^.:]*).*') | |
327 | ) | |
328 | .addPanels([ | |
329 | $.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, | |
330 | $.simpleSingleStatPanel( | |
331 | 'none', | |
332 | 'OSDs', | |
20effc67 | 333 | '', |
2a845540 TL |
334 | 'current', |
335 | "count(sum by (ceph_daemon) (ceph_osd_metadata{%(matchers)s, hostname='$ceph_hosts'}))" % $.matchers(), | |
336 | null, | |
337 | 'time_series', | |
338 | 0, | |
339 | 1, | |
340 | 3, | |
341 | 5 | |
342 | ), | |
343 | $.simpleGraphPanel( | |
20effc67 | 344 | { |
2a845540 TL |
345 | interrupt: '#447EBC', |
346 | steal: '#6D1F62', | |
347 | system: '#890F02', | |
348 | user: '#3F6833', | |
349 | wait: '#C15C17', | |
350 | }, | |
351 | 'CPU Utilization', | |
352 | "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown", | |
353 | 'percent', | |
354 | '% Utilization', | |
355 | null, | |
356 | ||| | |
357 | sum by (mode) ( | |
358 | rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) or | |
359 | rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) | |
360 | ) / ( | |
361 | scalar( | |
362 | sum(rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or | |
363 | rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) | |
364 | ) * 100 | |
365 | ) | |
366 | |||, | |
367 | '{{mode}}', | |
368 | 3, | |
369 | 1, | |
370 | 6, | |
371 | 10 | |
372 | ), | |
373 | $.simpleGraphPanel( | |
374 | { | |
375 | Available: '#508642', | |
376 | Free: '#508642', | |
377 | Total: '#bf1b00', | |
378 | Used: '#bf1b00', | |
379 | total: '#bf1b00', | |
380 | used: '#0a50a1', | |
381 | }, | |
382 | 'RAM Usage', | |
383 | '', | |
384 | 'bytes', | |
385 | 'RAM used', | |
386 | null, | |
387 | ||| | |
388 | node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or | |
389 | node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} | |
390 | |||, | |
391 | 'Free', | |
392 | 9, | |
393 | 1, | |
394 | 6, | |
395 | 10 | |
20effc67 | 396 | ) |
2a845540 TL |
397 | .addTargets( |
398 | [ | |
399 | $.addTargetSchema( | |
400 | ||| | |
401 | node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or | |
402 | node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} | |
403 | |||, | |
404 | 'total' | |
405 | ), | |
406 | $.addTargetSchema( | |
407 | ||| | |
408 | ( | |
409 | node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or | |
410 | node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} | |
411 | ) + ( | |
412 | node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or | |
413 | node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} | |
414 | ) + ( | |
415 | node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or | |
416 | node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} | |
417 | ) | |
418 | |||, | |
419 | 'buffers/cache' | |
420 | ), | |
421 | $.addTargetSchema( | |
422 | ||| | |
423 | ( | |
424 | node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or | |
425 | node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} | |
426 | ) - ( | |
427 | ( | |
428 | node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or | |
429 | node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} | |
430 | ) + ( | |
431 | node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or | |
432 | node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} | |
433 | ) + ( | |
434 | node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or | |
435 | node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} | |
436 | ) + | |
437 | ( | |
438 | node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or | |
439 | node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"} | |
440 | ) | |
441 | ) | |
442 | |||, | |
443 | 'used' | |
444 | ), | |
445 | ] | |
20effc67 | 446 | ) |
2a845540 TL |
447 | .addSeriesOverride( |
448 | { | |
449 | alias: 'total', | |
450 | color: '#bf1b00', | |
451 | fill: 0, | |
452 | linewidth: 2, | |
453 | stack: false, | |
454 | } | |
455 | ), | |
456 | $.simpleGraphPanel( | |
457 | {}, | |
458 | 'Network Load', | |
459 | "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')", | |
460 | 'decbytes', | |
461 | 'Send (-) / Receive (+)', | |
462 | null, | |
463 | ||| | |
464 | sum by (device) ( | |
465 | rate( | |
466 | node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or | |
467 | rate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval] | |
468 | ) | |
469 | ) | |
470 | |||, | |
471 | '{{device}}.rx', | |
472 | 15, | |
473 | 1, | |
474 | 6, | |
475 | 10 | |
20effc67 | 476 | ) |
2a845540 TL |
477 | .addTargets( |
478 | [ | |
479 | $.addTargetSchema( | |
480 | ||| | |
481 | sum by (device) ( | |
482 | rate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or | |
483 | rate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) | |
484 | ) | |
485 | |||, | |
486 | '{{device}}.tx' | |
487 | ), | |
488 | ] | |
20effc67 | 489 | ) |
2a845540 TL |
490 | .addSeriesOverride( |
491 | { alias: '/.*tx/', transform: 'negative-Y' } | |
492 | ), | |
493 | $.simpleGraphPanel( | |
494 | {}, | |
495 | 'Network drop rate', | |
496 | '', | |
497 | 'pps', | |
498 | 'Send (-) / Receive (+)', | |
499 | null, | |
500 | ||| | |
501 | rate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or | |
502 | rate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) | |
503 | |||, | |
504 | '{{device}}.rx', | |
505 | 21, | |
506 | 1, | |
507 | 3, | |
508 | 5 | |
20effc67 | 509 | ) |
2a845540 TL |
510 | .addTargets( |
511 | [ | |
512 | $.addTargetSchema( | |
513 | ||| | |
514 | rate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or | |
515 | rate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) | |
516 | |||, | |
517 | '{{device}}.tx' | |
518 | ), | |
519 | ] | |
20effc67 | 520 | ) |
2a845540 | 521 | .addSeriesOverride( |
20effc67 | 522 | { |
2a845540 TL |
523 | alias: '/.*tx/', |
524 | transform: 'negative-Y', | |
20effc67 | 525 | } |
2a845540 TL |
526 | ), |
527 | $.simpleSingleStatPanel( | |
528 | 'bytes', | |
529 | 'Raw Capacity', | |
530 | 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.', | |
531 | 'current', | |
532 | ||| | |
533 | sum( | |
534 | ceph_osd_stat_bytes{%(matchers)s} and | |
535 | on (ceph_daemon) ceph_disk_occupation{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"} | |
536 | ) | |
537 | ||| % $.matchers(), | |
538 | null, | |
539 | 'time_series', | |
540 | 0, | |
541 | 6, | |
542 | 3, | |
543 | 5 | |
544 | ), | |
545 | $.simpleGraphPanel( | |
546 | {}, | |
547 | 'Network error rate', | |
548 | '', | |
549 | 'pps', | |
550 | 'Send (-) / Receive (+)', | |
551 | null, | |
552 | ||| | |
553 | rate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or | |
554 | rate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) | |
555 | |||, | |
556 | '{{device}}.rx', | |
557 | 21, | |
558 | 6, | |
559 | 3, | |
560 | 5 | |
20effc67 | 561 | ) |
2a845540 TL |
562 | .addTargets( |
563 | [$.addTargetSchema( | |
564 | ||| | |
565 | rate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or | |
566 | rate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) | |
567 | |||, | |
568 | '{{device}}.tx' | |
569 | )] | |
20effc67 | 570 | ) |
2a845540 TL |
571 | .addSeriesOverride( |
572 | { | |
573 | alias: '/.*tx/', | |
574 | transform: 'negative-Y', | |
575 | } | |
576 | ), | |
577 | $.addRowSchema(false, | |
578 | true, | |
579 | 'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } }, | |
580 | $.simpleGraphPanel( | |
581 | {}, | |
582 | '$ceph_hosts Disk IOPS', | |
583 | "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value", | |
584 | 'ops', | |
585 | 'Read (-) / Write (+)', | |
586 | null, | |
587 | ||| | |
588 | label_replace( | |
589 | ( | |
590 | rate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or | |
591 | rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) | |
592 | ), "instance", "$1", "instance", "([^:.]*).*" | |
593 | ) * on(instance, device) group_left(ceph_daemon) label_replace( | |
594 | label_replace( | |
595 | ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)" | |
596 | ), "instance", "$1", "instance", "([^:.]*).*" | |
597 | ) | |
598 | ||| % $.matchers(), | |
599 | '{{device}}({{ceph_daemon}}) writes', | |
600 | 0, | |
601 | 12, | |
602 | 11, | |
603 | 9 | |
20effc67 | 604 | ) |
2a845540 TL |
605 | .addTargets( |
606 | [ | |
607 | $.addTargetSchema( | |
608 | ||| | |
609 | label_replace( | |
610 | ( | |
611 | rate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or | |
612 | rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) | |
613 | ), "instance", "$1", "instance", "([^:.]*).*" | |
614 | ) * on(instance, device) group_left(ceph_daemon) label_replace( | |
615 | label_replace( | |
616 | ceph_disk_occupation_human{%(matchers)s},"device", "$1", "device", "/dev/(.*)" | |
617 | ), "instance", "$1", "instance", "([^:.]*).*" | |
618 | ) | |
619 | ||| % $.matchers(), | |
620 | '{{device}}({{ceph_daemon}}) reads' | |
621 | ), | |
622 | ] | |
20effc67 | 623 | ) |
2a845540 TL |
624 | .addSeriesOverride( |
625 | { alias: '/.*reads/', transform: 'negative-Y' } | |
626 | ), | |
627 | $.simpleGraphPanel( | |
628 | {}, | |
629 | '$ceph_hosts Throughput by Disk', | |
630 | 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id', | |
631 | 'Bps', | |
632 | 'Read (-) / Write (+)', | |
633 | null, | |
634 | ||| | |
635 | label_replace( | |
636 | ( | |
637 | rate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or | |
638 | rate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) | |
639 | ), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) | |
640 | group_left(ceph_daemon) label_replace( | |
641 | label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"), | |
642 | "instance", "$1", "instance", "([^:.]*).*" | |
643 | ) | |
644 | ||| % $.matchers(), | |
645 | '{{device}}({{ceph_daemon}}) write', | |
646 | 12, | |
647 | 12, | |
648 | 11, | |
649 | 9 | |
20effc67 | 650 | ) |
2a845540 TL |
651 | .addTargets( |
652 | [$.addTargetSchema( | |
653 | ||| | |
654 | label_replace( | |
655 | ( | |
656 | rate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or | |
657 | rate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) | |
658 | ), | |
659 | "instance", "$1", "instance", "([^:.]*).*") * on(instance, device) | |
660 | group_left(ceph_daemon) label_replace( | |
661 | label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"), | |
662 | "instance", "$1", "instance", "([^:.]*).*" | |
663 | ) | |
664 | ||| % $.matchers(), | |
665 | '{{device}}({{ceph_daemon}}) read' | |
666 | )] | |
20effc67 | 667 | ) |
2a845540 TL |
668 | .addSeriesOverride( |
669 | { alias: '/.*read/', transform: 'negative-Y' } | |
670 | ), | |
671 | $.simpleGraphPanel( | |
672 | {}, | |
673 | '$ceph_hosts Disk Latency', | |
674 | "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id", | |
675 | 's', | |
676 | '', | |
677 | null, | |
678 | ||| | |
679 | max by(instance, device) (label_replace( | |
680 | (rate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) / | |
681 | clamp_min(rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001) or | |
682 | (rate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) / | |
683 | clamp_min(rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001), | |
684 | "instance", "$1", "instance", "([^:.]*).*" | |
685 | )) * on(instance, device) group_left(ceph_daemon) label_replace( | |
686 | label_replace( | |
687 | ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"}, | |
688 | "device", "$1", "device", "/dev/(.*)" | |
689 | ), "instance", "$1", "instance", "([^:.]*).*" | |
690 | ) | |
691 | ||| % $.matchers(), | |
692 | '{{device}}({{ceph_daemon}})', | |
693 | 0, | |
694 | 21, | |
695 | 11, | |
696 | 9 | |
697 | ), | |
698 | $.simpleGraphPanel( | |
699 | {}, | |
700 | '$ceph_hosts Disk utilization', | |
701 | 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.', | |
702 | 'percent', | |
703 | '%Util', | |
704 | null, | |
705 | ||| | |
706 | label_replace( | |
707 | ( | |
708 | (rate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) / 10) or | |
709 | rate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) * 100 | |
710 | ), "instance", "$1", "instance", "([^:.]*).*" | |
711 | ) * on(instance, device) group_left(ceph_daemon) label_replace( | |
712 | label_replace(ceph_disk_occupation_human{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"}, | |
713 | "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*" | |
714 | ) | |
715 | ||| % $.matchers(), | |
716 | '{{device}}({{ceph_daemon}})', | |
717 | 12, | |
718 | 21, | |
719 | 11, | |
720 | 9 | |
721 | ), | |
39ae355f TL |
722 | $.addTableSchema( |
723 | '$datasource', | |
724 | 'This table shows the 10 hosts with the highest number of slow ops', | |
725 | { col: 2, desc: true }, | |
726 | [ | |
727 | $.overviewStyle('Instance', 'instance', 'string', 'short'), | |
728 | $.overviewStyle('Slow Ops', 'Value', 'number', 'none'), | |
729 | $.overviewStyle('', '/.*/', 'hidden', 'short'), | |
730 | ], | |
731 | 'Top Slow Ops per Host', | |
732 | 'table' | |
733 | ) | |
734 | .addTarget( | |
735 | $.addTargetSchema( | |
736 | ||| | |
737 | topk(10, | |
738 | (sum by (instance)(ceph_daemon_health_metrics{type="SLOW_OPS", ceph_daemon=~"osd.*"})) | |
739 | ) | |
740 | ||| % $.matchers(), | |
741 | '', | |
742 | 'table', | |
743 | 1, | |
744 | true | |
745 | ) | |
746 | ) + { gridPos: { x: 0, y: 40, w: 4, h: 8 } }, | |
2a845540 | 747 | ]), |
20effc67 | 748 | } |