1 local g = import 'grafonnet/grafana.libsonnet';
3 (import 'utils.libsonnet') {
12 $._config.dashboardTags,
16 type='grafana', id='grafana', name='Grafana', version='5.3.2'
19 type='panel', id='graph', name='Graph', version='5.0.0'
22 type='panel', id='singlestat', name='Singlestat', version='5.0.0'
25 $.addAnnotationSchema(
30 'rgba(0, 211, 255, 1)',
31 'Annotations & Alerts',
36 g.template.datasource('datasource',
42 $.addClusterTemplate()
48 $.addTemplateSchema('osd_hosts',
50 'label_values(ceph_disk_occupation{%(matchers)s}, exported_instance)' % $.matchers(),
58 $.addTemplateSchema('mon_hosts',
60 'label_values(ceph_mon_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
68 $.addTemplateSchema('mds_hosts',
70 'label_values(ceph_mds_inodes{%(matchers)s}, ceph_daemon)' % $.matchers(),
78 $.addTemplateSchema('rgw_hosts',
80 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
88 $.simpleSingleStatPanel(
93 'count(sum by (hostname) (ceph_osd_metadata{%(matchers)s}))' % $.matchers(),
101 $.simpleSingleStatPanel(
104 'Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster',
109 rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or
110 rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval])
121 $.simpleSingleStatPanel(
123 'AVG RAM Utilization',
124 'Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)',
129 node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
130 node_memory_MemTotal_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
132 node_memory_MemFree{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
133 node_memory_MemFree_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}) +
135 node_memory_Cached{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
136 node_memory_Cached_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
138 node_memory_Buffers{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
139 node_memory_Buffers_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
141 node_memory_Slab{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
142 node_memory_Slab_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}
146 node_memory_MemTotal{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"} or
147 node_memory_MemTotal_bytes{instance=~"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*"}
157 $.simpleSingleStatPanel(
160 'IOPS Load at the device as reported by the OS on all OSD hosts',
164 rate(node_disk_reads_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or
165 rate(node_disk_reads_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval])
167 rate(node_disk_writes_completed{instance=~"($osd_hosts).*"}[$__rate_interval]) or
168 rate(node_disk_writes_completed_total{instance=~"($osd_hosts).*"}[$__rate_interval])
178 $.simpleSingleStatPanel(
180 'AVG Disk Utilization',
181 'Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)',
186 (rate(node_disk_io_time_ms[$__rate_interval]) / 10 ) or
187 (rate(node_disk_io_time_seconds_total[$__rate_interval]) * 100),
188 "instance", "$1", "instance", "([^.:]*).*"
189 ) * on(instance, device) group_left(ceph_daemon) label_replace(
191 ceph_disk_occupation_human{%(matchers)s, instance=~"($osd_hosts).*"},
192 "device", "$1", "device", "/dev/(.*)"
193 ), "instance", "$1", "instance", "([^.:]*).*"
204 $.simpleSingleStatPanel(
207 'Total send/receive network load across all hosts in the ceph cluster',
212 rate(node_network_receive_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or
213 rate(node_network_receive_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval])
214 ) unless on (device, instance)
215 label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")
219 rate(node_network_transmit_bytes{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval]) or
220 rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*",device!="lo"}[$__rate_interval])
221 ) unless on (device, instance)
222 label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)")
234 'CPU Busy - Top 10 Hosts',
235 'Show the top 10 busiest hosts by cpu',
244 rate(node_cpu_seconds_total{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval]) or
245 rate(node_cpu{mode='idle',instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*"}[$__rate_interval])
259 'Network Load - Top 10 Hosts',
260 'Top 10 hosts by network load',
265 topk(10, (sum by(instance) (
267 rate(node_network_receive_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or
268 rate(node_network_receive_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval])
271 rate(node_network_transmit_bytes{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval]) or
272 rate(node_network_transmit_bytes_total{instance=~"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*",device!="lo"}[$__rate_interval])
273 ) unless on (device, instance)
274 label_replace((bonding_slaves > 0), "device", "$1", "master", "(.+)"))
292 $._config.dashboardTags + ['overview'],
296 type='grafana', id='grafana', name='Grafana', version='5.3.2'
299 type='panel', id='graph', name='Graph', version='5.0.0'
302 type='panel', id='singlestat', name='Singlestat', version='5.0.0'
305 $.addAnnotationSchema(
306 1, '-- Grafana --', true, true, 'rgba(0, 211, 255, 1)', 'Annotations & Alerts', 'dashboard'
310 g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
313 $.addClusterTemplate()
319 $.addTemplateSchema('ceph_hosts',
321 'label_values({%(clusterMatcher)s}, instance)' % $.matchers(),
329 $.addRowSchema(false, true, '$ceph_hosts System Overview') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
330 $.simpleSingleStatPanel(
335 "count(sum by (ceph_daemon) (ceph_osd_metadata{%(matchers)s, hostname='$ceph_hosts'}))" % $.matchers(),
345 interrupt: '#447EBC',
352 "Shows the CPU breakdown. When multiple servers are selected, only the first host's cpu data is shown",
358 rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval]) or
359 rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?", mode=~"(irq|nice|softirq|steal|system|user|iowait)"}[$__rate_interval])
362 sum(rate(node_cpu{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
363 rate(node_cpu_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]))
375 Available: '#508642',
388 node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or
389 node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
401 node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or
402 node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
409 node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or
410 node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
412 node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or
413 node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
415 node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or
416 node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
424 node_memory_MemTotal{instance=~"$ceph_hosts([\\\\.:].*)?"} or
425 node_memory_MemTotal_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
428 node_memory_MemFree{instance=~"$ceph_hosts([\\\\.:].*)?"} or
429 node_memory_MemFree_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
431 node_memory_Cached{instance=~"$ceph_hosts([\\\\.:].*)?"} or
432 node_memory_Cached_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
434 node_memory_Buffers{instance=~"$ceph_hosts([\\\\.:].*)?"} or
435 node_memory_Buffers_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
438 node_memory_Slab{instance=~"$ceph_hosts([\\\\.:].*)?"} or
439 node_memory_Slab_bytes{instance=~"$ceph_hosts([\\\\.:].*)?"}
459 "Show the network load (rx,tx) across all interfaces (excluding loopback 'lo')",
461 'Send (-) / Receive (+)',
466 node_network_receive_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or
467 rate(node_network_receive_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]
482 rate(node_network_transmit_bytes{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval]) or
483 rate(node_network_transmit_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?",device!="lo"}[$__rate_interval])
491 { alias: '/.*tx/', transform: 'negative-Y' }
498 'Send (-) / Receive (+)',
501 rate(node_network_receive_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
502 rate(node_network_receive_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
514 rate(node_network_transmit_drop{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
515 rate(node_network_transmit_drop_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
524 transform: 'negative-Y',
527 $.simpleSingleStatPanel(
530 'Each OSD consists of a Journal/WAL partition and a data partition. The RAW Capacity shown is the sum of the data partitions across all OSDs on the selected OSD hosts.',
534 ceph_osd_stat_bytes{%(matchers)s} and
535 on (ceph_daemon) ceph_disk_occupation{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"}
547 'Network error rate',
550 'Send (-) / Receive (+)',
553 rate(node_network_receive_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
554 rate(node_network_receive_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
565 rate(node_network_transmit_errs{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval]) or
566 rate(node_network_transmit_errs_total{instance=~"$ceph_hosts([\\\\.:].*)?"}[$__rate_interval])
574 transform: 'negative-Y',
577 $.addRowSchema(false,
579 'OSD Disk Performance Statistics') + { gridPos: { x: 0, y: 11, w: 24, h: 1 } },
582 '$ceph_hosts Disk IOPS',
583 "For any OSD devices on the host, this chart shows the iops per physical device. Each device is shown by it's name and corresponding OSD id value",
585 'Read (-) / Write (+)',
590 rate(node_disk_writes_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
591 rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
592 ), "instance", "$1", "instance", "([^:.]*).*"
593 ) * on(instance, device) group_left(ceph_daemon) label_replace(
595 ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"
596 ), "instance", "$1", "instance", "([^:.]*).*"
599 '{{device}}({{ceph_daemon}}) writes',
611 rate(node_disk_reads_completed{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
612 rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
613 ), "instance", "$1", "instance", "([^:.]*).*"
614 ) * on(instance, device) group_left(ceph_daemon) label_replace(
616 ceph_disk_occupation_human{%(matchers)s},"device", "$1", "device", "/dev/(.*)"
617 ), "instance", "$1", "instance", "([^:.]*).*"
620 '{{device}}({{ceph_daemon}}) reads'
625 { alias: '/.*reads/', transform: 'negative-Y' }
629 '$ceph_hosts Throughput by Disk',
630 'For OSD hosts, this chart shows the disk bandwidth (read bytes/sec + write bytes/sec) of the physical OSD device. Each device is shown by device name, and corresponding OSD id',
632 'Read (-) / Write (+)',
637 rate(node_disk_bytes_written{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
638 rate(node_disk_written_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
639 ), "instance", "$1", "instance", "([^:.]*).*") * on(instance, device)
640 group_left(ceph_daemon) label_replace(
641 label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"),
642 "instance", "$1", "instance", "([^:.]*).*"
645 '{{device}}({{ceph_daemon}}) write',
656 rate(node_disk_bytes_read{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) or
657 rate(node_disk_read_bytes_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])
659 "instance", "$1", "instance", "([^:.]*).*") * on(instance, device)
660 group_left(ceph_daemon) label_replace(
661 label_replace(ceph_disk_occupation_human{%(matchers)s}, "device", "$1", "device", "/dev/(.*)"),
662 "instance", "$1", "instance", "([^:.]*).*"
665 '{{device}}({{ceph_daemon}}) read'
669 { alias: '/.*read/', transform: 'negative-Y' }
673 '$ceph_hosts Disk Latency',
674 "For OSD hosts, this chart shows the latency at the physical drive. Each drive is shown by device name, with it's corresponding OSD id",
679 max by(instance, device) (label_replace(
680 (rate(node_disk_write_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) /
681 clamp_min(rate(node_disk_writes_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001) or
682 (rate(node_disk_read_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval])) /
683 clamp_min(rate(node_disk_reads_completed_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]), 0.001),
684 "instance", "$1", "instance", "([^:.]*).*"
685 )) * on(instance, device) group_left(ceph_daemon) label_replace(
687 ceph_disk_occupation_human{instance=~"($ceph_hosts)([\\\\.:].*)?"},
688 "device", "$1", "device", "/dev/(.*)"
689 ), "instance", "$1", "instance", "([^:.]*).*"
692 '{{device}}({{ceph_daemon}})',
700 '$ceph_hosts Disk utilization',
701 'Show disk utilization % (util) of any OSD devices on the host by the physical device name and associated OSD id.',
708 (rate(node_disk_io_time_ms{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) / 10) or
709 rate(node_disk_io_time_seconds_total{instance=~"($ceph_hosts)([\\\\.:].*)?"}[$__rate_interval]) * 100
710 ), "instance", "$1", "instance", "([^:.]*).*"
711 ) * on(instance, device) group_left(ceph_daemon) label_replace(
712 label_replace(ceph_disk_occupation_human{%(matchers)s, instance=~"($ceph_hosts)([\\\\.:].*)?"},
713 "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"
716 '{{device}}({{ceph_daemon}})',