1 local g = import 'grafonnet/grafana.libsonnet';
3 (import 'utils.libsonnet') {
12 $._config.dashboardTags,
16 $.addAnnotationSchema(
21 'rgba(0, 211, 255, 1)',
22 'Annotations & Alerts',
27 type='grafana', id='grafana', name='Grafana', version='5.0.0'
30 type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
33 type='panel', id='graph', name='Graph', version='5.0.0'
36 type='panel', id='table', name='Table', version='5.0.0'
39 g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
42 $.addClusterTemplate()
49 { '@95%ile': '#e0752d' },
57 rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
58 on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
72 rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
73 on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
82 rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
83 on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
94 "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
95 { col: 2, desc: true },
97 $.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
98 $.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
99 $.overviewStyle('', '/.*/', 'hidden', 'short'),
101 'Highest READ Latencies',
110 rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
111 on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) *
122 ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } },
125 '@95%ile write': '#e0752d',
127 'OSD Write Latencies',
134 rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
135 on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
150 rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
151 on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
154 ||| % $.matchers(), 'MAX write'
159 rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
160 on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
163 ||| % $.matchers(), '@95%ile write'
169 "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
170 { col: 2, desc: true },
173 'OSD ID', 'ceph_daemon', 'string', 'short'
175 $.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
176 $.overviewStyle('', '/.*/', 'hidden', 'short'),
178 'Highest WRITE Latencies',
186 (rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
187 on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
197 ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } },
199 {}, '', 'OSD Types Summary'
202 $.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % $.matchers(), '{{device_class}}')
203 ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } },
205 { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types'
209 'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % $.matchers(), 'bluestore', 'time_series', 2
214 'absent(ceph_bluefs_wal_total_bytes{%(matchers)s}) * count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), 'filestore', 'time_series', 2
216 ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } },
218 {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary'
220 .addTarget($.addTargetSchema(
221 'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % $.matchers(), '<1TB', 'time_series', 2
223 .addTarget($.addTargetSchema(
224 'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % $.matchers(), '<2TB', 'time_series', 2
226 .addTarget($.addTargetSchema(
227 'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % $.matchers(), '<3TB', 'time_series', 2
229 .addTarget($.addTargetSchema(
230 'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % $.matchers(), '<4TB', 'time_series', 2
232 .addTarget($.addTargetSchema(
233 'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % $.matchers(), '<6TB', 'time_series', 2
235 .addTarget($.addTargetSchema(
236 'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % $.matchers(), '<8TB', 'time_series', 2
238 .addTarget($.addTargetSchema(
239 'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % $.matchers(), '<10TB', 'time_series', 2
241 .addTarget($.addTargetSchema(
242 'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % $.matchers(), '<12TB', 'time_series', 2
244 .addTarget($.addTargetSchema(
245 'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % $.matchers(), '<12TB+', 'time_series', 2
246 )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } },
247 g.graphPanel.new(bars=true,
248 datasource='$datasource',
249 title='Distribution of PGs per OSD',
251 x_axis_mode='histogram',
252 x_axis_values=['total'],
257 nullPointMode='null')
258 .addTarget($.addTargetSchema(
259 'ceph_osd_numpg{%(matchers)s}' % $.matchers(), 'PGs per OSD', 'time_series', 1, true
260 )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
261 $.gaugeSingleStatPanel(
263 'OSD onode Hits Ratio',
264 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
272 sum(ceph_bluestore_onode_hits{%(matchers)s}) / (
273 sum(ceph_bluestore_onode_hits{%(matchers)s}) +
274 sum(ceph_bluestore_onode_misses{%(matchers)s})
283 $.addRowSchema(false,
285 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
288 'Read/Write Profile',
289 'Show the read/write workload profile overtime',
293 'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % $.matchers(),
300 .addTargets([$.addTargetSchema(
301 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
304 'osd-device-details.json':
305 local OsdDeviceDetailsPanel(title,
317 $.graphPanelSchema({},
331 $.addTargetSchema(expr1,
333 $.addTargetSchema(expr2, legendFormat2),
335 ) + { gridPos: { x: x, y: y, w: w, h: h } };
338 'OSD device details',
344 $._config.dashboardTags,
348 $.addAnnotationSchema(
353 'rgba(0, 211, 255, 1)',
354 'Annotations & Alerts',
359 type='grafana', id='grafana', name='Grafana', version='5.3.2'
362 type='panel', id='graph', name='Graph', version='5.0.0'
365 g.template.datasource('datasource',
371 $.addClusterTemplate()
377 $.addTemplateSchema('osd',
379 'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
388 false, true, 'OSD Performance'
389 ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
390 OsdDeviceDetailsPanel(
394 'Read (-) / Write (+)',
396 rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
397 on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
400 rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
401 on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
413 transform: 'negative-Y',
416 OsdDeviceDetailsPanel(
420 'Read (-) / Write (+)',
421 'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
422 'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
431 { alias: 'Reads', transform: 'negative-Y' }
433 OsdDeviceDetailsPanel(
437 'Read (-) / Write (+)',
438 'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
439 'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
447 .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
449 false, true, 'Physical Device Performance'
450 ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
451 OsdDeviceDetailsPanel(
452 'Physical Device Latency for $osd',
455 'Read (-) / Write (+)',
459 rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
460 rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
461 "instance", "$1", "instance", "([^:.]*).*"
462 ) and on (instance, device) label_replace(
464 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
465 "device", "$1", "device", "/dev/(.*)"
466 ), "instance", "$1", "instance", "([^:.]*).*"
473 rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
474 rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
475 "instance", "$1", "instance", "([^:.]*).*") and on (instance, device)
478 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
479 ), "instance", "$1", "instance", "([^:.]*).*"
483 '{{instance}}/{{device}} Reads',
484 '{{instance}}/{{device}} Writes',
491 { alias: '/.*Reads/', transform: 'negative-Y' }
493 OsdDeviceDetailsPanel(
494 'Physical Device R/W IOPS for $osd',
497 'Read (-) / Write (+)',
500 rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
501 "instance", "$1", "instance", "([^:.]*).*"
502 ) and on (instance, device) label_replace(
504 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
505 "device", "$1", "device", "/dev/(.*)"
506 ), "instance", "$1", "instance", "([^:.]*).*"
511 rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
512 "instance", "$1", "instance", "([^:.]*).*"
513 ) and on (instance, device) label_replace(
515 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
516 "device", "$1", "device", "/dev/(.*)"
517 ), "instance", "$1", "instance", "([^:.]*).*"
520 '{{device}} on {{instance}} Writes',
521 '{{device}} on {{instance}} Reads',
528 { alias: '/.*Reads/', transform: 'negative-Y' }
530 OsdDeviceDetailsPanel(
531 'Physical Device R/W Bytes for $osd',
534 'Read (-) / Write (+)',
537 rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
538 ) and on (instance, device) label_replace(
540 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
541 "device", "$1", "device", "/dev/(.*)"
542 ), "instance", "$1", "instance", "([^:.]*).*"
547 rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
548 ) and on (instance, device) label_replace(
550 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
551 "device", "$1", "device", "/dev/(.*)"
552 ), "instance", "$1", "instance", "([^:.]*).*"
555 '{{instance}} {{device}} Reads',
556 '{{instance}} {{device}} Writes',
563 { alias: '/.*Reads/', transform: 'negative-Y' }
567 'Physical Device Util% for $osd',
579 .addTarget($.addTargetSchema(
582 rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]),
583 "instance", "$1", "instance", "([^:.]*).*"
584 ) and on (instance, device) label_replace(
586 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
587 ), "instance", "$1", "instance", "([^:.]*).*"
590 '{{device}} on {{instance}}'
591 )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },