1 local g = import 'grafonnet/grafana.libsonnet';
2 local u = import 'utils.libsonnet';
7 local OsdOverviewStyle(alias, pattern, type, unit) =
8 u.addStyle(alias, null, [
9 'rgba(245, 54, 54, 0.9)',
10 'rgba(237, 129, 40, 0.89)',
11 'rgba(50, 172, 45, 0.97)',
12 ], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []);
13 local OsdOverviewGraphPanel(alias,
25 u.graphPanelSchema(alias,
38 [u.addTargetSchema(expr, 1, 'time_series', legendFormat1)]
39 ) + { gridPos: { x: x, y: y, w: w, h: h } };
40 local OsdOverviewPieChartPanel(alias, description, title) =
41 u.addPieChartSchema(alias,
48 local OsdOverviewSingleStatPanel(colors,
64 u.addSingleStatSchema(
78 u.addTargetSchema(expr, 1, targetFormat, '')
79 ) + { gridPos: { x: x, y: y, w: w, h: h } };
91 refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
92 time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
96 u.addAnnotationSchema(
101 'rgba(0, 211, 255, 1)',
102 'Annotations & Alerts',
107 type='grafana', id='grafana', name='Grafana', version='5.0.0'
110 type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
113 type='panel', id='graph', name='Graph', version='5.0.0'
116 type='panel', id='table', name='Table', version='5.0.0'
119 g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
122 OsdOverviewGraphPanel(
123 { '@95%ile': '#e0752d' },
124 'OSD Read Latencies',
129 'avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)',
139 'max (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)',
145 'quantile(0.95,\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n)', 1, 'time_series', '@95%ile'
151 "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
152 { col: 2, desc: true },
154 OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
155 OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),
156 OsdOverviewStyle('', '/.*/', 'hidden', 'short'),
158 'Highest READ Latencies',
163 'topk(10,\n (sort(\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n ))\n)\n\n', 1, 'table', ''
165 ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } },
166 OsdOverviewGraphPanel(
168 '@95%ile write': '#e0752d',
170 'OSD Write Latencies',
175 'avg (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)',
185 'max (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)',
191 'quantile(0.95,\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n)', 1, 'time_series', '@95%ile write'
197 "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
198 { col: 2, desc: true },
201 'OSD ID', 'ceph_daemon', 'string', 'short'
203 OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),
204 OsdOverviewStyle('', '/.*/', 'hidden', 'short'),
206 'Highest WRITE Latencies',
211 'topk(10,\n (sort(\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n ))\n)\n\n',
216 ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } },
217 OsdOverviewPieChartPanel(
218 {}, '', 'OSD Types Summary'
221 u.addTargetSchema('count by (device_class) (ceph_osd_metadata)', 1, 'time_series', '{{device_class}}')
222 ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } },
223 OsdOverviewPieChartPanel(
224 { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types'
228 'count(ceph_bluefs_wal_total_bytes)', 1, 'time_series', 'bluestore'
233 'absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)', 1, 'time_series', 'filestore'
235 ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } },
236 OsdOverviewPieChartPanel(
237 {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary'
239 .addTarget(u.addTargetSchema(
240 'count(ceph_osd_stat_bytes < 1099511627776)', 1, 'time_series', '<1TB'
242 .addTarget(u.addTargetSchema(
243 'count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)', 1, 'time_series', '<2TB'
245 .addTarget(u.addTargetSchema(
246 'count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)', 1, 'time_series', '<3TB'
248 .addTarget(u.addTargetSchema(
249 'count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)', 1, 'time_series', '<4TB'
251 .addTarget(u.addTargetSchema(
252 'count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)', 1, 'time_series', '<6TB'
254 .addTarget(u.addTargetSchema(
255 'count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)', 1, 'time_series', '<8TB'
257 .addTarget(u.addTargetSchema(
258 'count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)', 1, 'time_series', '<10TB'
260 .addTarget(u.addTargetSchema(
261 'count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)', 1, 'time_series', '<12TB'
263 .addTarget(u.addTargetSchema(
264 'count(ceph_osd_stat_bytes >= 13194139533312)', 1, 'time_series', '<12TB+'
265 )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } },
266 g.graphPanel.new(bars=true,
267 datasource='$datasource',
268 title='Distribution of PGs per OSD',
270 x_axis_mode='histogram',
271 x_axis_values=['total'],
276 nullPointMode='null')
277 .addTarget(u.addTargetSchema(
278 'ceph_osd_numpg\n', 1, 'time_series', 'PGs per OSD'
279 )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
280 OsdOverviewSingleStatPanel(
281 ['#d44a3a', '#299c46'],
283 'OSD onode Hits Ratio',
284 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
291 'sum(ceph_bluestore_onode_hits)/(sum(ceph_bluestore_onode_hits) + sum(ceph_bluestore_onode_misses))',
298 u.addRowSchema(false,
300 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
301 OsdOverviewGraphPanel(
303 'Read/Write Profile',
304 'Show the read/write workload profile overtime',
308 'round(sum(irate(ceph_pool_rd[30s])))',
315 .addTargets([u.addTargetSchema(
316 'round(sum(irate(ceph_pool_wr[30s])))', 1, 'time_series', 'Writes'
319 'osd-device-details.json':
320 local OsdDeviceDetailsPanel(title,
332 u.graphPanelSchema({},
346 u.addTargetSchema(expr1,
350 u.addTargetSchema(expr2, 1, 'time_series', legendFormat2),
352 ) + { gridPos: { x: x, y: y, w: w, h: h } };
355 'OSD device details',
364 refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
365 time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
369 u.addAnnotationSchema(
374 'rgba(0, 211, 255, 1)',
375 'Annotations & Alerts',
380 type='grafana', id='grafana', name='Grafana', version='5.3.2'
383 type='panel', id='graph', name='Graph', version='5.0.0'
386 g.template.datasource('datasource',
392 u.addTemplateSchema('osd',
394 'label_values(ceph_osd_metadata,ceph_daemon)',
403 false, true, 'OSD Performance'
404 ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
405 OsdDeviceDetailsPanel(
409 'Read (-) / Write (+)',
410 'irate(ceph_osd_op_r_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m])',
411 'irate(ceph_osd_op_w_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m])',
422 transform: 'negative-Y',
425 OsdDeviceDetailsPanel(
429 'Read (-) / Write (+)',
430 'irate(ceph_osd_op_r{ceph_daemon=~"$osd"}[1m])',
431 'irate(ceph_osd_op_w{ceph_daemon=~"$osd"}[1m])',
440 { alias: 'Reads', transform: 'negative-Y' }
442 OsdDeviceDetailsPanel(
446 'Read (-) / Write (+)',
447 'irate(ceph_osd_op_r_out_bytes{ceph_daemon=~"$osd"}[1m])',
448 'irate(ceph_osd_op_w_in_bytes{ceph_daemon=~"$osd"}[1m])',
456 .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
458 false, true, 'Physical Device Performance'
459 ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
460 OsdDeviceDetailsPanel(
461 'Physical Device Latency for $osd',
464 'Read (-) / Write (+)',
465 '(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))',
466 '(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))',
467 '{{instance}}/{{device}} Reads',
468 '{{instance}}/{{device}} Writes',
475 { alias: '/.*Reads/', transform: 'negative-Y' }
477 OsdDeviceDetailsPanel(
478 'Physical Device R/W IOPS for $osd',
481 'Read (-) / Write (+)',
482 'label_replace(irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
483 'label_replace(irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
484 '{{device}} on {{instance}} Writes',
485 '{{device}} on {{instance}} Reads',
492 { alias: '/.*Reads/', transform: 'negative-Y' }
494 OsdDeviceDetailsPanel(
495 'Physical Device R/W Bytes for $osd',
498 'Read (-) / Write (+)',
499 'label_replace(irate(node_disk_read_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
500 'label_replace(irate(node_disk_written_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
501 '{{instance}} {{device}} Reads',
502 '{{instance}} {{device}} Writes',
509 { alias: '/.*Reads/', transform: 'negative-Y' }
513 'Physical Device Util% for $osd',
525 .addTarget(u.addTargetSchema(
526 'label_replace(irate(node_disk_io_time_seconds_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
529 '{{device}} on {{instance}}'
530 )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },