]> git.proxmox.com Git - ceph.git/blob - ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet
import quincy beta 17.1.0
[ceph.git] / ceph / monitoring / ceph-mixin / dashboards / osd.libsonnet
1 local g = import 'grafonnet/grafana.libsonnet';
2 local u = import 'utils.libsonnet';
3
4 {
5 grafanaDashboards+:: {
6 'osds-overview.json':
7 local OsdOverviewStyle(alias, pattern, type, unit) =
8 u.addStyle(alias, null, [
9 'rgba(245, 54, 54, 0.9)',
10 'rgba(237, 129, 40, 0.89)',
11 'rgba(50, 172, 45, 0.97)',
12 ], 'YYYY-MM-DD HH:mm:ss', 2, 1, pattern, [], type, unit, []);
13 local OsdOverviewGraphPanel(alias,
14 title,
15 description,
16 formatY1,
17 labelY1,
18 min,
19 expr,
20 legendFormat1,
21 x,
22 y,
23 w,
24 h) =
25 u.graphPanelSchema(alias,
26 title,
27 description,
28 'null',
29 false,
30 formatY1,
31 'short',
32 labelY1,
33 null,
34 min,
35 1,
36 '$datasource')
37 .addTargets(
38 [u.addTargetSchema(expr, 1, 'time_series', legendFormat1)]
39 ) + { gridPos: { x: x, y: y, w: w, h: h } };
40 local OsdOverviewPieChartPanel(alias, description, title) =
41 u.addPieChartSchema(alias,
42 '$datasource',
43 description,
44 'Under graph',
45 'pie',
46 title,
47 'current');
48 local OsdOverviewSingleStatPanel(colors,
49 format,
50 title,
51 description,
52 valueName,
53 colorValue,
54 gaugeMaxValue,
55 gaugeShow,
56 sparkLineShow,
57 thresholds,
58 expr,
59 targetFormat,
60 x,
61 y,
62 w,
63 h) =
64 u.addSingleStatSchema(
65 colors,
66 '$datasource',
67 format,
68 title,
69 description,
70 valueName,
71 colorValue,
72 gaugeMaxValue,
73 gaugeShow,
74 sparkLineShow,
75 thresholds
76 )
77 .addTarget(
78 u.addTargetSchema(expr, 1, targetFormat, '')
79 ) + { gridPos: { x: x, y: y, w: w, h: h } };
80
81 u.dashboardSchema(
82 'OSD Overview',
83 '',
84 'lo02I1Aiz',
85 'now-1h',
86 '10s',
87 16,
88 [],
89 '',
90 {
91 refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
92 time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
93 }
94 )
95 .addAnnotation(
96 u.addAnnotationSchema(
97 1,
98 '-- Grafana --',
99 true,
100 true,
101 'rgba(0, 211, 255, 1)',
102 'Annotations & Alerts',
103 'dashboard'
104 )
105 )
106 .addRequired(
107 type='grafana', id='grafana', name='Grafana', version='5.0.0'
108 )
109 .addRequired(
110 type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
111 )
112 .addRequired(
113 type='panel', id='graph', name='Graph', version='5.0.0'
114 )
115 .addRequired(
116 type='panel', id='table', name='Table', version='5.0.0'
117 )
118 .addTemplate(
119 g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
120 )
121 .addPanels([
122 OsdOverviewGraphPanel(
123 { '@95%ile': '#e0752d' },
124 'OSD Read Latencies',
125 '',
126 'ms',
127 null,
128 '0',
129 'avg (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)',
130 'AVG read',
131 0,
132 0,
133 8,
134 8
135 )
136 .addTargets(
137 [
138 u.addTargetSchema(
139 'max (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)',
140 1,
141 'time_series',
142 'MAX read'
143 ),
144 u.addTargetSchema(
145 'quantile(0.95,\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n)', 1, 'time_series', '@95%ile'
146 ),
147 ],
148 ),
149 u.addTableSchema(
150 '$datasource',
151 "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
152 { col: 2, desc: true },
153 [
154 OsdOverviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
155 OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),
156 OsdOverviewStyle('', '/.*/', 'hidden', 'short'),
157 ],
158 'Highest READ Latencies',
159 'table'
160 )
161 .addTarget(
162 u.addTargetSchema(
163 'topk(10,\n (sort(\n (irate(ceph_osd_op_r_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m]) * 1000)\n ))\n)\n\n', 1, 'table', ''
164 )
165 ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } },
166 OsdOverviewGraphPanel(
167 {
168 '@95%ile write': '#e0752d',
169 },
170 'OSD Write Latencies',
171 '',
172 'ms',
173 null,
174 '0',
175 'avg (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)',
176 'AVG write',
177 12,
178 0,
179 8,
180 8
181 )
182 .addTargets(
183 [
184 u.addTargetSchema(
185 'max (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)',
186 1,
187 'time_series',
188 'MAX write'
189 ),
190 u.addTargetSchema(
191 'quantile(0.95,\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n)', 1, 'time_series', '@95%ile write'
192 ),
193 ],
194 ),
195 u.addTableSchema(
196 '$datasource',
197 "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
198 { col: 2, desc: true },
199 [
200 OsdOverviewStyle(
201 'OSD ID', 'ceph_daemon', 'string', 'short'
202 ),
203 OsdOverviewStyle('Latency (ms)', 'Value', 'number', 'none'),
204 OsdOverviewStyle('', '/.*/', 'hidden', 'short'),
205 ],
206 'Highest WRITE Latencies',
207 'table'
208 )
209 .addTarget(
210 u.addTargetSchema(
211 'topk(10,\n (sort(\n (irate(ceph_osd_op_w_latency_sum[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m]) * 1000)\n ))\n)\n\n',
212 1,
213 'table',
214 ''
215 )
216 ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } },
217 OsdOverviewPieChartPanel(
218 {}, '', 'OSD Types Summary'
219 )
220 .addTarget(
221 u.addTargetSchema('count by (device_class) (ceph_osd_metadata)', 1, 'time_series', '{{device_class}}')
222 ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } },
223 OsdOverviewPieChartPanel(
224 { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types'
225 )
226 .addTarget(
227 u.addTargetSchema(
228 'count(ceph_bluefs_wal_total_bytes)', 1, 'time_series', 'bluestore'
229 )
230 )
231 .addTarget(
232 u.addTargetSchema(
233 'absent(ceph_bluefs_wal_total_bytes)*count(ceph_osd_metadata)', 1, 'time_series', 'filestore'
234 )
235 ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } },
236 OsdOverviewPieChartPanel(
237 {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary'
238 )
239 .addTarget(u.addTargetSchema(
240 'count(ceph_osd_stat_bytes < 1099511627776)', 1, 'time_series', '<1TB'
241 ))
242 .addTarget(u.addTargetSchema(
243 'count(ceph_osd_stat_bytes >= 1099511627776 < 2199023255552)', 1, 'time_series', '<2TB'
244 ))
245 .addTarget(u.addTargetSchema(
246 'count(ceph_osd_stat_bytes >= 2199023255552 < 3298534883328)', 1, 'time_series', '<3TB'
247 ))
248 .addTarget(u.addTargetSchema(
249 'count(ceph_osd_stat_bytes >= 3298534883328 < 4398046511104)', 1, 'time_series', '<4TB'
250 ))
251 .addTarget(u.addTargetSchema(
252 'count(ceph_osd_stat_bytes >= 4398046511104 < 6597069766656)', 1, 'time_series', '<6TB'
253 ))
254 .addTarget(u.addTargetSchema(
255 'count(ceph_osd_stat_bytes >= 6597069766656 < 8796093022208)', 1, 'time_series', '<8TB'
256 ))
257 .addTarget(u.addTargetSchema(
258 'count(ceph_osd_stat_bytes >= 8796093022208 < 10995116277760)', 1, 'time_series', '<10TB'
259 ))
260 .addTarget(u.addTargetSchema(
261 'count(ceph_osd_stat_bytes >= 10995116277760 < 13194139533312)', 1, 'time_series', '<12TB'
262 ))
263 .addTarget(u.addTargetSchema(
264 'count(ceph_osd_stat_bytes >= 13194139533312)', 1, 'time_series', '<12TB+'
265 )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } },
266 g.graphPanel.new(bars=true,
267 datasource='$datasource',
268 title='Distribution of PGs per OSD',
269 x_axis_buckets=20,
270 x_axis_mode='histogram',
271 x_axis_values=['total'],
272 formatY1='short',
273 formatY2='short',
274 labelY1='# of OSDs',
275 min='0',
276 nullPointMode='null')
277 .addTarget(u.addTargetSchema(
278 'ceph_osd_numpg\n', 1, 'time_series', 'PGs per OSD'
279 )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
280 OsdOverviewSingleStatPanel(
281 ['#d44a3a', '#299c46'],
282 'percentunit',
283 'OSD onode Hits Ratio',
284 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
285 'current',
286 true,
287 1,
288 true,
289 false,
290 '.75',
291 'sum(ceph_bluestore_onode_hits)/(sum(ceph_bluestore_onode_hits) + sum(ceph_bluestore_onode_misses))',
292 'time_series',
293 20,
294 8,
295 4,
296 8
297 ),
298 u.addRowSchema(false,
299 true,
300 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
301 OsdOverviewGraphPanel(
302 {},
303 'Read/Write Profile',
304 'Show the read/write workload profile overtime',
305 'short',
306 null,
307 null,
308 'round(sum(irate(ceph_pool_rd[30s])))',
309 'Reads',
310 0,
311 17,
312 24,
313 8
314 )
315 .addTargets([u.addTargetSchema(
316 'round(sum(irate(ceph_pool_wr[30s])))', 1, 'time_series', 'Writes'
317 )]),
318 ]),
319 'osd-device-details.json':
320 local OsdDeviceDetailsPanel(title,
321 description,
322 formatY1,
323 labelY1,
324 expr1,
325 expr2,
326 legendFormat1,
327 legendFormat2,
328 x,
329 y,
330 w,
331 h) =
332 u.graphPanelSchema({},
333 title,
334 description,
335 'null',
336 false,
337 formatY1,
338 'short',
339 labelY1,
340 null,
341 null,
342 1,
343 '$datasource')
344 .addTargets(
345 [
346 u.addTargetSchema(expr1,
347 1,
348 'time_series',
349 legendFormat1),
350 u.addTargetSchema(expr2, 1, 'time_series', legendFormat2),
351 ]
352 ) + { gridPos: { x: x, y: y, w: w, h: h } };
353
354 u.dashboardSchema(
355 'OSD device details',
356 '',
357 'CrAHE0iZz',
358 'now-3h',
359 '',
360 16,
361 [],
362 '',
363 {
364 refresh_intervals: ['5s', '10s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
365 time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
366 }
367 )
368 .addAnnotation(
369 u.addAnnotationSchema(
370 1,
371 '-- Grafana --',
372 true,
373 true,
374 'rgba(0, 211, 255, 1)',
375 'Annotations & Alerts',
376 'dashboard'
377 )
378 )
379 .addRequired(
380 type='grafana', id='grafana', name='Grafana', version='5.3.2'
381 )
382 .addRequired(
383 type='panel', id='graph', name='Graph', version='5.0.0'
384 )
385 .addTemplate(
386 g.template.datasource('datasource',
387 'prometheus',
388 'default',
389 label='Data Source')
390 )
391 .addTemplate(
392 u.addTemplateSchema('osd',
393 '$datasource',
394 'label_values(ceph_osd_metadata,ceph_daemon)',
395 1,
396 false,
397 1,
398 'OSD',
399 '(.*)')
400 )
401 .addPanels([
402 u.addRowSchema(
403 false, true, 'OSD Performance'
404 ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
405 OsdDeviceDetailsPanel(
406 '$osd Latency',
407 '',
408 's',
409 'Read (-) / Write (+)',
410 'irate(ceph_osd_op_r_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_r_latency_count[1m])',
411 'irate(ceph_osd_op_w_latency_sum{ceph_daemon=~"$osd"}[1m]) / on (ceph_daemon) irate(ceph_osd_op_w_latency_count[1m])',
412 'read',
413 'write',
414 0,
415 1,
416 6,
417 9
418 )
419 .addSeriesOverride(
420 {
421 alias: 'read',
422 transform: 'negative-Y',
423 }
424 ),
425 OsdDeviceDetailsPanel(
426 '$osd R/W IOPS',
427 '',
428 'short',
429 'Read (-) / Write (+)',
430 'irate(ceph_osd_op_r{ceph_daemon=~"$osd"}[1m])',
431 'irate(ceph_osd_op_w{ceph_daemon=~"$osd"}[1m])',
432 'Reads',
433 'Writes',
434 6,
435 1,
436 6,
437 9
438 )
439 .addSeriesOverride(
440 { alias: 'Reads', transform: 'negative-Y' }
441 ),
442 OsdDeviceDetailsPanel(
443 '$osd R/W Bytes',
444 '',
445 'bytes',
446 'Read (-) / Write (+)',
447 'irate(ceph_osd_op_r_out_bytes{ceph_daemon=~"$osd"}[1m])',
448 'irate(ceph_osd_op_w_in_bytes{ceph_daemon=~"$osd"}[1m])',
449 'Read Bytes',
450 'Write Bytes',
451 12,
452 1,
453 6,
454 9
455 )
456 .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
457 u.addRowSchema(
458 false, true, 'Physical Device Performance'
459 ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
460 OsdDeviceDetailsPanel(
461 'Physical Device Latency for $osd',
462 '',
463 's',
464 'Read (-) / Write (+)',
465 '(label_replace(irate(node_disk_read_time_seconds_total[1m]) / irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))',
466 '(label_replace(irate(node_disk_write_time_seconds_total[1m]) / irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*"))',
467 '{{instance}}/{{device}} Reads',
468 '{{instance}}/{{device}} Writes',
469 0,
470 11,
471 6,
472 9
473 )
474 .addSeriesOverride(
475 { alias: '/.*Reads/', transform: 'negative-Y' }
476 ),
477 OsdDeviceDetailsPanel(
478 'Physical Device R/W IOPS for $osd',
479 '',
480 'short',
481 'Read (-) / Write (+)',
482 'label_replace(irate(node_disk_writes_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
483 'label_replace(irate(node_disk_reads_completed_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
484 '{{device}} on {{instance}} Writes',
485 '{{device}} on {{instance}} Reads',
486 6,
487 11,
488 6,
489 9
490 )
491 .addSeriesOverride(
492 { alias: '/.*Reads/', transform: 'negative-Y' }
493 ),
494 OsdDeviceDetailsPanel(
495 'Physical Device R/W Bytes for $osd',
496 '',
497 'Bps',
498 'Read (-) / Write (+)',
499 'label_replace(irate(node_disk_read_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
500 'label_replace(irate(node_disk_written_bytes_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
501 '{{instance}} {{device}} Reads',
502 '{{instance}} {{device}} Writes',
503 12,
504 11,
505 6,
506 9
507 )
508 .addSeriesOverride(
509 { alias: '/.*Reads/', transform: 'negative-Y' }
510 ),
511 u.graphPanelSchema(
512 {},
513 'Physical Device Util% for $osd',
514 '',
515 'null',
516 false,
517 'percentunit',
518 'short',
519 null,
520 null,
521 null,
522 1,
523 '$datasource'
524 )
525 .addTarget(u.addTargetSchema(
526 'label_replace(irate(node_disk_io_time_seconds_total[1m]), "instance", "$1", "instance", "([^:.]*).*") and on (instance, device) label_replace(label_replace(ceph_disk_occupation_human{ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"), "instance", "$1", "instance", "([^:.]*).*")',
527 1,
528 'time_series',
529 '{{device}} on {{instance}}'
530 )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },
531 ]),
532 },
533 }