]> git.proxmox.com Git - ceph.git/blob - ceph/monitoring/ceph-mixin/dashboards/osd.libsonnet
129b74ba6669112df29001a6f63345c860678afe
[ceph.git] / ceph / monitoring / ceph-mixin / dashboards / osd.libsonnet
1 local g = import 'grafonnet/grafana.libsonnet';
2
3 (import 'utils.libsonnet') {
4 'osds-overview.json':
5 $.dashboardSchema(
6 'OSD Overview',
7 '',
8 'lo02I1Aiz',
9 'now-1h',
10 '30s',
11 16,
12 $._config.dashboardTags,
13 ''
14 )
15 .addAnnotation(
16 $.addAnnotationSchema(
17 1,
18 '-- Grafana --',
19 true,
20 true,
21 'rgba(0, 211, 255, 1)',
22 'Annotations & Alerts',
23 'dashboard'
24 )
25 )
26 .addRequired(
27 type='grafana', id='grafana', name='Grafana', version='5.0.0'
28 )
29 .addRequired(
30 type='panel', id='grafana-piechart-panel', name='Pie Chart', version='1.3.3'
31 )
32 .addRequired(
33 type='panel', id='graph', name='Graph', version='5.0.0'
34 )
35 .addRequired(
36 type='panel', id='table', name='Table', version='5.0.0'
37 )
38 .addTemplate(
39 g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
40 )
41 .addTemplate(
42 $.addClusterTemplate()
43 )
44 .addTemplate(
45 $.addJobTemplate()
46 )
47 .addPanels([
48 $.simpleGraphPanel(
49 { '@95%ile': '#e0752d' },
50 'OSD Read Latencies',
51 '',
52 'ms',
53 null,
54 '0',
55 |||
56 avg (
57 rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
58 on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
59 )
60 ||| % $.matchers(),
61 'AVG read',
62 0,
63 0,
64 8,
65 8
66 )
67 .addTargets(
68 [
69 $.addTargetSchema(
70 |||
71 max(
72 rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
73 on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) * 1000
74 )
75 ||| % $.matchers(),
76 'MAX read'
77 ),
78 $.addTargetSchema(
79 |||
80 quantile(0.95,
81 (
82 rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
83 on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
84 * 1000
85 )
86 )
87 ||| % $.matchers(),
88 '@95%ile'
89 ),
90 ],
91 ),
92 $.addTableSchema(
93 '$datasource',
94 "This table shows the osd's that are delivering the 10 highest read latencies within the cluster",
95 { col: 2, desc: true },
96 [
97 $.overviewStyle('OSD ID', 'ceph_daemon', 'string', 'short'),
98 $.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
99 $.overviewStyle('', '/.*/', 'hidden', 'short'),
100 ],
101 'Highest READ Latencies',
102 'table'
103 )
104 .addTarget(
105 $.addTargetSchema(
106 |||
107 topk(10,
108 (sort(
109 (
110 rate(ceph_osd_op_r_latency_sum{%(matchers)s}[$__rate_interval]) /
111 on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval]) *
112 1000
113 )
114 ))
115 )
116 ||| % $.matchers(),
117 '',
118 'table',
119 1,
120 true
121 )
122 ) + { gridPos: { x: 8, y: 0, w: 4, h: 8 } },
123 $.simpleGraphPanel(
124 {
125 '@95%ile write': '#e0752d',
126 },
127 'OSD Write Latencies',
128 '',
129 'ms',
130 null,
131 '0',
132 |||
133 avg(
134 rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
135 on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
136 * 1000
137 )
138 ||| % $.matchers(),
139 'AVG write',
140 12,
141 0,
142 8,
143 8
144 )
145 .addTargets(
146 [
147 $.addTargetSchema(
148 |||
149 max(
150 rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
151 on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
152 1000
153 )
154 ||| % $.matchers(), 'MAX write'
155 ),
156 $.addTargetSchema(
157 |||
158 quantile(0.95, (
159 rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
160 on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
161 1000
162 ))
163 ||| % $.matchers(), '@95%ile write'
164 ),
165 ],
166 ),
167 $.addTableSchema(
168 '$datasource',
169 "This table shows the osd's that are delivering the 10 highest write latencies within the cluster",
170 { col: 2, desc: true },
171 [
172 $.overviewStyle(
173 'OSD ID', 'ceph_daemon', 'string', 'short'
174 ),
175 $.overviewStyle('Latency (ms)', 'Value', 'number', 'none'),
176 $.overviewStyle('', '/.*/', 'hidden', 'short'),
177 ],
178 'Highest WRITE Latencies',
179 'table'
180 )
181 .addTarget(
182 $.addTargetSchema(
183 |||
184 topk(10,
185 (sort(
186 (rate(ceph_osd_op_w_latency_sum{%(matchers)s}[$__rate_interval]) /
187 on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval]) *
188 1000)
189 ))
190 )
191 ||| % $.matchers(),
192 '',
193 'table',
194 1,
195 true
196 )
197 ) + { gridPos: { x: 20, y: 0, w: 4, h: 8 } },
198 $.simplePieChart(
199 {}, '', 'OSD Types Summary'
200 )
201 .addTarget(
202 $.addTargetSchema('count by (device_class) (ceph_osd_metadata{%(matchers)s})' % $.matchers(), '{{device_class}}')
203 ) + { gridPos: { x: 0, y: 8, w: 4, h: 8 } },
204 $.simplePieChart(
205 { 'Non-Encrypted': '#E5AC0E' }, '', 'OSD Objectstore Types'
206 )
207 .addTarget(
208 $.addTargetSchema(
209 'count(ceph_bluefs_wal_total_bytes{%(matchers)s})' % $.matchers(), 'bluestore', 'time_series', 2
210 )
211 )
212 .addTarget(
213 $.addTargetSchema(
214 'absent(ceph_bluefs_wal_total_bytes{%(matchers)s}) * count(ceph_osd_metadata{%(matchers)s})' % $.matchers(), 'filestore', 'time_series', 2
215 )
216 ) + { gridPos: { x: 4, y: 8, w: 4, h: 8 } },
217 $.simplePieChart(
218 {}, 'The pie chart shows the various OSD sizes used within the cluster', 'OSD Size Summary'
219 )
220 .addTarget($.addTargetSchema(
221 'count(ceph_osd_stat_bytes{%(matchers)s} < 1099511627776)' % $.matchers(), '<1TB', 'time_series', 2
222 ))
223 .addTarget($.addTargetSchema(
224 'count(ceph_osd_stat_bytes{%(matchers)s} >= 1099511627776 < 2199023255552)' % $.matchers(), '<2TB', 'time_series', 2
225 ))
226 .addTarget($.addTargetSchema(
227 'count(ceph_osd_stat_bytes{%(matchers)s} >= 2199023255552 < 3298534883328)' % $.matchers(), '<3TB', 'time_series', 2
228 ))
229 .addTarget($.addTargetSchema(
230 'count(ceph_osd_stat_bytes{%(matchers)s} >= 3298534883328 < 4398046511104)' % $.matchers(), '<4TB', 'time_series', 2
231 ))
232 .addTarget($.addTargetSchema(
233 'count(ceph_osd_stat_bytes{%(matchers)s} >= 4398046511104 < 6597069766656)' % $.matchers(), '<6TB', 'time_series', 2
234 ))
235 .addTarget($.addTargetSchema(
236 'count(ceph_osd_stat_bytes{%(matchers)s} >= 6597069766656 < 8796093022208)' % $.matchers(), '<8TB', 'time_series', 2
237 ))
238 .addTarget($.addTargetSchema(
239 'count(ceph_osd_stat_bytes{%(matchers)s} >= 8796093022208 < 10995116277760)' % $.matchers(), '<10TB', 'time_series', 2
240 ))
241 .addTarget($.addTargetSchema(
242 'count(ceph_osd_stat_bytes{%(matchers)s} >= 10995116277760 < 13194139533312)' % $.matchers(), '<12TB', 'time_series', 2
243 ))
244 .addTarget($.addTargetSchema(
245 'count(ceph_osd_stat_bytes{%(matchers)s} >= 13194139533312)' % $.matchers(), '<12TB+', 'time_series', 2
246 )) + { gridPos: { x: 8, y: 8, w: 4, h: 8 } },
247 g.graphPanel.new(bars=true,
248 datasource='$datasource',
249 title='Distribution of PGs per OSD',
250 x_axis_buckets=20,
251 x_axis_mode='histogram',
252 x_axis_values=['total'],
253 formatY1='short',
254 formatY2='short',
255 labelY1='# of OSDs',
256 min='0',
257 nullPointMode='null')
258 .addTarget($.addTargetSchema(
259 'ceph_osd_numpg{%(matchers)s}' % $.matchers(), 'PGs per OSD', 'time_series', 1, true
260 )) + { gridPos: { x: 12, y: 8, w: 8, h: 8 } },
261 $.gaugeSingleStatPanel(
262 'percentunit',
263 'OSD onode Hits Ratio',
264 'This gauge panel shows onode Hits ratio to help determine if increasing RAM per OSD could help improve the performance of the cluster',
265 'current',
266 true,
267 1,
268 true,
269 false,
270 '.75',
271 |||
272 sum(ceph_bluestore_onode_hits{%(matchers)s}) / (
273 sum(ceph_bluestore_onode_hits{%(matchers)s}) +
274 sum(ceph_bluestore_onode_misses{%(matchers)s})
275 )
276 ||| % $.matchers(),
277 'time_series',
278 20,
279 8,
280 4,
281 8
282 ),
283 $.addRowSchema(false,
284 true,
285 'R/W Profile') + { gridPos: { x: 0, y: 16, w: 24, h: 1 } },
286 $.simpleGraphPanel(
287 {},
288 'Read/Write Profile',
289 'Show the read/write workload profile overtime',
290 'short',
291 null,
292 null,
293 'round(sum(rate(ceph_pool_rd{%(matchers)s}[$__rate_interval])))' % $.matchers(),
294 'Reads',
295 0,
296 17,
297 24,
298 8
299 )
300 .addTargets([$.addTargetSchema(
301 'round(sum(rate(ceph_pool_wr{%(matchers)s}[$__rate_interval])))' % $.matchers(), 'Writes'
302 )]),
303 ]),
304 'osd-device-details.json':
305 local OsdDeviceDetailsPanel(title,
306 description,
307 formatY1,
308 labelY1,
309 expr1,
310 expr2,
311 legendFormat1,
312 legendFormat2,
313 x,
314 y,
315 w,
316 h) =
317 $.graphPanelSchema({},
318 title,
319 description,
320 'null',
321 false,
322 formatY1,
323 'short',
324 labelY1,
325 null,
326 null,
327 1,
328 '$datasource')
329 .addTargets(
330 [
331 $.addTargetSchema(expr1,
332 legendFormat1),
333 $.addTargetSchema(expr2, legendFormat2),
334 ]
335 ) + { gridPos: { x: x, y: y, w: w, h: h } };
336
337 $.dashboardSchema(
338 'OSD device details',
339 '',
340 'CrAHE0iZz',
341 'now-3h',
342 '30s',
343 16,
344 $._config.dashboardTags,
345 ''
346 )
347 .addAnnotation(
348 $.addAnnotationSchema(
349 1,
350 '-- Grafana --',
351 true,
352 true,
353 'rgba(0, 211, 255, 1)',
354 'Annotations & Alerts',
355 'dashboard'
356 )
357 )
358 .addRequired(
359 type='grafana', id='grafana', name='Grafana', version='5.3.2'
360 )
361 .addRequired(
362 type='panel', id='graph', name='Graph', version='5.0.0'
363 )
364 .addTemplate(
365 g.template.datasource('datasource',
366 'prometheus',
367 'default',
368 label='Data Source')
369 )
370 .addTemplate(
371 $.addClusterTemplate()
372 )
373 .addTemplate(
374 $.addJobTemplate()
375 )
376 .addTemplate(
377 $.addTemplateSchema('osd',
378 '$datasource',
379 'label_values(ceph_osd_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
380 1,
381 false,
382 1,
383 'OSD',
384 '(.*)')
385 )
386 .addPanels([
387 $.addRowSchema(
388 false, true, 'OSD Performance'
389 ) + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
390 OsdDeviceDetailsPanel(
391 '$osd Latency',
392 '',
393 's',
394 'Read (-) / Write (+)',
395 |||
396 rate(ceph_osd_op_r_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
397 on (ceph_daemon) rate(ceph_osd_op_r_latency_count{%(matchers)s}[$__rate_interval])
398 ||| % $.matchers(),
399 |||
400 rate(ceph_osd_op_w_latency_sum{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval]) /
401 on (ceph_daemon) rate(ceph_osd_op_w_latency_count{%(matchers)s}[$__rate_interval])
402 ||| % $.matchers(),
403 'read',
404 'write',
405 0,
406 1,
407 6,
408 9
409 )
410 .addSeriesOverride(
411 {
412 alias: 'read',
413 transform: 'negative-Y',
414 }
415 ),
416 OsdDeviceDetailsPanel(
417 '$osd R/W IOPS',
418 '',
419 'short',
420 'Read (-) / Write (+)',
421 'rate(ceph_osd_op_r{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
422 'rate(ceph_osd_op_w{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
423 'Reads',
424 'Writes',
425 6,
426 1,
427 6,
428 9
429 )
430 .addSeriesOverride(
431 { alias: 'Reads', transform: 'negative-Y' }
432 ),
433 OsdDeviceDetailsPanel(
434 '$osd R/W Bytes',
435 '',
436 'bytes',
437 'Read (-) / Write (+)',
438 'rate(ceph_osd_op_r_out_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
439 'rate(ceph_osd_op_w_in_bytes{%(matchers)s, ceph_daemon=~"$osd"}[$__rate_interval])' % $.matchers(),
440 'Read Bytes',
441 'Write Bytes',
442 12,
443 1,
444 6,
445 9
446 )
447 .addSeriesOverride({ alias: 'Read Bytes', transform: 'negative-Y' }),
448 $.addRowSchema(
449 false, true, 'Physical Device Performance'
450 ) + { gridPos: { x: 0, y: 10, w: 24, h: 1 } },
451 OsdDeviceDetailsPanel(
452 'Physical Device Latency for $osd',
453 '',
454 's',
455 'Read (-) / Write (+)',
456 |||
457 (
458 label_replace(
459 rate(node_disk_read_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
460 rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
461 "instance", "$1", "instance", "([^:.]*).*"
462 ) and on (instance, device) label_replace(
463 label_replace(
464 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
465 "device", "$1", "device", "/dev/(.*)"
466 ), "instance", "$1", "instance", "([^:.]*).*"
467 )
468 )
469 ||| % $.matchers(),
470 |||
471 (
472 label_replace(
473 rate(node_disk_write_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]) /
474 rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
475 "instance", "$1", "instance", "([^:.]*).*") and on (instance, device)
476 label_replace(
477 label_replace(
478 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
479 ), "instance", "$1", "instance", "([^:.]*).*"
480 )
481 )
482 ||| % $.matchers(),
483 '{{instance}}/{{device}} Reads',
484 '{{instance}}/{{device}} Writes',
485 0,
486 11,
487 6,
488 9
489 )
490 .addSeriesOverride(
491 { alias: '/.*Reads/', transform: 'negative-Y' }
492 ),
493 OsdDeviceDetailsPanel(
494 'Physical Device R/W IOPS for $osd',
495 '',
496 'short',
497 'Read (-) / Write (+)',
498 |||
499 label_replace(
500 rate(node_disk_writes_completed_total{%(clusterMatcher)s}[$__rate_interval]),
501 "instance", "$1", "instance", "([^:.]*).*"
502 ) and on (instance, device) label_replace(
503 label_replace(
504 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
505 "device", "$1", "device", "/dev/(.*)"
506 ), "instance", "$1", "instance", "([^:.]*).*"
507 )
508 ||| % $.matchers(),
509 |||
510 label_replace(
511 rate(node_disk_reads_completed_total{%(clusterMatcher)s}[$__rate_interval]),
512 "instance", "$1", "instance", "([^:.]*).*"
513 ) and on (instance, device) label_replace(
514 label_replace(
515 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
516 "device", "$1", "device", "/dev/(.*)"
517 ), "instance", "$1", "instance", "([^:.]*).*"
518 )
519 ||| % $.matchers(),
520 '{{device}} on {{instance}} Writes',
521 '{{device}} on {{instance}} Reads',
522 6,
523 11,
524 6,
525 9
526 )
527 .addSeriesOverride(
528 { alias: '/.*Reads/', transform: 'negative-Y' }
529 ),
530 OsdDeviceDetailsPanel(
531 'Physical Device R/W Bytes for $osd',
532 '',
533 'Bps',
534 'Read (-) / Write (+)',
535 |||
536 label_replace(
537 rate(node_disk_read_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
538 ) and on (instance, device) label_replace(
539 label_replace(
540 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
541 "device", "$1", "device", "/dev/(.*)"
542 ), "instance", "$1", "instance", "([^:.]*).*"
543 )
544 ||| % $.matchers(),
545 |||
546 label_replace(
547 rate(node_disk_written_bytes_total{%(clusterMatcher)s}[$__rate_interval]), "instance", "$1", "instance", "([^:.]*).*"
548 ) and on (instance, device) label_replace(
549 label_replace(
550 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"},
551 "device", "$1", "device", "/dev/(.*)"
552 ), "instance", "$1", "instance", "([^:.]*).*"
553 )
554 ||| % $.matchers(),
555 '{{instance}} {{device}} Reads',
556 '{{instance}} {{device}} Writes',
557 12,
558 11,
559 6,
560 9
561 )
562 .addSeriesOverride(
563 { alias: '/.*Reads/', transform: 'negative-Y' }
564 ),
565 $.graphPanelSchema(
566 {},
567 'Physical Device Util% for $osd',
568 '',
569 'null',
570 false,
571 'percentunit',
572 'short',
573 null,
574 null,
575 null,
576 1,
577 '$datasource'
578 )
579 .addTarget($.addTargetSchema(
580 |||
581 label_replace(
582 rate(node_disk_io_time_seconds_total{%(clusterMatcher)s}[$__rate_interval]),
583 "instance", "$1", "instance", "([^:.]*).*"
584 ) and on (instance, device) label_replace(
585 label_replace(
586 ceph_disk_occupation_human{%(matchers)s, ceph_daemon=~"$osd"}, "device", "$1", "device", "/dev/(.*)"
587 ), "instance", "$1", "instance", "([^:.]*).*"
588 )
589 ||| % $.matchers(),
590 '{{device}} on {{instance}}'
591 )) + { gridPos: { x: 18, y: 11, w: 6, h: 9 } },
592 ]),
593 }