27 "datasource": "-- Grafana --",
30 "iconColor": "rgba(0, 211, 255, 1)",
31 "name": "Annotations & Alerts",
42 "hideControls": false,
48 "colorBackground": false,
52 "rgba(237, 129, 40, 0.89)",
55 "datasource": "$datasource",
61 "thresholdLabels": false,
62 "thresholdMarkers": true
76 "name": "value to text",
80 "name": "range to text",
85 "nullPointMode": "connected",
88 "postfixFontSize": "50%",
90 "prefixFontSize": "50%",
99 "fillColor": "rgba(31, 118, 189, 0.18)",
101 "lineColor": "rgb(31, 120, 193)",
107 "expr": "count(sum by (hostname) (ceph_osd_metadata))",
108 "format": "time_series",
115 "title": "OSD Hosts",
116 "type": "singlestat",
117 "valueFontSize": "80%",
125 "valueName": "current"
128 "cacheTimeout": null,
129 "colorBackground": false,
133 "rgba(237, 129, 40, 0.89)",
136 "datasource": "$datasource",
137 "description": "Average CPU busy across all hosts (OSD, RGW, MON etc) within the cluster",
138 "format": "percentunit",
143 "thresholdLabels": false,
144 "thresholdMarkers": true
158 "name": "value to text",
162 "name": "range to text",
166 "maxDataPoints": 100,
167 "nullPointMode": "connected",
170 "postfixFontSize": "50%",
172 "prefixFontSize": "50%",
181 "fillColor": "rgba(31, 118, 189, 0.18)",
183 "lineColor": "rgb(31, 120, 193)",
189 "expr": "avg(\n 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or\n irate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]))\n )\n )",
190 "format": "time_series",
197 "title": "AVG CPU Busy",
198 "type": "singlestat",
199 "valueFontSize": "80%",
207 "valueName": "current"
210 "cacheTimeout": null,
211 "colorBackground": false,
215 "rgba(237, 129, 40, 0.89)",
218 "datasource": "$datasource",
219 "description": "Average Memory Usage across all hosts in the cluster (excludes buffer/cache usage)",
220 "format": "percentunit",
225 "thresholdLabels": false,
226 "thresholdMarkers": true
240 "name": "value to text",
244 "name": "range to text",
248 "maxDataPoints": 100,
249 "nullPointMode": "connected",
252 "postfixFontSize": "50%",
254 "prefixFontSize": "50%",
263 "fillColor": "rgba(31, 118, 189, 0.18)",
265 "lineColor": "rgb(31, 120, 193)",
271 "expr": "avg (((node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"})- (\n (node_memory_MemFree{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_MemFree_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) + \n (node_memory_Cached{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_Cached_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) + \n (node_memory_Buffers{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_Buffers_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}) +\n (node_memory_Slab{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_Slab_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"})\n )) /\n (node_memory_MemTotal{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"} or node_memory_MemTotal_bytes{instance=~\"($osd_hosts|$rgw_hosts|$mon_hosts|$mds_hosts).*\"} ))",
272 "format": "time_series",
279 "title": "AVG RAM Utilization",
280 "type": "singlestat",
281 "valueFontSize": "80%",
289 "valueName": "current"
292 "cacheTimeout": null,
293 "colorBackground": false,
297 "rgba(237, 129, 40, 0.89)",
300 "datasource": "$datasource",
301 "description": "IOPS Load at the device as reported by the OS on all OSD hosts",
307 "thresholdLabels": false,
308 "thresholdMarkers": true
322 "name": "value to text",
326 "name": "range to text",
330 "maxDataPoints": 100,
331 "nullPointMode": "connected",
334 "postfixFontSize": "50%",
336 "prefixFontSize": "50%",
345 "fillColor": "rgba(31, 118, 189, 0.18)",
347 "lineColor": "rgb(31, 120, 193)",
353 "expr": "sum ((irate(node_disk_reads_completed{instance=~\"($osd_hosts).*\"}[5m]) or irate(node_disk_reads_completed_total{instance=~\"($osd_hosts).*\"}[5m]) ) + \n(irate(node_disk_writes_completed{instance=~\"($osd_hosts).*\"}[5m]) or irate(node_disk_writes_completed_total{instance=~\"($osd_hosts).*\"}[5m])))",
354 "format": "time_series",
361 "title": "Physical IOPS",
362 "type": "singlestat",
363 "valueFontSize": "80%",
371 "valueName": "current"
374 "cacheTimeout": null,
375 "colorBackground": false,
379 "rgba(237, 129, 40, 0.89)",
382 "datasource": "$datasource",
383 "description": "Average Disk utilization for all OSD data devices (i.e. excludes journal/WAL)",
389 "thresholdLabels": false,
390 "thresholdMarkers": true
404 "name": "value to text",
408 "name": "range to text",
412 "maxDataPoints": 100,
413 "nullPointMode": "connected",
416 "postfixFontSize": "50%",
418 "prefixFontSize": "50%",
427 "fillColor": "rgba(31, 118, 189, 0.18)",
429 "lineColor": "rgb(31, 120, 193)",
435 "expr": "avg (\n label_replace((irate(node_disk_io_time_ms[5m]) / 10 ) or\n (irate(node_disk_io_time_seconds_total[5m]) * 100), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\"\n ) *\n on(instance, device, ceph_daemon) label_replace(label_replace(ceph_disk_occupation{instance=~\"($osd_hosts).*\"}, \"device\", \"$1\", \"device\", \"/dev/(.*)\"), \"instance\", \"$1\", \"instance\", \"([^.:]*).*\")\n)",
436 "format": "time_series",
443 "title": "AVG Disk Utilization",
444 "type": "singlestat",
445 "valueFontSize": "80%",
453 "valueName": "current"
456 "cacheTimeout": null,
457 "colorBackground": false,
461 "rgba(237, 129, 40, 0.89)",
464 "datasource": "$datasource",
465 "description": "Total send/receive network load across all hosts in the ceph cluster",
471 "thresholdLabels": false,
472 "thresholdMarkers": true
486 "name": "value to text",
490 "name": "range to text",
494 "maxDataPoints": 100,
495 "nullPointMode": "connected",
498 "postfixFontSize": "50%",
500 "prefixFontSize": "50%",
509 "fillColor": "rgba(31, 118, 189, 0.18)",
511 "lineColor": "rgb(31, 120, 193)",
517 "expr": "sum (\n\t(\n\t\tirate(node_network_receive_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\t\tirate(node_network_receive_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n\t) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n) +\nsum (\n\t(\n\t\tirate(node_network_transmit_bytes{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\t\tirate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|mon_hosts|mds_hosts|rgw_hosts).*\",device!=\"lo\"}[1m])\n\t) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\")\n\t)\n",
518 "format": "time_series",
525 "title": "Network Load",
526 "type": "singlestat",
527 "valueFontSize": "80%",
535 "valueName": "current"
542 "datasource": "$datasource",
543 "description": "Show the top 10 busiest hosts by cpu",
553 "alignAsTable": false,
567 "nullPointMode": "null",
573 "seriesOverrides": [ ],
576 "steppedLine": false,
579 "expr": "topk(10,100 * ( 1 - (\n avg by(instance) \n (irate(node_cpu_seconds_total{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]) or\n irate(node_cpu{mode='idle',instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\"}[1m]))\n )\n )\n)",
580 "format": "time_series",
582 "legendFormat": "{{instance}}",
589 "title": "CPU Busy - Top 10 Hosts",
593 "value_type": "individual"
627 "datasource": "$datasource",
628 "description": "Top 10 hosts by network load",
638 "alignAsTable": false,
652 "nullPointMode": "null",
658 "seriesOverrides": [ ],
661 "steppedLine": false,
664 "expr": "topk(10, (sum by(instance) (\n(\n\tirate(node_network_receive_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\tirate(node_network_receive_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) +\n(\n\tirate(node_network_transmit_bytes{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m]) or\n\tirate(node_network_transmit_bytes_total{instance=~\"($osd_hosts|$mon_hosts|$mds_hosts|$rgw_hosts).*\",device!=\"lo\"}[1m])\n) unless on (device, instance)\n\tlabel_replace((bonding_slaves > 0), \"device\", \"$1\", \"master\", \"(.+)\"))\n))\n",
665 "format": "time_series",
667 "legendFormat": "{{instance}}",
674 "title": "Network Load - Top 10 Hosts",
678 "value_type": "individual"
721 "label": "Data Source",
722 "name": "datasource",
724 "query": "prometheus",
732 "datasource": "$datasource",
739 "query": "label_values(ceph_disk_occupation, exported_instance)",
741 "regex": "([^.]*).*",
743 "tagValuesQuery": "",
752 "datasource": "$datasource",
759 "query": "label_values(ceph_mon_metadata, ceph_daemon)",
763 "tagValuesQuery": "",
772 "datasource": "$datasource",
779 "query": "label_values(ceph_mds_inodes, ceph_daemon)",
783 "tagValuesQuery": "",
792 "datasource": "$datasource",
799 "query": "label_values(ceph_rgw_qlen, ceph_daemon)",
803 "tagValuesQuery": "",
816 "refresh_intervals": [
841 "title": "Host Overview",