1 local g = import 'grafonnet/grafana.libsonnet';
2 local u = import 'utils.libsonnet';
4 (import 'utils.libsonnet') {
5 'radosgw-sync-overview.json':
6 local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) =
22 'sum by (source_zone) (rate(%(rgwMetric)s{%(matchers)s}[$__rate_interval]))'
23 % ($.matchers() + { rgwMetric: rgwMetric }),
27 ) + { gridPos: { x: x, y: y, w: w, h: h } };
36 $._config.dashboardTags + ['overview'],
40 $.addAnnotationSchema(
45 'rgba(0, 211, 255, 1)',
46 'Annotations & Alerts',
51 type='grafana', id='grafana', name='Grafana', version='5.0.0'
54 type='panel', id='graph', name='Graph', version='5.0.0'
57 g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
60 $.addClusterTemplate()
69 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
79 'Replication (throughput) from Source Zone',
82 'ceph_data_sync_from_zone_fetch_bytes_sum',
89 'Replication (objects) from Source Zone',
92 'ceph_data_sync_from_zone_fetch_bytes_count',
99 'Polling Request Latency from Source Zone',
102 'ceph_data_sync_from_zone_poll_latency_sum',
108 RgwSyncOverviewPanel(
109 'Unsuccessful Object Replications from Source Zone',
112 'ceph_data_sync_from_zone_fetch_errors',
119 'radosgw-overview.json':
120 local RgwOverviewPanel(
131 datasource='$datasource',
132 legend_alignAsTable=false,
136 legend_current=false,
160 [$.addTargetSchema(expr1, legendFormat1)]
161 ) + { gridPos: { x: x, y: y, w: w, h: h } };
170 $._config.dashboardTags + ['overview'],
174 $.addAnnotationSchema(
179 'rgba(0, 211, 255, 1)',
180 'Annotations & Alerts',
185 type='grafana', id='grafana', name='Grafana', version='5.0.0'
188 type='panel', id='graph', name='Graph', version='5.0.0'
191 g.template.datasource('datasource',
197 $.addClusterTemplate()
206 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
218 'label_values(haproxy_server_http_responses_total{job=~"$job_haproxy", instance=~"$ingress_service"}, code)',
230 'label_values(haproxy_server_status, job)',
244 'label_values(haproxy_server_status{job=~"$job_haproxy"}, instance)',
253 $.addRowSchema(false,
255 'RGW Overview - All Gateways') +
257 gridPos: { x: 0, y: 0, w: 24, h: 1 },
260 'Average GET/PUT Latencies',
265 rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
266 rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) *
267 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}
278 rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
279 rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) *
280 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s}
287 'Total Requests/sec by RGW Instance',
294 rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) *
295 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
296 "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
307 'GET Latencies by RGW Instance',
308 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts',
313 rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
314 rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval]) *
315 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
316 "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
326 'Bandwidth Consumed by Type',
327 'Total bytes transferred in/out of all radosgw instances within the cluster',
330 'sum(rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]))' % $.matchers(),
337 [$.addTargetSchema('sum(rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]))' % $.matchers(),
341 'Bandwidth by RGW Instance',
342 'Total bytes transferred in/out through get/put operations, by radosgw instance',
346 label_replace(sum by (instance_id) (
347 rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) +
348 rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval])) *
349 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
350 "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
360 'PUT Latencies by RGW Instance',
361 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts',
366 rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
367 rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval]) *
368 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s},
369 "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"
379 false, true, 'RGW Overview - HAProxy Metrics'
380 ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } },
382 'Total responses by HTTP code',
389 haproxy_frontend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"frontend"}[$__rate_interval]
393 'Frontend {{ code }}',
412 haproxy_backend_http_responses_total{code=~"$code", job=~"$job_haproxy", instance=~"$ingress_service", proxy=~"backend"}[$__rate_interval]
415 |||, 'Backend {{ code }}'
422 transform: 'negative-Y',
424 { alias: '/.*1.*/' },
425 { alias: '/.*2.*/' },
426 { alias: '/.*3.*/' },
427 { alias: '/.*4.*/' },
428 { alias: '/.*5.*/' },
429 { alias: '/.*other.*/' },
432 'Total requests / responses',
439 haproxy_frontend_http_requests_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
462 haproxy_backend_response_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
465 |||, 'Response errors', 'time_series', 2
471 haproxy_frontend_request_errors_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
474 |||, 'Requests errors'
480 haproxy_backend_redispatch_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
483 |||, 'Backend redispatch', 'time_series', 2
489 haproxy_backend_retry_warnings_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
492 |||, 'Backend retry', 'time_series', 2
498 haproxy_frontend_requests_denied_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
501 |||, 'Request denied', 'time_series', 2
506 haproxy_backend_current_queue{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}
508 |||, 'Backend Queued', 'time_series', 2
514 alias: '/.*Response.*/',
515 transform: 'negative-Y',
518 alias: '/.*Backend.*/',
519 transform: 'negative-Y',
523 'Total number of connections',
530 haproxy_frontend_connections_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
553 haproxy_backend_connection_attempts_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
562 haproxy_backend_connection_errors_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
572 transform: 'negative-Y',
576 'Current total of incoming / outgoing bytes',
583 haproxy_frontend_bytes_in_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
606 haproxy_frontend_bytes_out_total{proxy=~"frontend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
609 |||, 'OUT Front', 'time_series', 2
615 haproxy_backend_bytes_in_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
618 |||, 'IN Back', 'time_series', 2
624 haproxy_backend_bytes_out_total{proxy=~"backend", job=~"$job_haproxy", instance=~"$ingress_service"}[$__rate_interval]
627 |||, 'OUT Back', 'time_series', 2
634 transform: 'negative-Y',
638 'radosgw-detail.json':
639 local RgwDetailsPanel(aliasColors,
652 $.graphPanelSchema(aliasColors,
665 [$.addTargetSchema(expr1, legendFormat1), $.addTargetSchema(expr2, legendFormat2)]
666 ) + { gridPos: { x: x, y: y, w: w, h: h } };
669 'RGW Instance Detail',
675 $._config.dashboardTags + ['overview'],
679 $.addAnnotationSchema(
684 'rgba(0, 211, 255, 1)',
685 'Annotations & Alerts',
690 type='grafana', id='grafana', name='Grafana', version='5.0.0'
694 id='grafana-piechart-panel',
699 type='panel', id='graph', name='Graph', version='5.0.0'
702 g.template.datasource('datasource',
708 $.addClusterTemplate()
714 $.addTemplateSchema('rgw_servers',
716 'label_values(ceph_rgw_metadata{%(matchers)s}, ceph_daemon)' % $.matchers(),
724 $.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
727 '$rgw_servers GET/PUT Latencies',
732 sum by (instance_id) (
733 rate(ceph_rgw_get_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
734 rate(ceph_rgw_get_initial_lat_count{%(matchers)s}[$__rate_interval])
735 ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
738 sum by (instance_id) (
739 rate(ceph_rgw_put_initial_lat_sum{%(matchers)s}[$__rate_interval]) /
740 rate(ceph_rgw_put_initial_lat_count{%(matchers)s}[$__rate_interval])
741 ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
743 'GET {{ceph_daemon}}',
744 'PUT {{ceph_daemon}}',
752 'Bandwidth by HTTP Operation',
757 rate(ceph_rgw_get_b{%(matchers)s}[$__rate_interval]) *
758 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
761 rate(ceph_rgw_put_b{%(matchers)s}[$__rate_interval]) *
762 on (instance_id) group_left (ceph_daemon)
763 ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
765 'GETs {{ceph_daemon}}',
766 'PUTs {{ceph_daemon}}',
778 'Requests Failed': '#bf1b00',
780 'HTTP Request Breakdown',
785 rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) *
786 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s,ceph_daemon=~"$rgw_servers"}
789 rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) *
790 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
792 'Requests Failed {{ceph_daemon}}',
793 'GETs {{ceph_daemon}}',
803 rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) *
804 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
806 'PUTs {{ceph_daemon}}'
811 rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) -
813 rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) +
814 rate(ceph_rgw_put{%(matchers)s}[$__rate_interval])
816 ) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
818 'Other {{ceph_daemon}}'
825 'Other (HEAD,POST,DELETE)': '#447ebc',
829 }, '', 'Workload Breakdown'
831 .addTarget($.addTargetSchema(
833 rate(ceph_rgw_failed_req{%(matchers)s}[$__rate_interval]) *
834 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
836 'Failures {{ceph_daemon}}'
838 .addTarget($.addTargetSchema(
840 rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) *
841 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
843 'GETs {{ceph_daemon}}'
845 .addTarget($.addTargetSchema(
847 rate(ceph_rgw_put{%(matchers)s}[$__rate_interval]) *
848 on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
850 'PUTs {{ceph_daemon}}'
852 .addTarget($.addTargetSchema(
855 rate(ceph_rgw_req{%(matchers)s}[$__rate_interval]) -
857 rate(ceph_rgw_get{%(matchers)s}[$__rate_interval]) +
858 rate(ceph_rgw_put{%(matchers)s}[$__rate_interval])
860 ) * on (instance_id) group_left (ceph_daemon)
861 ceph_rgw_metadata{%(matchers)s, ceph_daemon=~"$rgw_servers"}
863 'Other (DELETE,LIST) {{ceph_daemon}}'
864 )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } },