1 local g = import 'grafonnet/grafana.libsonnet';
2 local u = import 'utils.libsonnet';
6 'radosgw-sync-overview.json':
7 local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) =
21 [u.addTargetSchema('sum by (source_zone) (rate(%s[30s]))' % rgwMetric,
25 ) + { gridPos: { x: x, y: y, w: w, h: h } };
37 refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
38 time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
42 u.addAnnotationSchema(
47 'rgba(0, 211, 255, 1)',
48 'Annotations & Alerts',
53 type='grafana', id='grafana', name='Grafana', version='5.0.0'
56 type='panel', id='graph', name='Graph', version='5.0.0'
59 u.addTemplateSchema('rgw_servers', '$datasource', 'prometehus', 1, true, 1, '', '')
62 g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
66 'Replication (throughput) from Source Zone',
69 'ceph_data_sync_from_zone_fetch_bytes_sum',
76 'Replication (objects) from Source Zone',
79 'ceph_data_sync_from_zone_fetch_bytes_count',
86 'Polling Request Latency from Source Zone',
89 'ceph_data_sync_from_zone_poll_latency_sum',
96 'Unsuccessful Object Replications from Source Zone',
99 'ceph_data_sync_from_zone_fetch_errors',
106 'radosgw-overview.json':
107 local RgwOverviewPanel(
118 datasource='$datasource',
119 legend_alignAsTable=false,
123 legend_current=false,
147 [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1)]
148 ) + { gridPos: { x: x, y: y, w: w, h: h } };
160 refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
161 time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
165 u.addAnnotationSchema(
170 'rgba(0, 211, 255, 1)',
171 'Annotations & Alerts',
176 type='grafana', id='grafana', name='Grafana', version='5.0.0'
179 type='panel', id='graph', name='Graph', version='5.0.0'
185 'label_values(ceph_rgw_metadata, ceph_daemon)',
197 'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)',
209 'label_values(haproxy_server_status, instance)',
218 g.template.datasource('datasource',
224 u.addRowSchema(false,
226 'RGW Overview - All Gateways') +
228 gridPos: { x: 0, y: 0, w: 24, h: 1 },
231 'Average GET/PUT Latencies',
235 'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
244 'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
252 'Total Requests/sec by RGW Instance',
256 'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))',
264 'GET Latencies by RGW Instance',
265 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts',
268 'label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
276 'Bandwidth Consumed by Type',
277 'Total bytes transferred in/out of all radosgw instances within the cluster',
280 'sum(rate(ceph_rgw_get_b[30s]))',
287 [u.addTargetSchema('sum(rate(ceph_rgw_put_b[30s]))',
293 'Bandwidth by RGW Instance',
294 'Total bytes transferred in/out through get/put operations, by radosgw instance',
297 'label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
305 'PUT Latencies by RGW Instance',
306 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts',
309 'label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
317 false, true, 'RGW Overview - HAProxy Metrics'
318 ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } },
320 'Total responses by HTTP code',
324 'sum(irate(haproxy_frontend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"frontend"}[5m])) by (code)',
325 'Frontend {{ code }}',
339 [u.addTargetSchema('sum(irate(haproxy_backend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"backend"}[5m])) by (code)', 1, 'time_series', 'Backend {{ code }}')]
344 transform: 'negative-Y',
346 { alias: '/.*1.*/' },
347 { alias: '/.*2.*/' },
348 { alias: '/.*3.*/' },
349 { alias: '/.*4.*/' },
350 { alias: '/.*5.*/' },
351 { alias: '/.*other.*/' },
354 'Total requests / responses',
358 'sum(irate(haproxy_frontend_http_requests_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)',
374 u.addTargetSchema('sum(irate(haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Response errors'),
375 u.addTargetSchema('sum(irate(haproxy_frontend_request_errors_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Requests errors'),
376 u.addTargetSchema('sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend redispatch'),
377 u.addTargetSchema('sum(irate(haproxy_backend_retry_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend retry'),
378 u.addTargetSchema('sum(irate(haproxy_frontend_requests_denied_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Request denied'),
379 u.addTargetSchema('sum(haproxy_backend_current_queue{proxy=~"backend",instance=~"$ingress_service"}) by (instance)', 2, 'time_series', 'Backend Queued'),
384 alias: '/.*Response.*/',
385 transform: 'negative-Y',
388 alias: '/.*Backend.*/',
389 transform: 'negative-Y',
393 'Total number of connections',
397 'sum(irate(haproxy_frontend_connections_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)',
413 u.addTargetSchema('sum(irate(haproxy_backend_connection_attempts_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back'),
414 u.addTargetSchema('sum(irate(haproxy_backend_connection_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back errors'),
420 transform: 'negative-Y',
424 'Current total of incoming / outgoing bytes',
428 'sum(irate(haproxy_frontend_bytes_in_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)',
444 u.addTargetSchema('sum(irate(haproxy_frontend_bytes_out_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Front'),
445 u.addTargetSchema('sum(irate(haproxy_backend_bytes_in_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'IN Back'),
446 u.addTargetSchema('sum(irate(haproxy_backend_bytes_out_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Back'),
452 transform: 'negative-Y',
456 'radosgw-detail.json':
457 local RgwDetailsPanel(aliasColors,
470 u.graphPanelSchema(aliasColors,
483 [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1), u.addTargetSchema(expr2, 1, 'time_series', legendFormat2)]
484 ) + { gridPos: { x: x, y: y, w: w, h: h } };
487 'RGW Instance Detail',
496 refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
497 time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
501 u.addAnnotationSchema(
506 'rgba(0, 211, 255, 1)',
507 'Annotations & Alerts',
512 type='grafana', id='grafana', name='Grafana', version='5.0.0'
516 id='grafana-piechart-panel',
521 type='panel', id='graph', name='Graph', version='5.0.0'
524 g.template.datasource('datasource',
530 u.addTemplateSchema('rgw_servers',
532 'label_values(ceph_rgw_metadata, ceph_daemon)',
540 u.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
543 '$rgw_servers GET/PUT Latencies',
547 'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
548 'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
549 'GET {{ceph_daemon}}',
550 'PUT {{ceph_daemon}}',
558 'Bandwidth by HTTP Operation',
562 'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
563 'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
564 'GETs {{ceph_daemon}}',
565 'PUTs {{ceph_daemon}}',
577 'Requests Failed': '#bf1b00',
579 'HTTP Request Breakdown',
583 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
584 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
585 'Requests Failed {{ceph_daemon}}',
586 'GETs {{ceph_daemon}}',
595 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
598 'PUTs {{ceph_daemon}}'
601 '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
604 'Other {{ceph_daemon}}'
611 'Other (HEAD,POST,DELETE)': '#447ebc',
615 }, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current'
617 .addTarget(u.addTargetSchema(
618 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
621 'Failures {{ceph_daemon}}'
623 .addTarget(u.addTargetSchema(
624 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
627 'GETs {{ceph_daemon}}'
629 .addTarget(u.addTargetSchema(
630 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
633 'PUTs {{ceph_daemon}}'
635 .addTarget(u.addTargetSchema(
636 '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
639 'Other (DELETE,LIST) {{ceph_daemon}}'
640 )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } },