]> git.proxmox.com Git - ceph.git/blob - ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet
import quincy beta 17.1.0
[ceph.git] / ceph / monitoring / ceph-mixin / dashboards / rgw.libsonnet
1 local g = import 'grafonnet/grafana.libsonnet';
2 local u = import 'utils.libsonnet';
3
4 {
5 grafanaDashboards+:: {
6 'radosgw-sync-overview.json':
7 local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) =
8 u.graphPanelSchema({},
9 title,
10 '',
11 'null as zero',
12 true,
13 formatY1,
14 'short',
15 labelY1,
16 null,
17 0,
18 1,
19 '$datasource')
20 .addTargets(
21 [u.addTargetSchema('sum by (source_zone) (rate(%s[30s]))' % rgwMetric,
22 1,
23 'time_series',
24 '{{source_zone}}')]
25 ) + { gridPos: { x: x, y: y, w: w, h: h } };
26
27 u.dashboardSchema(
28 'RGW Sync Overview',
29 '',
30 'rgw-sync-overview',
31 'now-1h',
32 '15s',
33 16,
34 ['overview'],
35 '',
36 {
37 refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
38 time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
39 }
40 )
41 .addAnnotation(
42 u.addAnnotationSchema(
43 1,
44 '-- Grafana --',
45 true,
46 true,
47 'rgba(0, 211, 255, 1)',
48 'Annotations & Alerts',
49 'dashboard'
50 )
51 )
52 .addRequired(
53 type='grafana', id='grafana', name='Grafana', version='5.0.0'
54 )
55 .addRequired(
56 type='panel', id='graph', name='Graph', version='5.0.0'
57 )
58 .addTemplate(
59 u.addTemplateSchema('rgw_servers', '$datasource', 'prometehus', 1, true, 1, '', '')
60 )
61 .addTemplate(
62 g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
63 )
64 .addPanels([
65 RgwSyncOverviewPanel(
66 'Replication (throughput) from Source Zone',
67 'Bps',
68 null,
69 'ceph_data_sync_from_zone_fetch_bytes_sum',
70 0,
71 0,
72 8,
73 7
74 ),
75 RgwSyncOverviewPanel(
76 'Replication (objects) from Source Zone',
77 'short',
78 'Objects/s',
79 'ceph_data_sync_from_zone_fetch_bytes_count',
80 8,
81 0,
82 8,
83 7
84 ),
85 RgwSyncOverviewPanel(
86 'Polling Request Latency from Source Zone',
87 'ms',
88 null,
89 'ceph_data_sync_from_zone_poll_latency_sum',
90 16,
91 0,
92 8,
93 7
94 ),
95 RgwSyncOverviewPanel(
96 'Unsuccessful Object Replications from Source Zone',
97 'short',
98 'Count/s',
99 'ceph_data_sync_from_zone_fetch_errors',
100 0,
101 7,
102 8,
103 7
104 ),
105 ]),
106 'radosgw-overview.json':
107 local RgwOverviewPanel(
108 title,
109 description,
110 formatY1,
111 formatY2,
112 expr1,
113 legendFormat1,
114 x,
115 y,
116 w,
117 h,
118 datasource='$datasource',
119 legend_alignAsTable=false,
120 legend_avg=false,
121 legend_min=false,
122 legend_max=false,
123 legend_current=false,
124 legend_values=false
125 ) =
126 u.graphPanelSchema(
127 {},
128 title,
129 description,
130 'null',
131 false,
132 formatY1,
133 formatY2,
134 null,
135 null,
136 0,
137 1,
138 datasource,
139 legend_alignAsTable,
140 legend_avg,
141 legend_min,
142 legend_max,
143 legend_current,
144 legend_values
145 )
146 .addTargets(
147 [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1)]
148 ) + { gridPos: { x: x, y: y, w: w, h: h } };
149
150 u.dashboardSchema(
151 'RGW Overview',
152 '',
153 'WAkugZpiz',
154 'now-1h',
155 '15s',
156 16,
157 ['overview'],
158 '',
159 {
160 refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
161 time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
162 }
163 )
164 .addAnnotation(
165 u.addAnnotationSchema(
166 1,
167 '-- Grafana --',
168 true,
169 true,
170 'rgba(0, 211, 255, 1)',
171 'Annotations & Alerts',
172 'dashboard'
173 )
174 )
175 .addRequired(
176 type='grafana', id='grafana', name='Grafana', version='5.0.0'
177 )
178 .addRequired(
179 type='panel', id='graph', name='Graph', version='5.0.0'
180 )
181 .addTemplate(
182 u.addTemplateSchema(
183 'rgw_servers',
184 '$datasource',
185 'label_values(ceph_rgw_metadata, ceph_daemon)',
186 1,
187 true,
188 1,
189 '',
190 ''
191 )
192 )
193 .addTemplate(
194 u.addTemplateSchema(
195 'code',
196 '$datasource',
197 'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)',
198 1,
199 true,
200 1,
201 'HTTP Code',
202 ''
203 )
204 )
205 .addTemplate(
206 u.addTemplateSchema(
207 'ingress_service',
208 '$datasource',
209 'label_values(haproxy_server_status, instance)',
210 1,
211 true,
212 1,
213 'Ingress Service',
214 ''
215 )
216 )
217 .addTemplate(
218 g.template.datasource('datasource',
219 'prometheus',
220 'default',
221 label='Data Source')
222 )
223 .addPanels([
224 u.addRowSchema(false,
225 true,
226 'RGW Overview - All Gateways') +
227 {
228 gridPos: { x: 0, y: 0, w: 24, h: 1 },
229 },
230 RgwOverviewPanel(
231 'Average GET/PUT Latencies',
232 '',
233 's',
234 'short',
235 'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
236 'GET AVG',
237 0,
238 1,
239 8,
240 7
241 ).addTargets(
242 [
243 u.addTargetSchema(
244 'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
245 1,
246 'time_series',
247 'PUT AVG'
248 ),
249 ]
250 ),
251 RgwOverviewPanel(
252 'Total Requests/sec by RGW Instance',
253 '',
254 'none',
255 'short',
256 'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))',
257 '{{rgw_host}}',
258 8,
259 1,
260 7,
261 7
262 ),
263 RgwOverviewPanel(
264 'GET Latencies by RGW Instance',
265 'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts',
266 's',
267 'short',
268 'label_replace(\n rate(ceph_rgw_get_initial_lat_sum[30s]) /\n rate(ceph_rgw_get_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
269 '{{rgw_host}}',
270 15,
271 1,
272 6,
273 7
274 ),
275 RgwOverviewPanel(
276 'Bandwidth Consumed by Type',
277 'Total bytes transferred in/out of all radosgw instances within the cluster',
278 'bytes',
279 'short',
280 'sum(rate(ceph_rgw_get_b[30s]))',
281 'GETs',
282 0,
283 8,
284 8,
285 6
286 ).addTargets(
287 [u.addTargetSchema('sum(rate(ceph_rgw_put_b[30s]))',
288 1,
289 'time_series',
290 'PUTs')]
291 ),
292 RgwOverviewPanel(
293 'Bandwidth by RGW Instance',
294 'Total bytes transferred in/out through get/put operations, by radosgw instance',
295 'bytes',
296 'short',
297 'label_replace(sum by (instance_id) (\n rate(ceph_rgw_get_b[30s]) + \n rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
298 '{{rgw_host}}',
299 8,
300 8,
301 7,
302 6
303 ),
304 RgwOverviewPanel(
305 'PUT Latencies by RGW Instance',
306 'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts',
307 's',
308 'short',
309 'label_replace(\n rate(ceph_rgw_put_initial_lat_sum[30s]) /\n rate(ceph_rgw_put_initial_lat_count[30s]) *\n on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
310 '{{rgw_host}}',
311 15,
312 8,
313 6,
314 6
315 ),
316 u.addRowSchema(
317 false, true, 'RGW Overview - HAProxy Metrics'
318 ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } },
319 RgwOverviewPanel(
320 'Total responses by HTTP code',
321 '',
322 'short',
323 'short',
324 'sum(irate(haproxy_frontend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"frontend"}[5m])) by (code)',
325 'Frontend {{ code }}',
326 0,
327 12,
328 5,
329 12,
330 '$datasource',
331 true,
332 true,
333 true,
334 true,
335 true,
336 true
337 )
338 .addTargets(
339 [u.addTargetSchema('sum(irate(haproxy_backend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"backend"}[5m])) by (code)', 1, 'time_series', 'Backend {{ code }}')]
340 )
341 .addSeriesOverride([
342 {
343 alias: '/.*Back.*/',
344 transform: 'negative-Y',
345 },
346 { alias: '/.*1.*/' },
347 { alias: '/.*2.*/' },
348 { alias: '/.*3.*/' },
349 { alias: '/.*4.*/' },
350 { alias: '/.*5.*/' },
351 { alias: '/.*other.*/' },
352 ]),
353 RgwOverviewPanel(
354 'Total requests / responses',
355 '',
356 'short',
357 'short',
358 'sum(irate(haproxy_frontend_http_requests_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)',
359 'Requests',
360 5,
361 12,
362 5,
363 12,
364 '$datasource',
365 true,
366 true,
367 true,
368 true,
369 true,
370 true
371 )
372 .addTargets(
373 [
374 u.addTargetSchema('sum(irate(haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Response errors'),
375 u.addTargetSchema('sum(irate(haproxy_frontend_request_errors_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Requests errors'),
376 u.addTargetSchema('sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend redispatch'),
377 u.addTargetSchema('sum(irate(haproxy_backend_retry_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend retry'),
378 u.addTargetSchema('sum(irate(haproxy_frontend_requests_denied_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Request denied'),
379 u.addTargetSchema('sum(haproxy_backend_current_queue{proxy=~"backend",instance=~"$ingress_service"}) by (instance)', 2, 'time_series', 'Backend Queued'),
380 ]
381 )
382 .addSeriesOverride([
383 {
384 alias: '/.*Response.*/',
385 transform: 'negative-Y',
386 },
387 {
388 alias: '/.*Backend.*/',
389 transform: 'negative-Y',
390 },
391 ]),
392 RgwOverviewPanel(
393 'Total number of connections',
394 '',
395 'short',
396 'short',
397 'sum(irate(haproxy_frontend_connections_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)',
398 'Front',
399 10,
400 12,
401 5,
402 12,
403 '$datasource',
404 true,
405 true,
406 true,
407 true,
408 true,
409 true
410 )
411 .addTargets(
412 [
413 u.addTargetSchema('sum(irate(haproxy_backend_connection_attempts_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back'),
414 u.addTargetSchema('sum(irate(haproxy_backend_connection_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back errors'),
415 ]
416 )
417 .addSeriesOverride([
418 {
419 alias: '/.*Back.*/',
420 transform: 'negative-Y',
421 },
422 ]),
423 RgwOverviewPanel(
424 'Current total of incoming / outgoing bytes',
425 '',
426 'short',
427 'short',
428 'sum(irate(haproxy_frontend_bytes_in_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)',
429 'IN Front',
430 15,
431 12,
432 6,
433 12,
434 '$datasource',
435 true,
436 true,
437 true,
438 true,
439 true,
440 true
441 )
442 .addTargets(
443 [
444 u.addTargetSchema('sum(irate(haproxy_frontend_bytes_out_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Front'),
445 u.addTargetSchema('sum(irate(haproxy_backend_bytes_in_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'IN Back'),
446 u.addTargetSchema('sum(irate(haproxy_backend_bytes_out_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Back'),
447 ]
448 )
449 .addSeriesOverride([
450 {
451 alias: '/.*OUT.*/',
452 transform: 'negative-Y',
453 },
454 ]),
455 ]),
456 'radosgw-detail.json':
457 local RgwDetailsPanel(aliasColors,
458 title,
459 description,
460 formatY1,
461 formatY2,
462 expr1,
463 expr2,
464 legendFormat1,
465 legendFormat2,
466 x,
467 y,
468 w,
469 h) =
470 u.graphPanelSchema(aliasColors,
471 title,
472 description,
473 'null',
474 false,
475 formatY1,
476 formatY2,
477 null,
478 null,
479 0,
480 1,
481 '$datasource')
482 .addTargets(
483 [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1), u.addTargetSchema(expr2, 1, 'time_series', legendFormat2)]
484 ) + { gridPos: { x: x, y: y, w: w, h: h } };
485
486 u.dashboardSchema(
487 'RGW Instance Detail',
488 '',
489 'x5ARzZtmk',
490 'now-1h',
491 '15s',
492 16,
493 ['overview'],
494 '',
495 {
496 refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
497 time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
498 }
499 )
500 .addAnnotation(
501 u.addAnnotationSchema(
502 1,
503 '-- Grafana --',
504 true,
505 true,
506 'rgba(0, 211, 255, 1)',
507 'Annotations & Alerts',
508 'dashboard'
509 )
510 )
511 .addRequired(
512 type='grafana', id='grafana', name='Grafana', version='5.0.0'
513 )
514 .addRequired(
515 type='panel',
516 id='grafana-piechart-panel',
517 name='Pie Chart',
518 version='1.3.3'
519 )
520 .addRequired(
521 type='panel', id='graph', name='Graph', version='5.0.0'
522 )
523 .addTemplate(
524 g.template.datasource('datasource',
525 'prometheus',
526 'default',
527 label='Data Source')
528 )
529 .addTemplate(
530 u.addTemplateSchema('rgw_servers',
531 '$datasource',
532 'label_values(ceph_rgw_metadata, ceph_daemon)',
533 1,
534 true,
535 1,
536 '',
537 '')
538 )
539 .addPanels([
540 u.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
541 RgwDetailsPanel(
542 {},
543 '$rgw_servers GET/PUT Latencies',
544 '',
545 's',
546 'short',
547 'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
548 'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
549 'GET {{ceph_daemon}}',
550 'PUT {{ceph_daemon}}',
551 0,
552 1,
553 6,
554 8
555 ),
556 RgwDetailsPanel(
557 {},
558 'Bandwidth by HTTP Operation',
559 '',
560 'bytes',
561 'short',
562 'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
563 'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
564 'GETs {{ceph_daemon}}',
565 'PUTs {{ceph_daemon}}',
566 6,
567 1,
568 7,
569 8
570 ),
571 RgwDetailsPanel(
572 {
573 GETs: '#7eb26d',
574 Other: '#447ebc',
575 PUTs: '#eab839',
576 Requests: '#3f2b5b',
577 'Requests Failed': '#bf1b00',
578 },
579 'HTTP Request Breakdown',
580 '',
581 'short',
582 'short',
583 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
584 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
585 'Requests Failed {{ceph_daemon}}',
586 'GETs {{ceph_daemon}}',
587 13,
588 1,
589 7,
590 8
591 )
592 .addTargets(
593 [
594 u.addTargetSchema(
595 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
596 1,
597 'time_series',
598 'PUTs {{ceph_daemon}}'
599 ),
600 u.addTargetSchema(
601 '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
602 1,
603 'time_series',
604 'Other {{ceph_daemon}}'
605 ),
606 ]
607 ),
608 u.addPieChartSchema(
609 {
610 GETs: '#7eb26d',
611 'Other (HEAD,POST,DELETE)': '#447ebc',
612 PUTs: '#eab839',
613 Requests: '#3f2b5b',
614 Failures: '#bf1b00',
615 }, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current'
616 )
617 .addTarget(u.addTargetSchema(
618 'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
619 1,
620 'time_series',
621 'Failures {{ceph_daemon}}'
622 ))
623 .addTarget(u.addTargetSchema(
624 'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
625 1,
626 'time_series',
627 'GETs {{ceph_daemon}}'
628 ))
629 .addTarget(u.addTargetSchema(
630 'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
631 1,
632 'time_series',
633 'PUTs {{ceph_daemon}}'
634 ))
635 .addTarget(u.addTargetSchema(
636 '(\n rate(ceph_rgw_req[30s]) -\n (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
637 1,
638 'time_series',
639 'Other (DELETE,LIST) {{ceph_daemon}}'
640 )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } },
641 ]),
642 },
643 }