ceph/monitoring/ceph-mixin/dashboards/rgw.libsonnet

   1 local g = import 'grafonnet/grafana.libsonnet';
   2 local u = import 'utils.libsonnet';
   3
   4 {
   5   grafanaDashboards+:: {
   6     'radosgw-sync-overview.json':
   7       local RgwSyncOverviewPanel(title, formatY1, labelY1, rgwMetric, x, y, w, h) =
   8         u.graphPanelSchema({},
   9                            title,
  10                            '',
  11                            'null as zero',
  12                            true,
  13                            formatY1,
  14                            'short',
  15                            labelY1,
  16                            null,
  17                            0,
  18                            1,
  19                            '$datasource')
  20         .addTargets(
  21           [u.addTargetSchema('sum by (source_zone) (rate(%s[30s]))' % rgwMetric,
  22                              1,
  23                              'time_series',
  24                              '{{source_zone}}')]
  25         ) + { gridPos: { x: x, y: y, w: w, h: h } };
  26
  27       u.dashboardSchema(
  28         'RGW Sync Overview',
  29         '',
  30         'rgw-sync-overview',
  31         'now-1h',
  32         '15s',
  33         16,
  34         ['overview'],
  35         '',
  36         {
  37           refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
  38           time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
  39         }
  40       )
  41       .addAnnotation(
  42         u.addAnnotationSchema(
  43           1,
  44           '-- Grafana --',
  45           true,
  46           true,
  47           'rgba(0, 211, 255, 1)',
  48           'Annotations & Alerts',
  49           'dashboard'
  50         )
  51       )
  52       .addRequired(
  53         type='grafana', id='grafana', name='Grafana', version='5.0.0'
  54       )
  55       .addRequired(
  56         type='panel', id='graph', name='Graph', version='5.0.0'
  57       )
  58       .addTemplate(
  59         u.addTemplateSchema('rgw_servers', '$datasource', 'prometehus', 1, true, 1, '', '')
  60       )
  61       .addTemplate(
  62         g.template.datasource('datasource', 'prometheus', 'default', label='Data Source')
  63       )
  64       .addPanels([
  65         RgwSyncOverviewPanel(
  66           'Replication (throughput) from Source Zone',
  67           'Bps',
  68           null,
  69           'ceph_data_sync_from_zone_fetch_bytes_sum',
  70           0,
  71           0,
  72           8,
  73           7
  74         ),
  75         RgwSyncOverviewPanel(
  76           'Replication (objects) from Source Zone',
  77           'short',
  78           'Objects/s',
  79           'ceph_data_sync_from_zone_fetch_bytes_count',
  80           8,
  81           0,
  82           8,
  83           7
  84         ),
  85         RgwSyncOverviewPanel(
  86           'Polling Request Latency from Source Zone',
  87           'ms',
  88           null,
  89           'ceph_data_sync_from_zone_poll_latency_sum',
  90           16,
  91           0,
  92           8,
  93           7
  94         ),
  95         RgwSyncOverviewPanel(
  96           'Unsuccessful Object Replications from Source Zone',
  97           'short',
  98           'Count/s',
  99           'ceph_data_sync_from_zone_fetch_errors',
 100           0,
 101           7,
 102           8,
 103           7
 104         ),
 105       ]),
 106     'radosgw-overview.json':
 107       local RgwOverviewPanel(
 108         title,
 109         description,
 110         formatY1,
 111         formatY2,
 112         expr1,
 113         legendFormat1,
 114         x,
 115         y,
 116         w,
 117         h,
 118         datasource='$datasource',
 119         legend_alignAsTable=false,
 120         legend_avg=false,
 121         legend_min=false,
 122         legend_max=false,
 123         legend_current=false,
 124         legend_values=false
 125             ) =
 126         u.graphPanelSchema(
 127           {},
 128           title,
 129           description,
 130           'null',
 131           false,
 132           formatY1,
 133           formatY2,
 134           null,
 135           null,
 136           0,
 137           1,
 138           datasource,
 139           legend_alignAsTable,
 140           legend_avg,
 141           legend_min,
 142           legend_max,
 143           legend_current,
 144           legend_values
 145         )
 146         .addTargets(
 147           [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1)]
 148         ) + { gridPos: { x: x, y: y, w: w, h: h } };
 149
 150       u.dashboardSchema(
 151         'RGW Overview',
 152         '',
 153         'WAkugZpiz',
 154         'now-1h',
 155         '15s',
 156         16,
 157         ['overview'],
 158         '',
 159         {
 160           refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
 161           time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
 162         }
 163       )
 164       .addAnnotation(
 165         u.addAnnotationSchema(
 166           1,
 167           '-- Grafana --',
 168           true,
 169           true,
 170           'rgba(0, 211, 255, 1)',
 171           'Annotations & Alerts',
 172           'dashboard'
 173         )
 174       )
 175       .addRequired(
 176         type='grafana', id='grafana', name='Grafana', version='5.0.0'
 177       )
 178       .addRequired(
 179         type='panel', id='graph', name='Graph', version='5.0.0'
 180       )
 181       .addTemplate(
 182         u.addTemplateSchema(
 183           'rgw_servers',
 184           '$datasource',
 185           'label_values(ceph_rgw_metadata, ceph_daemon)',
 186           1,
 187           true,
 188           1,
 189           '',
 190           ''
 191         )
 192       )
 193       .addTemplate(
 194         u.addTemplateSchema(
 195           'code',
 196           '$datasource',
 197           'label_values(haproxy_server_http_responses_total{instance=~"$ingress_service"}, code)',
 198           1,
 199           true,
 200           1,
 201           'HTTP Code',
 202           ''
 203         )
 204       )
 205       .addTemplate(
 206         u.addTemplateSchema(
 207           'ingress_service',
 208           '$datasource',
 209           'label_values(haproxy_server_status, instance)',
 210           1,
 211           true,
 212           1,
 213           'Ingress Service',
 214           ''
 215         )
 216       )
 217       .addTemplate(
 218         g.template.datasource('datasource',
 219                               'prometheus',
 220                               'default',
 221                               label='Data Source')
 222       )
 223       .addPanels([
 224         u.addRowSchema(false,
 225                        true,
 226                        'RGW Overview - All Gateways') +
 227         {
 228           gridPos: { x: 0, y: 0, w: 24, h: 1 },
 229         },
 230         RgwOverviewPanel(
 231           'Average GET/PUT Latencies',
 232           '',
 233           's',
 234           'short',
 235           'rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
 236           'GET AVG',
 237           0,
 238           1,
 239           8,
 240           7
 241         ).addTargets(
 242           [
 243             u.addTargetSchema(
 244               'rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata',
 245               1,
 246               'time_series',
 247               'PUT AVG'
 248             ),
 249           ]
 250         ),
 251         RgwOverviewPanel(
 252           'Total Requests/sec by RGW Instance',
 253           '',
 254           'none',
 255           'short',
 256           'sum by (rgw_host) (label_replace(rate(ceph_rgw_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)"))',
 257           '{{rgw_host}}',
 258           8,
 259           1,
 260           7,
 261           7
 262         ),
 263         RgwOverviewPanel(
 264           'GET Latencies by RGW Instance',
 265           'Latencies are shown stacked, without a yaxis to provide a visual indication of GET latency imbalance across RGW hosts',
 266           's',
 267           'short',
 268           'label_replace(\n    rate(ceph_rgw_get_initial_lat_sum[30s]) /\n    rate(ceph_rgw_get_initial_lat_count[30s]) *\n    on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
 269           '{{rgw_host}}',
 270           15,
 271           1,
 272           6,
 273           7
 274         ),
 275         RgwOverviewPanel(
 276           'Bandwidth Consumed by Type',
 277           'Total bytes transferred in/out of all radosgw instances within the cluster',
 278           'bytes',
 279           'short',
 280           'sum(rate(ceph_rgw_get_b[30s]))',
 281           'GETs',
 282           0,
 283           8,
 284           8,
 285           6
 286         ).addTargets(
 287           [u.addTargetSchema('sum(rate(ceph_rgw_put_b[30s]))',
 288                              1,
 289                              'time_series',
 290                              'PUTs')]
 291         ),
 292         RgwOverviewPanel(
 293           'Bandwidth by RGW Instance',
 294           'Total bytes transferred in/out through get/put operations, by radosgw instance',
 295           'bytes',
 296           'short',
 297           'label_replace(sum by (instance_id) (\n    rate(ceph_rgw_get_b[30s]) + \n    rate(ceph_rgw_put_b[30s])\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata, "rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
 298           '{{rgw_host}}',
 299           8,
 300           8,
 301           7,
 302           6
 303         ),
 304         RgwOverviewPanel(
 305           'PUT Latencies by RGW Instance',
 306           'Latencies are shown stacked, without a yaxis to provide a visual indication of PUT latency imbalance across RGW hosts',
 307           's',
 308           'short',
 309           'label_replace(\n    rate(ceph_rgw_put_initial_lat_sum[30s]) /\n    rate(ceph_rgw_put_initial_lat_count[30s]) *\n    on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata,\n"rgw_host", "$1", "ceph_daemon", "rgw.(.*)")',
 310           '{{rgw_host}}',
 311           15,
 312           8,
 313           6,
 314           6
 315         ),
 316         u.addRowSchema(
 317           false, true, 'RGW Overview - HAProxy Metrics'
 318         ) + { gridPos: { x: 0, y: 12, w: 9, h: 12 } },
 319         RgwOverviewPanel(
 320           'Total responses by HTTP code',
 321           '',
 322           'short',
 323           'short',
 324           'sum(irate(haproxy_frontend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"frontend"}[5m])) by (code)',
 325           'Frontend {{ code }}',
 326           0,
 327           12,
 328           5,
 329           12,
 330           '$datasource',
 331           true,
 332           true,
 333           true,
 334           true,
 335           true,
 336           true
 337         )
 338         .addTargets(
 339           [u.addTargetSchema('sum(irate(haproxy_backend_http_responses_total{code=~"$code",instance=~"$ingress_service",proxy=~"backend"}[5m])) by (code)', 1, 'time_series', 'Backend {{ code }}')]
 340         )
 341         .addSeriesOverride([
 342           {
 343             alias: '/.*Back.*/',
 344             transform: 'negative-Y',
 345           },
 346           { alias: '/.*1.*/' },
 347           { alias: '/.*2.*/' },
 348           { alias: '/.*3.*/' },
 349           { alias: '/.*4.*/' },
 350           { alias: '/.*5.*/' },
 351           { alias: '/.*other.*/' },
 352         ]),
 353         RgwOverviewPanel(
 354           'Total requests / responses',
 355           '',
 356           'short',
 357           'short',
 358           'sum(irate(haproxy_frontend_http_requests_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)',
 359           'Requests',
 360           5,
 361           12,
 362           5,
 363           12,
 364           '$datasource',
 365           true,
 366           true,
 367           true,
 368           true,
 369           true,
 370           true
 371         )
 372         .addTargets(
 373           [
 374             u.addTargetSchema('sum(irate(haproxy_backend_response_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Response errors'),
 375             u.addTargetSchema('sum(irate(haproxy_frontend_request_errors_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Requests errors'),
 376             u.addTargetSchema('sum(irate(haproxy_backend_redispatch_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend redispatch'),
 377             u.addTargetSchema('sum(irate(haproxy_backend_retry_warnings_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Backend retry'),
 378             u.addTargetSchema('sum(irate(haproxy_frontend_requests_denied_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)', 2, 'time_series', 'Request denied'),
 379             u.addTargetSchema('sum(haproxy_backend_current_queue{proxy=~"backend",instance=~"$ingress_service"}) by (instance)', 2, 'time_series', 'Backend Queued'),
 380           ]
 381         )
 382         .addSeriesOverride([
 383           {
 384             alias: '/.*Response.*/',
 385             transform: 'negative-Y',
 386           },
 387           {
 388             alias: '/.*Backend.*/',
 389             transform: 'negative-Y',
 390           },
 391         ]),
 392         RgwOverviewPanel(
 393           'Total number of connections',
 394           '',
 395           'short',
 396           'short',
 397           'sum(irate(haproxy_frontend_connections_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])) by (instance)',
 398           'Front',
 399           10,
 400           12,
 401           5,
 402           12,
 403           '$datasource',
 404           true,
 405           true,
 406           true,
 407           true,
 408           true,
 409           true
 410         )
 411         .addTargets(
 412           [
 413             u.addTargetSchema('sum(irate(haproxy_backend_connection_attempts_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back'),
 414             u.addTargetSchema('sum(irate(haproxy_backend_connection_errors_total{proxy=~"backend",instance=~"$ingress_service"}[5m])) by (instance)', 1, 'time_series', 'Back errors'),
 415           ]
 416         )
 417         .addSeriesOverride([
 418           {
 419             alias: '/.*Back.*/',
 420             transform: 'negative-Y',
 421           },
 422         ]),
 423         RgwOverviewPanel(
 424           'Current total of incoming / outgoing bytes',
 425           '',
 426           'short',
 427           'short',
 428           'sum(irate(haproxy_frontend_bytes_in_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)',
 429           'IN Front',
 430           15,
 431           12,
 432           6,
 433           12,
 434           '$datasource',
 435           true,
 436           true,
 437           true,
 438           true,
 439           true,
 440           true
 441         )
 442         .addTargets(
 443           [
 444             u.addTargetSchema('sum(irate(haproxy_frontend_bytes_out_total{proxy=~"frontend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Front'),
 445             u.addTargetSchema('sum(irate(haproxy_backend_bytes_in_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'IN Back'),
 446             u.addTargetSchema('sum(irate(haproxy_backend_bytes_out_total{proxy=~"backend",instance=~"$ingress_service"}[5m])*8) by (instance)', 2, 'time_series', 'OUT Back'),
 447           ]
 448         )
 449         .addSeriesOverride([
 450           {
 451             alias: '/.*OUT.*/',
 452             transform: 'negative-Y',
 453           },
 454         ]),
 455       ]),
 456     'radosgw-detail.json':
 457       local RgwDetailsPanel(aliasColors,
 458                             title,
 459                             description,
 460                             formatY1,
 461                             formatY2,
 462                             expr1,
 463                             expr2,
 464                             legendFormat1,
 465                             legendFormat2,
 466                             x,
 467                             y,
 468                             w,
 469                             h) =
 470         u.graphPanelSchema(aliasColors,
 471                            title,
 472                            description,
 473                            'null',
 474                            false,
 475                            formatY1,
 476                            formatY2,
 477                            null,
 478                            null,
 479                            0,
 480                            1,
 481                            '$datasource')
 482         .addTargets(
 483           [u.addTargetSchema(expr1, 1, 'time_series', legendFormat1), u.addTargetSchema(expr2, 1, 'time_series', legendFormat2)]
 484         ) + { gridPos: { x: x, y: y, w: w, h: h } };
 485
 486       u.dashboardSchema(
 487         'RGW Instance Detail',
 488         '',
 489         'x5ARzZtmk',
 490         'now-1h',
 491         '15s',
 492         16,
 493         ['overview'],
 494         '',
 495         {
 496           refresh_intervals: ['5s', '10s', '15s', '30s', '1m', '5m', '15m', '30m', '1h', '2h', '1d'],
 497           time_options: ['5m', '15m', '1h', '6h', '12h', '24h', '2d', '7d', '30d'],
 498         }
 499       )
 500       .addAnnotation(
 501         u.addAnnotationSchema(
 502           1,
 503           '-- Grafana --',
 504           true,
 505           true,
 506           'rgba(0, 211, 255, 1)',
 507           'Annotations & Alerts',
 508           'dashboard'
 509         )
 510       )
 511       .addRequired(
 512         type='grafana', id='grafana', name='Grafana', version='5.0.0'
 513       )
 514       .addRequired(
 515         type='panel',
 516         id='grafana-piechart-panel',
 517         name='Pie Chart',
 518         version='1.3.3'
 519       )
 520       .addRequired(
 521         type='panel', id='graph', name='Graph', version='5.0.0'
 522       )
 523       .addTemplate(
 524         g.template.datasource('datasource',
 525                               'prometheus',
 526                               'default',
 527                               label='Data Source')
 528       )
 529       .addTemplate(
 530         u.addTemplateSchema('rgw_servers',
 531                             '$datasource',
 532                             'label_values(ceph_rgw_metadata, ceph_daemon)',
 533                             1,
 534                             true,
 535                             1,
 536                             '',
 537                             '')
 538       )
 539       .addPanels([
 540         u.addRowSchema(false, true, 'RGW Host Detail : $rgw_servers') + { gridPos: { x: 0, y: 0, w: 24, h: 1 } },
 541         RgwDetailsPanel(
 542           {},
 543           '$rgw_servers GET/PUT Latencies',
 544           '',
 545           's',
 546           'short',
 547           'sum by (instance_id) (rate(ceph_rgw_get_initial_lat_sum[30s]) / rate(ceph_rgw_get_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 548           'sum by (instance_id) (rate(ceph_rgw_put_initial_lat_sum[30s]) / rate(ceph_rgw_put_initial_lat_count[30s])) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 549           'GET {{ceph_daemon}}',
 550           'PUT {{ceph_daemon}}',
 551           0,
 552           1,
 553           6,
 554           8
 555         ),
 556         RgwDetailsPanel(
 557           {},
 558           'Bandwidth by HTTP Operation',
 559           '',
 560           'bytes',
 561           'short',
 562           'rate(ceph_rgw_get_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 563           'rate(ceph_rgw_put_b[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 564           'GETs {{ceph_daemon}}',
 565           'PUTs {{ceph_daemon}}',
 566           6,
 567           1,
 568           7,
 569           8
 570         ),
 571         RgwDetailsPanel(
 572           {
 573             GETs: '#7eb26d',
 574             Other: '#447ebc',
 575             PUTs: '#eab839',
 576             Requests: '#3f2b5b',
 577             'Requests Failed': '#bf1b00',
 578           },
 579           'HTTP Request Breakdown',
 580           '',
 581           'short',
 582           'short',
 583           'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 584           'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 585           'Requests Failed {{ceph_daemon}}',
 586           'GETs {{ceph_daemon}}',
 587           13,
 588           1,
 589           7,
 590           8
 591         )
 592         .addTargets(
 593           [
 594             u.addTargetSchema(
 595               'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 596               1,
 597               'time_series',
 598               'PUTs {{ceph_daemon}}'
 599             ),
 600             u.addTargetSchema(
 601               '(\n    rate(ceph_rgw_req[30s]) -\n    (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 602               1,
 603               'time_series',
 604               'Other {{ceph_daemon}}'
 605             ),
 606           ]
 607         ),
 608         u.addPieChartSchema(
 609           {
 610             GETs: '#7eb26d',
 611             'Other (HEAD,POST,DELETE)': '#447ebc',
 612             PUTs: '#eab839',
 613             Requests: '#3f2b5b',
 614             Failures: '#bf1b00',
 615           }, '$datasource', '', 'Under graph', 'pie', 'Workload Breakdown', 'current'
 616         )
 617         .addTarget(u.addTargetSchema(
 618           'rate(ceph_rgw_failed_req[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 619           1,
 620           'time_series',
 621           'Failures {{ceph_daemon}}'
 622         ))
 623         .addTarget(u.addTargetSchema(
 624           'rate(ceph_rgw_get[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 625           1,
 626           'time_series',
 627           'GETs {{ceph_daemon}}'
 628         ))
 629         .addTarget(u.addTargetSchema(
 630           'rate(ceph_rgw_put[30s]) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 631           1,
 632           'time_series',
 633           'PUTs {{ceph_daemon}}'
 634         ))
 635         .addTarget(u.addTargetSchema(
 636           '(\n    rate(ceph_rgw_req[30s]) -\n    (rate(ceph_rgw_get[30s]) + rate(ceph_rgw_put[30s]))\n) * on (instance_id) group_left (ceph_daemon) ceph_rgw_metadata{ceph_daemon=~"$rgw_servers"}',
 637           1,
 638           'time_series',
 639           'Other (DELETE,LIST) {{ceph_daemon}}'
 640         )) + { gridPos: { x: 20, y: 1, w: 4, h: 8 } },
 641       ]),
 642   },
 643 }