]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/cephadm/services/monitoring.py
f99c79e795575531149e31cc5f2a97c52f612003
[ceph.git] / ceph / src / pybind / mgr / cephadm / services / monitoring.py
1 import errno
2 import ipaddress
3 import logging
4 import os
5 import socket
6 from typing import List, Any, Tuple, Dict, Optional, cast
7 from urllib.parse import urlparse
8
9 from mgr_module import HandleCommandResult
10
11 from orchestrator import DaemonDescription
12 from ceph.deployment.service_spec import AlertManagerSpec, GrafanaSpec, ServiceSpec, SNMPGatewaySpec
13 from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec
14 from cephadm.services.ingress import IngressSpec
15 from mgr_util import verify_tls, ServerConfigException, create_self_signed_cert, build_url
16
17 logger = logging.getLogger(__name__)
18
19
20 class GrafanaService(CephadmService):
21 TYPE = 'grafana'
22 DEFAULT_SERVICE_PORT = 3000
23
24 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
25 assert self.TYPE == daemon_spec.daemon_type
26 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
27 return daemon_spec
28
29 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
30 assert self.TYPE == daemon_spec.daemon_type
31 deps = [] # type: List[str]
32
33 prom_services = [] # type: List[str]
34 for dd in self.mgr.cache.get_daemons_by_service('prometheus'):
35 assert dd.hostname is not None
36 addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname)
37 port = dd.ports[0] if dd.ports else 9095
38 prom_services.append(build_url(scheme='http', host=addr, port=port))
39
40 deps.append(dd.name())
41
42 daemons = self.mgr.cache.get_daemons_by_service('loki')
43 loki_host = ''
44 for i, dd in enumerate(daemons):
45 assert dd.hostname is not None
46 if i == 0:
47 addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname)
48 loki_host = build_url(scheme='http', host=addr, port=3100)
49
50 deps.append(dd.name())
51
52 grafana_data_sources = self.mgr.template.render(
53 'services/grafana/ceph-dashboard.yml.j2', {'hosts': prom_services, 'loki_host': loki_host})
54
55 cert_path = f'{daemon_spec.host}/grafana_crt'
56 key_path = f'{daemon_spec.host}/grafana_key'
57 cert = self.mgr.get_store(cert_path)
58 pkey = self.mgr.get_store(key_path)
59 if cert and pkey:
60 try:
61 verify_tls(cert, pkey)
62 except ServerConfigException as e:
63 logger.warning('Provided grafana TLS certificates invalid: %s', str(e))
64 cert, pkey = None, None
65 if not (cert and pkey):
66 cert, pkey = create_self_signed_cert('Ceph', daemon_spec.host)
67 self.mgr.set_store(cert_path, cert)
68 self.mgr.set_store(key_path, pkey)
69 if 'dashboard' in self.mgr.get('mgr_map')['modules']:
70 self.mgr.check_mon_command({
71 'prefix': 'dashboard set-grafana-api-ssl-verify',
72 'value': 'false',
73 })
74
75 spec: GrafanaSpec = cast(
76 GrafanaSpec, self.mgr.spec_store.active_specs[daemon_spec.service_name])
77 grafana_ini = self.mgr.template.render(
78 'services/grafana/grafana.ini.j2', {
79 'initial_admin_password': spec.initial_admin_password,
80 'http_port': daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT,
81 'http_addr': daemon_spec.ip if daemon_spec.ip else ''
82 })
83
84 if 'dashboard' in self.mgr.get('mgr_map')['modules'] and spec.initial_admin_password:
85 self.mgr.check_mon_command(
86 {'prefix': 'dashboard set-grafana-api-password'}, inbuf=spec.initial_admin_password)
87
88 config_file = {
89 'files': {
90 "grafana.ini": grafana_ini,
91 'provisioning/datasources/ceph-dashboard.yml': grafana_data_sources,
92 'certs/cert_file': '# generated by cephadm\n%s' % cert,
93 'certs/cert_key': '# generated by cephadm\n%s' % pkey,
94 }
95 }
96 return config_file, sorted(deps)
97
98 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
99 # Use the least-created one as the active daemon
100 if daemon_descrs:
101 return daemon_descrs[-1]
102 # if empty list provided, return empty Daemon Desc
103 return DaemonDescription()
104
105 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
106 # TODO: signed cert
107 dd = self.get_active_daemon(daemon_descrs)
108 assert dd.hostname is not None
109 addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname)
110 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
111 service_url = build_url(scheme='https', host=addr, port=port)
112 self._set_service_url_on_dashboard(
113 'Grafana',
114 'dashboard get-grafana-api-url',
115 'dashboard set-grafana-api-url',
116 service_url
117 )
118
119 def pre_remove(self, daemon: DaemonDescription) -> None:
120 """
121 Called before grafana daemon is removed.
122 """
123 if daemon.hostname is not None:
124 # delete cert/key entires for this grafana daemon
125 cert_path = f'{daemon.hostname}/grafana_crt'
126 key_path = f'{daemon.hostname}/grafana_key'
127 self.mgr.set_store(cert_path, None)
128 self.mgr.set_store(key_path, None)
129
130 def ok_to_stop(self,
131 daemon_ids: List[str],
132 force: bool = False,
133 known: Optional[List[str]] = None) -> HandleCommandResult:
134 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Grafana', 1)
135 if warn and not force:
136 return HandleCommandResult(-errno.EBUSY, '', warn_message)
137 return HandleCommandResult(0, warn_message, '')
138
139
140 class AlertmanagerService(CephadmService):
141 TYPE = 'alertmanager'
142 DEFAULT_SERVICE_PORT = 9093
143
144 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
145 assert self.TYPE == daemon_spec.daemon_type
146 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
147 return daemon_spec
148
149 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
150 assert self.TYPE == daemon_spec.daemon_type
151 deps: List[str] = []
152 default_webhook_urls: List[str] = []
153
154 spec = cast(AlertManagerSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
155 try:
156 secure = spec.secure
157 except AttributeError:
158 secure = False
159 user_data = spec.user_data
160 if 'default_webhook_urls' in user_data and isinstance(
161 user_data['default_webhook_urls'], list):
162 default_webhook_urls.extend(user_data['default_webhook_urls'])
163
164 # dashboard(s)
165 dashboard_urls: List[str] = []
166 snmp_gateway_urls: List[str] = []
167 mgr_map = self.mgr.get('mgr_map')
168 port = None
169 proto = None # http: or https:
170 url = mgr_map.get('services', {}).get('dashboard', None)
171 if url:
172 p_result = urlparse(url.rstrip('/'))
173 hostname = socket.getfqdn(p_result.hostname)
174
175 try:
176 ip = ipaddress.ip_address(hostname)
177 except ValueError:
178 pass
179 else:
180 if ip.version == 6:
181 hostname = f'[{hostname}]'
182
183 dashboard_urls.append(
184 f'{p_result.scheme}://{hostname}:{p_result.port}{p_result.path}')
185 proto = p_result.scheme
186 port = p_result.port
187 # scan all mgrs to generate deps and to get standbys too.
188 # assume that they are all on the same port as the active mgr.
189 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
190 # we consider mgr a dep even if the dashboard is disabled
191 # in order to be consistent with _calc_daemon_deps().
192 deps.append(dd.name())
193 if not port:
194 continue
195 if dd.daemon_id == self.mgr.get_mgr_id():
196 continue
197 assert dd.hostname is not None
198 addr = self._inventory_get_fqdn(dd.hostname)
199 dashboard_urls.append(build_url(scheme=proto, host=addr, port=port).rstrip('/'))
200
201 for dd in self.mgr.cache.get_daemons_by_service('snmp-gateway'):
202 assert dd.hostname is not None
203 assert dd.ports
204 addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname)
205 deps.append(dd.name())
206
207 snmp_gateway_urls.append(build_url(scheme='http', host=addr,
208 port=dd.ports[0], path='/alerts'))
209
210 context = {
211 'dashboard_urls': dashboard_urls,
212 'default_webhook_urls': default_webhook_urls,
213 'snmp_gateway_urls': snmp_gateway_urls,
214 'secure': secure,
215 }
216 yml = self.mgr.template.render('services/alertmanager/alertmanager.yml.j2', context)
217
218 peers = []
219 port = 9094
220 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
221 assert dd.hostname is not None
222 deps.append(dd.name())
223 addr = self._inventory_get_fqdn(dd.hostname)
224 peers.append(build_url(host=addr, port=port).lstrip('/'))
225
226 return {
227 "files": {
228 "alertmanager.yml": yml
229 },
230 "peers": peers
231 }, sorted(deps)
232
233 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
234 # TODO: if there are multiple daemons, who is the active one?
235 if daemon_descrs:
236 return daemon_descrs[0]
237 # if empty list provided, return empty Daemon Desc
238 return DaemonDescription()
239
240 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
241 dd = self.get_active_daemon(daemon_descrs)
242 assert dd.hostname is not None
243 addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname)
244 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
245 service_url = build_url(scheme='http', host=addr, port=port)
246 self._set_service_url_on_dashboard(
247 'AlertManager',
248 'dashboard get-alertmanager-api-host',
249 'dashboard set-alertmanager-api-host',
250 service_url
251 )
252
253 def ok_to_stop(self,
254 daemon_ids: List[str],
255 force: bool = False,
256 known: Optional[List[str]] = None) -> HandleCommandResult:
257 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Alertmanager', 1)
258 if warn and not force:
259 return HandleCommandResult(-errno.EBUSY, '', warn_message)
260 return HandleCommandResult(0, warn_message, '')
261
262
263 class PrometheusService(CephadmService):
264 TYPE = 'prometheus'
265 DEFAULT_SERVICE_PORT = 9095
266 DEFAULT_MGR_PROMETHEUS_PORT = 9283
267
268 def config(self, spec: ServiceSpec) -> None:
269 # make sure module is enabled
270 mgr_map = self.mgr.get('mgr_map')
271 if 'prometheus' not in mgr_map.get('services', {}):
272 self.mgr.check_mon_command({
273 'prefix': 'mgr module enable',
274 'module': 'prometheus'
275 })
276 # we shouldn't get here (mon will tell the mgr to respawn), but no
277 # harm done if we do.
278
279 def prepare_create(
280 self,
281 daemon_spec: CephadmDaemonDeploySpec,
282 ) -> CephadmDaemonDeploySpec:
283 assert self.TYPE == daemon_spec.daemon_type
284 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
285 return daemon_spec
286
287 def generate_config(
288 self,
289 daemon_spec: CephadmDaemonDeploySpec,
290 ) -> Tuple[Dict[str, Any], List[str]]:
291 assert self.TYPE == daemon_spec.daemon_type
292 deps = [] # type: List[str]
293
294 # scrape mgrs
295 mgr_scrape_list = []
296 mgr_map = self.mgr.get('mgr_map')
297 port = cast(int, self.mgr.get_module_option_ex(
298 'prometheus', 'server_port', self.DEFAULT_MGR_PROMETHEUS_PORT))
299 deps.append(str(port))
300 t = mgr_map.get('services', {}).get('prometheus', None)
301 if t:
302 p_result = urlparse(t)
303 # urlparse .hostname removes '[]' from the hostname in case
304 # of ipv6 addresses so if this is the case then we just
305 # append the brackets when building the final scrape endpoint
306 if '[' in p_result.netloc and ']' in p_result.netloc:
307 mgr_scrape_list.append(f"[{p_result.hostname}]:{port}")
308 else:
309 mgr_scrape_list.append(f"{p_result.hostname}:{port}")
310 # scan all mgrs to generate deps and to get standbys too.
311 # assume that they are all on the same port as the active mgr.
312 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
313 # we consider the mgr a dep even if the prometheus module is
314 # disabled in order to be consistent with _calc_daemon_deps().
315 deps.append(dd.name())
316 if not port:
317 continue
318 if dd.daemon_id == self.mgr.get_mgr_id():
319 continue
320 assert dd.hostname is not None
321 addr = self._inventory_get_fqdn(dd.hostname)
322 mgr_scrape_list.append(build_url(host=addr, port=port).lstrip('/'))
323
324 # scrape node exporters
325 nodes = []
326 for dd in self.mgr.cache.get_daemons_by_service('node-exporter'):
327 assert dd.hostname is not None
328 deps.append(dd.name())
329 addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname)
330 port = dd.ports[0] if dd.ports else 9100
331 nodes.append({
332 'hostname': dd.hostname,
333 'url': build_url(host=addr, port=port).lstrip('/')
334 })
335
336 # scrape alert managers
337 alertmgr_targets = []
338 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
339 assert dd.hostname is not None
340 deps.append(dd.name())
341 addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname)
342 port = dd.ports[0] if dd.ports else 9093
343 alertmgr_targets.append("'{}'".format(build_url(host=addr, port=port).lstrip('/')))
344
345 # scrape haproxies
346 haproxy_targets = []
347 for dd in self.mgr.cache.get_daemons_by_type('ingress'):
348 if dd.service_name() in self.mgr.spec_store:
349 spec = cast(IngressSpec, self.mgr.spec_store[dd.service_name()].spec)
350 assert dd.hostname is not None
351 deps.append(dd.name())
352 if dd.daemon_type == 'haproxy':
353 addr = self._inventory_get_fqdn(dd.hostname)
354 haproxy_targets.append({
355 "url": f"'{build_url(host=addr, port=spec.monitor_port).lstrip('/')}'",
356 "service": dd.service_name(),
357 })
358
359 # generate the prometheus configuration
360 context = {
361 'alertmgr_targets': alertmgr_targets,
362 'mgr_scrape_list': mgr_scrape_list,
363 'haproxy_targets': haproxy_targets,
364 'nodes': nodes,
365 }
366 r = {
367 'files': {
368 'prometheus.yml':
369 self.mgr.template.render(
370 'services/prometheus/prometheus.yml.j2', context)
371 }
372 }
373
374 # include alerts, if present in the container
375 if os.path.exists(self.mgr.prometheus_alerts_path):
376 with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f:
377 alerts = f.read()
378 r['files']['/etc/prometheus/alerting/ceph_alerts.yml'] = alerts
379
380 # Include custom alerts if present in key value store. This enables the
381 # users to add custom alerts. Write the file in any case, so that if the
382 # content of the key value store changed, that file is overwritten
383 # (emptied in case they value has been removed from the key value
384 # store). This prevents the necessity to adapt `cephadm` binary to
385 # remove the file.
386 #
387 # Don't use the template engine for it as
388 #
389 # 1. the alerts are always static and
390 # 2. they are a template themselves for the Go template engine, which
391 # use curly braces and escaping that is cumbersome and unnecessary
392 # for the user.
393 #
394 r['files']['/etc/prometheus/alerting/custom_alerts.yml'] = \
395 self.mgr.get_store('services/prometheus/alerting/custom_alerts.yml', '')
396
397 return r, sorted(deps)
398
399 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
400 # TODO: if there are multiple daemons, who is the active one?
401 if daemon_descrs:
402 return daemon_descrs[0]
403 # if empty list provided, return empty Daemon Desc
404 return DaemonDescription()
405
406 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
407 dd = self.get_active_daemon(daemon_descrs)
408 assert dd.hostname is not None
409 addr = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname)
410 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
411 service_url = build_url(scheme='http', host=addr, port=port)
412 self._set_service_url_on_dashboard(
413 'Prometheus',
414 'dashboard get-prometheus-api-host',
415 'dashboard set-prometheus-api-host',
416 service_url
417 )
418
419 def ok_to_stop(self,
420 daemon_ids: List[str],
421 force: bool = False,
422 known: Optional[List[str]] = None) -> HandleCommandResult:
423 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Prometheus', 1)
424 if warn and not force:
425 return HandleCommandResult(-errno.EBUSY, '', warn_message)
426 return HandleCommandResult(0, warn_message, '')
427
428
429 class NodeExporterService(CephadmService):
430 TYPE = 'node-exporter'
431
432 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
433 assert self.TYPE == daemon_spec.daemon_type
434 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
435 return daemon_spec
436
437 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
438 assert self.TYPE == daemon_spec.daemon_type
439 return {}, []
440
441 def ok_to_stop(self,
442 daemon_ids: List[str],
443 force: bool = False,
444 known: Optional[List[str]] = None) -> HandleCommandResult:
445 # since node exporter runs on each host and cannot compromise data, no extra checks required
446 names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids]
447 out = f'It is presumed safe to stop {names}'
448 return HandleCommandResult(0, out, '')
449
450
451 class LokiService(CephadmService):
452 TYPE = 'loki'
453 DEFAULT_SERVICE_PORT = 3100
454
455 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
456 assert self.TYPE == daemon_spec.daemon_type
457 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
458 return daemon_spec
459
460 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
461 assert self.TYPE == daemon_spec.daemon_type
462 deps: List[str] = []
463
464 yml = self.mgr.template.render('services/loki.yml.j2')
465 return {
466 "files": {
467 "loki.yml": yml
468 }
469 }, sorted(deps)
470
471
472 class PromtailService(CephadmService):
473 TYPE = 'promtail'
474 DEFAULT_SERVICE_PORT = 9080
475
476 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
477 assert self.TYPE == daemon_spec.daemon_type
478 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
479 return daemon_spec
480
481 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
482 assert self.TYPE == daemon_spec.daemon_type
483 deps: List[str] = []
484
485 daemons = self.mgr.cache.get_daemons_by_service('loki')
486 loki_host = ''
487 for i, dd in enumerate(daemons):
488 assert dd.hostname is not None
489 if i == 0:
490 loki_host = dd.ip if dd.ip else self._inventory_get_fqdn(dd.hostname)
491
492 deps.append(dd.name())
493
494 context = {
495 'client_hostname': loki_host,
496 }
497
498 yml = self.mgr.template.render('services/promtail.yml.j2', context)
499 return {
500 "files": {
501 "promtail.yml": yml
502 }
503 }, sorted(deps)
504
505
506 class SNMPGatewayService(CephadmService):
507 TYPE = 'snmp-gateway'
508
509 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
510 assert self.TYPE == daemon_spec.daemon_type
511 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
512 return daemon_spec
513
514 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
515 assert self.TYPE == daemon_spec.daemon_type
516 deps: List[str] = []
517
518 spec = cast(SNMPGatewaySpec, self.mgr.spec_store[daemon_spec.service_name].spec)
519 config = {
520 "destination": spec.snmp_destination,
521 "snmp_version": spec.snmp_version,
522 }
523 if spec.snmp_version == 'V2c':
524 community = spec.credentials.get('snmp_community', None)
525 assert community is not None
526
527 config.update({
528 "snmp_community": community
529 })
530 else:
531 # SNMP v3 settings can be either authNoPriv or authPriv
532 auth_protocol = 'SHA' if not spec.auth_protocol else spec.auth_protocol
533
534 auth_username = spec.credentials.get('snmp_v3_auth_username', None)
535 auth_password = spec.credentials.get('snmp_v3_auth_password', None)
536 assert auth_username is not None
537 assert auth_password is not None
538 assert spec.engine_id is not None
539
540 config.update({
541 "snmp_v3_auth_protocol": auth_protocol,
542 "snmp_v3_auth_username": auth_username,
543 "snmp_v3_auth_password": auth_password,
544 "snmp_v3_engine_id": spec.engine_id,
545 })
546 # authPriv adds encryption
547 if spec.privacy_protocol:
548 priv_password = spec.credentials.get('snmp_v3_priv_password', None)
549 assert priv_password is not None
550
551 config.update({
552 "snmp_v3_priv_protocol": spec.privacy_protocol,
553 "snmp_v3_priv_password": priv_password,
554 })
555
556 logger.debug(
557 f"Generated configuration for '{self.TYPE}' service. Dependencies={deps}")
558
559 return config, sorted(deps)