]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/cephadm/services/monitoring.py
import quincy beta 17.1.0
[ceph.git] / ceph / src / pybind / mgr / cephadm / services / monitoring.py
CommitLineData
f67539c2 1import errno
e306af50
TL
2import logging
3import os
f67539c2 4from typing import List, Any, Tuple, Dict, Optional, cast
a4b75251 5from urllib.parse import urlparse
f67539c2
TL
6
7from mgr_module import HandleCommandResult
e306af50
TL
8
9from orchestrator import DaemonDescription
20effc67 10from ceph.deployment.service_spec import AlertManagerSpec, GrafanaSpec, ServiceSpec, SNMPGatewaySpec
f67539c2
TL
11from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec
12from cephadm.services.ingress import IngressSpec
522d829b 13from mgr_util import verify_tls, ServerConfigException, create_self_signed_cert, build_url
e306af50
TL
14
15logger = logging.getLogger(__name__)
16
f6b5b4d7 17
e306af50 18class GrafanaService(CephadmService):
f6b5b4d7 19 TYPE = 'grafana'
e306af50
TL
20 DEFAULT_SERVICE_PORT = 3000
21
f67539c2 22 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
f6b5b4d7 23 assert self.TYPE == daemon_spec.daemon_type
f67539c2 24 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 25 return daemon_spec
e306af50 26
f67539c2 27 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7 28 assert self.TYPE == daemon_spec.daemon_type
e306af50
TL
29 deps = [] # type: List[str]
30
31 prom_services = [] # type: List[str]
32 for dd in self.mgr.cache.get_daemons_by_service('prometheus'):
f67539c2 33 assert dd.hostname is not None
b3b6e05e
TL
34 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
35 port = dd.ports[0] if dd.ports else 9095
a4b75251
TL
36 prom_services.append(build_url(scheme='http', host=addr, port=port))
37
e306af50
TL
38 deps.append(dd.name())
39 grafana_data_sources = self.mgr.template.render(
40 'services/grafana/ceph-dashboard.yml.j2', {'hosts': prom_services})
41
42 cert = self.mgr.get_store('grafana_crt')
43 pkey = self.mgr.get_store('grafana_key')
44 if cert and pkey:
45 try:
46 verify_tls(cert, pkey)
47 except ServerConfigException as e:
48 logger.warning('Provided grafana TLS certificates invalid: %s', str(e))
49 cert, pkey = None, None
50 if not (cert and pkey):
51 cert, pkey = create_self_signed_cert('Ceph', 'cephadm')
52 self.mgr.set_store('grafana_crt', cert)
53 self.mgr.set_store('grafana_key', pkey)
522d829b
TL
54 if 'dashboard' in self.mgr.get('mgr_map')['modules']:
55 self.mgr.check_mon_command({
56 'prefix': 'dashboard set-grafana-api-ssl-verify',
57 'value': 'false',
58 })
e306af50 59
20effc67
TL
60 spec: GrafanaSpec = cast(
61 GrafanaSpec, self.mgr.spec_store.active_specs[daemon_spec.service_name])
e306af50 62 grafana_ini = self.mgr.template.render(
b3b6e05e 63 'services/grafana/grafana.ini.j2', {
20effc67 64 'initial_admin_password': spec.initial_admin_password,
b3b6e05e
TL
65 'http_port': daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT,
66 'http_addr': daemon_spec.ip if daemon_spec.ip else ''
67 })
e306af50
TL
68
69 config_file = {
70 'files': {
71 "grafana.ini": grafana_ini,
72 'provisioning/datasources/ceph-dashboard.yml': grafana_data_sources,
73 'certs/cert_file': '# generated by cephadm\n%s' % cert,
74 'certs/cert_key': '# generated by cephadm\n%s' % pkey,
75 }
76 }
77 return config_file, sorted(deps)
78
79 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
80 # Use the least-created one as the active daemon
f91f0fd5
TL
81 if daemon_descrs:
82 return daemon_descrs[-1]
83 # if empty list provided, return empty Daemon Desc
84 return DaemonDescription()
e306af50 85
f91f0fd5 86 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
e306af50
TL
87 # TODO: signed cert
88 dd = self.get_active_daemon(daemon_descrs)
f67539c2 89 assert dd.hostname is not None
b3b6e05e
TL
90 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
91 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
522d829b 92 service_url = build_url(scheme='https', host=addr, port=port)
e306af50
TL
93 self._set_service_url_on_dashboard(
94 'Grafana',
95 'dashboard get-grafana-api-url',
96 'dashboard set-grafana-api-url',
97 service_url
98 )
99
f67539c2
TL
100 def ok_to_stop(self,
101 daemon_ids: List[str],
102 force: bool = False,
103 known: Optional[List[str]] = None) -> HandleCommandResult:
104 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Grafana', 1)
105 if warn and not force:
106 return HandleCommandResult(-errno.EBUSY, '', warn_message)
107 return HandleCommandResult(0, warn_message, '')
108
f6b5b4d7 109
e306af50 110class AlertmanagerService(CephadmService):
f6b5b4d7 111 TYPE = 'alertmanager'
e306af50
TL
112 DEFAULT_SERVICE_PORT = 9093
113
f67539c2 114 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
f6b5b4d7 115 assert self.TYPE == daemon_spec.daemon_type
f67539c2 116 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 117 return daemon_spec
f6b5b4d7 118
f67539c2 119 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7
TL
120 assert self.TYPE == daemon_spec.daemon_type
121 deps: List[str] = []
122 default_webhook_urls: List[str] = []
e306af50 123
f67539c2
TL
124 spec = cast(AlertManagerSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
125 user_data = spec.user_data
126 if 'default_webhook_urls' in user_data and isinstance(
127 user_data['default_webhook_urls'], list):
128 default_webhook_urls.extend(user_data['default_webhook_urls'])
e306af50
TL
129
130 # dashboard(s)
f6b5b4d7 131 dashboard_urls: List[str] = []
20effc67 132 snmp_gateway_urls: List[str] = []
e306af50
TL
133 mgr_map = self.mgr.get('mgr_map')
134 port = None
135 proto = None # http: or https:
136 url = mgr_map.get('services', {}).get('dashboard', None)
137 if url:
138 dashboard_urls.append(url)
a4b75251
TL
139 p_result = urlparse(url)
140 proto = p_result.scheme
141 port = p_result.port
e306af50
TL
142 # scan all mgrs to generate deps and to get standbys too.
143 # assume that they are all on the same port as the active mgr.
144 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
145 # we consider mgr a dep even if the dashboard is disabled
146 # in order to be consistent with _calc_daemon_deps().
147 deps.append(dd.name())
148 if not port:
149 continue
150 if dd.daemon_id == self.mgr.get_mgr_id():
151 continue
f67539c2 152 assert dd.hostname is not None
e306af50 153 addr = self.mgr.inventory.get_addr(dd.hostname)
a4b75251 154 dashboard_urls.append(build_url(scheme=proto, host=addr, port=port))
e306af50 155
20effc67
TL
156 for dd in self.mgr.cache.get_daemons_by_service('snmp-gateway'):
157 assert dd.hostname is not None
158 assert dd.ports
159 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
160 deps.append(dd.name())
161
162 snmp_gateway_urls.append(build_url(scheme='http', host=addr,
163 port=dd.ports[0], path='/alerts'))
164
e306af50 165 context = {
f6b5b4d7 166 'dashboard_urls': dashboard_urls,
20effc67
TL
167 'default_webhook_urls': default_webhook_urls,
168 'snmp_gateway_urls': snmp_gateway_urls,
e306af50
TL
169 }
170 yml = self.mgr.template.render('services/alertmanager/alertmanager.yml.j2', context)
171
172 peers = []
a4b75251 173 port = 9094
e306af50 174 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
f67539c2 175 assert dd.hostname is not None
e306af50
TL
176 deps.append(dd.name())
177 addr = self.mgr.inventory.get_addr(dd.hostname)
a4b75251 178 peers.append(build_url(host=addr, port=port).lstrip('/'))
20effc67 179
e306af50
TL
180 return {
181 "files": {
182 "alertmanager.yml": yml
183 },
184 "peers": peers
185 }, sorted(deps)
186
187 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
188 # TODO: if there are multiple daemons, who is the active one?
f91f0fd5
TL
189 if daemon_descrs:
190 return daemon_descrs[0]
191 # if empty list provided, return empty Daemon Desc
192 return DaemonDescription()
e306af50 193
f91f0fd5 194 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
e306af50 195 dd = self.get_active_daemon(daemon_descrs)
f67539c2 196 assert dd.hostname is not None
b3b6e05e
TL
197 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
198 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
a4b75251 199 service_url = build_url(scheme='http', host=addr, port=port)
e306af50
TL
200 self._set_service_url_on_dashboard(
201 'AlertManager',
202 'dashboard get-alertmanager-api-host',
203 'dashboard set-alertmanager-api-host',
204 service_url
205 )
206
f67539c2
TL
207 def ok_to_stop(self,
208 daemon_ids: List[str],
209 force: bool = False,
210 known: Optional[List[str]] = None) -> HandleCommandResult:
211 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Alertmanager', 1)
212 if warn and not force:
213 return HandleCommandResult(-errno.EBUSY, '', warn_message)
214 return HandleCommandResult(0, warn_message, '')
215
e306af50
TL
216
217class PrometheusService(CephadmService):
f6b5b4d7 218 TYPE = 'prometheus'
e306af50
TL
219 DEFAULT_SERVICE_PORT = 9095
220
522d829b
TL
221 def config(self, spec: ServiceSpec) -> None:
222 # make sure module is enabled
223 mgr_map = self.mgr.get('mgr_map')
224 if 'prometheus' not in mgr_map.get('services', {}):
225 self.mgr.check_mon_command({
226 'prefix': 'mgr module enable',
227 'module': 'prometheus'
228 })
229 # we shouldn't get here (mon will tell the mgr to respawn), but no
230 # harm done if we do.
231
232 def prepare_create(
233 self,
234 daemon_spec: CephadmDaemonDeploySpec,
235 ) -> CephadmDaemonDeploySpec:
f6b5b4d7 236 assert self.TYPE == daemon_spec.daemon_type
f67539c2 237 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 238 return daemon_spec
e306af50 239
522d829b
TL
240 def generate_config(
241 self,
242 daemon_spec: CephadmDaemonDeploySpec,
243 ) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7 244 assert self.TYPE == daemon_spec.daemon_type
e306af50
TL
245 deps = [] # type: List[str]
246
247 # scrape mgrs
248 mgr_scrape_list = []
249 mgr_map = self.mgr.get('mgr_map')
250 port = None
251 t = mgr_map.get('services', {}).get('prometheus', None)
252 if t:
a4b75251 253 p_result = urlparse(t)
e306af50
TL
254 t = t.split('/')[2]
255 mgr_scrape_list.append(t)
a4b75251 256 port = p_result.port or 9283
e306af50
TL
257 # scan all mgrs to generate deps and to get standbys too.
258 # assume that they are all on the same port as the active mgr.
259 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
260 # we consider the mgr a dep even if the prometheus module is
261 # disabled in order to be consistent with _calc_daemon_deps().
262 deps.append(dd.name())
263 if not port:
264 continue
265 if dd.daemon_id == self.mgr.get_mgr_id():
266 continue
f67539c2 267 assert dd.hostname is not None
e306af50 268 addr = self.mgr.inventory.get_addr(dd.hostname)
a4b75251 269 mgr_scrape_list.append(build_url(host=addr, port=port).lstrip('/'))
e306af50
TL
270
271 # scrape node exporters
272 nodes = []
273 for dd in self.mgr.cache.get_daemons_by_service('node-exporter'):
f67539c2 274 assert dd.hostname is not None
e306af50 275 deps.append(dd.name())
b3b6e05e 276 addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
a4b75251 277 port = dd.ports[0] if dd.ports else 9100
e306af50
TL
278 nodes.append({
279 'hostname': dd.hostname,
a4b75251 280 'url': build_url(host=addr, port=port).lstrip('/')
e306af50
TL
281 })
282
283 # scrape alert managers
284 alertmgr_targets = []
285 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
f67539c2 286 assert dd.hostname is not None
e306af50 287 deps.append(dd.name())
b3b6e05e 288 addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
a4b75251
TL
289 port = dd.ports[0] if dd.ports else 9093
290 alertmgr_targets.append("'{}'".format(build_url(host=addr, port=port).lstrip('/')))
e306af50 291
f67539c2
TL
292 # scrape haproxies
293 haproxy_targets = []
294 for dd in self.mgr.cache.get_daemons_by_type('ingress'):
295 if dd.service_name() in self.mgr.spec_store:
296 spec = cast(IngressSpec, self.mgr.spec_store[dd.service_name()].spec)
297 assert dd.hostname is not None
298 deps.append(dd.name())
299 if dd.daemon_type == 'haproxy':
300 addr = self.mgr.inventory.get_addr(dd.hostname)
301 haproxy_targets.append({
a4b75251 302 "url": f"'{build_url(host=addr, port=spec.monitor_port).lstrip('/')}'",
f67539c2
TL
303 "service": dd.service_name(),
304 })
305
e306af50
TL
306 # generate the prometheus configuration
307 context = {
308 'alertmgr_targets': alertmgr_targets,
309 'mgr_scrape_list': mgr_scrape_list,
f67539c2 310 'haproxy_targets': haproxy_targets,
e306af50
TL
311 'nodes': nodes,
312 }
313 r = {
314 'files': {
315 'prometheus.yml':
316 self.mgr.template.render(
317 'services/prometheus/prometheus.yml.j2', context)
318 }
319 }
320
321 # include alerts, if present in the container
322 if os.path.exists(self.mgr.prometheus_alerts_path):
323 with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f:
324 alerts = f.read()
325 r['files']['/etc/prometheus/alerting/ceph_alerts.yml'] = alerts
326
327 return r, sorted(deps)
328
329 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
330 # TODO: if there are multiple daemons, who is the active one?
f91f0fd5
TL
331 if daemon_descrs:
332 return daemon_descrs[0]
333 # if empty list provided, return empty Daemon Desc
334 return DaemonDescription()
e306af50 335
f91f0fd5 336 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
e306af50 337 dd = self.get_active_daemon(daemon_descrs)
f67539c2 338 assert dd.hostname is not None
b3b6e05e
TL
339 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
340 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
a4b75251 341 service_url = build_url(scheme='http', host=addr, port=port)
e306af50
TL
342 self._set_service_url_on_dashboard(
343 'Prometheus',
344 'dashboard get-prometheus-api-host',
345 'dashboard set-prometheus-api-host',
346 service_url
347 )
348
f67539c2
TL
349 def ok_to_stop(self,
350 daemon_ids: List[str],
351 force: bool = False,
352 known: Optional[List[str]] = None) -> HandleCommandResult:
353 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Prometheus', 1)
354 if warn and not force:
355 return HandleCommandResult(-errno.EBUSY, '', warn_message)
356 return HandleCommandResult(0, warn_message, '')
357
f6b5b4d7 358
e306af50 359class NodeExporterService(CephadmService):
f6b5b4d7
TL
360 TYPE = 'node-exporter'
361
f67539c2 362 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
f6b5b4d7 363 assert self.TYPE == daemon_spec.daemon_type
f67539c2 364 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 365 return daemon_spec
e306af50 366
f67539c2 367 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7 368 assert self.TYPE == daemon_spec.daemon_type
e306af50 369 return {}, []
f67539c2
TL
370
371 def ok_to_stop(self,
372 daemon_ids: List[str],
373 force: bool = False,
374 known: Optional[List[str]] = None) -> HandleCommandResult:
375 # since node exporter runs on each host and cannot compromise data, no extra checks required
376 names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids]
377 out = f'It is presumed safe to stop {names}'
378 return HandleCommandResult(0, out, '')
20effc67
TL
379
380
381class SNMPGatewayService(CephadmService):
382 TYPE = 'snmp-gateway'
383
384 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
385 assert self.TYPE == daemon_spec.daemon_type
386 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
387 return daemon_spec
388
389 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
390 assert self.TYPE == daemon_spec.daemon_type
391 deps: List[str] = []
392
393 spec = cast(SNMPGatewaySpec, self.mgr.spec_store[daemon_spec.service_name].spec)
394 config = {
395 "destination": spec.snmp_destination,
396 "snmp_version": spec.snmp_version,
397 }
398 if spec.snmp_version == 'V2c':
399 community = spec.credentials.get('snmp_community', None)
400 assert community is not None
401
402 config.update({
403 "snmp_community": community
404 })
405 else:
406 # SNMP v3 settings can be either authNoPriv or authPriv
407 auth_protocol = 'SHA' if not spec.auth_protocol else spec.auth_protocol
408
409 auth_username = spec.credentials.get('snmp_v3_auth_username', None)
410 auth_password = spec.credentials.get('snmp_v3_auth_password', None)
411 assert auth_username is not None
412 assert auth_password is not None
413 assert spec.engine_id is not None
414
415 config.update({
416 "snmp_v3_auth_protocol": auth_protocol,
417 "snmp_v3_auth_username": auth_username,
418 "snmp_v3_auth_password": auth_password,
419 "snmp_v3_engine_id": spec.engine_id,
420 })
421 # authPriv adds encryption
422 if spec.privacy_protocol:
423 priv_password = spec.credentials.get('snmp_v3_priv_password', None)
424 assert priv_password is not None
425
426 config.update({
427 "snmp_v3_priv_protocol": spec.privacy_protocol,
428 "snmp_v3_priv_password": priv_password,
429 })
430
431 logger.debug(
432 f"Generated configuration for '{self.TYPE}' service. Dependencies={deps}")
433
434 return config, sorted(deps)