]> git.proxmox.com Git - ceph.git/blame_incremental - ceph/src/pybind/mgr/cephadm/services/monitoring.py
import quincy beta 17.1.0
[ceph.git] / ceph / src / pybind / mgr / cephadm / services / monitoring.py
... / ...
CommitLineData
1import errno
2import logging
3import os
4from typing import List, Any, Tuple, Dict, Optional, cast
5from urllib.parse import urlparse
6
7from mgr_module import HandleCommandResult
8
9from orchestrator import DaemonDescription
10from ceph.deployment.service_spec import AlertManagerSpec, GrafanaSpec, ServiceSpec, SNMPGatewaySpec
11from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec
12from cephadm.services.ingress import IngressSpec
13from mgr_util import verify_tls, ServerConfigException, create_self_signed_cert, build_url
14
15logger = logging.getLogger(__name__)
16
17
18class GrafanaService(CephadmService):
19 TYPE = 'grafana'
20 DEFAULT_SERVICE_PORT = 3000
21
22 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
23 assert self.TYPE == daemon_spec.daemon_type
24 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
25 return daemon_spec
26
27 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
28 assert self.TYPE == daemon_spec.daemon_type
29 deps = [] # type: List[str]
30
31 prom_services = [] # type: List[str]
32 for dd in self.mgr.cache.get_daemons_by_service('prometheus'):
33 assert dd.hostname is not None
34 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
35 port = dd.ports[0] if dd.ports else 9095
36 prom_services.append(build_url(scheme='http', host=addr, port=port))
37
38 deps.append(dd.name())
39 grafana_data_sources = self.mgr.template.render(
40 'services/grafana/ceph-dashboard.yml.j2', {'hosts': prom_services})
41
42 cert = self.mgr.get_store('grafana_crt')
43 pkey = self.mgr.get_store('grafana_key')
44 if cert and pkey:
45 try:
46 verify_tls(cert, pkey)
47 except ServerConfigException as e:
48 logger.warning('Provided grafana TLS certificates invalid: %s', str(e))
49 cert, pkey = None, None
50 if not (cert and pkey):
51 cert, pkey = create_self_signed_cert('Ceph', 'cephadm')
52 self.mgr.set_store('grafana_crt', cert)
53 self.mgr.set_store('grafana_key', pkey)
54 if 'dashboard' in self.mgr.get('mgr_map')['modules']:
55 self.mgr.check_mon_command({
56 'prefix': 'dashboard set-grafana-api-ssl-verify',
57 'value': 'false',
58 })
59
60 spec: GrafanaSpec = cast(
61 GrafanaSpec, self.mgr.spec_store.active_specs[daemon_spec.service_name])
62 grafana_ini = self.mgr.template.render(
63 'services/grafana/grafana.ini.j2', {
64 'initial_admin_password': spec.initial_admin_password,
65 'http_port': daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT,
66 'http_addr': daemon_spec.ip if daemon_spec.ip else ''
67 })
68
69 config_file = {
70 'files': {
71 "grafana.ini": grafana_ini,
72 'provisioning/datasources/ceph-dashboard.yml': grafana_data_sources,
73 'certs/cert_file': '# generated by cephadm\n%s' % cert,
74 'certs/cert_key': '# generated by cephadm\n%s' % pkey,
75 }
76 }
77 return config_file, sorted(deps)
78
79 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
80 # Use the least-created one as the active daemon
81 if daemon_descrs:
82 return daemon_descrs[-1]
83 # if empty list provided, return empty Daemon Desc
84 return DaemonDescription()
85
86 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
87 # TODO: signed cert
88 dd = self.get_active_daemon(daemon_descrs)
89 assert dd.hostname is not None
90 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
91 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
92 service_url = build_url(scheme='https', host=addr, port=port)
93 self._set_service_url_on_dashboard(
94 'Grafana',
95 'dashboard get-grafana-api-url',
96 'dashboard set-grafana-api-url',
97 service_url
98 )
99
100 def ok_to_stop(self,
101 daemon_ids: List[str],
102 force: bool = False,
103 known: Optional[List[str]] = None) -> HandleCommandResult:
104 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Grafana', 1)
105 if warn and not force:
106 return HandleCommandResult(-errno.EBUSY, '', warn_message)
107 return HandleCommandResult(0, warn_message, '')
108
109
110class AlertmanagerService(CephadmService):
111 TYPE = 'alertmanager'
112 DEFAULT_SERVICE_PORT = 9093
113
114 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
115 assert self.TYPE == daemon_spec.daemon_type
116 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
117 return daemon_spec
118
119 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
120 assert self.TYPE == daemon_spec.daemon_type
121 deps: List[str] = []
122 default_webhook_urls: List[str] = []
123
124 spec = cast(AlertManagerSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
125 user_data = spec.user_data
126 if 'default_webhook_urls' in user_data and isinstance(
127 user_data['default_webhook_urls'], list):
128 default_webhook_urls.extend(user_data['default_webhook_urls'])
129
130 # dashboard(s)
131 dashboard_urls: List[str] = []
132 snmp_gateway_urls: List[str] = []
133 mgr_map = self.mgr.get('mgr_map')
134 port = None
135 proto = None # http: or https:
136 url = mgr_map.get('services', {}).get('dashboard', None)
137 if url:
138 dashboard_urls.append(url)
139 p_result = urlparse(url)
140 proto = p_result.scheme
141 port = p_result.port
142 # scan all mgrs to generate deps and to get standbys too.
143 # assume that they are all on the same port as the active mgr.
144 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
145 # we consider mgr a dep even if the dashboard is disabled
146 # in order to be consistent with _calc_daemon_deps().
147 deps.append(dd.name())
148 if not port:
149 continue
150 if dd.daemon_id == self.mgr.get_mgr_id():
151 continue
152 assert dd.hostname is not None
153 addr = self.mgr.inventory.get_addr(dd.hostname)
154 dashboard_urls.append(build_url(scheme=proto, host=addr, port=port))
155
156 for dd in self.mgr.cache.get_daemons_by_service('snmp-gateway'):
157 assert dd.hostname is not None
158 assert dd.ports
159 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
160 deps.append(dd.name())
161
162 snmp_gateway_urls.append(build_url(scheme='http', host=addr,
163 port=dd.ports[0], path='/alerts'))
164
165 context = {
166 'dashboard_urls': dashboard_urls,
167 'default_webhook_urls': default_webhook_urls,
168 'snmp_gateway_urls': snmp_gateway_urls,
169 }
170 yml = self.mgr.template.render('services/alertmanager/alertmanager.yml.j2', context)
171
172 peers = []
173 port = 9094
174 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
175 assert dd.hostname is not None
176 deps.append(dd.name())
177 addr = self.mgr.inventory.get_addr(dd.hostname)
178 peers.append(build_url(host=addr, port=port).lstrip('/'))
179
180 return {
181 "files": {
182 "alertmanager.yml": yml
183 },
184 "peers": peers
185 }, sorted(deps)
186
187 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
188 # TODO: if there are multiple daemons, who is the active one?
189 if daemon_descrs:
190 return daemon_descrs[0]
191 # if empty list provided, return empty Daemon Desc
192 return DaemonDescription()
193
194 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
195 dd = self.get_active_daemon(daemon_descrs)
196 assert dd.hostname is not None
197 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
198 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
199 service_url = build_url(scheme='http', host=addr, port=port)
200 self._set_service_url_on_dashboard(
201 'AlertManager',
202 'dashboard get-alertmanager-api-host',
203 'dashboard set-alertmanager-api-host',
204 service_url
205 )
206
207 def ok_to_stop(self,
208 daemon_ids: List[str],
209 force: bool = False,
210 known: Optional[List[str]] = None) -> HandleCommandResult:
211 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Alertmanager', 1)
212 if warn and not force:
213 return HandleCommandResult(-errno.EBUSY, '', warn_message)
214 return HandleCommandResult(0, warn_message, '')
215
216
217class PrometheusService(CephadmService):
218 TYPE = 'prometheus'
219 DEFAULT_SERVICE_PORT = 9095
220
221 def config(self, spec: ServiceSpec) -> None:
222 # make sure module is enabled
223 mgr_map = self.mgr.get('mgr_map')
224 if 'prometheus' not in mgr_map.get('services', {}):
225 self.mgr.check_mon_command({
226 'prefix': 'mgr module enable',
227 'module': 'prometheus'
228 })
229 # we shouldn't get here (mon will tell the mgr to respawn), but no
230 # harm done if we do.
231
232 def prepare_create(
233 self,
234 daemon_spec: CephadmDaemonDeploySpec,
235 ) -> CephadmDaemonDeploySpec:
236 assert self.TYPE == daemon_spec.daemon_type
237 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
238 return daemon_spec
239
240 def generate_config(
241 self,
242 daemon_spec: CephadmDaemonDeploySpec,
243 ) -> Tuple[Dict[str, Any], List[str]]:
244 assert self.TYPE == daemon_spec.daemon_type
245 deps = [] # type: List[str]
246
247 # scrape mgrs
248 mgr_scrape_list = []
249 mgr_map = self.mgr.get('mgr_map')
250 port = None
251 t = mgr_map.get('services', {}).get('prometheus', None)
252 if t:
253 p_result = urlparse(t)
254 t = t.split('/')[2]
255 mgr_scrape_list.append(t)
256 port = p_result.port or 9283
257 # scan all mgrs to generate deps and to get standbys too.
258 # assume that they are all on the same port as the active mgr.
259 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
260 # we consider the mgr a dep even if the prometheus module is
261 # disabled in order to be consistent with _calc_daemon_deps().
262 deps.append(dd.name())
263 if not port:
264 continue
265 if dd.daemon_id == self.mgr.get_mgr_id():
266 continue
267 assert dd.hostname is not None
268 addr = self.mgr.inventory.get_addr(dd.hostname)
269 mgr_scrape_list.append(build_url(host=addr, port=port).lstrip('/'))
270
271 # scrape node exporters
272 nodes = []
273 for dd in self.mgr.cache.get_daemons_by_service('node-exporter'):
274 assert dd.hostname is not None
275 deps.append(dd.name())
276 addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
277 port = dd.ports[0] if dd.ports else 9100
278 nodes.append({
279 'hostname': dd.hostname,
280 'url': build_url(host=addr, port=port).lstrip('/')
281 })
282
283 # scrape alert managers
284 alertmgr_targets = []
285 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
286 assert dd.hostname is not None
287 deps.append(dd.name())
288 addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
289 port = dd.ports[0] if dd.ports else 9093
290 alertmgr_targets.append("'{}'".format(build_url(host=addr, port=port).lstrip('/')))
291
292 # scrape haproxies
293 haproxy_targets = []
294 for dd in self.mgr.cache.get_daemons_by_type('ingress'):
295 if dd.service_name() in self.mgr.spec_store:
296 spec = cast(IngressSpec, self.mgr.spec_store[dd.service_name()].spec)
297 assert dd.hostname is not None
298 deps.append(dd.name())
299 if dd.daemon_type == 'haproxy':
300 addr = self.mgr.inventory.get_addr(dd.hostname)
301 haproxy_targets.append({
302 "url": f"'{build_url(host=addr, port=spec.monitor_port).lstrip('/')}'",
303 "service": dd.service_name(),
304 })
305
306 # generate the prometheus configuration
307 context = {
308 'alertmgr_targets': alertmgr_targets,
309 'mgr_scrape_list': mgr_scrape_list,
310 'haproxy_targets': haproxy_targets,
311 'nodes': nodes,
312 }
313 r = {
314 'files': {
315 'prometheus.yml':
316 self.mgr.template.render(
317 'services/prometheus/prometheus.yml.j2', context)
318 }
319 }
320
321 # include alerts, if present in the container
322 if os.path.exists(self.mgr.prometheus_alerts_path):
323 with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f:
324 alerts = f.read()
325 r['files']['/etc/prometheus/alerting/ceph_alerts.yml'] = alerts
326
327 return r, sorted(deps)
328
329 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
330 # TODO: if there are multiple daemons, who is the active one?
331 if daemon_descrs:
332 return daemon_descrs[0]
333 # if empty list provided, return empty Daemon Desc
334 return DaemonDescription()
335
336 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
337 dd = self.get_active_daemon(daemon_descrs)
338 assert dd.hostname is not None
339 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
340 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
341 service_url = build_url(scheme='http', host=addr, port=port)
342 self._set_service_url_on_dashboard(
343 'Prometheus',
344 'dashboard get-prometheus-api-host',
345 'dashboard set-prometheus-api-host',
346 service_url
347 )
348
349 def ok_to_stop(self,
350 daemon_ids: List[str],
351 force: bool = False,
352 known: Optional[List[str]] = None) -> HandleCommandResult:
353 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Prometheus', 1)
354 if warn and not force:
355 return HandleCommandResult(-errno.EBUSY, '', warn_message)
356 return HandleCommandResult(0, warn_message, '')
357
358
359class NodeExporterService(CephadmService):
360 TYPE = 'node-exporter'
361
362 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
363 assert self.TYPE == daemon_spec.daemon_type
364 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
365 return daemon_spec
366
367 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
368 assert self.TYPE == daemon_spec.daemon_type
369 return {}, []
370
371 def ok_to_stop(self,
372 daemon_ids: List[str],
373 force: bool = False,
374 known: Optional[List[str]] = None) -> HandleCommandResult:
375 # since node exporter runs on each host and cannot compromise data, no extra checks required
376 names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids]
377 out = f'It is presumed safe to stop {names}'
378 return HandleCommandResult(0, out, '')
379
380
381class SNMPGatewayService(CephadmService):
382 TYPE = 'snmp-gateway'
383
384 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
385 assert self.TYPE == daemon_spec.daemon_type
386 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
387 return daemon_spec
388
389 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
390 assert self.TYPE == daemon_spec.daemon_type
391 deps: List[str] = []
392
393 spec = cast(SNMPGatewaySpec, self.mgr.spec_store[daemon_spec.service_name].spec)
394 config = {
395 "destination": spec.snmp_destination,
396 "snmp_version": spec.snmp_version,
397 }
398 if spec.snmp_version == 'V2c':
399 community = spec.credentials.get('snmp_community', None)
400 assert community is not None
401
402 config.update({
403 "snmp_community": community
404 })
405 else:
406 # SNMP v3 settings can be either authNoPriv or authPriv
407 auth_protocol = 'SHA' if not spec.auth_protocol else spec.auth_protocol
408
409 auth_username = spec.credentials.get('snmp_v3_auth_username', None)
410 auth_password = spec.credentials.get('snmp_v3_auth_password', None)
411 assert auth_username is not None
412 assert auth_password is not None
413 assert spec.engine_id is not None
414
415 config.update({
416 "snmp_v3_auth_protocol": auth_protocol,
417 "snmp_v3_auth_username": auth_username,
418 "snmp_v3_auth_password": auth_password,
419 "snmp_v3_engine_id": spec.engine_id,
420 })
421 # authPriv adds encryption
422 if spec.privacy_protocol:
423 priv_password = spec.credentials.get('snmp_v3_priv_password', None)
424 assert priv_password is not None
425
426 config.update({
427 "snmp_v3_priv_protocol": spec.privacy_protocol,
428 "snmp_v3_priv_password": priv_password,
429 })
430
431 logger.debug(
432 f"Generated configuration for '{self.TYPE}' service. Dependencies={deps}")
433
434 return config, sorted(deps)