]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/cephadm/services/monitoring.py
bump version to 16.2.6-pve2
[ceph.git] / ceph / src / pybind / mgr / cephadm / services / monitoring.py
CommitLineData
f67539c2 1import errno
e306af50
TL
2import logging
3import os
f67539c2
TL
4from typing import List, Any, Tuple, Dict, Optional, cast
5
6from mgr_module import HandleCommandResult
e306af50
TL
7
8from orchestrator import DaemonDescription
522d829b 9from ceph.deployment.service_spec import AlertManagerSpec, ServiceSpec
f67539c2
TL
10from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec
11from cephadm.services.ingress import IngressSpec
522d829b 12from mgr_util import verify_tls, ServerConfigException, create_self_signed_cert, build_url
e306af50
TL
13
14logger = logging.getLogger(__name__)
15
f6b5b4d7 16
e306af50 17class GrafanaService(CephadmService):
f6b5b4d7 18 TYPE = 'grafana'
e306af50
TL
19 DEFAULT_SERVICE_PORT = 3000
20
f67539c2 21 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
f6b5b4d7 22 assert self.TYPE == daemon_spec.daemon_type
f67539c2 23 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 24 return daemon_spec
e306af50 25
f67539c2 26 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7 27 assert self.TYPE == daemon_spec.daemon_type
e306af50
TL
28 deps = [] # type: List[str]
29
30 prom_services = [] # type: List[str]
31 for dd in self.mgr.cache.get_daemons_by_service('prometheus'):
f67539c2 32 assert dd.hostname is not None
b3b6e05e
TL
33 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
34 port = dd.ports[0] if dd.ports else 9095
35 prom_services.append(addr + ':' + str(port))
e306af50
TL
36 deps.append(dd.name())
37 grafana_data_sources = self.mgr.template.render(
38 'services/grafana/ceph-dashboard.yml.j2', {'hosts': prom_services})
39
40 cert = self.mgr.get_store('grafana_crt')
41 pkey = self.mgr.get_store('grafana_key')
42 if cert and pkey:
43 try:
44 verify_tls(cert, pkey)
45 except ServerConfigException as e:
46 logger.warning('Provided grafana TLS certificates invalid: %s', str(e))
47 cert, pkey = None, None
48 if not (cert and pkey):
49 cert, pkey = create_self_signed_cert('Ceph', 'cephadm')
50 self.mgr.set_store('grafana_crt', cert)
51 self.mgr.set_store('grafana_key', pkey)
522d829b
TL
52 if 'dashboard' in self.mgr.get('mgr_map')['modules']:
53 self.mgr.check_mon_command({
54 'prefix': 'dashboard set-grafana-api-ssl-verify',
55 'value': 'false',
56 })
e306af50
TL
57
58 grafana_ini = self.mgr.template.render(
b3b6e05e
TL
59 'services/grafana/grafana.ini.j2', {
60 'http_port': daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT,
61 'http_addr': daemon_spec.ip if daemon_spec.ip else ''
62 })
e306af50
TL
63
64 config_file = {
65 'files': {
66 "grafana.ini": grafana_ini,
67 'provisioning/datasources/ceph-dashboard.yml': grafana_data_sources,
68 'certs/cert_file': '# generated by cephadm\n%s' % cert,
69 'certs/cert_key': '# generated by cephadm\n%s' % pkey,
70 }
71 }
72 return config_file, sorted(deps)
73
74 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
75 # Use the least-created one as the active daemon
f91f0fd5
TL
76 if daemon_descrs:
77 return daemon_descrs[-1]
78 # if empty list provided, return empty Daemon Desc
79 return DaemonDescription()
e306af50 80
f91f0fd5 81 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
e306af50
TL
82 # TODO: signed cert
83 dd = self.get_active_daemon(daemon_descrs)
f67539c2 84 assert dd.hostname is not None
b3b6e05e
TL
85 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
86 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
522d829b 87 service_url = build_url(scheme='https', host=addr, port=port)
e306af50
TL
88 self._set_service_url_on_dashboard(
89 'Grafana',
90 'dashboard get-grafana-api-url',
91 'dashboard set-grafana-api-url',
92 service_url
93 )
94
f67539c2
TL
95 def ok_to_stop(self,
96 daemon_ids: List[str],
97 force: bool = False,
98 known: Optional[List[str]] = None) -> HandleCommandResult:
99 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Grafana', 1)
100 if warn and not force:
101 return HandleCommandResult(-errno.EBUSY, '', warn_message)
102 return HandleCommandResult(0, warn_message, '')
103
f6b5b4d7 104
e306af50 105class AlertmanagerService(CephadmService):
f6b5b4d7 106 TYPE = 'alertmanager'
e306af50
TL
107 DEFAULT_SERVICE_PORT = 9093
108
f67539c2 109 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
f6b5b4d7 110 assert self.TYPE == daemon_spec.daemon_type
f67539c2 111 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 112 return daemon_spec
f6b5b4d7 113
f67539c2 114 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7
TL
115 assert self.TYPE == daemon_spec.daemon_type
116 deps: List[str] = []
117 default_webhook_urls: List[str] = []
e306af50 118
f67539c2
TL
119 spec = cast(AlertManagerSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
120 user_data = spec.user_data
121 if 'default_webhook_urls' in user_data and isinstance(
122 user_data['default_webhook_urls'], list):
123 default_webhook_urls.extend(user_data['default_webhook_urls'])
e306af50
TL
124
125 # dashboard(s)
f6b5b4d7 126 dashboard_urls: List[str] = []
e306af50
TL
127 mgr_map = self.mgr.get('mgr_map')
128 port = None
129 proto = None # http: or https:
130 url = mgr_map.get('services', {}).get('dashboard', None)
131 if url:
132 dashboard_urls.append(url)
133 proto = url.split('/')[0]
134 port = url.split('/')[2].split(':')[1]
135 # scan all mgrs to generate deps and to get standbys too.
136 # assume that they are all on the same port as the active mgr.
137 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
138 # we consider mgr a dep even if the dashboard is disabled
139 # in order to be consistent with _calc_daemon_deps().
140 deps.append(dd.name())
141 if not port:
142 continue
143 if dd.daemon_id == self.mgr.get_mgr_id():
144 continue
f67539c2 145 assert dd.hostname is not None
e306af50
TL
146 addr = self.mgr.inventory.get_addr(dd.hostname)
147 dashboard_urls.append('%s//%s:%s/' % (proto, addr.split(':')[0],
148 port))
149
150 context = {
f6b5b4d7
TL
151 'dashboard_urls': dashboard_urls,
152 'default_webhook_urls': default_webhook_urls
e306af50
TL
153 }
154 yml = self.mgr.template.render('services/alertmanager/alertmanager.yml.j2', context)
155
156 peers = []
157 port = '9094'
158 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
f67539c2 159 assert dd.hostname is not None
e306af50
TL
160 deps.append(dd.name())
161 addr = self.mgr.inventory.get_addr(dd.hostname)
162 peers.append(addr.split(':')[0] + ':' + port)
163 return {
164 "files": {
165 "alertmanager.yml": yml
166 },
167 "peers": peers
168 }, sorted(deps)
169
170 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
171 # TODO: if there are multiple daemons, who is the active one?
f91f0fd5
TL
172 if daemon_descrs:
173 return daemon_descrs[0]
174 # if empty list provided, return empty Daemon Desc
175 return DaemonDescription()
e306af50 176
f91f0fd5 177 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
e306af50 178 dd = self.get_active_daemon(daemon_descrs)
f67539c2 179 assert dd.hostname is not None
b3b6e05e
TL
180 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
181 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
182 service_url = 'http://{}:{}'.format(addr, port)
e306af50
TL
183 self._set_service_url_on_dashboard(
184 'AlertManager',
185 'dashboard get-alertmanager-api-host',
186 'dashboard set-alertmanager-api-host',
187 service_url
188 )
189
f67539c2
TL
190 def ok_to_stop(self,
191 daemon_ids: List[str],
192 force: bool = False,
193 known: Optional[List[str]] = None) -> HandleCommandResult:
194 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Alertmanager', 1)
195 if warn and not force:
196 return HandleCommandResult(-errno.EBUSY, '', warn_message)
197 return HandleCommandResult(0, warn_message, '')
198
e306af50
TL
199
200class PrometheusService(CephadmService):
f6b5b4d7 201 TYPE = 'prometheus'
e306af50
TL
202 DEFAULT_SERVICE_PORT = 9095
203
522d829b
TL
204 def config(self, spec: ServiceSpec) -> None:
205 # make sure module is enabled
206 mgr_map = self.mgr.get('mgr_map')
207 if 'prometheus' not in mgr_map.get('services', {}):
208 self.mgr.check_mon_command({
209 'prefix': 'mgr module enable',
210 'module': 'prometheus'
211 })
212 # we shouldn't get here (mon will tell the mgr to respawn), but no
213 # harm done if we do.
214
215 def prepare_create(
216 self,
217 daemon_spec: CephadmDaemonDeploySpec,
218 ) -> CephadmDaemonDeploySpec:
f6b5b4d7 219 assert self.TYPE == daemon_spec.daemon_type
f67539c2 220 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 221 return daemon_spec
e306af50 222
522d829b
TL
223 def generate_config(
224 self,
225 daemon_spec: CephadmDaemonDeploySpec,
226 ) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7 227 assert self.TYPE == daemon_spec.daemon_type
e306af50
TL
228 deps = [] # type: List[str]
229
230 # scrape mgrs
231 mgr_scrape_list = []
232 mgr_map = self.mgr.get('mgr_map')
233 port = None
234 t = mgr_map.get('services', {}).get('prometheus', None)
235 if t:
236 t = t.split('/')[2]
237 mgr_scrape_list.append(t)
238 port = '9283'
239 if ':' in t:
240 port = t.split(':')[1]
241 # scan all mgrs to generate deps and to get standbys too.
242 # assume that they are all on the same port as the active mgr.
243 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
244 # we consider the mgr a dep even if the prometheus module is
245 # disabled in order to be consistent with _calc_daemon_deps().
246 deps.append(dd.name())
247 if not port:
248 continue
249 if dd.daemon_id == self.mgr.get_mgr_id():
250 continue
f67539c2 251 assert dd.hostname is not None
e306af50
TL
252 addr = self.mgr.inventory.get_addr(dd.hostname)
253 mgr_scrape_list.append(addr.split(':')[0] + ':' + port)
254
255 # scrape node exporters
256 nodes = []
257 for dd in self.mgr.cache.get_daemons_by_service('node-exporter'):
f67539c2 258 assert dd.hostname is not None
e306af50 259 deps.append(dd.name())
b3b6e05e
TL
260 addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
261 port = str(dd.ports[0]) if dd.ports else '9100'
e306af50
TL
262 nodes.append({
263 'hostname': dd.hostname,
b3b6e05e 264 'url': addr.split(':')[0] + ':' + port
e306af50
TL
265 })
266
267 # scrape alert managers
268 alertmgr_targets = []
269 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
f67539c2 270 assert dd.hostname is not None
e306af50 271 deps.append(dd.name())
b3b6e05e
TL
272 addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
273 port = str(dd.ports[0]) if dd.ports else '9093'
274 alertmgr_targets.append("'{}:{}'".format(addr.split(':')[0], port))
e306af50 275
f67539c2
TL
276 # scrape haproxies
277 haproxy_targets = []
278 for dd in self.mgr.cache.get_daemons_by_type('ingress'):
279 if dd.service_name() in self.mgr.spec_store:
280 spec = cast(IngressSpec, self.mgr.spec_store[dd.service_name()].spec)
281 assert dd.hostname is not None
282 deps.append(dd.name())
283 if dd.daemon_type == 'haproxy':
284 addr = self.mgr.inventory.get_addr(dd.hostname)
285 haproxy_targets.append({
286 "url": f"'{addr.split(':')[0]}:{spec.monitor_port}'",
287 "service": dd.service_name(),
288 })
289
e306af50
TL
290 # generate the prometheus configuration
291 context = {
292 'alertmgr_targets': alertmgr_targets,
293 'mgr_scrape_list': mgr_scrape_list,
f67539c2 294 'haproxy_targets': haproxy_targets,
e306af50
TL
295 'nodes': nodes,
296 }
297 r = {
298 'files': {
299 'prometheus.yml':
300 self.mgr.template.render(
301 'services/prometheus/prometheus.yml.j2', context)
302 }
303 }
304
305 # include alerts, if present in the container
306 if os.path.exists(self.mgr.prometheus_alerts_path):
307 with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f:
308 alerts = f.read()
309 r['files']['/etc/prometheus/alerting/ceph_alerts.yml'] = alerts
310
311 return r, sorted(deps)
312
313 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
314 # TODO: if there are multiple daemons, who is the active one?
f91f0fd5
TL
315 if daemon_descrs:
316 return daemon_descrs[0]
317 # if empty list provided, return empty Daemon Desc
318 return DaemonDescription()
e306af50 319
f91f0fd5 320 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
e306af50 321 dd = self.get_active_daemon(daemon_descrs)
f67539c2 322 assert dd.hostname is not None
b3b6e05e
TL
323 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
324 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
325 service_url = 'http://{}:{}'.format(addr, port)
e306af50
TL
326 self._set_service_url_on_dashboard(
327 'Prometheus',
328 'dashboard get-prometheus-api-host',
329 'dashboard set-prometheus-api-host',
330 service_url
331 )
332
f67539c2
TL
333 def ok_to_stop(self,
334 daemon_ids: List[str],
335 force: bool = False,
336 known: Optional[List[str]] = None) -> HandleCommandResult:
337 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Prometheus', 1)
338 if warn and not force:
339 return HandleCommandResult(-errno.EBUSY, '', warn_message)
340 return HandleCommandResult(0, warn_message, '')
341
f6b5b4d7 342
e306af50 343class NodeExporterService(CephadmService):
f6b5b4d7
TL
344 TYPE = 'node-exporter'
345
f67539c2 346 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
f6b5b4d7 347 assert self.TYPE == daemon_spec.daemon_type
f67539c2 348 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 349 return daemon_spec
e306af50 350
f67539c2 351 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7 352 assert self.TYPE == daemon_spec.daemon_type
e306af50 353 return {}, []
f67539c2
TL
354
355 def ok_to_stop(self,
356 daemon_ids: List[str],
357 force: bool = False,
358 known: Optional[List[str]] = None) -> HandleCommandResult:
359 # since node exporter runs on each host and cannot compromise data, no extra checks required
360 names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids]
361 out = f'It is presumed safe to stop {names}'
362 return HandleCommandResult(0, out, '')