]> git.proxmox.com Git - ceph.git/blame - ceph/src/pybind/mgr/cephadm/services/monitoring.py
update dh_systemd restart patch for pacific
[ceph.git] / ceph / src / pybind / mgr / cephadm / services / monitoring.py
CommitLineData
f67539c2 1import errno
e306af50
TL
2import logging
3import os
f67539c2
TL
4from typing import List, Any, Tuple, Dict, Optional, cast
5
6from mgr_module import HandleCommandResult
e306af50
TL
7
8from orchestrator import DaemonDescription
f6b5b4d7 9from ceph.deployment.service_spec import AlertManagerSpec
f67539c2
TL
10from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec
11from cephadm.services.ingress import IngressSpec
e306af50
TL
12from mgr_util import verify_tls, ServerConfigException, create_self_signed_cert
13
14logger = logging.getLogger(__name__)
15
f6b5b4d7 16
e306af50 17class GrafanaService(CephadmService):
f6b5b4d7 18 TYPE = 'grafana'
e306af50
TL
19 DEFAULT_SERVICE_PORT = 3000
20
f67539c2 21 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
f6b5b4d7 22 assert self.TYPE == daemon_spec.daemon_type
f67539c2 23 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 24 return daemon_spec
e306af50 25
f67539c2 26 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7 27 assert self.TYPE == daemon_spec.daemon_type
e306af50
TL
28 deps = [] # type: List[str]
29
30 prom_services = [] # type: List[str]
31 for dd in self.mgr.cache.get_daemons_by_service('prometheus'):
f67539c2 32 assert dd.hostname is not None
e306af50
TL
33 prom_services.append(dd.hostname)
34 deps.append(dd.name())
35 grafana_data_sources = self.mgr.template.render(
36 'services/grafana/ceph-dashboard.yml.j2', {'hosts': prom_services})
37
38 cert = self.mgr.get_store('grafana_crt')
39 pkey = self.mgr.get_store('grafana_key')
40 if cert and pkey:
41 try:
42 verify_tls(cert, pkey)
43 except ServerConfigException as e:
44 logger.warning('Provided grafana TLS certificates invalid: %s', str(e))
45 cert, pkey = None, None
46 if not (cert and pkey):
47 cert, pkey = create_self_signed_cert('Ceph', 'cephadm')
48 self.mgr.set_store('grafana_crt', cert)
49 self.mgr.set_store('grafana_key', pkey)
50 self.mgr.check_mon_command({
51 'prefix': 'dashboard set-grafana-api-ssl-verify',
52 'value': 'false',
53 })
54
55 grafana_ini = self.mgr.template.render(
56 'services/grafana/grafana.ini.j2', {'http_port': self.DEFAULT_SERVICE_PORT})
57
58 config_file = {
59 'files': {
60 "grafana.ini": grafana_ini,
61 'provisioning/datasources/ceph-dashboard.yml': grafana_data_sources,
62 'certs/cert_file': '# generated by cephadm\n%s' % cert,
63 'certs/cert_key': '# generated by cephadm\n%s' % pkey,
64 }
65 }
66 return config_file, sorted(deps)
67
68 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
69 # Use the least-created one as the active daemon
f91f0fd5
TL
70 if daemon_descrs:
71 return daemon_descrs[-1]
72 # if empty list provided, return empty Daemon Desc
73 return DaemonDescription()
e306af50 74
f91f0fd5 75 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
e306af50
TL
76 # TODO: signed cert
77 dd = self.get_active_daemon(daemon_descrs)
f67539c2 78 assert dd.hostname is not None
e306af50
TL
79 service_url = 'https://{}:{}'.format(
80 self._inventory_get_addr(dd.hostname), self.DEFAULT_SERVICE_PORT)
81 self._set_service_url_on_dashboard(
82 'Grafana',
83 'dashboard get-grafana-api-url',
84 'dashboard set-grafana-api-url',
85 service_url
86 )
87
f67539c2
TL
88 def ok_to_stop(self,
89 daemon_ids: List[str],
90 force: bool = False,
91 known: Optional[List[str]] = None) -> HandleCommandResult:
92 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Grafana', 1)
93 if warn and not force:
94 return HandleCommandResult(-errno.EBUSY, '', warn_message)
95 return HandleCommandResult(0, warn_message, '')
96
f6b5b4d7 97
e306af50 98class AlertmanagerService(CephadmService):
f6b5b4d7 99 TYPE = 'alertmanager'
e306af50
TL
100 DEFAULT_SERVICE_PORT = 9093
101
f67539c2 102 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
f6b5b4d7 103 assert self.TYPE == daemon_spec.daemon_type
f67539c2 104 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 105 return daemon_spec
f6b5b4d7 106
f67539c2 107 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7
TL
108 assert self.TYPE == daemon_spec.daemon_type
109 deps: List[str] = []
110 default_webhook_urls: List[str] = []
e306af50 111
f67539c2
TL
112 spec = cast(AlertManagerSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
113 user_data = spec.user_data
114 if 'default_webhook_urls' in user_data and isinstance(
115 user_data['default_webhook_urls'], list):
116 default_webhook_urls.extend(user_data['default_webhook_urls'])
e306af50
TL
117
118 # dashboard(s)
f6b5b4d7 119 dashboard_urls: List[str] = []
e306af50
TL
120 mgr_map = self.mgr.get('mgr_map')
121 port = None
122 proto = None # http: or https:
123 url = mgr_map.get('services', {}).get('dashboard', None)
124 if url:
125 dashboard_urls.append(url)
126 proto = url.split('/')[0]
127 port = url.split('/')[2].split(':')[1]
128 # scan all mgrs to generate deps and to get standbys too.
129 # assume that they are all on the same port as the active mgr.
130 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
131 # we consider mgr a dep even if the dashboard is disabled
132 # in order to be consistent with _calc_daemon_deps().
133 deps.append(dd.name())
134 if not port:
135 continue
136 if dd.daemon_id == self.mgr.get_mgr_id():
137 continue
f67539c2 138 assert dd.hostname is not None
e306af50
TL
139 addr = self.mgr.inventory.get_addr(dd.hostname)
140 dashboard_urls.append('%s//%s:%s/' % (proto, addr.split(':')[0],
141 port))
142
143 context = {
f6b5b4d7
TL
144 'dashboard_urls': dashboard_urls,
145 'default_webhook_urls': default_webhook_urls
e306af50
TL
146 }
147 yml = self.mgr.template.render('services/alertmanager/alertmanager.yml.j2', context)
148
149 peers = []
150 port = '9094'
151 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
f67539c2 152 assert dd.hostname is not None
e306af50
TL
153 deps.append(dd.name())
154 addr = self.mgr.inventory.get_addr(dd.hostname)
155 peers.append(addr.split(':')[0] + ':' + port)
156 return {
157 "files": {
158 "alertmanager.yml": yml
159 },
160 "peers": peers
161 }, sorted(deps)
162
163 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
164 # TODO: if there are multiple daemons, who is the active one?
f91f0fd5
TL
165 if daemon_descrs:
166 return daemon_descrs[0]
167 # if empty list provided, return empty Daemon Desc
168 return DaemonDescription()
e306af50 169
f91f0fd5 170 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
e306af50 171 dd = self.get_active_daemon(daemon_descrs)
f67539c2 172 assert dd.hostname is not None
f91f0fd5
TL
173 service_url = 'http://{}:{}'.format(self._inventory_get_addr(dd.hostname),
174 self.DEFAULT_SERVICE_PORT)
e306af50
TL
175 self._set_service_url_on_dashboard(
176 'AlertManager',
177 'dashboard get-alertmanager-api-host',
178 'dashboard set-alertmanager-api-host',
179 service_url
180 )
181
f67539c2
TL
182 def ok_to_stop(self,
183 daemon_ids: List[str],
184 force: bool = False,
185 known: Optional[List[str]] = None) -> HandleCommandResult:
186 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Alertmanager', 1)
187 if warn and not force:
188 return HandleCommandResult(-errno.EBUSY, '', warn_message)
189 return HandleCommandResult(0, warn_message, '')
190
e306af50
TL
191
192class PrometheusService(CephadmService):
f6b5b4d7 193 TYPE = 'prometheus'
e306af50
TL
194 DEFAULT_SERVICE_PORT = 9095
195
f67539c2 196 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
f6b5b4d7 197 assert self.TYPE == daemon_spec.daemon_type
f67539c2 198 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 199 return daemon_spec
e306af50 200
f67539c2 201 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7 202 assert self.TYPE == daemon_spec.daemon_type
e306af50
TL
203 deps = [] # type: List[str]
204
205 # scrape mgrs
206 mgr_scrape_list = []
207 mgr_map = self.mgr.get('mgr_map')
208 port = None
209 t = mgr_map.get('services', {}).get('prometheus', None)
210 if t:
211 t = t.split('/')[2]
212 mgr_scrape_list.append(t)
213 port = '9283'
214 if ':' in t:
215 port = t.split(':')[1]
216 # scan all mgrs to generate deps and to get standbys too.
217 # assume that they are all on the same port as the active mgr.
218 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
219 # we consider the mgr a dep even if the prometheus module is
220 # disabled in order to be consistent with _calc_daemon_deps().
221 deps.append(dd.name())
222 if not port:
223 continue
224 if dd.daemon_id == self.mgr.get_mgr_id():
225 continue
f67539c2 226 assert dd.hostname is not None
e306af50
TL
227 addr = self.mgr.inventory.get_addr(dd.hostname)
228 mgr_scrape_list.append(addr.split(':')[0] + ':' + port)
229
230 # scrape node exporters
231 nodes = []
232 for dd in self.mgr.cache.get_daemons_by_service('node-exporter'):
f67539c2 233 assert dd.hostname is not None
e306af50
TL
234 deps.append(dd.name())
235 addr = self.mgr.inventory.get_addr(dd.hostname)
236 nodes.append({
237 'hostname': dd.hostname,
238 'url': addr.split(':')[0] + ':9100'
239 })
240
241 # scrape alert managers
242 alertmgr_targets = []
243 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
f67539c2 244 assert dd.hostname is not None
e306af50
TL
245 deps.append(dd.name())
246 addr = self.mgr.inventory.get_addr(dd.hostname)
247 alertmgr_targets.append("'{}:9093'".format(addr.split(':')[0]))
248
f67539c2
TL
249 # scrape haproxies
250 haproxy_targets = []
251 for dd in self.mgr.cache.get_daemons_by_type('ingress'):
252 if dd.service_name() in self.mgr.spec_store:
253 spec = cast(IngressSpec, self.mgr.spec_store[dd.service_name()].spec)
254 assert dd.hostname is not None
255 deps.append(dd.name())
256 if dd.daemon_type == 'haproxy':
257 addr = self.mgr.inventory.get_addr(dd.hostname)
258 haproxy_targets.append({
259 "url": f"'{addr.split(':')[0]}:{spec.monitor_port}'",
260 "service": dd.service_name(),
261 })
262
e306af50
TL
263 # generate the prometheus configuration
264 context = {
265 'alertmgr_targets': alertmgr_targets,
266 'mgr_scrape_list': mgr_scrape_list,
f67539c2 267 'haproxy_targets': haproxy_targets,
e306af50
TL
268 'nodes': nodes,
269 }
270 r = {
271 'files': {
272 'prometheus.yml':
273 self.mgr.template.render(
274 'services/prometheus/prometheus.yml.j2', context)
275 }
276 }
277
278 # include alerts, if present in the container
279 if os.path.exists(self.mgr.prometheus_alerts_path):
280 with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f:
281 alerts = f.read()
282 r['files']['/etc/prometheus/alerting/ceph_alerts.yml'] = alerts
283
284 return r, sorted(deps)
285
286 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
287 # TODO: if there are multiple daemons, who is the active one?
f91f0fd5
TL
288 if daemon_descrs:
289 return daemon_descrs[0]
290 # if empty list provided, return empty Daemon Desc
291 return DaemonDescription()
e306af50 292
f91f0fd5 293 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
e306af50 294 dd = self.get_active_daemon(daemon_descrs)
f67539c2 295 assert dd.hostname is not None
e306af50
TL
296 service_url = 'http://{}:{}'.format(
297 self._inventory_get_addr(dd.hostname), self.DEFAULT_SERVICE_PORT)
298 self._set_service_url_on_dashboard(
299 'Prometheus',
300 'dashboard get-prometheus-api-host',
301 'dashboard set-prometheus-api-host',
302 service_url
303 )
304
f67539c2
TL
305 def ok_to_stop(self,
306 daemon_ids: List[str],
307 force: bool = False,
308 known: Optional[List[str]] = None) -> HandleCommandResult:
309 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Prometheus', 1)
310 if warn and not force:
311 return HandleCommandResult(-errno.EBUSY, '', warn_message)
312 return HandleCommandResult(0, warn_message, '')
313
f6b5b4d7 314
e306af50 315class NodeExporterService(CephadmService):
f6b5b4d7
TL
316 TYPE = 'node-exporter'
317
f67539c2 318 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
f6b5b4d7 319 assert self.TYPE == daemon_spec.daemon_type
f67539c2 320 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
f91f0fd5 321 return daemon_spec
e306af50 322
f67539c2 323 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
f6b5b4d7 324 assert self.TYPE == daemon_spec.daemon_type
e306af50 325 return {}, []
f67539c2
TL
326
327 def ok_to_stop(self,
328 daemon_ids: List[str],
329 force: bool = False,
330 known: Optional[List[str]] = None) -> HandleCommandResult:
331 # since node exporter runs on each host and cannot compromise data, no extra checks required
332 names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids]
333 out = f'It is presumed safe to stop {names}'
334 return HandleCommandResult(0, out, '')