]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/cephadm/services/monitoring.py
import ceph pacific 16.2.5
[ceph.git] / ceph / src / pybind / mgr / cephadm / services / monitoring.py
1 import errno
2 import logging
3 import os
4 from typing import List, Any, Tuple, Dict, Optional, cast
5
6 from mgr_module import HandleCommandResult
7
8 from orchestrator import DaemonDescription
9 from ceph.deployment.service_spec import AlertManagerSpec
10 from cephadm.services.cephadmservice import CephadmService, CephadmDaemonDeploySpec
11 from cephadm.services.ingress import IngressSpec
12 from mgr_util import verify_tls, ServerConfigException, create_self_signed_cert
13
14 logger = logging.getLogger(__name__)
15
16
17 class GrafanaService(CephadmService):
18 TYPE = 'grafana'
19 DEFAULT_SERVICE_PORT = 3000
20
21 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
22 assert self.TYPE == daemon_spec.daemon_type
23 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
24 return daemon_spec
25
26 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
27 assert self.TYPE == daemon_spec.daemon_type
28 deps = [] # type: List[str]
29
30 prom_services = [] # type: List[str]
31 for dd in self.mgr.cache.get_daemons_by_service('prometheus'):
32 assert dd.hostname is not None
33 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
34 port = dd.ports[0] if dd.ports else 9095
35 prom_services.append(addr + ':' + str(port))
36 deps.append(dd.name())
37 grafana_data_sources = self.mgr.template.render(
38 'services/grafana/ceph-dashboard.yml.j2', {'hosts': prom_services})
39
40 cert = self.mgr.get_store('grafana_crt')
41 pkey = self.mgr.get_store('grafana_key')
42 if cert and pkey:
43 try:
44 verify_tls(cert, pkey)
45 except ServerConfigException as e:
46 logger.warning('Provided grafana TLS certificates invalid: %s', str(e))
47 cert, pkey = None, None
48 if not (cert and pkey):
49 cert, pkey = create_self_signed_cert('Ceph', 'cephadm')
50 self.mgr.set_store('grafana_crt', cert)
51 self.mgr.set_store('grafana_key', pkey)
52 self.mgr.check_mon_command({
53 'prefix': 'dashboard set-grafana-api-ssl-verify',
54 'value': 'false',
55 })
56
57 grafana_ini = self.mgr.template.render(
58 'services/grafana/grafana.ini.j2', {
59 'http_port': daemon_spec.ports[0] if daemon_spec.ports else self.DEFAULT_SERVICE_PORT,
60 'http_addr': daemon_spec.ip if daemon_spec.ip else ''
61 })
62
63 config_file = {
64 'files': {
65 "grafana.ini": grafana_ini,
66 'provisioning/datasources/ceph-dashboard.yml': grafana_data_sources,
67 'certs/cert_file': '# generated by cephadm\n%s' % cert,
68 'certs/cert_key': '# generated by cephadm\n%s' % pkey,
69 }
70 }
71 return config_file, sorted(deps)
72
73 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
74 # Use the least-created one as the active daemon
75 if daemon_descrs:
76 return daemon_descrs[-1]
77 # if empty list provided, return empty Daemon Desc
78 return DaemonDescription()
79
80 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
81 # TODO: signed cert
82 dd = self.get_active_daemon(daemon_descrs)
83 assert dd.hostname is not None
84 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
85 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
86 service_url = 'https://{}:{}'.format(addr, port)
87 self._set_service_url_on_dashboard(
88 'Grafana',
89 'dashboard get-grafana-api-url',
90 'dashboard set-grafana-api-url',
91 service_url
92 )
93
94 def ok_to_stop(self,
95 daemon_ids: List[str],
96 force: bool = False,
97 known: Optional[List[str]] = None) -> HandleCommandResult:
98 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Grafana', 1)
99 if warn and not force:
100 return HandleCommandResult(-errno.EBUSY, '', warn_message)
101 return HandleCommandResult(0, warn_message, '')
102
103
104 class AlertmanagerService(CephadmService):
105 TYPE = 'alertmanager'
106 DEFAULT_SERVICE_PORT = 9093
107
108 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
109 assert self.TYPE == daemon_spec.daemon_type
110 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
111 return daemon_spec
112
113 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
114 assert self.TYPE == daemon_spec.daemon_type
115 deps: List[str] = []
116 default_webhook_urls: List[str] = []
117
118 spec = cast(AlertManagerSpec, self.mgr.spec_store[daemon_spec.service_name].spec)
119 user_data = spec.user_data
120 if 'default_webhook_urls' in user_data and isinstance(
121 user_data['default_webhook_urls'], list):
122 default_webhook_urls.extend(user_data['default_webhook_urls'])
123
124 # dashboard(s)
125 dashboard_urls: List[str] = []
126 mgr_map = self.mgr.get('mgr_map')
127 port = None
128 proto = None # http: or https:
129 url = mgr_map.get('services', {}).get('dashboard', None)
130 if url:
131 dashboard_urls.append(url)
132 proto = url.split('/')[0]
133 port = url.split('/')[2].split(':')[1]
134 # scan all mgrs to generate deps and to get standbys too.
135 # assume that they are all on the same port as the active mgr.
136 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
137 # we consider mgr a dep even if the dashboard is disabled
138 # in order to be consistent with _calc_daemon_deps().
139 deps.append(dd.name())
140 if not port:
141 continue
142 if dd.daemon_id == self.mgr.get_mgr_id():
143 continue
144 assert dd.hostname is not None
145 addr = self.mgr.inventory.get_addr(dd.hostname)
146 dashboard_urls.append('%s//%s:%s/' % (proto, addr.split(':')[0],
147 port))
148
149 context = {
150 'dashboard_urls': dashboard_urls,
151 'default_webhook_urls': default_webhook_urls
152 }
153 yml = self.mgr.template.render('services/alertmanager/alertmanager.yml.j2', context)
154
155 peers = []
156 port = '9094'
157 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
158 assert dd.hostname is not None
159 deps.append(dd.name())
160 addr = self.mgr.inventory.get_addr(dd.hostname)
161 peers.append(addr.split(':')[0] + ':' + port)
162 return {
163 "files": {
164 "alertmanager.yml": yml
165 },
166 "peers": peers
167 }, sorted(deps)
168
169 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
170 # TODO: if there are multiple daemons, who is the active one?
171 if daemon_descrs:
172 return daemon_descrs[0]
173 # if empty list provided, return empty Daemon Desc
174 return DaemonDescription()
175
176 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
177 dd = self.get_active_daemon(daemon_descrs)
178 assert dd.hostname is not None
179 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
180 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
181 service_url = 'http://{}:{}'.format(addr, port)
182 self._set_service_url_on_dashboard(
183 'AlertManager',
184 'dashboard get-alertmanager-api-host',
185 'dashboard set-alertmanager-api-host',
186 service_url
187 )
188
189 def ok_to_stop(self,
190 daemon_ids: List[str],
191 force: bool = False,
192 known: Optional[List[str]] = None) -> HandleCommandResult:
193 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Alertmanager', 1)
194 if warn and not force:
195 return HandleCommandResult(-errno.EBUSY, '', warn_message)
196 return HandleCommandResult(0, warn_message, '')
197
198
199 class PrometheusService(CephadmService):
200 TYPE = 'prometheus'
201 DEFAULT_SERVICE_PORT = 9095
202
203 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
204 assert self.TYPE == daemon_spec.daemon_type
205 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
206 return daemon_spec
207
208 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
209 assert self.TYPE == daemon_spec.daemon_type
210 deps = [] # type: List[str]
211
212 # scrape mgrs
213 mgr_scrape_list = []
214 mgr_map = self.mgr.get('mgr_map')
215 port = None
216 t = mgr_map.get('services', {}).get('prometheus', None)
217 if t:
218 t = t.split('/')[2]
219 mgr_scrape_list.append(t)
220 port = '9283'
221 if ':' in t:
222 port = t.split(':')[1]
223 # scan all mgrs to generate deps and to get standbys too.
224 # assume that they are all on the same port as the active mgr.
225 for dd in self.mgr.cache.get_daemons_by_service('mgr'):
226 # we consider the mgr a dep even if the prometheus module is
227 # disabled in order to be consistent with _calc_daemon_deps().
228 deps.append(dd.name())
229 if not port:
230 continue
231 if dd.daemon_id == self.mgr.get_mgr_id():
232 continue
233 assert dd.hostname is not None
234 addr = self.mgr.inventory.get_addr(dd.hostname)
235 mgr_scrape_list.append(addr.split(':')[0] + ':' + port)
236
237 # scrape node exporters
238 nodes = []
239 for dd in self.mgr.cache.get_daemons_by_service('node-exporter'):
240 assert dd.hostname is not None
241 deps.append(dd.name())
242 addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
243 port = str(dd.ports[0]) if dd.ports else '9100'
244 nodes.append({
245 'hostname': dd.hostname,
246 'url': addr.split(':')[0] + ':' + port
247 })
248
249 # scrape alert managers
250 alertmgr_targets = []
251 for dd in self.mgr.cache.get_daemons_by_service('alertmanager'):
252 assert dd.hostname is not None
253 deps.append(dd.name())
254 addr = dd.ip if dd.ip else self.mgr.inventory.get_addr(dd.hostname)
255 port = str(dd.ports[0]) if dd.ports else '9093'
256 alertmgr_targets.append("'{}:{}'".format(addr.split(':')[0], port))
257
258 # scrape haproxies
259 haproxy_targets = []
260 for dd in self.mgr.cache.get_daemons_by_type('ingress'):
261 if dd.service_name() in self.mgr.spec_store:
262 spec = cast(IngressSpec, self.mgr.spec_store[dd.service_name()].spec)
263 assert dd.hostname is not None
264 deps.append(dd.name())
265 if dd.daemon_type == 'haproxy':
266 addr = self.mgr.inventory.get_addr(dd.hostname)
267 haproxy_targets.append({
268 "url": f"'{addr.split(':')[0]}:{spec.monitor_port}'",
269 "service": dd.service_name(),
270 })
271
272 # generate the prometheus configuration
273 context = {
274 'alertmgr_targets': alertmgr_targets,
275 'mgr_scrape_list': mgr_scrape_list,
276 'haproxy_targets': haproxy_targets,
277 'nodes': nodes,
278 }
279 r = {
280 'files': {
281 'prometheus.yml':
282 self.mgr.template.render(
283 'services/prometheus/prometheus.yml.j2', context)
284 }
285 }
286
287 # include alerts, if present in the container
288 if os.path.exists(self.mgr.prometheus_alerts_path):
289 with open(self.mgr.prometheus_alerts_path, 'r', encoding='utf-8') as f:
290 alerts = f.read()
291 r['files']['/etc/prometheus/alerting/ceph_alerts.yml'] = alerts
292
293 return r, sorted(deps)
294
295 def get_active_daemon(self, daemon_descrs: List[DaemonDescription]) -> DaemonDescription:
296 # TODO: if there are multiple daemons, who is the active one?
297 if daemon_descrs:
298 return daemon_descrs[0]
299 # if empty list provided, return empty Daemon Desc
300 return DaemonDescription()
301
302 def config_dashboard(self, daemon_descrs: List[DaemonDescription]) -> None:
303 dd = self.get_active_daemon(daemon_descrs)
304 assert dd.hostname is not None
305 addr = dd.ip if dd.ip else self._inventory_get_addr(dd.hostname)
306 port = dd.ports[0] if dd.ports else self.DEFAULT_SERVICE_PORT
307 service_url = 'http://{}:{}'.format(addr, port)
308 self._set_service_url_on_dashboard(
309 'Prometheus',
310 'dashboard get-prometheus-api-host',
311 'dashboard set-prometheus-api-host',
312 service_url
313 )
314
315 def ok_to_stop(self,
316 daemon_ids: List[str],
317 force: bool = False,
318 known: Optional[List[str]] = None) -> HandleCommandResult:
319 warn, warn_message = self._enough_daemons_to_stop(self.TYPE, daemon_ids, 'Prometheus', 1)
320 if warn and not force:
321 return HandleCommandResult(-errno.EBUSY, '', warn_message)
322 return HandleCommandResult(0, warn_message, '')
323
324
325 class NodeExporterService(CephadmService):
326 TYPE = 'node-exporter'
327
328 def prepare_create(self, daemon_spec: CephadmDaemonDeploySpec) -> CephadmDaemonDeploySpec:
329 assert self.TYPE == daemon_spec.daemon_type
330 daemon_spec.final_config, daemon_spec.deps = self.generate_config(daemon_spec)
331 return daemon_spec
332
333 def generate_config(self, daemon_spec: CephadmDaemonDeploySpec) -> Tuple[Dict[str, Any], List[str]]:
334 assert self.TYPE == daemon_spec.daemon_type
335 return {}, []
336
337 def ok_to_stop(self,
338 daemon_ids: List[str],
339 force: bool = False,
340 known: Optional[List[str]] = None) -> HandleCommandResult:
341 # since node exporter runs on each host and cannot compromise data, no extra checks required
342 names = [f'{self.TYPE}.{d_id}' for d_id in daemon_ids]
343 out = f'It is presumed safe to stop {names}'
344 return HandleCommandResult(0, out, '')