]>
git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/cephadm/services/monitoring.py
4 from typing
import List
, Any
, Tuple
, Dict
, Optional
, cast
6 from mgr_module
import HandleCommandResult
8 from orchestrator
import DaemonDescription
9 from ceph
.deployment
.service_spec
import AlertManagerSpec
, ServiceSpec
10 from cephadm
.services
.cephadmservice
import CephadmService
, CephadmDaemonDeploySpec
11 from cephadm
.services
.ingress
import IngressSpec
12 from mgr_util
import verify_tls
, ServerConfigException
, create_self_signed_cert
, build_url
14 logger
= logging
.getLogger(__name__
)
17 class GrafanaService(CephadmService
):
19 DEFAULT_SERVICE_PORT
= 3000
21 def prepare_create(self
, daemon_spec
: CephadmDaemonDeploySpec
) -> CephadmDaemonDeploySpec
:
22 assert self
.TYPE
== daemon_spec
.daemon_type
23 daemon_spec
.final_config
, daemon_spec
.deps
= self
.generate_config(daemon_spec
)
26 def generate_config(self
, daemon_spec
: CephadmDaemonDeploySpec
) -> Tuple
[Dict
[str, Any
], List
[str]]:
27 assert self
.TYPE
== daemon_spec
.daemon_type
28 deps
= [] # type: List[str]
30 prom_services
= [] # type: List[str]
31 for dd
in self
.mgr
.cache
.get_daemons_by_service('prometheus'):
32 assert dd
.hostname
is not None
33 addr
= dd
.ip
if dd
.ip
else self
._inventory
_get
_addr
(dd
.hostname
)
34 port
= dd
.ports
[0] if dd
.ports
else 9095
35 prom_services
.append(addr
+ ':' + str(port
))
36 deps
.append(dd
.name())
37 grafana_data_sources
= self
.mgr
.template
.render(
38 'services/grafana/ceph-dashboard.yml.j2', {'hosts': prom_services
})
40 cert
= self
.mgr
.get_store('grafana_crt')
41 pkey
= self
.mgr
.get_store('grafana_key')
44 verify_tls(cert
, pkey
)
45 except ServerConfigException
as e
:
46 logger
.warning('Provided grafana TLS certificates invalid: %s', str(e
))
47 cert
, pkey
= None, None
48 if not (cert
and pkey
):
49 cert
, pkey
= create_self_signed_cert('Ceph', 'cephadm')
50 self
.mgr
.set_store('grafana_crt', cert
)
51 self
.mgr
.set_store('grafana_key', pkey
)
52 if 'dashboard' in self
.mgr
.get('mgr_map')['modules']:
53 self
.mgr
.check_mon_command({
54 'prefix': 'dashboard set-grafana-api-ssl-verify',
58 grafana_ini
= self
.mgr
.template
.render(
59 'services/grafana/grafana.ini.j2', {
60 'http_port': daemon_spec
.ports
[0] if daemon_spec
.ports
else self
.DEFAULT_SERVICE_PORT
,
61 'http_addr': daemon_spec
.ip
if daemon_spec
.ip
else ''
66 "grafana.ini": grafana_ini
,
67 'provisioning/datasources/ceph-dashboard.yml': grafana_data_sources
,
68 'certs/cert_file': '# generated by cephadm\n%s' % cert
,
69 'certs/cert_key': '# generated by cephadm\n%s' % pkey
,
72 return config_file
, sorted(deps
)
74 def get_active_daemon(self
, daemon_descrs
: List
[DaemonDescription
]) -> DaemonDescription
:
75 # Use the least-created one as the active daemon
77 return daemon_descrs
[-1]
78 # if empty list provided, return empty Daemon Desc
79 return DaemonDescription()
81 def config_dashboard(self
, daemon_descrs
: List
[DaemonDescription
]) -> None:
83 dd
= self
.get_active_daemon(daemon_descrs
)
84 assert dd
.hostname
is not None
85 addr
= dd
.ip
if dd
.ip
else self
._inventory
_get
_addr
(dd
.hostname
)
86 port
= dd
.ports
[0] if dd
.ports
else self
.DEFAULT_SERVICE_PORT
87 service_url
= build_url(scheme
='https', host
=addr
, port
=port
)
88 self
._set
_service
_url
_on
_dashboard
(
90 'dashboard get-grafana-api-url',
91 'dashboard set-grafana-api-url',
96 daemon_ids
: List
[str],
98 known
: Optional
[List
[str]] = None) -> HandleCommandResult
:
99 warn
, warn_message
= self
._enough
_daemons
_to
_stop
(self
.TYPE
, daemon_ids
, 'Grafana', 1)
100 if warn
and not force
:
101 return HandleCommandResult(-errno
.EBUSY
, '', warn_message
)
102 return HandleCommandResult(0, warn_message
, '')
105 class AlertmanagerService(CephadmService
):
106 TYPE
= 'alertmanager'
107 DEFAULT_SERVICE_PORT
= 9093
109 def prepare_create(self
, daemon_spec
: CephadmDaemonDeploySpec
) -> CephadmDaemonDeploySpec
:
110 assert self
.TYPE
== daemon_spec
.daemon_type
111 daemon_spec
.final_config
, daemon_spec
.deps
= self
.generate_config(daemon_spec
)
114 def generate_config(self
, daemon_spec
: CephadmDaemonDeploySpec
) -> Tuple
[Dict
[str, Any
], List
[str]]:
115 assert self
.TYPE
== daemon_spec
.daemon_type
117 default_webhook_urls
: List
[str] = []
119 spec
= cast(AlertManagerSpec
, self
.mgr
.spec_store
[daemon_spec
.service_name
].spec
)
120 user_data
= spec
.user_data
121 if 'default_webhook_urls' in user_data
and isinstance(
122 user_data
['default_webhook_urls'], list):
123 default_webhook_urls
.extend(user_data
['default_webhook_urls'])
126 dashboard_urls
: List
[str] = []
127 mgr_map
= self
.mgr
.get('mgr_map')
129 proto
= None # http: or https:
130 url
= mgr_map
.get('services', {}).get('dashboard', None)
132 dashboard_urls
.append(url
)
133 proto
= url
.split('/')[0]
134 port
= url
.split('/')[2].split(':')[1]
135 # scan all mgrs to generate deps and to get standbys too.
136 # assume that they are all on the same port as the active mgr.
137 for dd
in self
.mgr
.cache
.get_daemons_by_service('mgr'):
138 # we consider mgr a dep even if the dashboard is disabled
139 # in order to be consistent with _calc_daemon_deps().
140 deps
.append(dd
.name())
143 if dd
.daemon_id
== self
.mgr
.get_mgr_id():
145 assert dd
.hostname
is not None
146 addr
= self
.mgr
.inventory
.get_addr(dd
.hostname
)
147 dashboard_urls
.append('%s//%s:%s/' % (proto
, addr
.split(':')[0],
151 'dashboard_urls': dashboard_urls
,
152 'default_webhook_urls': default_webhook_urls
154 yml
= self
.mgr
.template
.render('services/alertmanager/alertmanager.yml.j2', context
)
158 for dd
in self
.mgr
.cache
.get_daemons_by_service('alertmanager'):
159 assert dd
.hostname
is not None
160 deps
.append(dd
.name())
161 addr
= self
.mgr
.inventory
.get_addr(dd
.hostname
)
162 peers
.append(addr
.split(':')[0] + ':' + port
)
165 "alertmanager.yml": yml
170 def get_active_daemon(self
, daemon_descrs
: List
[DaemonDescription
]) -> DaemonDescription
:
171 # TODO: if there are multiple daemons, who is the active one?
173 return daemon_descrs
[0]
174 # if empty list provided, return empty Daemon Desc
175 return DaemonDescription()
177 def config_dashboard(self
, daemon_descrs
: List
[DaemonDescription
]) -> None:
178 dd
= self
.get_active_daemon(daemon_descrs
)
179 assert dd
.hostname
is not None
180 addr
= dd
.ip
if dd
.ip
else self
._inventory
_get
_addr
(dd
.hostname
)
181 port
= dd
.ports
[0] if dd
.ports
else self
.DEFAULT_SERVICE_PORT
182 service_url
= 'http://{}:{}'.format(addr
, port
)
183 self
._set
_service
_url
_on
_dashboard
(
185 'dashboard get-alertmanager-api-host',
186 'dashboard set-alertmanager-api-host',
191 daemon_ids
: List
[str],
193 known
: Optional
[List
[str]] = None) -> HandleCommandResult
:
194 warn
, warn_message
= self
._enough
_daemons
_to
_stop
(self
.TYPE
, daemon_ids
, 'Alertmanager', 1)
195 if warn
and not force
:
196 return HandleCommandResult(-errno
.EBUSY
, '', warn_message
)
197 return HandleCommandResult(0, warn_message
, '')
200 class PrometheusService(CephadmService
):
202 DEFAULT_SERVICE_PORT
= 9095
204 def config(self
, spec
: ServiceSpec
) -> None:
205 # make sure module is enabled
206 mgr_map
= self
.mgr
.get('mgr_map')
207 if 'prometheus' not in mgr_map
.get('services', {}):
208 self
.mgr
.check_mon_command({
209 'prefix': 'mgr module enable',
210 'module': 'prometheus'
212 # we shouldn't get here (mon will tell the mgr to respawn), but no
213 # harm done if we do.
217 daemon_spec
: CephadmDaemonDeploySpec
,
218 ) -> CephadmDaemonDeploySpec
:
219 assert self
.TYPE
== daemon_spec
.daemon_type
220 daemon_spec
.final_config
, daemon_spec
.deps
= self
.generate_config(daemon_spec
)
225 daemon_spec
: CephadmDaemonDeploySpec
,
226 ) -> Tuple
[Dict
[str, Any
], List
[str]]:
227 assert self
.TYPE
== daemon_spec
.daemon_type
228 deps
= [] # type: List[str]
232 mgr_map
= self
.mgr
.get('mgr_map')
234 t
= mgr_map
.get('services', {}).get('prometheus', None)
237 mgr_scrape_list
.append(t
)
240 port
= t
.split(':')[1]
241 # scan all mgrs to generate deps and to get standbys too.
242 # assume that they are all on the same port as the active mgr.
243 for dd
in self
.mgr
.cache
.get_daemons_by_service('mgr'):
244 # we consider the mgr a dep even if the prometheus module is
245 # disabled in order to be consistent with _calc_daemon_deps().
246 deps
.append(dd
.name())
249 if dd
.daemon_id
== self
.mgr
.get_mgr_id():
251 assert dd
.hostname
is not None
252 addr
= self
.mgr
.inventory
.get_addr(dd
.hostname
)
253 mgr_scrape_list
.append(addr
.split(':')[0] + ':' + port
)
255 # scrape node exporters
257 for dd
in self
.mgr
.cache
.get_daemons_by_service('node-exporter'):
258 assert dd
.hostname
is not None
259 deps
.append(dd
.name())
260 addr
= dd
.ip
if dd
.ip
else self
.mgr
.inventory
.get_addr(dd
.hostname
)
261 port
= str(dd
.ports
[0]) if dd
.ports
else '9100'
263 'hostname': dd
.hostname
,
264 'url': addr
.split(':')[0] + ':' + port
267 # scrape alert managers
268 alertmgr_targets
= []
269 for dd
in self
.mgr
.cache
.get_daemons_by_service('alertmanager'):
270 assert dd
.hostname
is not None
271 deps
.append(dd
.name())
272 addr
= dd
.ip
if dd
.ip
else self
.mgr
.inventory
.get_addr(dd
.hostname
)
273 port
= str(dd
.ports
[0]) if dd
.ports
else '9093'
274 alertmgr_targets
.append("'{}:{}'".format(addr
.split(':')[0], port
))
278 for dd
in self
.mgr
.cache
.get_daemons_by_type('ingress'):
279 if dd
.service_name() in self
.mgr
.spec_store
:
280 spec
= cast(IngressSpec
, self
.mgr
.spec_store
[dd
.service_name()].spec
)
281 assert dd
.hostname
is not None
282 deps
.append(dd
.name())
283 if dd
.daemon_type
== 'haproxy':
284 addr
= self
.mgr
.inventory
.get_addr(dd
.hostname
)
285 haproxy_targets
.append({
286 "url": f
"'{addr.split(':')[0]}:{spec.monitor_port}'",
287 "service": dd
.service_name(),
290 # generate the prometheus configuration
292 'alertmgr_targets': alertmgr_targets
,
293 'mgr_scrape_list': mgr_scrape_list
,
294 'haproxy_targets': haproxy_targets
,
300 self
.mgr
.template
.render(
301 'services/prometheus/prometheus.yml.j2', context
)
305 # include alerts, if present in the container
306 if os
.path
.exists(self
.mgr
.prometheus_alerts_path
):
307 with
open(self
.mgr
.prometheus_alerts_path
, 'r', encoding
='utf-8') as f
:
309 r
['files']['/etc/prometheus/alerting/ceph_alerts.yml'] = alerts
311 return r
, sorted(deps
)
313 def get_active_daemon(self
, daemon_descrs
: List
[DaemonDescription
]) -> DaemonDescription
:
314 # TODO: if there are multiple daemons, who is the active one?
316 return daemon_descrs
[0]
317 # if empty list provided, return empty Daemon Desc
318 return DaemonDescription()
320 def config_dashboard(self
, daemon_descrs
: List
[DaemonDescription
]) -> None:
321 dd
= self
.get_active_daemon(daemon_descrs
)
322 assert dd
.hostname
is not None
323 addr
= dd
.ip
if dd
.ip
else self
._inventory
_get
_addr
(dd
.hostname
)
324 port
= dd
.ports
[0] if dd
.ports
else self
.DEFAULT_SERVICE_PORT
325 service_url
= 'http://{}:{}'.format(addr
, port
)
326 self
._set
_service
_url
_on
_dashboard
(
328 'dashboard get-prometheus-api-host',
329 'dashboard set-prometheus-api-host',
334 daemon_ids
: List
[str],
336 known
: Optional
[List
[str]] = None) -> HandleCommandResult
:
337 warn
, warn_message
= self
._enough
_daemons
_to
_stop
(self
.TYPE
, daemon_ids
, 'Prometheus', 1)
338 if warn
and not force
:
339 return HandleCommandResult(-errno
.EBUSY
, '', warn_message
)
340 return HandleCommandResult(0, warn_message
, '')
343 class NodeExporterService(CephadmService
):
344 TYPE
= 'node-exporter'
346 def prepare_create(self
, daemon_spec
: CephadmDaemonDeploySpec
) -> CephadmDaemonDeploySpec
:
347 assert self
.TYPE
== daemon_spec
.daemon_type
348 daemon_spec
.final_config
, daemon_spec
.deps
= self
.generate_config(daemon_spec
)
351 def generate_config(self
, daemon_spec
: CephadmDaemonDeploySpec
) -> Tuple
[Dict
[str, Any
], List
[str]]:
352 assert self
.TYPE
== daemon_spec
.daemon_type
356 daemon_ids
: List
[str],
358 known
: Optional
[List
[str]] = None) -> HandleCommandResult
:
359 # since node exporter runs on each host and cannot compromise data, no extra checks required
360 names
= [f
'{self.TYPE}.{d_id}' for d_id
in daemon_ids
]
361 out
= f
'It is presumed safe to stop {names}'
362 return HandleCommandResult(0, out
, '')