]>
Commit | Line | Data |
---|---|---|
f67539c2 TL |
1 | import json |
2 | import ipaddress | |
3 | import logging | |
4 | ||
5 | from mgr_module import ServiceInfoT | |
6 | ||
7 | from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast, Tuple, Callable | |
8 | ||
9 | if TYPE_CHECKING: | |
10 | from cephadm.module import CephadmOrchestrator | |
11 | ||
12 | logger = logging.getLogger(__name__) | |
13 | ||
14 | ||
15 | class HostFacts: | |
16 | ||
17 | def __init__(self) -> None: | |
18 | self.arch: Optional[str] = None | |
19 | self.bios_date: Optional[str] = None | |
20 | self.bios_version: Optional[str] = None | |
21 | self.cpu_cores: Optional[int] = None | |
22 | self.cpu_count: Optional[int] = None | |
23 | self.cpu_load: Optional[Dict[str, float]] = None | |
24 | self.cpu_model: Optional[str] = None | |
25 | self.cpu_threads: Optional[int] = None | |
26 | self.flash_capacity: Optional[str] = None | |
27 | self.flash_capacity_bytes: Optional[int] = None | |
28 | self.flash_count: Optional[int] = None | |
29 | self.flash_list: Optional[List[Dict[str, Any]]] = None | |
30 | self.hdd_capacity: Optional[str] = None | |
31 | self.hdd_capacity_bytes: Optional[int] = None | |
32 | self.hdd_count: Optional[int] = None | |
33 | self.hdd_list: Optional[List[Dict[str, Any]]] = None | |
34 | self.hostname: Optional[str] = None | |
35 | self.interfaces: Optional[Dict[str, Dict[str, Any]]] = None | |
36 | self.kernel: Optional[str] = None | |
37 | self.kernel_parameters: Optional[Dict[str, Any]] = None | |
38 | self.kernel_security: Optional[Dict[str, str]] = None | |
39 | self.memory_available_kb: Optional[int] = None | |
40 | self.memory_free_kb: Optional[int] = None | |
41 | self.memory_total_kb: Optional[int] = None | |
42 | self.model: Optional[str] = None | |
43 | self.nic_count: Optional[int] = None | |
44 | self.operating_system: Optional[str] = None | |
45 | self.subscribed: Optional[str] = None | |
46 | self.system_uptime: Optional[float] = None | |
47 | self.timestamp: Optional[float] = None | |
48 | self.vendor: Optional[str] = None | |
49 | self._valid = False | |
50 | ||
51 | def load_facts(self, json_data: Dict[str, Any]) -> None: | |
52 | ||
53 | if isinstance(json_data, dict): | |
54 | keys = json_data.keys() | |
55 | if all([k in keys for k in self.__dict__ if not k.startswith('_')]): | |
56 | self._valid = True | |
57 | for k in json_data.keys(): | |
58 | if hasattr(self, k): | |
59 | setattr(self, k, json_data[k]) | |
60 | else: | |
61 | self._valid = False | |
62 | else: | |
63 | self._valid = False | |
64 | ||
65 | def subnet_to_nic(self, subnet: str) -> Optional[str]: | |
66 | ip_version = ipaddress.ip_network(subnet).version | |
67 | logger.debug(f"subnet {subnet} is IP version {ip_version}") | |
68 | interfaces = cast(Dict[str, Dict[str, Any]], self.interfaces) | |
69 | nic = None | |
70 | for iface in interfaces.keys(): | |
71 | addr = '' | |
72 | if ip_version == 4: | |
73 | addr = interfaces[iface].get('ipv4_address', '') | |
74 | else: | |
75 | addr = interfaces[iface].get('ipv6_address', '') | |
76 | if addr: | |
77 | a = addr.split('/')[0] | |
78 | if ipaddress.ip_address(a) in ipaddress.ip_network(subnet): | |
79 | nic = iface | |
80 | break | |
81 | return nic | |
82 | ||
83 | ||
84 | class SubnetLookup: | |
85 | def __init__(self, subnet: str, hostname: str, mtu: str, speed: str): | |
86 | self.subnet = subnet | |
87 | self.mtu_map = { | |
88 | mtu: [hostname] | |
89 | } | |
90 | self.speed_map = { | |
91 | speed: [hostname] | |
92 | } | |
93 | ||
f78120f9 | 94 | @property |
f67539c2 TL |
95 | def host_list(self) -> List[str]: |
96 | hosts = [] | |
97 | for mtu in self.mtu_map: | |
98 | hosts.extend(self.mtu_map.get(mtu, [])) | |
99 | return hosts | |
100 | ||
101 | def update(self, hostname: str, mtu: str, speed: str) -> None: | |
102 | if mtu in self.mtu_map and hostname not in self.mtu_map[mtu]: | |
103 | self.mtu_map[mtu].append(hostname) | |
104 | else: | |
105 | self.mtu_map[mtu] = [hostname] | |
106 | ||
107 | if speed in self.speed_map and hostname not in self.speed_map[speed]: | |
108 | self.speed_map[speed].append(hostname) | |
109 | else: | |
110 | self.speed_map[speed] = [hostname] | |
111 | ||
112 | def __repr__(self) -> str: | |
113 | return json.dumps({ | |
114 | "subnet": self.subnet, | |
115 | "mtu_map": self.mtu_map, | |
116 | "speed_map": self.speed_map | |
117 | }) | |
118 | ||
119 | ||
120 | class CephadmCheckDefinition: | |
121 | def __init__(self, mgr: "CephadmOrchestrator", healthcheck_name: str, description: str, name: str, func: Callable) -> None: | |
122 | self.mgr = mgr | |
123 | self.log = logger | |
124 | self.healthcheck_name = healthcheck_name | |
125 | self.description = description | |
126 | self.name = name | |
127 | self.func = func | |
128 | ||
129 | @property | |
130 | def status(self) -> str: | |
131 | check_states: Dict[str, str] = {} | |
132 | # Issuing a get each time, since the value could be set at the CLI | |
133 | raw_states = self.mgr.get_store('config_checks') | |
134 | if not raw_states: | |
135 | self.log.error( | |
136 | "config_checks setting is not defined - unable to determine healthcheck state") | |
137 | return "Unknown" | |
138 | ||
139 | try: | |
140 | check_states = json.loads(raw_states) | |
141 | except json.JSONDecodeError: | |
142 | self.log.error("Unable to serialize the config_checks settings to JSON") | |
143 | return "Unavailable" | |
144 | ||
145 | return check_states.get(self.name, 'Missing') | |
146 | ||
147 | def to_json(self) -> Dict[str, Any]: | |
148 | return { | |
149 | "healthcheck_name": self.healthcheck_name, | |
150 | "description": self.description, | |
151 | "name": self.name, | |
152 | "status": self.status, | |
f51cf556 | 153 | "valid": True if getattr(self, 'func', None) else False |
f67539c2 TL |
154 | } |
155 | ||
156 | ||
157 | class CephadmConfigChecks: | |
158 | def __init__(self, mgr: "CephadmOrchestrator"): | |
159 | self.mgr: "CephadmOrchestrator" = mgr | |
160 | self.health_checks: List[CephadmCheckDefinition] = [ | |
161 | CephadmCheckDefinition(mgr, "CEPHADM_CHECK_KERNEL_LSM", | |
162 | "checks SELINUX/Apparmor profiles are consistent across cluster hosts", | |
163 | "kernel_security", | |
164 | self._check_kernel_lsm), | |
165 | CephadmCheckDefinition(mgr, "CEPHADM_CHECK_SUBSCRIPTION", | |
166 | "checks subscription states are consistent for all cluster hosts", | |
167 | "os_subscription", | |
168 | self._check_subscription), | |
169 | CephadmCheckDefinition(mgr, "CEPHADM_CHECK_PUBLIC_MEMBERSHIP", | |
1e59de90 | 170 | "check that all hosts have a NIC on the Ceph public_network", |
f67539c2 TL |
171 | "public_network", |
172 | self._check_public_network), | |
173 | CephadmCheckDefinition(mgr, "CEPHADM_CHECK_MTU", | |
174 | "check that OSD hosts share a common MTU setting", | |
175 | "osd_mtu_size", | |
176 | self._check_osd_mtu), | |
177 | CephadmCheckDefinition(mgr, "CEPHADM_CHECK_LINKSPEED", | |
178 | "check that OSD hosts share a common linkspeed", | |
179 | "osd_linkspeed", | |
180 | self._check_osd_linkspeed), | |
181 | CephadmCheckDefinition(mgr, "CEPHADM_CHECK_NETWORK_MISSING", | |
182 | "checks that the cluster/public networks defined exist on the Ceph hosts", | |
183 | "network_missing", | |
184 | self._check_network_missing), | |
185 | CephadmCheckDefinition(mgr, "CEPHADM_CHECK_CEPH_RELEASE", | |
186 | "check for Ceph version consistency - ceph daemons should be on the same release (unless upgrade is active)", | |
187 | "ceph_release", | |
188 | self._check_release_parity), | |
189 | CephadmCheckDefinition(mgr, "CEPHADM_CHECK_KERNEL_VERSION", | |
190 | "checks that the MAJ.MIN of the kernel on Ceph hosts is consistent", | |
191 | "kernel_version", | |
192 | self._check_kernel_version), | |
193 | ] | |
194 | self.log = logger | |
195 | self.host_facts: Dict[str, HostFacts] = {} | |
196 | self.subnet_lookup: Dict[str, SubnetLookup] = {} # subnet CIDR -> SubnetLookup Object | |
197 | self.lsm_to_host: Dict[str, List[str]] = {} | |
198 | self.subscribed: Dict[str, List[str]] = { | |
199 | "yes": [], | |
200 | "no": [], | |
201 | "unknown": [], | |
202 | } | |
203 | self.host_to_role: Dict[str, List[str]] = {} | |
204 | self.kernel_to_hosts: Dict[str, List[str]] = {} | |
205 | ||
206 | self.public_network_list: List[str] = [] | |
207 | self.cluster_network_list: List[str] = [] | |
208 | self.health_check_raised = False | |
209 | self.active_checks: List[str] = [] # checks enabled and executed | |
210 | self.skipped_checks: List[str] = [] # checks enabled, but skipped due to a pre-req failure | |
211 | ||
212 | raw_checks = self.mgr.get_store('config_checks') | |
213 | if not raw_checks: | |
214 | # doesn't exist, so seed the checks | |
215 | self.seed_config_checks() | |
216 | else: | |
217 | # setting is there, so ensure there is an entry for each of the checks that | |
218 | # this module supports (account for upgrades/changes) | |
219 | try: | |
220 | config_checks = json.loads(raw_checks) | |
221 | except json.JSONDecodeError: | |
222 | self.log.error("Unable to serialize config_checks config. Reset to defaults") | |
223 | self.seed_config_checks() | |
224 | else: | |
225 | # Ensure the config_checks setting is consistent with this module | |
226 | from_config = set(config_checks.keys()) | |
227 | from_module = set([c.name for c in self.health_checks]) | |
228 | old_checks = from_config.difference(from_module) | |
229 | new_checks = from_module.difference(from_config) | |
230 | ||
231 | if old_checks: | |
232 | self.log.debug(f"old checks being removed from config_checks: {old_checks}") | |
233 | for i in old_checks: | |
234 | del config_checks[i] | |
235 | if new_checks: | |
236 | self.log.debug(f"new checks being added to config_checks: {new_checks}") | |
237 | for i in new_checks: | |
238 | config_checks[i] = 'enabled' | |
239 | ||
240 | if old_checks or new_checks: | |
241 | self.log.info( | |
242 | f"config_checks updated: {len(old_checks)} removed, {len(new_checks)} added") | |
243 | self.mgr.set_store('config_checks', json.dumps(config_checks)) | |
244 | else: | |
245 | self.log.debug("config_checks match module definition") | |
246 | ||
247 | def lookup_check(self, key_value: str, key_name: str = 'name') -> Optional[CephadmCheckDefinition]: | |
248 | ||
249 | for c in self.health_checks: | |
250 | if getattr(c, key_name) == key_value: | |
251 | return c | |
252 | return None | |
253 | ||
254 | @property | |
255 | def defined_checks(self) -> int: | |
256 | return len(self.health_checks) | |
257 | ||
258 | @property | |
259 | def active_checks_count(self) -> int: | |
260 | return len(self.active_checks) | |
261 | ||
262 | def seed_config_checks(self) -> None: | |
263 | defaults = {check.name: 'enabled' for check in self.health_checks} | |
264 | self.mgr.set_store('config_checks', json.dumps(defaults)) | |
265 | ||
266 | @property | |
267 | def skipped_checks_count(self) -> int: | |
268 | return len(self.skipped_checks) | |
269 | ||
270 | def to_json(self) -> List[Dict[str, str]]: | |
271 | return [check.to_json() for check in self.health_checks] | |
272 | ||
273 | def load_network_config(self) -> None: | |
274 | ret, out, _err = self.mgr.check_mon_command({ | |
275 | 'prefix': 'config dump', | |
276 | 'format': 'json' | |
277 | }) | |
278 | assert ret == 0 | |
279 | js = json.loads(out) | |
280 | for item in js: | |
281 | if item['name'] == "cluster_network": | |
282 | self.cluster_network_list = item['value'].strip().split(',') | |
283 | if item['name'] == "public_network": | |
284 | self.public_network_list = item['value'].strip().split(',') | |
285 | ||
286 | self.log.debug(f"public networks {self.public_network_list}") | |
287 | self.log.debug(f"cluster networks {self.cluster_network_list}") | |
288 | ||
289 | def _update_subnet(self, subnet: str, hostname: str, nic: Dict[str, Any]) -> None: | |
290 | mtu = nic.get('mtu', None) | |
291 | speed = nic.get('speed', None) | |
292 | if not mtu or not speed: | |
293 | return | |
294 | ||
295 | this_subnet = self.subnet_lookup.get(subnet, None) | |
296 | if this_subnet: | |
297 | this_subnet.update(hostname, mtu, speed) | |
298 | else: | |
299 | self.subnet_lookup[subnet] = SubnetLookup(subnet, hostname, mtu, speed) | |
300 | ||
301 | def _update_subnet_lookups(self, hostname: str, devname: str, nic: Dict[str, Any]) -> None: | |
302 | if nic['ipv4_address']: | |
303 | try: | |
304 | iface4 = ipaddress.IPv4Interface(nic['ipv4_address']) | |
305 | subnet = str(iface4.network) | |
306 | except ipaddress.AddressValueError as e: | |
307 | self.log.exception(f"Invalid network on {hostname}, interface {devname} : {str(e)}") | |
308 | else: | |
309 | self._update_subnet(subnet, hostname, nic) | |
310 | ||
311 | if nic['ipv6_address']: | |
312 | try: | |
313 | iface6 = ipaddress.IPv6Interface(nic['ipv6_address']) | |
314 | subnet = str(iface6.network) | |
315 | except ipaddress.AddressValueError as e: | |
316 | self.log.exception(f"Invalid network on {hostname}, interface {devname} : {str(e)}") | |
317 | else: | |
318 | self._update_subnet(subnet, hostname, nic) | |
319 | ||
320 | def hosts_with_role(self, role: str) -> List[str]: | |
321 | host_list = [] | |
322 | for hostname, roles in self.host_to_role.items(): | |
323 | if role in roles: | |
324 | host_list.append(hostname) | |
325 | return host_list | |
326 | ||
327 | def reset(self) -> None: | |
328 | self.subnet_lookup.clear() | |
329 | self.lsm_to_host.clear() | |
330 | self.subscribed['yes'] = [] | |
331 | self.subscribed['no'] = [] | |
332 | self.subscribed['unknown'] = [] | |
333 | self.host_to_role.clear() | |
334 | self.kernel_to_hosts.clear() | |
335 | ||
336 | def _get_majority(self, data: Dict[str, List[str]]) -> Tuple[str, int]: | |
337 | assert isinstance(data, dict) | |
338 | ||
339 | majority_key = '' | |
340 | majority_count = 0 | |
341 | for key in data: | |
342 | if len(data[key]) > majority_count: | |
343 | majority_count = len(data[key]) | |
344 | majority_key = key | |
345 | return majority_key, majority_count | |
346 | ||
347 | def get_ceph_metadata(self) -> Dict[str, Optional[Dict[str, str]]]: | |
348 | """Build a map of service -> service metadata""" | |
349 | service_map: Dict[str, Optional[Dict[str, str]]] = {} | |
350 | ||
351 | for server in self.mgr.list_servers(): | |
352 | for service in cast(List[ServiceInfoT], server.get('services', [])): | |
353 | if service: | |
354 | service_map.update( | |
355 | { | |
356 | f"{service['type']}.{service['id']}": | |
357 | self.mgr.get_metadata(service['type'], service['id']) | |
358 | } | |
359 | ) | |
360 | return service_map | |
361 | ||
362 | def _check_kernel_lsm(self) -> None: | |
363 | if len(self.lsm_to_host.keys()) > 1: | |
364 | ||
365 | majority_hosts_ptr, majority_hosts_count = self._get_majority(self.lsm_to_host) | |
366 | lsm_copy = self.lsm_to_host.copy() | |
367 | del lsm_copy[majority_hosts_ptr] | |
368 | details = [] | |
369 | for lsm_key in lsm_copy.keys(): | |
370 | for host in lsm_copy[lsm_key]: | |
371 | details.append( | |
372 | f"{host} has inconsistent KSM settings compared to the " | |
373 | f"majority of hosts({majority_hosts_count}) in the cluster") | |
374 | host_sfx = 's' if len(details) > 1 else '' | |
375 | self.mgr.health_checks['CEPHADM_CHECK_KERNEL_LSM'] = { | |
376 | 'severity': 'warning', | |
377 | 'summary': f"Kernel Security Module (SELinux/AppArmor) is inconsistent for " | |
378 | f"{len(details)} host{host_sfx}", | |
379 | 'count': len(details), | |
380 | 'detail': details, | |
381 | } | |
382 | self.health_check_raised = True | |
383 | else: | |
384 | self.mgr.health_checks.pop('CEPHADM_CHECK_KERNEL_LSM', None) | |
385 | ||
386 | def _check_subscription(self) -> None: | |
387 | if len(self.subscribed['yes']) > 0 and len(self.subscribed['no']) > 0: | |
388 | # inconsistent subscription states - CEPHADM_CHECK_SUBSCRIPTION | |
389 | details = [] | |
390 | for host in self.subscribed['no']: | |
391 | details.append(f"{host} does not have an active subscription") | |
392 | self.mgr.health_checks['CEPHADM_CHECK_SUBSCRIPTION'] = { | |
393 | 'severity': 'warning', | |
394 | 'summary': f"Support subscriptions inactive on {len(details)} host(s)" | |
395 | f"({len(self.subscribed['yes'])} subscriptions active)", | |
396 | 'count': len(details), | |
397 | 'detail': details, | |
398 | } | |
399 | self.health_check_raised = True | |
400 | else: | |
401 | self.mgr.health_checks.pop('CEPHADM_CHECK_SUBSCRIPTION', None) | |
402 | ||
403 | def _check_public_network(self) -> None: | |
404 | hosts_remaining: List[str] = list(self.mgr.cache.facts.keys()) | |
405 | hosts_removed: List[str] = [] | |
406 | self.log.debug(f"checking public network membership for: {hosts_remaining}") | |
407 | ||
408 | for p_net in self.public_network_list: | |
409 | self.log.debug(f"checking network {p_net}") | |
410 | subnet_data = self.subnet_lookup.get(p_net, None) | |
411 | self.log.debug(f"subnet data - {subnet_data}") | |
412 | ||
413 | if subnet_data: | |
414 | hosts_in_subnet = subnet_data.host_list | |
415 | for host in hosts_in_subnet: | |
416 | if host in hosts_remaining: | |
417 | hosts_remaining.remove(host) | |
418 | hosts_removed.append(host) | |
419 | else: | |
420 | if host not in hosts_removed: | |
421 | self.log.debug(f"host={host}, subnet={p_net}") | |
422 | self.log.exception( | |
423 | "Host listed for a subnet but not present in the host facts?") | |
424 | ||
425 | # Ideally all hosts will have been removed since they have an IP on at least | |
426 | # one of the public networks | |
427 | if hosts_remaining: | |
428 | if len(hosts_remaining) != len(self.mgr.cache.facts): | |
429 | # public network is visible on some hosts | |
430 | details = [ | |
431 | f"{host} does not have an interface on any public network" for host in hosts_remaining] | |
432 | ||
433 | self.mgr.health_checks['CEPHADM_CHECK_PUBLIC_MEMBERSHIP'] = { | |
434 | 'severity': 'warning', | |
435 | 'summary': f"Public network(s) is not directly accessible from {len(hosts_remaining)} " | |
436 | "cluster hosts", | |
437 | 'count': len(details), | |
438 | 'detail': details, | |
439 | } | |
440 | self.health_check_raised = True | |
441 | else: | |
442 | self.mgr.health_checks.pop('CEPHADM_CHECK_PUBLIC_MEMBERSHIP', None) | |
443 | ||
444 | def _check_osd_mtu(self) -> None: | |
445 | osd_hosts = set(self.hosts_with_role('osd')) | |
446 | osd_network_list = self.cluster_network_list or self.public_network_list | |
447 | mtu_errors: List[str] = [] | |
448 | ||
449 | for osd_net in osd_network_list: | |
450 | subnet_data = self.subnet_lookup.get(osd_net, None) | |
451 | ||
452 | if subnet_data: | |
453 | ||
454 | self.log.debug(f"processing mtu map : {json.dumps(subnet_data.mtu_map)}") | |
455 | mtu_count = {} | |
456 | max_hosts = 0 | |
457 | mtu_ptr = '' | |
458 | diffs = {} | |
459 | for mtu, host_list in subnet_data.mtu_map.items(): | |
460 | mtu_hosts = set(host_list) | |
461 | mtu_count[mtu] = len(mtu_hosts) | |
462 | errors = osd_hosts.difference(mtu_hosts) | |
463 | if errors: | |
464 | diffs[mtu] = errors | |
465 | if len(errors) > max_hosts: | |
466 | mtu_ptr = mtu | |
467 | ||
468 | if diffs: | |
469 | self.log.debug("MTU problems detected") | |
470 | self.log.debug(f"most hosts using {mtu_ptr}") | |
471 | mtu_copy = subnet_data.mtu_map.copy() | |
472 | del mtu_copy[mtu_ptr] | |
473 | for bad_mtu in mtu_copy: | |
474 | for h in mtu_copy[bad_mtu]: | |
475 | host = HostFacts() | |
476 | host.load_facts(self.mgr.cache.facts[h]) | |
477 | mtu_errors.append( | |
478 | f"host {h}({host.subnet_to_nic(osd_net)}) is using MTU " | |
479 | f"{bad_mtu} on {osd_net}, NICs on other hosts use {mtu_ptr}") | |
480 | ||
481 | if mtu_errors: | |
482 | self.mgr.health_checks['CEPHADM_CHECK_MTU'] = { | |
483 | 'severity': 'warning', | |
484 | 'summary': f"MTU setting inconsistent on osd network NICs on {len(mtu_errors)} host(s)", | |
485 | 'count': len(mtu_errors), | |
486 | 'detail': mtu_errors, | |
487 | } | |
488 | self.health_check_raised = True | |
489 | else: | |
490 | self.mgr.health_checks.pop('CEPHADM_CHECK_MTU', None) | |
491 | ||
492 | def _check_osd_linkspeed(self) -> None: | |
493 | osd_hosts = set(self.hosts_with_role('osd')) | |
494 | osd_network_list = self.cluster_network_list or self.public_network_list | |
495 | ||
496 | linkspeed_errors = [] | |
497 | ||
498 | for osd_net in osd_network_list: | |
499 | subnet_data = self.subnet_lookup.get(osd_net, None) | |
500 | ||
501 | if subnet_data: | |
502 | ||
503 | self.log.debug(f"processing subnet : {subnet_data}") | |
504 | ||
505 | speed_count = {} | |
506 | max_hosts = 0 | |
507 | speed_ptr = '' | |
508 | diffs = {} | |
509 | for speed, host_list in subnet_data.speed_map.items(): | |
510 | speed_hosts = set(host_list) | |
511 | speed_count[speed] = len(speed_hosts) | |
512 | errors = osd_hosts.difference(speed_hosts) | |
513 | if errors: | |
514 | diffs[speed] = errors | |
515 | if len(errors) > max_hosts: | |
516 | speed_ptr = speed | |
517 | ||
518 | if diffs: | |
519 | self.log.debug("linkspeed issue(s) detected") | |
520 | self.log.debug(f"most hosts using {speed_ptr}") | |
521 | speed_copy = subnet_data.speed_map.copy() | |
522 | del speed_copy[speed_ptr] | |
523 | for bad_speed in speed_copy: | |
524 | if bad_speed > speed_ptr: | |
525 | # skip speed is better than most...it can stay! | |
526 | continue | |
527 | for h in speed_copy[bad_speed]: | |
528 | host = HostFacts() | |
529 | host.load_facts(self.mgr.cache.facts[h]) | |
530 | linkspeed_errors.append( | |
531 | f"host {h}({host.subnet_to_nic(osd_net)}) has linkspeed of " | |
532 | f"{bad_speed} on {osd_net}, NICs on other hosts use {speed_ptr}") | |
533 | ||
534 | if linkspeed_errors: | |
535 | self.mgr.health_checks['CEPHADM_CHECK_LINKSPEED'] = { | |
536 | 'severity': 'warning', | |
537 | 'summary': "Link speed is inconsistent on osd network NICs for " | |
538 | f"{len(linkspeed_errors)} host(s)", | |
539 | 'count': len(linkspeed_errors), | |
540 | 'detail': linkspeed_errors, | |
541 | } | |
542 | self.health_check_raised = True | |
543 | else: | |
544 | self.mgr.health_checks.pop('CEPHADM_CHECK_LINKSPEED', None) | |
545 | ||
546 | def _check_network_missing(self) -> None: | |
547 | all_networks = self.public_network_list.copy() | |
548 | all_networks.extend(self.cluster_network_list) | |
549 | ||
550 | missing_networks = [] | |
551 | for subnet in all_networks: | |
552 | subnet_data = self.subnet_lookup.get(subnet, None) | |
553 | ||
554 | if not subnet_data: | |
555 | missing_networks.append(f"{subnet} not found on any host in the cluster") | |
556 | self.log.warning( | |
557 | f"Network {subnet} has been defined, but is not present on any host") | |
558 | ||
559 | if missing_networks: | |
560 | net_sfx = 's' if len(missing_networks) > 1 else '' | |
561 | self.mgr.health_checks['CEPHADM_CHECK_NETWORK_MISSING'] = { | |
562 | 'severity': 'warning', | |
563 | 'summary': f"Public/cluster network{net_sfx} defined, but can not be found on " | |
564 | "any host", | |
565 | 'count': len(missing_networks), | |
566 | 'detail': missing_networks, | |
567 | } | |
568 | self.health_check_raised = True | |
569 | else: | |
570 | self.mgr.health_checks.pop('CEPHADM_CHECK_NETWORK_MISSING', None) | |
571 | ||
572 | def _check_release_parity(self) -> None: | |
573 | upgrade_status = self.mgr.upgrade.upgrade_status() | |
574 | if upgrade_status.in_progress: | |
575 | # skip version consistency checks during an upgrade cycle | |
576 | self.skipped_checks.append('ceph_release') | |
577 | return | |
578 | ||
579 | services = self.get_ceph_metadata() | |
580 | self.log.debug(json.dumps(services)) | |
581 | version_to_svcs: Dict[str, List[str]] = {} | |
582 | ||
583 | for svc in services: | |
584 | if services[svc]: | |
585 | metadata = cast(Dict[str, str], services[svc]) | |
586 | v = metadata.get('ceph_release', '') | |
587 | if v in version_to_svcs: | |
588 | version_to_svcs[v].append(svc) | |
589 | else: | |
590 | version_to_svcs[v] = [svc] | |
591 | ||
592 | if len(version_to_svcs) > 1: | |
593 | majority_ptr, _majority_count = self._get_majority(version_to_svcs) | |
594 | ver_copy = version_to_svcs.copy() | |
595 | del ver_copy[majority_ptr] | |
596 | details = [] | |
597 | for v in ver_copy: | |
598 | for svc in ver_copy[v]: | |
599 | details.append( | |
600 | f"{svc} is running {v} (majority of cluster is using {majority_ptr})") | |
601 | ||
602 | self.mgr.health_checks['CEPHADM_CHECK_CEPH_RELEASE'] = { | |
603 | 'severity': 'warning', | |
604 | 'summary': 'Ceph cluster running mixed ceph releases', | |
605 | 'count': len(details), | |
606 | 'detail': details, | |
607 | } | |
608 | self.health_check_raised = True | |
609 | self.log.warning( | |
610 | f"running with {len(version_to_svcs)} different ceph releases within this cluster") | |
611 | else: | |
612 | self.mgr.health_checks.pop('CEPHADM_CHECK_CEPH_RELEASE', None) | |
613 | ||
614 | def _check_kernel_version(self) -> None: | |
615 | if len(self.kernel_to_hosts.keys()) > 1: | |
616 | majority_hosts_ptr, majority_hosts_count = self._get_majority(self.kernel_to_hosts) | |
617 | kver_copy = self.kernel_to_hosts.copy() | |
618 | del kver_copy[majority_hosts_ptr] | |
619 | details = [] | |
620 | for k in kver_copy: | |
621 | for h in kver_copy[k]: | |
622 | details.append( | |
623 | f"host {h} running kernel {k}, majority of hosts({majority_hosts_count}) " | |
624 | f"running {majority_hosts_ptr}") | |
625 | ||
626 | self.log.warning("mixed kernel versions detected") | |
627 | self.mgr.health_checks['CEPHADM_CHECK_KERNEL_VERSION'] = { | |
628 | 'severity': 'warning', | |
629 | 'summary': f"{len(details)} host(s) running different kernel versions", | |
630 | 'count': len(details), | |
631 | 'detail': details, | |
632 | } | |
633 | self.health_check_raised = True | |
634 | else: | |
635 | self.mgr.health_checks.pop('CEPHADM_CHECK_KERNEL_VERSION', None) | |
636 | ||
637 | def _process_hosts(self) -> None: | |
638 | self.log.debug(f"processing data from {len(self.mgr.cache.facts)} hosts") | |
639 | for hostname in self.mgr.cache.facts: | |
640 | host = HostFacts() | |
641 | host.load_facts(self.mgr.cache.facts[hostname]) | |
642 | if not host._valid: | |
643 | self.log.warning(f"skipping {hostname} - incompatible host facts") | |
644 | continue | |
645 | ||
646 | kernel_lsm = cast(Dict[str, str], host.kernel_security) | |
647 | lsm_desc = kernel_lsm.get('description', '') | |
648 | if lsm_desc: | |
649 | if lsm_desc in self.lsm_to_host: | |
650 | self.lsm_to_host[lsm_desc].append(hostname) | |
651 | else: | |
652 | self.lsm_to_host[lsm_desc] = [hostname] | |
653 | ||
654 | subscription_state = host.subscribed.lower() if host.subscribed else None | |
655 | if subscription_state: | |
656 | self.subscribed[subscription_state].append(hostname) | |
657 | ||
658 | interfaces = cast(Dict[str, Dict[str, Any]], host.interfaces) | |
659 | for name in interfaces.keys(): | |
660 | if name in ['lo']: | |
661 | continue | |
662 | self._update_subnet_lookups(hostname, name, interfaces[name]) | |
663 | ||
664 | if host.kernel: | |
665 | kernel_maj_min = '.'.join(host.kernel.split('.')[0:2]) | |
666 | if kernel_maj_min in self.kernel_to_hosts: | |
667 | self.kernel_to_hosts[kernel_maj_min].append(hostname) | |
668 | else: | |
669 | self.kernel_to_hosts[kernel_maj_min] = [hostname] | |
670 | else: | |
671 | self.log.warning(f"Host gather facts for {hostname} is missing kernel information") | |
672 | ||
673 | # NOTE: if daemondescription had systemd enabled state, we could check for systemd 'tampering' | |
20effc67 | 674 | self.host_to_role[hostname] = list(self.mgr.cache.get_daemon_types(hostname)) |
f67539c2 TL |
675 | |
676 | def run_checks(self) -> None: | |
f38dd50b | 677 | checks_enabled = self.mgr.config_checks_enabled |
f67539c2 TL |
678 | if checks_enabled is not True: |
679 | return | |
680 | ||
681 | self.reset() | |
682 | ||
683 | check_config: Dict[str, str] = {} | |
684 | checks_raw: Optional[str] = self.mgr.get_store('config_checks') | |
685 | if checks_raw: | |
686 | try: | |
687 | check_config.update(json.loads(checks_raw)) | |
688 | except json.JSONDecodeError: | |
689 | self.log.exception( | |
690 | "mgr/cephadm/config_checks is not JSON serializable - all checks will run") | |
691 | ||
692 | # build lookup "maps" by walking the host facts, once | |
693 | self._process_hosts() | |
694 | ||
695 | self.health_check_raised = False | |
696 | self.active_checks = [] | |
697 | self.skipped_checks = [] | |
698 | ||
1e59de90 | 699 | # process all healthchecks that are not explicitly disabled |
f67539c2 TL |
700 | for health_check in self.health_checks: |
701 | if check_config.get(health_check.name, '') != 'disabled': | |
702 | self.active_checks.append(health_check.name) | |
703 | health_check.func() | |
704 | ||
f67539c2 | 705 | self.mgr.set_health_checks(self.mgr.health_checks) |