]> git.proxmox.com Git - ceph.git/blob - ceph/src/pybind/mgr/cephadm/configchecks.py
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / pybind / mgr / cephadm / configchecks.py
1 import json
2 import ipaddress
3 import logging
4
5 from mgr_module import ServiceInfoT
6
7 from typing import TYPE_CHECKING, Any, Dict, List, Optional, cast, Tuple, Callable
8
9 if TYPE_CHECKING:
10 from cephadm.module import CephadmOrchestrator
11
12 logger = logging.getLogger(__name__)
13
14
15 class HostFacts:
16
17 def __init__(self) -> None:
18 self.arch: Optional[str] = None
19 self.bios_date: Optional[str] = None
20 self.bios_version: Optional[str] = None
21 self.cpu_cores: Optional[int] = None
22 self.cpu_count: Optional[int] = None
23 self.cpu_load: Optional[Dict[str, float]] = None
24 self.cpu_model: Optional[str] = None
25 self.cpu_threads: Optional[int] = None
26 self.flash_capacity: Optional[str] = None
27 self.flash_capacity_bytes: Optional[int] = None
28 self.flash_count: Optional[int] = None
29 self.flash_list: Optional[List[Dict[str, Any]]] = None
30 self.hdd_capacity: Optional[str] = None
31 self.hdd_capacity_bytes: Optional[int] = None
32 self.hdd_count: Optional[int] = None
33 self.hdd_list: Optional[List[Dict[str, Any]]] = None
34 self.hostname: Optional[str] = None
35 self.interfaces: Optional[Dict[str, Dict[str, Any]]] = None
36 self.kernel: Optional[str] = None
37 self.kernel_parameters: Optional[Dict[str, Any]] = None
38 self.kernel_security: Optional[Dict[str, str]] = None
39 self.memory_available_kb: Optional[int] = None
40 self.memory_free_kb: Optional[int] = None
41 self.memory_total_kb: Optional[int] = None
42 self.model: Optional[str] = None
43 self.nic_count: Optional[int] = None
44 self.operating_system: Optional[str] = None
45 self.subscribed: Optional[str] = None
46 self.system_uptime: Optional[float] = None
47 self.timestamp: Optional[float] = None
48 self.vendor: Optional[str] = None
49 self._valid = False
50
51 def load_facts(self, json_data: Dict[str, Any]) -> None:
52
53 if isinstance(json_data, dict):
54 keys = json_data.keys()
55 if all([k in keys for k in self.__dict__ if not k.startswith('_')]):
56 self._valid = True
57 for k in json_data.keys():
58 if hasattr(self, k):
59 setattr(self, k, json_data[k])
60 else:
61 self._valid = False
62 else:
63 self._valid = False
64
65 def subnet_to_nic(self, subnet: str) -> Optional[str]:
66 ip_version = ipaddress.ip_network(subnet).version
67 logger.debug(f"subnet {subnet} is IP version {ip_version}")
68 interfaces = cast(Dict[str, Dict[str, Any]], self.interfaces)
69 nic = None
70 for iface in interfaces.keys():
71 addr = ''
72 if ip_version == 4:
73 addr = interfaces[iface].get('ipv4_address', '')
74 else:
75 addr = interfaces[iface].get('ipv6_address', '')
76 if addr:
77 a = addr.split('/')[0]
78 if ipaddress.ip_address(a) in ipaddress.ip_network(subnet):
79 nic = iface
80 break
81 return nic
82
83
84 class SubnetLookup:
85 def __init__(self, subnet: str, hostname: str, mtu: str, speed: str):
86 self.subnet = subnet
87 self.mtu_map = {
88 mtu: [hostname]
89 }
90 self.speed_map = {
91 speed: [hostname]
92 }
93
94 @ property
95 def host_list(self) -> List[str]:
96 hosts = []
97 for mtu in self.mtu_map:
98 hosts.extend(self.mtu_map.get(mtu, []))
99 return hosts
100
101 def update(self, hostname: str, mtu: str, speed: str) -> None:
102 if mtu in self.mtu_map and hostname not in self.mtu_map[mtu]:
103 self.mtu_map[mtu].append(hostname)
104 else:
105 self.mtu_map[mtu] = [hostname]
106
107 if speed in self.speed_map and hostname not in self.speed_map[speed]:
108 self.speed_map[speed].append(hostname)
109 else:
110 self.speed_map[speed] = [hostname]
111
112 def __repr__(self) -> str:
113 return json.dumps({
114 "subnet": self.subnet,
115 "mtu_map": self.mtu_map,
116 "speed_map": self.speed_map
117 })
118
119
120 class CephadmCheckDefinition:
121 def __init__(self, mgr: "CephadmOrchestrator", healthcheck_name: str, description: str, name: str, func: Callable) -> None:
122 self.mgr = mgr
123 self.log = logger
124 self.healthcheck_name = healthcheck_name
125 self.description = description
126 self.name = name
127 self.func = func
128
129 @property
130 def status(self) -> str:
131 check_states: Dict[str, str] = {}
132 # Issuing a get each time, since the value could be set at the CLI
133 raw_states = self.mgr.get_store('config_checks')
134 if not raw_states:
135 self.log.error(
136 "config_checks setting is not defined - unable to determine healthcheck state")
137 return "Unknown"
138
139 try:
140 check_states = json.loads(raw_states)
141 except json.JSONDecodeError:
142 self.log.error("Unable to serialize the config_checks settings to JSON")
143 return "Unavailable"
144
145 return check_states.get(self.name, 'Missing')
146
147 def to_json(self) -> Dict[str, Any]:
148 return {
149 "healthcheck_name": self.healthcheck_name,
150 "description": self.description,
151 "name": self.name,
152 "status": self.status,
153 "valid": True if self.func else False
154 }
155
156
157 class CephadmConfigChecks:
158 def __init__(self, mgr: "CephadmOrchestrator"):
159 self.mgr: "CephadmOrchestrator" = mgr
160 self.health_checks: List[CephadmCheckDefinition] = [
161 CephadmCheckDefinition(mgr, "CEPHADM_CHECK_KERNEL_LSM",
162 "checks SELINUX/Apparmor profiles are consistent across cluster hosts",
163 "kernel_security",
164 self._check_kernel_lsm),
165 CephadmCheckDefinition(mgr, "CEPHADM_CHECK_SUBSCRIPTION",
166 "checks subscription states are consistent for all cluster hosts",
167 "os_subscription",
168 self._check_subscription),
169 CephadmCheckDefinition(mgr, "CEPHADM_CHECK_PUBLIC_MEMBERSHIP",
170 "check that all hosts have a NIC on the Ceph public_netork",
171 "public_network",
172 self._check_public_network),
173 CephadmCheckDefinition(mgr, "CEPHADM_CHECK_MTU",
174 "check that OSD hosts share a common MTU setting",
175 "osd_mtu_size",
176 self._check_osd_mtu),
177 CephadmCheckDefinition(mgr, "CEPHADM_CHECK_LINKSPEED",
178 "check that OSD hosts share a common linkspeed",
179 "osd_linkspeed",
180 self._check_osd_linkspeed),
181 CephadmCheckDefinition(mgr, "CEPHADM_CHECK_NETWORK_MISSING",
182 "checks that the cluster/public networks defined exist on the Ceph hosts",
183 "network_missing",
184 self._check_network_missing),
185 CephadmCheckDefinition(mgr, "CEPHADM_CHECK_CEPH_RELEASE",
186 "check for Ceph version consistency - ceph daemons should be on the same release (unless upgrade is active)",
187 "ceph_release",
188 self._check_release_parity),
189 CephadmCheckDefinition(mgr, "CEPHADM_CHECK_KERNEL_VERSION",
190 "checks that the MAJ.MIN of the kernel on Ceph hosts is consistent",
191 "kernel_version",
192 self._check_kernel_version),
193 ]
194 self.log = logger
195 self.host_facts: Dict[str, HostFacts] = {}
196 self.subnet_lookup: Dict[str, SubnetLookup] = {} # subnet CIDR -> SubnetLookup Object
197 self.lsm_to_host: Dict[str, List[str]] = {}
198 self.subscribed: Dict[str, List[str]] = {
199 "yes": [],
200 "no": [],
201 "unknown": [],
202 }
203 self.host_to_role: Dict[str, List[str]] = {}
204 self.kernel_to_hosts: Dict[str, List[str]] = {}
205
206 self.public_network_list: List[str] = []
207 self.cluster_network_list: List[str] = []
208 self.health_check_raised = False
209 self.active_checks: List[str] = [] # checks enabled and executed
210 self.skipped_checks: List[str] = [] # checks enabled, but skipped due to a pre-req failure
211
212 raw_checks = self.mgr.get_store('config_checks')
213 if not raw_checks:
214 # doesn't exist, so seed the checks
215 self.seed_config_checks()
216 else:
217 # setting is there, so ensure there is an entry for each of the checks that
218 # this module supports (account for upgrades/changes)
219 try:
220 config_checks = json.loads(raw_checks)
221 except json.JSONDecodeError:
222 self.log.error("Unable to serialize config_checks config. Reset to defaults")
223 self.seed_config_checks()
224 else:
225 # Ensure the config_checks setting is consistent with this module
226 from_config = set(config_checks.keys())
227 from_module = set([c.name for c in self.health_checks])
228 old_checks = from_config.difference(from_module)
229 new_checks = from_module.difference(from_config)
230
231 if old_checks:
232 self.log.debug(f"old checks being removed from config_checks: {old_checks}")
233 for i in old_checks:
234 del config_checks[i]
235 if new_checks:
236 self.log.debug(f"new checks being added to config_checks: {new_checks}")
237 for i in new_checks:
238 config_checks[i] = 'enabled'
239
240 if old_checks or new_checks:
241 self.log.info(
242 f"config_checks updated: {len(old_checks)} removed, {len(new_checks)} added")
243 self.mgr.set_store('config_checks', json.dumps(config_checks))
244 else:
245 self.log.debug("config_checks match module definition")
246
247 def lookup_check(self, key_value: str, key_name: str = 'name') -> Optional[CephadmCheckDefinition]:
248
249 for c in self.health_checks:
250 if getattr(c, key_name) == key_value:
251 return c
252 return None
253
254 @property
255 def defined_checks(self) -> int:
256 return len(self.health_checks)
257
258 @property
259 def active_checks_count(self) -> int:
260 return len(self.active_checks)
261
262 def seed_config_checks(self) -> None:
263 defaults = {check.name: 'enabled' for check in self.health_checks}
264 self.mgr.set_store('config_checks', json.dumps(defaults))
265
266 @property
267 def skipped_checks_count(self) -> int:
268 return len(self.skipped_checks)
269
270 def to_json(self) -> List[Dict[str, str]]:
271 return [check.to_json() for check in self.health_checks]
272
273 def load_network_config(self) -> None:
274 ret, out, _err = self.mgr.check_mon_command({
275 'prefix': 'config dump',
276 'format': 'json'
277 })
278 assert ret == 0
279 js = json.loads(out)
280 for item in js:
281 if item['name'] == "cluster_network":
282 self.cluster_network_list = item['value'].strip().split(',')
283 if item['name'] == "public_network":
284 self.public_network_list = item['value'].strip().split(',')
285
286 self.log.debug(f"public networks {self.public_network_list}")
287 self.log.debug(f"cluster networks {self.cluster_network_list}")
288
289 def _update_subnet(self, subnet: str, hostname: str, nic: Dict[str, Any]) -> None:
290 mtu = nic.get('mtu', None)
291 speed = nic.get('speed', None)
292 if not mtu or not speed:
293 return
294
295 this_subnet = self.subnet_lookup.get(subnet, None)
296 if this_subnet:
297 this_subnet.update(hostname, mtu, speed)
298 else:
299 self.subnet_lookup[subnet] = SubnetLookup(subnet, hostname, mtu, speed)
300
301 def _update_subnet_lookups(self, hostname: str, devname: str, nic: Dict[str, Any]) -> None:
302 if nic['ipv4_address']:
303 try:
304 iface4 = ipaddress.IPv4Interface(nic['ipv4_address'])
305 subnet = str(iface4.network)
306 except ipaddress.AddressValueError as e:
307 self.log.exception(f"Invalid network on {hostname}, interface {devname} : {str(e)}")
308 else:
309 self._update_subnet(subnet, hostname, nic)
310
311 if nic['ipv6_address']:
312 try:
313 iface6 = ipaddress.IPv6Interface(nic['ipv6_address'])
314 subnet = str(iface6.network)
315 except ipaddress.AddressValueError as e:
316 self.log.exception(f"Invalid network on {hostname}, interface {devname} : {str(e)}")
317 else:
318 self._update_subnet(subnet, hostname, nic)
319
320 def hosts_with_role(self, role: str) -> List[str]:
321 host_list = []
322 for hostname, roles in self.host_to_role.items():
323 if role in roles:
324 host_list.append(hostname)
325 return host_list
326
327 def reset(self) -> None:
328 self.subnet_lookup.clear()
329 self.lsm_to_host.clear()
330 self.subscribed['yes'] = []
331 self.subscribed['no'] = []
332 self.subscribed['unknown'] = []
333 self.host_to_role.clear()
334 self.kernel_to_hosts.clear()
335
336 def _get_majority(self, data: Dict[str, List[str]]) -> Tuple[str, int]:
337 assert isinstance(data, dict)
338
339 majority_key = ''
340 majority_count = 0
341 for key in data:
342 if len(data[key]) > majority_count:
343 majority_count = len(data[key])
344 majority_key = key
345 return majority_key, majority_count
346
347 def get_ceph_metadata(self) -> Dict[str, Optional[Dict[str, str]]]:
348 """Build a map of service -> service metadata"""
349 service_map: Dict[str, Optional[Dict[str, str]]] = {}
350
351 for server in self.mgr.list_servers():
352 for service in cast(List[ServiceInfoT], server.get('services', [])):
353 if service:
354 service_map.update(
355 {
356 f"{service['type']}.{service['id']}":
357 self.mgr.get_metadata(service['type'], service['id'])
358 }
359 )
360 return service_map
361
362 def _check_kernel_lsm(self) -> None:
363 if len(self.lsm_to_host.keys()) > 1:
364
365 majority_hosts_ptr, majority_hosts_count = self._get_majority(self.lsm_to_host)
366 lsm_copy = self.lsm_to_host.copy()
367 del lsm_copy[majority_hosts_ptr]
368 details = []
369 for lsm_key in lsm_copy.keys():
370 for host in lsm_copy[lsm_key]:
371 details.append(
372 f"{host} has inconsistent KSM settings compared to the "
373 f"majority of hosts({majority_hosts_count}) in the cluster")
374 host_sfx = 's' if len(details) > 1 else ''
375 self.mgr.health_checks['CEPHADM_CHECK_KERNEL_LSM'] = {
376 'severity': 'warning',
377 'summary': f"Kernel Security Module (SELinux/AppArmor) is inconsistent for "
378 f"{len(details)} host{host_sfx}",
379 'count': len(details),
380 'detail': details,
381 }
382 self.health_check_raised = True
383 else:
384 self.mgr.health_checks.pop('CEPHADM_CHECK_KERNEL_LSM', None)
385
386 def _check_subscription(self) -> None:
387 if len(self.subscribed['yes']) > 0 and len(self.subscribed['no']) > 0:
388 # inconsistent subscription states - CEPHADM_CHECK_SUBSCRIPTION
389 details = []
390 for host in self.subscribed['no']:
391 details.append(f"{host} does not have an active subscription")
392 self.mgr.health_checks['CEPHADM_CHECK_SUBSCRIPTION'] = {
393 'severity': 'warning',
394 'summary': f"Support subscriptions inactive on {len(details)} host(s)"
395 f"({len(self.subscribed['yes'])} subscriptions active)",
396 'count': len(details),
397 'detail': details,
398 }
399 self.health_check_raised = True
400 else:
401 self.mgr.health_checks.pop('CEPHADM_CHECK_SUBSCRIPTION', None)
402
403 def _check_public_network(self) -> None:
404 hosts_remaining: List[str] = list(self.mgr.cache.facts.keys())
405 hosts_removed: List[str] = []
406 self.log.debug(f"checking public network membership for: {hosts_remaining}")
407
408 for p_net in self.public_network_list:
409 self.log.debug(f"checking network {p_net}")
410 subnet_data = self.subnet_lookup.get(p_net, None)
411 self.log.debug(f"subnet data - {subnet_data}")
412
413 if subnet_data:
414 hosts_in_subnet = subnet_data.host_list
415 for host in hosts_in_subnet:
416 if host in hosts_remaining:
417 hosts_remaining.remove(host)
418 hosts_removed.append(host)
419 else:
420 if host not in hosts_removed:
421 self.log.debug(f"host={host}, subnet={p_net}")
422 self.log.exception(
423 "Host listed for a subnet but not present in the host facts?")
424
425 # Ideally all hosts will have been removed since they have an IP on at least
426 # one of the public networks
427 if hosts_remaining:
428 if len(hosts_remaining) != len(self.mgr.cache.facts):
429 # public network is visible on some hosts
430 details = [
431 f"{host} does not have an interface on any public network" for host in hosts_remaining]
432
433 self.mgr.health_checks['CEPHADM_CHECK_PUBLIC_MEMBERSHIP'] = {
434 'severity': 'warning',
435 'summary': f"Public network(s) is not directly accessible from {len(hosts_remaining)} "
436 "cluster hosts",
437 'count': len(details),
438 'detail': details,
439 }
440 self.health_check_raised = True
441 else:
442 self.mgr.health_checks.pop('CEPHADM_CHECK_PUBLIC_MEMBERSHIP', None)
443
444 def _check_osd_mtu(self) -> None:
445 osd_hosts = set(self.hosts_with_role('osd'))
446 osd_network_list = self.cluster_network_list or self.public_network_list
447 mtu_errors: List[str] = []
448
449 for osd_net in osd_network_list:
450 subnet_data = self.subnet_lookup.get(osd_net, None)
451
452 if subnet_data:
453
454 self.log.debug(f"processing mtu map : {json.dumps(subnet_data.mtu_map)}")
455 mtu_count = {}
456 max_hosts = 0
457 mtu_ptr = ''
458 diffs = {}
459 for mtu, host_list in subnet_data.mtu_map.items():
460 mtu_hosts = set(host_list)
461 mtu_count[mtu] = len(mtu_hosts)
462 errors = osd_hosts.difference(mtu_hosts)
463 if errors:
464 diffs[mtu] = errors
465 if len(errors) > max_hosts:
466 mtu_ptr = mtu
467
468 if diffs:
469 self.log.debug("MTU problems detected")
470 self.log.debug(f"most hosts using {mtu_ptr}")
471 mtu_copy = subnet_data.mtu_map.copy()
472 del mtu_copy[mtu_ptr]
473 for bad_mtu in mtu_copy:
474 for h in mtu_copy[bad_mtu]:
475 host = HostFacts()
476 host.load_facts(self.mgr.cache.facts[h])
477 mtu_errors.append(
478 f"host {h}({host.subnet_to_nic(osd_net)}) is using MTU "
479 f"{bad_mtu} on {osd_net}, NICs on other hosts use {mtu_ptr}")
480
481 if mtu_errors:
482 self.mgr.health_checks['CEPHADM_CHECK_MTU'] = {
483 'severity': 'warning',
484 'summary': f"MTU setting inconsistent on osd network NICs on {len(mtu_errors)} host(s)",
485 'count': len(mtu_errors),
486 'detail': mtu_errors,
487 }
488 self.health_check_raised = True
489 else:
490 self.mgr.health_checks.pop('CEPHADM_CHECK_MTU', None)
491
492 def _check_osd_linkspeed(self) -> None:
493 osd_hosts = set(self.hosts_with_role('osd'))
494 osd_network_list = self.cluster_network_list or self.public_network_list
495
496 linkspeed_errors = []
497
498 for osd_net in osd_network_list:
499 subnet_data = self.subnet_lookup.get(osd_net, None)
500
501 if subnet_data:
502
503 self.log.debug(f"processing subnet : {subnet_data}")
504
505 speed_count = {}
506 max_hosts = 0
507 speed_ptr = ''
508 diffs = {}
509 for speed, host_list in subnet_data.speed_map.items():
510 speed_hosts = set(host_list)
511 speed_count[speed] = len(speed_hosts)
512 errors = osd_hosts.difference(speed_hosts)
513 if errors:
514 diffs[speed] = errors
515 if len(errors) > max_hosts:
516 speed_ptr = speed
517
518 if diffs:
519 self.log.debug("linkspeed issue(s) detected")
520 self.log.debug(f"most hosts using {speed_ptr}")
521 speed_copy = subnet_data.speed_map.copy()
522 del speed_copy[speed_ptr]
523 for bad_speed in speed_copy:
524 if bad_speed > speed_ptr:
525 # skip speed is better than most...it can stay!
526 continue
527 for h in speed_copy[bad_speed]:
528 host = HostFacts()
529 host.load_facts(self.mgr.cache.facts[h])
530 linkspeed_errors.append(
531 f"host {h}({host.subnet_to_nic(osd_net)}) has linkspeed of "
532 f"{bad_speed} on {osd_net}, NICs on other hosts use {speed_ptr}")
533
534 if linkspeed_errors:
535 self.mgr.health_checks['CEPHADM_CHECK_LINKSPEED'] = {
536 'severity': 'warning',
537 'summary': "Link speed is inconsistent on osd network NICs for "
538 f"{len(linkspeed_errors)} host(s)",
539 'count': len(linkspeed_errors),
540 'detail': linkspeed_errors,
541 }
542 self.health_check_raised = True
543 else:
544 self.mgr.health_checks.pop('CEPHADM_CHECK_LINKSPEED', None)
545
546 def _check_network_missing(self) -> None:
547 all_networks = self.public_network_list.copy()
548 all_networks.extend(self.cluster_network_list)
549
550 missing_networks = []
551 for subnet in all_networks:
552 subnet_data = self.subnet_lookup.get(subnet, None)
553
554 if not subnet_data:
555 missing_networks.append(f"{subnet} not found on any host in the cluster")
556 self.log.warning(
557 f"Network {subnet} has been defined, but is not present on any host")
558
559 if missing_networks:
560 net_sfx = 's' if len(missing_networks) > 1 else ''
561 self.mgr.health_checks['CEPHADM_CHECK_NETWORK_MISSING'] = {
562 'severity': 'warning',
563 'summary': f"Public/cluster network{net_sfx} defined, but can not be found on "
564 "any host",
565 'count': len(missing_networks),
566 'detail': missing_networks,
567 }
568 self.health_check_raised = True
569 else:
570 self.mgr.health_checks.pop('CEPHADM_CHECK_NETWORK_MISSING', None)
571
572 def _check_release_parity(self) -> None:
573 upgrade_status = self.mgr.upgrade.upgrade_status()
574 if upgrade_status.in_progress:
575 # skip version consistency checks during an upgrade cycle
576 self.skipped_checks.append('ceph_release')
577 return
578
579 services = self.get_ceph_metadata()
580 self.log.debug(json.dumps(services))
581 version_to_svcs: Dict[str, List[str]] = {}
582
583 for svc in services:
584 if services[svc]:
585 metadata = cast(Dict[str, str], services[svc])
586 v = metadata.get('ceph_release', '')
587 if v in version_to_svcs:
588 version_to_svcs[v].append(svc)
589 else:
590 version_to_svcs[v] = [svc]
591
592 if len(version_to_svcs) > 1:
593 majority_ptr, _majority_count = self._get_majority(version_to_svcs)
594 ver_copy = version_to_svcs.copy()
595 del ver_copy[majority_ptr]
596 details = []
597 for v in ver_copy:
598 for svc in ver_copy[v]:
599 details.append(
600 f"{svc} is running {v} (majority of cluster is using {majority_ptr})")
601
602 self.mgr.health_checks['CEPHADM_CHECK_CEPH_RELEASE'] = {
603 'severity': 'warning',
604 'summary': 'Ceph cluster running mixed ceph releases',
605 'count': len(details),
606 'detail': details,
607 }
608 self.health_check_raised = True
609 self.log.warning(
610 f"running with {len(version_to_svcs)} different ceph releases within this cluster")
611 else:
612 self.mgr.health_checks.pop('CEPHADM_CHECK_CEPH_RELEASE', None)
613
614 def _check_kernel_version(self) -> None:
615 if len(self.kernel_to_hosts.keys()) > 1:
616 majority_hosts_ptr, majority_hosts_count = self._get_majority(self.kernel_to_hosts)
617 kver_copy = self.kernel_to_hosts.copy()
618 del kver_copy[majority_hosts_ptr]
619 details = []
620 for k in kver_copy:
621 for h in kver_copy[k]:
622 details.append(
623 f"host {h} running kernel {k}, majority of hosts({majority_hosts_count}) "
624 f"running {majority_hosts_ptr}")
625
626 self.log.warning("mixed kernel versions detected")
627 self.mgr.health_checks['CEPHADM_CHECK_KERNEL_VERSION'] = {
628 'severity': 'warning',
629 'summary': f"{len(details)} host(s) running different kernel versions",
630 'count': len(details),
631 'detail': details,
632 }
633 self.health_check_raised = True
634 else:
635 self.mgr.health_checks.pop('CEPHADM_CHECK_KERNEL_VERSION', None)
636
637 def _process_hosts(self) -> None:
638 self.log.debug(f"processing data from {len(self.mgr.cache.facts)} hosts")
639 for hostname in self.mgr.cache.facts:
640 host = HostFacts()
641 host.load_facts(self.mgr.cache.facts[hostname])
642 if not host._valid:
643 self.log.warning(f"skipping {hostname} - incompatible host facts")
644 continue
645
646 kernel_lsm = cast(Dict[str, str], host.kernel_security)
647 lsm_desc = kernel_lsm.get('description', '')
648 if lsm_desc:
649 if lsm_desc in self.lsm_to_host:
650 self.lsm_to_host[lsm_desc].append(hostname)
651 else:
652 self.lsm_to_host[lsm_desc] = [hostname]
653
654 subscription_state = host.subscribed.lower() if host.subscribed else None
655 if subscription_state:
656 self.subscribed[subscription_state].append(hostname)
657
658 interfaces = cast(Dict[str, Dict[str, Any]], host.interfaces)
659 for name in interfaces.keys():
660 if name in ['lo']:
661 continue
662 self._update_subnet_lookups(hostname, name, interfaces[name])
663
664 if host.kernel:
665 kernel_maj_min = '.'.join(host.kernel.split('.')[0:2])
666 if kernel_maj_min in self.kernel_to_hosts:
667 self.kernel_to_hosts[kernel_maj_min].append(hostname)
668 else:
669 self.kernel_to_hosts[kernel_maj_min] = [hostname]
670 else:
671 self.log.warning(f"Host gather facts for {hostname} is missing kernel information")
672
673 # NOTE: if daemondescription had systemd enabled state, we could check for systemd 'tampering'
674 self.host_to_role[hostname] = self.mgr.cache.get_daemon_types(hostname)
675
676 def run_checks(self) -> None:
677 checks_enabled = self.mgr.get_module_option('config_checks_enabled')
678 if checks_enabled is not True:
679 return
680
681 self.reset()
682
683 check_config: Dict[str, str] = {}
684 checks_raw: Optional[str] = self.mgr.get_store('config_checks')
685 if checks_raw:
686 try:
687 check_config.update(json.loads(checks_raw))
688 except json.JSONDecodeError:
689 self.log.exception(
690 "mgr/cephadm/config_checks is not JSON serializable - all checks will run")
691
692 # build lookup "maps" by walking the host facts, once
693 self._process_hosts()
694
695 self.health_check_raised = False
696 self.active_checks = []
697 self.skipped_checks = []
698
699 # process all healthchecks that are not explcitly disabled
700 for health_check in self.health_checks:
701 if check_config.get(health_check.name, '') != 'disabled':
702 self.active_checks.append(health_check.name)
703 health_check.func()
704
705 if self.health_check_raised:
706 self.log.warning("CEPHADM checks have detected configuration anomalies")
707 else:
708 self.log.info(
709 f"CEPHADM {self.active_checks_count}/{self.defined_checks} checks enabled "
710 f"and executed ({self.skipped_checks_count} bypassed, "
711 f"{self.defined_checks - self.active_checks_count} disabled). No issues detected")
712
713 self.mgr.set_health_checks(self.mgr.health_checks)