]> git.proxmox.com Git - ceph.git/blob - ceph/src/cephadm/cephadm
buildsys: change download over to reef release
[ceph.git] / ceph / src / cephadm / cephadm
1 #!/usr/bin/python3
2
3 import asyncio
4 import asyncio.subprocess
5 import argparse
6 import datetime
7 import fcntl
8 import ipaddress
9 import io
10 import json
11 import logging
12 from logging.config import dictConfig
13 import os
14 import platform
15 import pwd
16 import random
17 import shlex
18 import shutil
19 import socket
20 import string
21 import subprocess
22 import sys
23 import tempfile
24 import time
25 import errno
26 import struct
27 import ssl
28 from enum import Enum
29 from typing import Dict, List, Tuple, Optional, Union, Any, NoReturn, Callable, IO, Sequence, TypeVar, cast, Set, Iterable, TextIO
30
31 import re
32 import uuid
33
34 from configparser import ConfigParser
35 from contextlib import redirect_stdout
36 from functools import wraps
37 from glob import glob
38 from io import StringIO
39 from threading import Thread, Event
40 from urllib.error import HTTPError, URLError
41 from urllib.request import urlopen, Request
42 from pathlib import Path
43
44 FuncT = TypeVar('FuncT', bound=Callable)
45
46 # Default container images -----------------------------------------------------
47 DEFAULT_IMAGE = 'quay.io/ceph/ceph:v17'
48 DEFAULT_IMAGE_IS_MASTER = False
49 DEFAULT_IMAGE_RELEASE = 'quincy'
50 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.33.4'
51 DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0'
52 DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0'
53 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.3.1'
54 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.23.0'
55 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/ceph-grafana:8.3.5'
56 DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
57 DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.1.5'
58 DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
59 DEFAULT_REGISTRY = 'docker.io' # normalize unqualified digests to this
60 # ------------------------------------------------------------------------------
61
62 LATEST_STABLE_RELEASE = 'quincy'
63 DATA_DIR = '/var/lib/ceph'
64 LOG_DIR = '/var/log/ceph'
65 LOCK_DIR = '/run/cephadm'
66 LOGROTATE_DIR = '/etc/logrotate.d'
67 SYSCTL_DIR = '/etc/sysctl.d'
68 UNIT_DIR = '/etc/systemd/system'
69 CEPH_CONF_DIR = 'config'
70 CEPH_CONF = 'ceph.conf'
71 CEPH_PUBKEY = 'ceph.pub'
72 CEPH_KEYRING = 'ceph.client.admin.keyring'
73 CEPH_DEFAULT_CONF = f'/etc/ceph/{CEPH_CONF}'
74 CEPH_DEFAULT_KEYRING = f'/etc/ceph/{CEPH_KEYRING}'
75 CEPH_DEFAULT_PUBKEY = f'/etc/ceph/{CEPH_PUBKEY}'
76 LOG_DIR_MODE = 0o770
77 DATA_DIR_MODE = 0o700
78 CONTAINER_INIT = True
79 MIN_PODMAN_VERSION = (2, 0, 2)
80 CGROUPS_SPLIT_PODMAN_VERSION = (2, 1, 0)
81 PIDS_LIMIT_UNLIMITED_PODMAN_VERSION = (3, 4, 1)
82 CUSTOM_PS1 = r'[ceph: \u@\h \W]\$ '
83 DEFAULT_TIMEOUT = None # in seconds
84 DEFAULT_RETRY = 15
85 DATEFMT = '%Y-%m-%dT%H:%M:%S.%fZ'
86 QUIET_LOG_LEVEL = 9 # DEBUG is 10, so using 9 to be lower level than DEBUG
87
88 logger: logging.Logger = None # type: ignore
89
90 """
91 You can invoke cephadm in two ways:
92
93 1. The normal way, at the command line.
94
95 2. By piping the script to the python3 binary. In this latter case, you should
96 prepend one or more lines to the beginning of the script.
97
98 For arguments,
99
100 injected_argv = [...]
101
102 e.g.,
103
104 injected_argv = ['ls']
105
106 For reading stdin from the '--config-json -' argument,
107
108 injected_stdin = '...'
109 """
110 cached_stdin = None
111
112
113 ##################################
114
115
116 async def run_func(func: Callable, cmd: str) -> subprocess.CompletedProcess:
117 logger.debug(f'running function {func.__name__}, with parms: {cmd}')
118 response = func(cmd)
119 return response
120
121
122 async def concurrent_tasks(func: Callable, cmd_list: List[str]) -> List[Any]:
123 tasks = []
124 for cmd in cmd_list:
125 tasks.append(run_func(func, cmd))
126
127 data = await asyncio.gather(*tasks)
128
129 return data
130
131
132 class EndPoint:
133 """EndPoint representing an ip:port format"""
134
135 def __init__(self, ip: str, port: int) -> None:
136 self.ip = ip
137 self.port = port
138
139 def __str__(self) -> str:
140 return f'{self.ip}:{self.port}'
141
142 def __repr__(self) -> str:
143 return f'{self.ip}:{self.port}'
144
145
146 class ContainerInfo:
147 def __init__(self, container_id: str,
148 image_name: str,
149 image_id: str,
150 start: str,
151 version: str) -> None:
152 self.container_id = container_id
153 self.image_name = image_name
154 self.image_id = image_id
155 self.start = start
156 self.version = version
157
158 def __eq__(self, other: Any) -> bool:
159 if not isinstance(other, ContainerInfo):
160 return NotImplemented
161 return (self.container_id == other.container_id
162 and self.image_name == other.image_name
163 and self.image_id == other.image_id
164 and self.start == other.start
165 and self.version == other.version)
166
167
168 class BaseConfig:
169
170 def __init__(self) -> None:
171 self.image: str = ''
172 self.docker: bool = False
173 self.data_dir: str = DATA_DIR
174 self.log_dir: str = LOG_DIR
175 self.logrotate_dir: str = LOGROTATE_DIR
176 self.sysctl_dir: str = SYSCTL_DIR
177 self.unit_dir: str = UNIT_DIR
178 self.verbose: bool = False
179 self.timeout: Optional[int] = DEFAULT_TIMEOUT
180 self.retry: int = DEFAULT_RETRY
181 self.env: List[str] = []
182 self.memory_request: Optional[int] = None
183 self.memory_limit: Optional[int] = None
184 self.log_to_journald: Optional[bool] = None
185
186 self.container_init: bool = CONTAINER_INIT
187 self.container_engine: Optional[ContainerEngine] = None
188
189 def set_from_args(self, args: argparse.Namespace) -> None:
190 argdict: Dict[str, Any] = vars(args)
191 for k, v in argdict.items():
192 if hasattr(self, k):
193 setattr(self, k, v)
194
195
196 class CephadmContext:
197
198 def __init__(self) -> None:
199 self.__dict__['_args'] = None
200 self.__dict__['_conf'] = BaseConfig()
201
202 def set_args(self, args: argparse.Namespace) -> None:
203 self._conf.set_from_args(args)
204 self._args = args
205
206 def has_function(self) -> bool:
207 return 'func' in self._args
208
209 def __contains__(self, name: str) -> bool:
210 return hasattr(self, name)
211
212 def __getattr__(self, name: str) -> Any:
213 if '_conf' in self.__dict__ and hasattr(self._conf, name):
214 return getattr(self._conf, name)
215 elif '_args' in self.__dict__ and hasattr(self._args, name):
216 return getattr(self._args, name)
217 else:
218 return super().__getattribute__(name)
219
220 def __setattr__(self, name: str, value: Any) -> None:
221 if hasattr(self._conf, name):
222 setattr(self._conf, name, value)
223 elif hasattr(self._args, name):
224 setattr(self._args, name, value)
225 else:
226 super().__setattr__(name, value)
227
228
229 class ContainerEngine:
230 def __init__(self) -> None:
231 self.path = find_program(self.EXE)
232
233 @classmethod
234 @property
235 def EXE(cls) -> str:
236 raise NotImplementedError()
237
238 def __str__(self) -> str:
239 return f'{self.EXE} ({self.path})'
240
241
242 class Podman(ContainerEngine):
243 EXE = 'podman'
244
245 def __init__(self) -> None:
246 super().__init__()
247 self._version: Optional[Tuple[int, ...]] = None
248
249 @property
250 def version(self) -> Tuple[int, ...]:
251 if self._version is None:
252 raise RuntimeError('Please call `get_version` first')
253 return self._version
254
255 def get_version(self, ctx: CephadmContext) -> None:
256 out, _, _ = call_throws(ctx, [self.path, 'version', '--format', '{{.Client.Version}}'], verbosity=CallVerbosity.QUIET)
257 self._version = _parse_podman_version(out)
258
259 def __str__(self) -> str:
260 version = '.'.join(map(str, self.version))
261 return f'{self.EXE} ({self.path}) version {version}'
262
263
264 class Docker(ContainerEngine):
265 EXE = 'docker'
266
267
268 CONTAINER_PREFERENCE = (Podman, Docker) # prefer podman to docker
269
270
271 # During normal cephadm operations (cephadm ls, gather-facts, etc ) we use:
272 # stdout: for JSON output only
273 # stderr: for error, debug, info, etc
274 logging_config = {
275 'version': 1,
276 'disable_existing_loggers': True,
277 'formatters': {
278 'cephadm': {
279 'format': '%(asctime)s %(thread)x %(levelname)s %(message)s'
280 },
281 },
282 'handlers': {
283 'console': {
284 'level': 'INFO',
285 'class': 'logging.StreamHandler',
286 },
287 'log_file': {
288 'level': 'DEBUG',
289 'class': 'logging.handlers.WatchedFileHandler',
290 'formatter': 'cephadm',
291 'filename': '%s/cephadm.log' % LOG_DIR,
292 }
293 },
294 'loggers': {
295 '': {
296 'level': 'DEBUG',
297 'handlers': ['console', 'log_file'],
298 }
299 }
300 }
301
302
303 class ExcludeErrorsFilter(logging.Filter):
304 def filter(self, record: logging.LogRecord) -> bool:
305 """Only lets through log messages with log level below WARNING ."""
306 return record.levelno < logging.WARNING
307
308
309 # When cephadm is used as standard binary (bootstrap, rm-cluster, etc) we use:
310 # stdout: for debug and info
311 # stderr: for errors and warnings
312 interactive_logging_config = {
313 'version': 1,
314 'filters': {
315 'exclude_errors': {
316 '()': ExcludeErrorsFilter
317 }
318 },
319 'disable_existing_loggers': True,
320 'formatters': {
321 'cephadm': {
322 'format': '%(asctime)s %(thread)x %(levelname)s %(message)s'
323 },
324 },
325 'handlers': {
326 'console_stdout': {
327 'level': 'INFO',
328 'class': 'logging.StreamHandler',
329 'filters': ['exclude_errors'],
330 'stream': sys.stdout
331 },
332 'console_stderr': {
333 'level': 'WARNING',
334 'class': 'logging.StreamHandler',
335 'stream': sys.stderr
336 },
337 'log_file': {
338 'level': 'DEBUG',
339 'class': 'logging.handlers.WatchedFileHandler',
340 'formatter': 'cephadm',
341 'filename': '%s/cephadm.log' % LOG_DIR,
342 }
343 },
344 'loggers': {
345 '': {
346 'level': 'DEBUG',
347 'handlers': ['console_stdout', 'console_stderr', 'log_file'],
348 }
349 }
350 }
351
352
353 class termcolor:
354 yellow = '\033[93m'
355 red = '\033[31m'
356 end = '\033[0m'
357
358
359 class Error(Exception):
360 pass
361
362
363 class TimeoutExpired(Error):
364 pass
365
366
367 class UnauthorizedRegistryError(Error):
368 pass
369
370 ##################################
371
372
373 class Ceph(object):
374 daemons = ('mon', 'mgr', 'osd', 'mds', 'rgw', 'rbd-mirror',
375 'crash', 'cephfs-mirror', 'ceph-exporter')
376 gateways = ('iscsi', 'nfs')
377
378 ##################################
379
380
381 class OSD(object):
382 @staticmethod
383 def get_sysctl_settings() -> List[str]:
384 return [
385 '# allow a large number of OSDs',
386 'fs.aio-max-nr = 1048576',
387 'kernel.pid_max = 4194304',
388 ]
389
390
391 ##################################
392
393
394 class SNMPGateway:
395 """Defines an SNMP gateway between Prometheus and SNMP monitoring Frameworks"""
396 daemon_type = 'snmp-gateway'
397 SUPPORTED_VERSIONS = ['V2c', 'V3']
398 default_image = DEFAULT_SNMP_GATEWAY_IMAGE
399 DEFAULT_PORT = 9464
400 env_filename = 'snmp-gateway.conf'
401
402 def __init__(self,
403 ctx: CephadmContext,
404 fsid: str,
405 daemon_id: Union[int, str],
406 config_json: Dict[str, Any],
407 image: Optional[str] = None) -> None:
408 self.ctx = ctx
409 self.fsid = fsid
410 self.daemon_id = daemon_id
411 self.image = image or SNMPGateway.default_image
412
413 self.uid = config_json.get('uid', 0)
414 self.gid = config_json.get('gid', 0)
415
416 self.destination = config_json.get('destination', '')
417 self.snmp_version = config_json.get('snmp_version', 'V2c')
418 self.snmp_community = config_json.get('snmp_community', 'public')
419 self.log_level = config_json.get('log_level', 'info')
420 self.snmp_v3_auth_username = config_json.get('snmp_v3_auth_username', '')
421 self.snmp_v3_auth_password = config_json.get('snmp_v3_auth_password', '')
422 self.snmp_v3_auth_protocol = config_json.get('snmp_v3_auth_protocol', '')
423 self.snmp_v3_priv_protocol = config_json.get('snmp_v3_priv_protocol', '')
424 self.snmp_v3_priv_password = config_json.get('snmp_v3_priv_password', '')
425 self.snmp_v3_engine_id = config_json.get('snmp_v3_engine_id', '')
426
427 self.validate()
428
429 @classmethod
430 def init(cls, ctx: CephadmContext, fsid: str,
431 daemon_id: Union[int, str]) -> 'SNMPGateway':
432 assert ctx.config_json
433 return cls(ctx, fsid, daemon_id,
434 get_parm(ctx.config_json), ctx.image)
435
436 @staticmethod
437 def get_version(ctx: CephadmContext, fsid: str, daemon_id: str) -> Optional[str]:
438 """Return the version of the notifer from it's http endpoint"""
439 path = os.path.join(ctx.data_dir, fsid, f'snmp-gateway.{daemon_id}', 'unit.meta')
440 try:
441 with open(path, 'r') as env:
442 metadata = json.loads(env.read())
443 except (OSError, json.JSONDecodeError):
444 return None
445
446 ports = metadata.get('ports', [])
447 if not ports:
448 return None
449
450 try:
451 with urlopen(f'http://127.0.0.1:{ports[0]}/') as r:
452 html = r.read().decode('utf-8').split('\n')
453 except (HTTPError, URLError):
454 return None
455
456 for h in html:
457 stripped = h.strip()
458 if stripped.startswith(('<pre>', '<PRE>')) and \
459 stripped.endswith(('</pre>', '</PRE>')):
460 # <pre>(version=1.2.1, branch=HEAD, revision=7...
461 return stripped.split(',')[0].split('version=')[1]
462
463 return None
464
465 @property
466 def port(self) -> int:
467 if not self.ctx.tcp_ports:
468 return self.DEFAULT_PORT
469 else:
470 if len(self.ctx.tcp_ports) > 0:
471 return int(self.ctx.tcp_ports.split()[0])
472 else:
473 return self.DEFAULT_PORT
474
475 def get_daemon_args(self) -> List[str]:
476 v3_args = []
477 base_args = [
478 f'--web.listen-address=:{self.port}',
479 f'--snmp.destination={self.destination}',
480 f'--snmp.version={self.snmp_version}',
481 f'--log.level={self.log_level}',
482 '--snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl'
483 ]
484
485 if self.snmp_version == 'V3':
486 # common auth settings
487 v3_args.extend([
488 '--snmp.authentication-enabled',
489 f'--snmp.authentication-protocol={self.snmp_v3_auth_protocol}',
490 f'--snmp.security-engine-id={self.snmp_v3_engine_id}'
491 ])
492 # authPriv setting is applied if we have a privacy protocol setting
493 if self.snmp_v3_priv_protocol:
494 v3_args.extend([
495 '--snmp.private-enabled',
496 f'--snmp.private-protocol={self.snmp_v3_priv_protocol}'
497 ])
498
499 return base_args + v3_args
500
501 @property
502 def data_dir(self) -> str:
503 return os.path.join(self.ctx.data_dir, self.ctx.fsid, f'{self.daemon_type}.{self.daemon_id}')
504
505 @property
506 def conf_file_path(self) -> str:
507 return os.path.join(self.data_dir, self.env_filename)
508
509 def create_daemon_conf(self) -> None:
510 """Creates the environment file holding 'secrets' passed to the snmp-notifier daemon"""
511 with open(os.open(self.conf_file_path, os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
512 if self.snmp_version == 'V2c':
513 f.write(f'SNMP_NOTIFIER_COMMUNITY={self.snmp_community}\n')
514 else:
515 f.write(f'SNMP_NOTIFIER_AUTH_USERNAME={self.snmp_v3_auth_username}\n')
516 f.write(f'SNMP_NOTIFIER_AUTH_PASSWORD={self.snmp_v3_auth_password}\n')
517 if self.snmp_v3_priv_password:
518 f.write(f'SNMP_NOTIFIER_PRIV_PASSWORD={self.snmp_v3_priv_password}\n')
519
520 def validate(self) -> None:
521 """Validate the settings
522
523 Raises:
524 Error: if the fsid doesn't look like an fsid
525 Error: if the snmp version is not supported
526 Error: destination IP and port address missing
527 """
528 if not is_fsid(self.fsid):
529 raise Error(f'not a valid fsid: {self.fsid}')
530
531 if self.snmp_version not in SNMPGateway.SUPPORTED_VERSIONS:
532 raise Error(f'not a valid snmp version: {self.snmp_version}')
533
534 if not self.destination:
535 raise Error('config is missing destination attribute(<ip>:<port>) of the target SNMP listener')
536
537
538 ##################################
539 class Monitoring(object):
540 """Define the configs for the monitoring containers"""
541
542 port_map = {
543 'prometheus': [9095], # Avoid default 9090, due to conflict with cockpit UI
544 'node-exporter': [9100],
545 'grafana': [3000],
546 'alertmanager': [9093, 9094],
547 'loki': [3100],
548 'promtail': [9080]
549 }
550
551 components = {
552 'prometheus': {
553 'image': DEFAULT_PROMETHEUS_IMAGE,
554 'cpus': '2',
555 'memory': '4GB',
556 'args': [
557 '--config.file=/etc/prometheus/prometheus.yml',
558 '--storage.tsdb.path=/prometheus',
559 ],
560 'config-json-files': [
561 'prometheus.yml',
562 ],
563 },
564 'loki': {
565 'image': DEFAULT_LOKI_IMAGE,
566 'cpus': '1',
567 'memory': '1GB',
568 'args': [
569 '--config.file=/etc/loki/loki.yml',
570 ],
571 'config-json-files': [
572 'loki.yml'
573 ],
574 },
575 'promtail': {
576 'image': DEFAULT_PROMTAIL_IMAGE,
577 'cpus': '1',
578 'memory': '1GB',
579 'args': [
580 '--config.file=/etc/promtail/promtail.yml',
581 ],
582 'config-json-files': [
583 'promtail.yml',
584 ],
585 },
586 'node-exporter': {
587 'image': DEFAULT_NODE_EXPORTER_IMAGE,
588 'cpus': '1',
589 'memory': '1GB',
590 'args': [
591 '--no-collector.timex',
592 ],
593 },
594 'grafana': {
595 'image': DEFAULT_GRAFANA_IMAGE,
596 'cpus': '2',
597 'memory': '4GB',
598 'args': [],
599 'config-json-files': [
600 'grafana.ini',
601 'provisioning/datasources/ceph-dashboard.yml',
602 'certs/cert_file',
603 'certs/cert_key',
604 ],
605 },
606 'alertmanager': {
607 'image': DEFAULT_ALERT_MANAGER_IMAGE,
608 'cpus': '2',
609 'memory': '2GB',
610 'args': [
611 '--cluster.listen-address=:{}'.format(port_map['alertmanager'][1]),
612 ],
613 'config-json-files': [
614 'alertmanager.yml',
615 ],
616 'config-json-args': [
617 'peers',
618 ],
619 },
620 } # type: ignore
621
622 @staticmethod
623 def get_version(ctx, container_id, daemon_type):
624 # type: (CephadmContext, str, str) -> str
625 """
626 :param: daemon_type Either "prometheus", "alertmanager", "loki", "promtail" or "node-exporter"
627 """
628 assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter', 'loki', 'promtail')
629 cmd = daemon_type.replace('-', '_')
630 code = -1
631 err = ''
632 out = ''
633 version = ''
634 if daemon_type == 'alertmanager':
635 for cmd in ['alertmanager', 'prometheus-alertmanager']:
636 out, err, code = call(ctx, [
637 ctx.container_engine.path, 'exec', container_id, cmd,
638 '--version'
639 ], verbosity=CallVerbosity.QUIET)
640 if code == 0:
641 break
642 cmd = 'alertmanager' # reset cmd for version extraction
643 else:
644 out, err, code = call(ctx, [
645 ctx.container_engine.path, 'exec', container_id, cmd, '--version'
646 ], verbosity=CallVerbosity.QUIET)
647 if code == 0:
648 if err.startswith('%s, version ' % cmd):
649 version = err.split(' ')[2]
650 elif out.startswith('%s, version ' % cmd):
651 version = out.split(' ')[2]
652 return version
653
654 ##################################
655
656
657 def populate_files(config_dir, config_files, uid, gid):
658 # type: (str, Dict, int, int) -> None
659 """create config files for different services"""
660 for fname in config_files:
661 config_file = os.path.join(config_dir, fname)
662 config_content = dict_get_join(config_files, fname)
663 logger.info('Write file: %s' % (config_file))
664 with open(config_file, 'w', encoding='utf-8') as f:
665 os.fchown(f.fileno(), uid, gid)
666 os.fchmod(f.fileno(), 0o600)
667 f.write(config_content)
668
669
670 class NFSGanesha(object):
671 """Defines a NFS-Ganesha container"""
672
673 daemon_type = 'nfs'
674 entrypoint = '/usr/bin/ganesha.nfsd'
675 daemon_args = ['-F', '-L', 'STDERR']
676
677 required_files = ['ganesha.conf']
678
679 port_map = {
680 'nfs': 2049,
681 }
682
683 def __init__(self,
684 ctx,
685 fsid,
686 daemon_id,
687 config_json,
688 image=DEFAULT_IMAGE):
689 # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
690 self.ctx = ctx
691 self.fsid = fsid
692 self.daemon_id = daemon_id
693 self.image = image
694
695 # config-json options
696 self.pool = dict_get(config_json, 'pool', require=True)
697 self.namespace = dict_get(config_json, 'namespace')
698 self.userid = dict_get(config_json, 'userid')
699 self.extra_args = dict_get(config_json, 'extra_args', [])
700 self.files = dict_get(config_json, 'files', {})
701 self.rgw = dict_get(config_json, 'rgw', {})
702
703 # validate the supplied args
704 self.validate()
705
706 @classmethod
707 def init(cls, ctx, fsid, daemon_id):
708 # type: (CephadmContext, str, Union[int, str]) -> NFSGanesha
709 return cls(ctx, fsid, daemon_id, get_parm(ctx.config_json), ctx.image)
710
711 def get_container_mounts(self, data_dir):
712 # type: (str) -> Dict[str, str]
713 mounts = dict()
714 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
715 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
716 mounts[os.path.join(data_dir, 'etc/ganesha')] = '/etc/ganesha:z'
717 if self.rgw:
718 cluster = self.rgw.get('cluster', 'ceph')
719 rgw_user = self.rgw.get('user', 'admin')
720 mounts[os.path.join(data_dir, 'keyring.rgw')] = \
721 '/var/lib/ceph/radosgw/%s-%s/keyring:z' % (cluster, rgw_user)
722 return mounts
723
724 @staticmethod
725 def get_container_envs():
726 # type: () -> List[str]
727 envs = [
728 'CEPH_CONF=%s' % (CEPH_DEFAULT_CONF)
729 ]
730 return envs
731
732 @staticmethod
733 def get_version(ctx, container_id):
734 # type: (CephadmContext, str) -> Optional[str]
735 version = None
736 out, err, code = call(ctx,
737 [ctx.container_engine.path, 'exec', container_id,
738 NFSGanesha.entrypoint, '-v'],
739 verbosity=CallVerbosity.QUIET)
740 if code == 0:
741 match = re.search(r'NFS-Ganesha Release\s*=\s*[V]*([\d.]+)', out)
742 if match:
743 version = match.group(1)
744 return version
745
746 def validate(self):
747 # type: () -> None
748 if not is_fsid(self.fsid):
749 raise Error('not an fsid: %s' % self.fsid)
750 if not self.daemon_id:
751 raise Error('invalid daemon_id: %s' % self.daemon_id)
752 if not self.image:
753 raise Error('invalid image: %s' % self.image)
754
755 # check for the required files
756 if self.required_files:
757 for fname in self.required_files:
758 if fname not in self.files:
759 raise Error('required file missing from config-json: %s' % fname)
760
761 # check for an RGW config
762 if self.rgw:
763 if not self.rgw.get('keyring'):
764 raise Error('RGW keyring is missing')
765 if not self.rgw.get('user'):
766 raise Error('RGW user is missing')
767
768 def get_daemon_name(self):
769 # type: () -> str
770 return '%s.%s' % (self.daemon_type, self.daemon_id)
771
772 def get_container_name(self, desc=None):
773 # type: (Optional[str]) -> str
774 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
775 if desc:
776 cname = '%s-%s' % (cname, desc)
777 return cname
778
779 def get_daemon_args(self):
780 # type: () -> List[str]
781 return self.daemon_args + self.extra_args
782
783 def create_daemon_dirs(self, data_dir, uid, gid):
784 # type: (str, int, int) -> None
785 """Create files under the container data dir"""
786 if not os.path.isdir(data_dir):
787 raise OSError('data_dir is not a directory: %s' % (data_dir))
788
789 logger.info('Creating ganesha config...')
790
791 # create the ganesha conf dir
792 config_dir = os.path.join(data_dir, 'etc/ganesha')
793 makedirs(config_dir, uid, gid, 0o755)
794
795 # populate files from the config-json
796 populate_files(config_dir, self.files, uid, gid)
797
798 # write the RGW keyring
799 if self.rgw:
800 keyring_path = os.path.join(data_dir, 'keyring.rgw')
801 with open(keyring_path, 'w') as f:
802 os.fchmod(f.fileno(), 0o600)
803 os.fchown(f.fileno(), uid, gid)
804 f.write(self.rgw.get('keyring', ''))
805
806 ##################################
807
808
809 class CephIscsi(object):
810 """Defines a Ceph-Iscsi container"""
811
812 daemon_type = 'iscsi'
813 entrypoint = '/usr/bin/rbd-target-api'
814
815 required_files = ['iscsi-gateway.cfg']
816
817 def __init__(self,
818 ctx,
819 fsid,
820 daemon_id,
821 config_json,
822 image=DEFAULT_IMAGE):
823 # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
824 self.ctx = ctx
825 self.fsid = fsid
826 self.daemon_id = daemon_id
827 self.image = image
828
829 # config-json options
830 self.files = dict_get(config_json, 'files', {})
831
832 # validate the supplied args
833 self.validate()
834
835 @classmethod
836 def init(cls, ctx, fsid, daemon_id):
837 # type: (CephadmContext, str, Union[int, str]) -> CephIscsi
838 return cls(ctx, fsid, daemon_id,
839 get_parm(ctx.config_json), ctx.image)
840
841 @staticmethod
842 def get_container_mounts(data_dir, log_dir):
843 # type: (str, str) -> Dict[str, str]
844 mounts = dict()
845 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
846 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
847 mounts[os.path.join(data_dir, 'iscsi-gateway.cfg')] = '/etc/ceph/iscsi-gateway.cfg:z'
848 mounts[os.path.join(data_dir, 'configfs')] = '/sys/kernel/config'
849 mounts[log_dir] = '/var/log:z'
850 mounts['/dev'] = '/dev'
851 return mounts
852
853 @staticmethod
854 def get_container_binds():
855 # type: () -> List[List[str]]
856 binds = []
857 lib_modules = ['type=bind',
858 'source=/lib/modules',
859 'destination=/lib/modules',
860 'ro=true']
861 binds.append(lib_modules)
862 return binds
863
864 @staticmethod
865 def get_version(ctx, container_id):
866 # type: (CephadmContext, str) -> Optional[str]
867 version = None
868 out, err, code = call(ctx,
869 [ctx.container_engine.path, 'exec', container_id,
870 '/usr/bin/python3', '-c', "import pkg_resources; print(pkg_resources.require('ceph_iscsi')[0].version)"],
871 verbosity=CallVerbosity.QUIET)
872 if code == 0:
873 version = out.strip()
874 return version
875
876 def validate(self):
877 # type: () -> None
878 if not is_fsid(self.fsid):
879 raise Error('not an fsid: %s' % self.fsid)
880 if not self.daemon_id:
881 raise Error('invalid daemon_id: %s' % self.daemon_id)
882 if not self.image:
883 raise Error('invalid image: %s' % self.image)
884
885 # check for the required files
886 if self.required_files:
887 for fname in self.required_files:
888 if fname not in self.files:
889 raise Error('required file missing from config-json: %s' % fname)
890
891 def get_daemon_name(self):
892 # type: () -> str
893 return '%s.%s' % (self.daemon_type, self.daemon_id)
894
895 def get_container_name(self, desc=None):
896 # type: (Optional[str]) -> str
897 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
898 if desc:
899 cname = '%s-%s' % (cname, desc)
900 return cname
901
902 def create_daemon_dirs(self, data_dir, uid, gid):
903 # type: (str, int, int) -> None
904 """Create files under the container data dir"""
905 if not os.path.isdir(data_dir):
906 raise OSError('data_dir is not a directory: %s' % (data_dir))
907
908 logger.info('Creating ceph-iscsi config...')
909 configfs_dir = os.path.join(data_dir, 'configfs')
910 makedirs(configfs_dir, uid, gid, 0o755)
911
912 # populate files from the config-json
913 populate_files(data_dir, self.files, uid, gid)
914
915 @staticmethod
916 def configfs_mount_umount(data_dir, mount=True):
917 # type: (str, bool) -> List[str]
918 mount_path = os.path.join(data_dir, 'configfs')
919 if mount:
920 cmd = 'if ! grep -qs {0} /proc/mounts; then ' \
921 'mount -t configfs none {0}; fi'.format(mount_path)
922 else:
923 cmd = 'if grep -qs {0} /proc/mounts; then ' \
924 'umount {0}; fi'.format(mount_path)
925 return cmd.split()
926
927 def get_tcmu_runner_container(self):
928 # type: () -> CephContainer
929 tcmu_container = get_container(self.ctx, self.fsid, self.daemon_type, self.daemon_id)
930 tcmu_container.entrypoint = '/usr/bin/tcmu-runner'
931 tcmu_container.cname = self.get_container_name(desc='tcmu')
932 # remove extra container args for tcmu container.
933 # extra args could cause issue with forking service type
934 tcmu_container.container_args = []
935 set_pids_limit_unlimited(self.ctx, tcmu_container.container_args)
936 return tcmu_container
937
938 ##################################
939
940
941 class CephExporter(object):
942 """Defines a Ceph exporter container"""
943
944 daemon_type = 'ceph-exporter'
945 entrypoint = '/usr/bin/ceph-exporter'
946 DEFAULT_PORT = 9926
947 port_map = {
948 'ceph-exporter': DEFAULT_PORT,
949 }
950
951 def __init__(self,
952 ctx: CephadmContext,
953 fsid: str, daemon_id: Union[int, str],
954 config_json: Dict[str, Any],
955 image: str = DEFAULT_IMAGE) -> None:
956 self.ctx = ctx
957 self.fsid = fsid
958 self.daemon_id = daemon_id
959 self.image = image
960
961 self.sock_dir = config_json.get('sock-dir', '/var/run/ceph/')
962 self.addrs = config_json.get('addrs', socket.gethostbyname(socket.gethostname()))
963 self.port = config_json.get('port', self.DEFAULT_PORT)
964 self.prio_limit = config_json.get('prio-limit', 5)
965 self.stats_period = config_json.get('stats-period', 5)
966
967 self.validate()
968
969 @classmethod
970 def init(cls, ctx: CephadmContext, fsid: str,
971 daemon_id: Union[int, str]) -> 'CephExporter':
972 return cls(ctx, fsid, daemon_id,
973 get_parm(ctx.config_json), ctx.image)
974
975 @staticmethod
976 def get_container_mounts() -> Dict[str, str]:
977 mounts = dict()
978 mounts['/var/run/ceph'] = '/var/run/ceph:z'
979 return mounts
980
981 def get_daemon_args(self) -> List[str]:
982 args = [
983 f'--sock-dir={self.sock_dir}',
984 f'--addrs={self.addrs}',
985 f'--port={self.port}',
986 f'--prio-limit={self.prio_limit}',
987 f'--stats-period={self.stats_period}',
988 ]
989 return args
990
991 def validate(self) -> None:
992 if not os.path.isdir(self.sock_dir):
993 raise Error(f'Directory does not exist. Got: {self.sock_dir}')
994
995
996 ##################################
997
998
999 class HAproxy(object):
1000 """Defines an HAproxy container"""
1001 daemon_type = 'haproxy'
1002 required_files = ['haproxy.cfg']
1003 default_image = DEFAULT_HAPROXY_IMAGE
1004
1005 def __init__(self,
1006 ctx: CephadmContext,
1007 fsid: str, daemon_id: Union[int, str],
1008 config_json: Dict, image: str) -> None:
1009 self.ctx = ctx
1010 self.fsid = fsid
1011 self.daemon_id = daemon_id
1012 self.image = image
1013
1014 # config-json options
1015 self.files = dict_get(config_json, 'files', {})
1016
1017 self.validate()
1018
1019 @classmethod
1020 def init(cls, ctx: CephadmContext,
1021 fsid: str, daemon_id: Union[int, str]) -> 'HAproxy':
1022 return cls(ctx, fsid, daemon_id, get_parm(ctx.config_json),
1023 ctx.image)
1024
1025 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
1026 """Create files under the container data dir"""
1027 if not os.path.isdir(data_dir):
1028 raise OSError('data_dir is not a directory: %s' % (data_dir))
1029
1030 # create additional directories in data dir for HAproxy to use
1031 if not os.path.isdir(os.path.join(data_dir, 'haproxy')):
1032 makedirs(os.path.join(data_dir, 'haproxy'), uid, gid, DATA_DIR_MODE)
1033
1034 data_dir = os.path.join(data_dir, 'haproxy')
1035 populate_files(data_dir, self.files, uid, gid)
1036
1037 def get_daemon_args(self) -> List[str]:
1038 return ['haproxy', '-f', '/var/lib/haproxy/haproxy.cfg']
1039
1040 def validate(self):
1041 # type: () -> None
1042 if not is_fsid(self.fsid):
1043 raise Error('not an fsid: %s' % self.fsid)
1044 if not self.daemon_id:
1045 raise Error('invalid daemon_id: %s' % self.daemon_id)
1046 if not self.image:
1047 raise Error('invalid image: %s' % self.image)
1048
1049 # check for the required files
1050 if self.required_files:
1051 for fname in self.required_files:
1052 if fname not in self.files:
1053 raise Error('required file missing from config-json: %s' % fname)
1054
1055 def get_daemon_name(self):
1056 # type: () -> str
1057 return '%s.%s' % (self.daemon_type, self.daemon_id)
1058
1059 def get_container_name(self, desc=None):
1060 # type: (Optional[str]) -> str
1061 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
1062 if desc:
1063 cname = '%s-%s' % (cname, desc)
1064 return cname
1065
1066 def extract_uid_gid_haproxy(self) -> Tuple[int, int]:
1067 # better directory for this?
1068 return extract_uid_gid(self.ctx, file_path='/var/lib')
1069
1070 @staticmethod
1071 def get_container_mounts(data_dir: str) -> Dict[str, str]:
1072 mounts = dict()
1073 mounts[os.path.join(data_dir, 'haproxy')] = '/var/lib/haproxy'
1074 return mounts
1075
1076 @staticmethod
1077 def get_sysctl_settings() -> List[str]:
1078 return [
1079 '# IP forwarding and non-local bind',
1080 'net.ipv4.ip_forward = 1',
1081 'net.ipv4.ip_nonlocal_bind = 1',
1082 ]
1083
1084 ##################################
1085
1086
1087 class Keepalived(object):
1088 """Defines an Keepalived container"""
1089 daemon_type = 'keepalived'
1090 required_files = ['keepalived.conf']
1091 default_image = DEFAULT_KEEPALIVED_IMAGE
1092
1093 def __init__(self,
1094 ctx: CephadmContext,
1095 fsid: str, daemon_id: Union[int, str],
1096 config_json: Dict, image: str) -> None:
1097 self.ctx = ctx
1098 self.fsid = fsid
1099 self.daemon_id = daemon_id
1100 self.image = image
1101
1102 # config-json options
1103 self.files = dict_get(config_json, 'files', {})
1104
1105 self.validate()
1106
1107 @classmethod
1108 def init(cls, ctx: CephadmContext, fsid: str,
1109 daemon_id: Union[int, str]) -> 'Keepalived':
1110 return cls(ctx, fsid, daemon_id,
1111 get_parm(ctx.config_json), ctx.image)
1112
1113 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
1114 """Create files under the container data dir"""
1115 if not os.path.isdir(data_dir):
1116 raise OSError('data_dir is not a directory: %s' % (data_dir))
1117
1118 # create additional directories in data dir for keepalived to use
1119 if not os.path.isdir(os.path.join(data_dir, 'keepalived')):
1120 makedirs(os.path.join(data_dir, 'keepalived'), uid, gid, DATA_DIR_MODE)
1121
1122 # populate files from the config-json
1123 populate_files(data_dir, self.files, uid, gid)
1124
1125 def validate(self):
1126 # type: () -> None
1127 if not is_fsid(self.fsid):
1128 raise Error('not an fsid: %s' % self.fsid)
1129 if not self.daemon_id:
1130 raise Error('invalid daemon_id: %s' % self.daemon_id)
1131 if not self.image:
1132 raise Error('invalid image: %s' % self.image)
1133
1134 # check for the required files
1135 if self.required_files:
1136 for fname in self.required_files:
1137 if fname not in self.files:
1138 raise Error('required file missing from config-json: %s' % fname)
1139
1140 def get_daemon_name(self):
1141 # type: () -> str
1142 return '%s.%s' % (self.daemon_type, self.daemon_id)
1143
1144 def get_container_name(self, desc=None):
1145 # type: (Optional[str]) -> str
1146 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
1147 if desc:
1148 cname = '%s-%s' % (cname, desc)
1149 return cname
1150
1151 @staticmethod
1152 def get_container_envs():
1153 # type: () -> List[str]
1154 envs = [
1155 'KEEPALIVED_AUTOCONF=false',
1156 'KEEPALIVED_CONF=/etc/keepalived/keepalived.conf',
1157 'KEEPALIVED_CMD=/usr/sbin/keepalived -n -l -f /etc/keepalived/keepalived.conf',
1158 'KEEPALIVED_DEBUG=false'
1159 ]
1160 return envs
1161
1162 @staticmethod
1163 def get_sysctl_settings() -> List[str]:
1164 return [
1165 '# IP forwarding and non-local bind',
1166 'net.ipv4.ip_forward = 1',
1167 'net.ipv4.ip_nonlocal_bind = 1',
1168 ]
1169
1170 def extract_uid_gid_keepalived(self) -> Tuple[int, int]:
1171 # better directory for this?
1172 return extract_uid_gid(self.ctx, file_path='/var/lib')
1173
1174 @staticmethod
1175 def get_container_mounts(data_dir: str) -> Dict[str, str]:
1176 mounts = dict()
1177 mounts[os.path.join(data_dir, 'keepalived.conf')] = '/etc/keepalived/keepalived.conf'
1178 return mounts
1179
1180 ##################################
1181
1182
1183 class CustomContainer(object):
1184 """Defines a custom container"""
1185 daemon_type = 'container'
1186
1187 def __init__(self,
1188 fsid: str, daemon_id: Union[int, str],
1189 config_json: Dict, image: str) -> None:
1190 self.fsid = fsid
1191 self.daemon_id = daemon_id
1192 self.image = image
1193
1194 # config-json options
1195 self.entrypoint = dict_get(config_json, 'entrypoint')
1196 self.uid = dict_get(config_json, 'uid', 65534) # nobody
1197 self.gid = dict_get(config_json, 'gid', 65534) # nobody
1198 self.volume_mounts = dict_get(config_json, 'volume_mounts', {})
1199 self.args = dict_get(config_json, 'args', [])
1200 self.envs = dict_get(config_json, 'envs', [])
1201 self.privileged = dict_get(config_json, 'privileged', False)
1202 self.bind_mounts = dict_get(config_json, 'bind_mounts', [])
1203 self.ports = dict_get(config_json, 'ports', [])
1204 self.dirs = dict_get(config_json, 'dirs', [])
1205 self.files = dict_get(config_json, 'files', {})
1206
1207 @classmethod
1208 def init(cls, ctx: CephadmContext,
1209 fsid: str, daemon_id: Union[int, str]) -> 'CustomContainer':
1210 return cls(fsid, daemon_id,
1211 get_parm(ctx.config_json), ctx.image)
1212
1213 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
1214 """
1215 Create dirs/files below the container data directory.
1216 """
1217 logger.info('Creating custom container configuration '
1218 'dirs/files in {} ...'.format(data_dir))
1219
1220 if not os.path.isdir(data_dir):
1221 raise OSError('data_dir is not a directory: %s' % data_dir)
1222
1223 for dir_path in self.dirs:
1224 logger.info('Creating directory: {}'.format(dir_path))
1225 dir_path = os.path.join(data_dir, dir_path.strip('/'))
1226 makedirs(dir_path, uid, gid, 0o755)
1227
1228 for file_path in self.files:
1229 logger.info('Creating file: {}'.format(file_path))
1230 content = dict_get_join(self.files, file_path)
1231 file_path = os.path.join(data_dir, file_path.strip('/'))
1232 with open(file_path, 'w', encoding='utf-8') as f:
1233 os.fchown(f.fileno(), uid, gid)
1234 os.fchmod(f.fileno(), 0o600)
1235 f.write(content)
1236
1237 def get_daemon_args(self) -> List[str]:
1238 return []
1239
1240 def get_container_args(self) -> List[str]:
1241 return self.args
1242
1243 def get_container_envs(self) -> List[str]:
1244 return self.envs
1245
1246 def get_container_mounts(self, data_dir: str) -> Dict[str, str]:
1247 """
1248 Get the volume mounts. Relative source paths will be located below
1249 `/var/lib/ceph/<cluster-fsid>/<daemon-name>`.
1250
1251 Example:
1252 {
1253 /foo/conf: /conf
1254 foo/conf: /conf
1255 }
1256 becomes
1257 {
1258 /foo/conf: /conf
1259 /var/lib/ceph/<cluster-fsid>/<daemon-name>/foo/conf: /conf
1260 }
1261 """
1262 mounts = {}
1263 for source, destination in self.volume_mounts.items():
1264 source = os.path.join(data_dir, source)
1265 mounts[source] = destination
1266 return mounts
1267
1268 def get_container_binds(self, data_dir: str) -> List[List[str]]:
1269 """
1270 Get the bind mounts. Relative `source=...` paths will be located below
1271 `/var/lib/ceph/<cluster-fsid>/<daemon-name>`.
1272
1273 Example:
1274 [
1275 'type=bind',
1276 'source=lib/modules',
1277 'destination=/lib/modules',
1278 'ro=true'
1279 ]
1280 becomes
1281 [
1282 ...
1283 'source=/var/lib/ceph/<cluster-fsid>/<daemon-name>/lib/modules',
1284 ...
1285 ]
1286 """
1287 binds = self.bind_mounts.copy()
1288 for bind in binds:
1289 for index, value in enumerate(bind):
1290 match = re.match(r'^source=(.+)$', value)
1291 if match:
1292 bind[index] = 'source={}'.format(os.path.join(
1293 data_dir, match.group(1)))
1294 return binds
1295
1296 ##################################
1297
1298
1299 def touch(file_path: str, uid: Optional[int] = None, gid: Optional[int] = None) -> None:
1300 Path(file_path).touch()
1301 if uid and gid:
1302 os.chown(file_path, uid, gid)
1303
1304
1305 ##################################
1306
1307
1308 def dict_get(d: Dict, key: str, default: Any = None, require: bool = False) -> Any:
1309 """
1310 Helper function to get a key from a dictionary.
1311 :param d: The dictionary to process.
1312 :param key: The name of the key to get.
1313 :param default: The default value in case the key does not
1314 exist. Default is `None`.
1315 :param require: Set to `True` if the key is required. An
1316 exception will be raised if the key does not exist in
1317 the given dictionary.
1318 :return: Returns the value of the given key.
1319 :raises: :exc:`self.Error` if the given key does not exist
1320 and `require` is set to `True`.
1321 """
1322 if require and key not in d.keys():
1323 raise Error('{} missing from dict'.format(key))
1324 return d.get(key, default) # type: ignore
1325
1326 ##################################
1327
1328
1329 def dict_get_join(d: Dict, key: str) -> Any:
1330 """
1331 Helper function to get the value of a given key from a dictionary.
1332 `List` values will be converted to a string by joining them with a
1333 line break.
1334 :param d: The dictionary to process.
1335 :param key: The name of the key to get.
1336 :return: Returns the value of the given key. If it was a `list`, it
1337 will be joining with a line break.
1338 """
1339 value = d.get(key)
1340 if isinstance(value, list):
1341 value = '\n'.join(map(str, value))
1342 return value
1343
1344 ##################################
1345
1346
1347 def get_supported_daemons():
1348 # type: () -> List[str]
1349 supported_daemons = list(Ceph.daemons)
1350 supported_daemons.extend(Monitoring.components)
1351 supported_daemons.append(NFSGanesha.daemon_type)
1352 supported_daemons.append(CephIscsi.daemon_type)
1353 supported_daemons.append(CustomContainer.daemon_type)
1354 supported_daemons.append(HAproxy.daemon_type)
1355 supported_daemons.append(Keepalived.daemon_type)
1356 supported_daemons.append(CephadmAgent.daemon_type)
1357 supported_daemons.append(SNMPGateway.daemon_type)
1358 assert len(supported_daemons) == len(set(supported_daemons))
1359 return supported_daemons
1360
1361 ##################################
1362
1363
1364 class PortOccupiedError(Error):
1365 pass
1366
1367
1368 def attempt_bind(ctx, s, address, port):
1369 # type: (CephadmContext, socket.socket, str, int) -> None
1370 try:
1371 s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
1372 s.bind((address, port))
1373 except OSError as e:
1374 if e.errno == errno.EADDRINUSE:
1375 msg = 'Cannot bind to IP %s port %d: %s' % (address, port, e)
1376 logger.warning(msg)
1377 raise PortOccupiedError(msg)
1378 else:
1379 raise Error(e)
1380 except Exception as e:
1381 raise Error(e)
1382 finally:
1383 s.close()
1384
1385
1386 def port_in_use(ctx, port_num):
1387 # type: (CephadmContext, int) -> bool
1388 """Detect whether a port is in use on the local machine - IPv4 and IPv6"""
1389 logger.info('Verifying port %d ...' % port_num)
1390
1391 def _port_in_use(af: socket.AddressFamily, address: str) -> bool:
1392 try:
1393 s = socket.socket(af, socket.SOCK_STREAM)
1394 attempt_bind(ctx, s, address, port_num)
1395 except PortOccupiedError:
1396 return True
1397 except OSError as e:
1398 if e.errno in (errno.EAFNOSUPPORT, errno.EADDRNOTAVAIL):
1399 # Ignore EAFNOSUPPORT and EADDRNOTAVAIL as two interfaces are
1400 # being tested here and one might be intentionally be disabled.
1401 # In that case no error should be raised.
1402 return False
1403 else:
1404 raise e
1405 return False
1406 return any(_port_in_use(af, address) for af, address in (
1407 (socket.AF_INET, '0.0.0.0'),
1408 (socket.AF_INET6, '::')
1409 ))
1410
1411
1412 def check_ip_port(ctx, ep):
1413 # type: (CephadmContext, EndPoint) -> None
1414 if not ctx.skip_ping_check:
1415 logger.info(f'Verifying IP {ep.ip} port {ep.port} ...')
1416 if is_ipv6(ep.ip):
1417 s = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
1418 ip = unwrap_ipv6(ep.ip)
1419 else:
1420 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1421 ip = ep.ip
1422 attempt_bind(ctx, s, ip, ep.port)
1423
1424 ##################################
1425
1426
1427 # this is an abbreviated version of
1428 # https://github.com/benediktschmitt/py-filelock/blob/master/filelock.py
1429 # that drops all of the compatibility (this is Unix/Linux only).
1430
1431 class Timeout(TimeoutError):
1432 """
1433 Raised when the lock could not be acquired in *timeout*
1434 seconds.
1435 """
1436
1437 def __init__(self, lock_file: str) -> None:
1438 """
1439 """
1440 #: The path of the file lock.
1441 self.lock_file = lock_file
1442 return None
1443
1444 def __str__(self) -> str:
1445 temp = "The file lock '{}' could not be acquired."\
1446 .format(self.lock_file)
1447 return temp
1448
1449
1450 class _Acquire_ReturnProxy(object):
1451 def __init__(self, lock: 'FileLock') -> None:
1452 self.lock = lock
1453 return None
1454
1455 def __enter__(self) -> 'FileLock':
1456 return self.lock
1457
1458 def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
1459 self.lock.release()
1460 return None
1461
1462
1463 class FileLock(object):
1464 def __init__(self, ctx: CephadmContext, name: str, timeout: int = -1) -> None:
1465 if not os.path.exists(LOCK_DIR):
1466 os.mkdir(LOCK_DIR, 0o700)
1467 self._lock_file = os.path.join(LOCK_DIR, name + '.lock')
1468 self.ctx = ctx
1469
1470 # The file descriptor for the *_lock_file* as it is returned by the
1471 # os.open() function.
1472 # This file lock is only NOT None, if the object currently holds the
1473 # lock.
1474 self._lock_file_fd: Optional[int] = None
1475 self.timeout = timeout
1476 # The lock counter is used for implementing the nested locking
1477 # mechanism. Whenever the lock is acquired, the counter is increased and
1478 # the lock is only released, when this value is 0 again.
1479 self._lock_counter = 0
1480 return None
1481
1482 @property
1483 def is_locked(self) -> bool:
1484 return self._lock_file_fd is not None
1485
1486 def acquire(self, timeout: Optional[int] = None, poll_intervall: float = 0.05) -> _Acquire_ReturnProxy:
1487 """
1488 Acquires the file lock or fails with a :exc:`Timeout` error.
1489 .. code-block:: python
1490 # You can use this method in the context manager (recommended)
1491 with lock.acquire():
1492 pass
1493 # Or use an equivalent try-finally construct:
1494 lock.acquire()
1495 try:
1496 pass
1497 finally:
1498 lock.release()
1499 :arg float timeout:
1500 The maximum time waited for the file lock.
1501 If ``timeout < 0``, there is no timeout and this method will
1502 block until the lock could be acquired.
1503 If ``timeout`` is None, the default :attr:`~timeout` is used.
1504 :arg float poll_intervall:
1505 We check once in *poll_intervall* seconds if we can acquire the
1506 file lock.
1507 :raises Timeout:
1508 if the lock could not be acquired in *timeout* seconds.
1509 .. versionchanged:: 2.0.0
1510 This method returns now a *proxy* object instead of *self*,
1511 so that it can be used in a with statement without side effects.
1512 """
1513
1514 # Use the default timeout, if no timeout is provided.
1515 if timeout is None:
1516 timeout = self.timeout
1517
1518 # Increment the number right at the beginning.
1519 # We can still undo it, if something fails.
1520 self._lock_counter += 1
1521
1522 lock_id = id(self)
1523 lock_filename = self._lock_file
1524 start_time = time.time()
1525 try:
1526 while True:
1527 if not self.is_locked:
1528 logger.log(QUIET_LOG_LEVEL, 'Acquiring lock %s on %s', lock_id,
1529 lock_filename)
1530 self._acquire()
1531
1532 if self.is_locked:
1533 logger.log(QUIET_LOG_LEVEL, 'Lock %s acquired on %s', lock_id,
1534 lock_filename)
1535 break
1536 elif timeout >= 0 and time.time() - start_time > timeout:
1537 logger.warning('Timeout acquiring lock %s on %s', lock_id,
1538 lock_filename)
1539 raise Timeout(self._lock_file)
1540 else:
1541 logger.log(
1542 QUIET_LOG_LEVEL,
1543 'Lock %s not acquired on %s, waiting %s seconds ...',
1544 lock_id, lock_filename, poll_intervall
1545 )
1546 time.sleep(poll_intervall)
1547 except Exception:
1548 # Something did go wrong, so decrement the counter.
1549 self._lock_counter = max(0, self._lock_counter - 1)
1550
1551 raise
1552 return _Acquire_ReturnProxy(lock=self)
1553
1554 def release(self, force: bool = False) -> None:
1555 """
1556 Releases the file lock.
1557 Please note, that the lock is only completly released, if the lock
1558 counter is 0.
1559 Also note, that the lock file itself is not automatically deleted.
1560 :arg bool force:
1561 If true, the lock counter is ignored and the lock is released in
1562 every case.
1563 """
1564 if self.is_locked:
1565 self._lock_counter -= 1
1566
1567 if self._lock_counter == 0 or force:
1568 # lock_id = id(self)
1569 # lock_filename = self._lock_file
1570
1571 # Can't log in shutdown:
1572 # File "/usr/lib64/python3.9/logging/__init__.py", line 1175, in _open
1573 # NameError: name 'open' is not defined
1574 # logger.debug('Releasing lock %s on %s', lock_id, lock_filename)
1575 self._release()
1576 self._lock_counter = 0
1577 # logger.debug('Lock %s released on %s', lock_id, lock_filename)
1578
1579 return None
1580
1581 def __enter__(self) -> 'FileLock':
1582 self.acquire()
1583 return self
1584
1585 def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
1586 self.release()
1587 return None
1588
1589 def __del__(self) -> None:
1590 self.release(force=True)
1591 return None
1592
1593 def _acquire(self) -> None:
1594 open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
1595 fd = os.open(self._lock_file, open_mode)
1596
1597 try:
1598 fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
1599 except (IOError, OSError):
1600 os.close(fd)
1601 else:
1602 self._lock_file_fd = fd
1603 return None
1604
1605 def _release(self) -> None:
1606 # Do not remove the lockfile:
1607 #
1608 # https://github.com/benediktschmitt/py-filelock/issues/31
1609 # https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
1610 fd = self._lock_file_fd
1611 self._lock_file_fd = None
1612 fcntl.flock(fd, fcntl.LOCK_UN) # type: ignore
1613 os.close(fd) # type: ignore
1614 return None
1615
1616
1617 ##################################
1618 # Popen wrappers, lifted from ceph-volume
1619
1620 class CallVerbosity(Enum):
1621 #####
1622 # Format:
1623 # Normal Operation: <log-level-when-no-errors>, Errors: <log-level-when-error>
1624 #
1625 # NOTE: QUIET log level is custom level only used when --verbose is passed
1626 #####
1627
1628 # Normal Operation: None, Errors: None
1629 SILENT = 0
1630 # Normal Operation: QUIET, Error: QUIET
1631 QUIET = 1
1632 # Normal Operation: DEBUG, Error: DEBUG
1633 DEBUG = 2
1634 # Normal Operation: QUIET, Error: INFO
1635 QUIET_UNLESS_ERROR = 3
1636 # Normal Operation: DEBUG, Error: INFO
1637 VERBOSE_ON_FAILURE = 4
1638 # Normal Operation: INFO, Error: INFO
1639 VERBOSE = 5
1640
1641 def success_log_level(self) -> int:
1642 _verbosity_level_to_log_level = {
1643 self.SILENT: 0,
1644 self.QUIET: QUIET_LOG_LEVEL,
1645 self.DEBUG: logging.DEBUG,
1646 self.QUIET_UNLESS_ERROR: QUIET_LOG_LEVEL,
1647 self.VERBOSE_ON_FAILURE: logging.DEBUG,
1648 self.VERBOSE: logging.INFO
1649 }
1650 return _verbosity_level_to_log_level[self] # type: ignore
1651
1652 def error_log_level(self) -> int:
1653 _verbosity_level_to_log_level = {
1654 self.SILENT: 0,
1655 self.QUIET: QUIET_LOG_LEVEL,
1656 self.DEBUG: logging.DEBUG,
1657 self.QUIET_UNLESS_ERROR: logging.INFO,
1658 self.VERBOSE_ON_FAILURE: logging.INFO,
1659 self.VERBOSE: logging.INFO
1660 }
1661 return _verbosity_level_to_log_level[self] # type: ignore
1662
1663
1664 if sys.version_info < (3, 8):
1665 import itertools
1666 import threading
1667 import warnings
1668 from asyncio import events
1669
1670 class ThreadedChildWatcher(asyncio.AbstractChildWatcher):
1671 """Threaded child watcher implementation.
1672 The watcher uses a thread per process
1673 for waiting for the process finish.
1674 It doesn't require subscription on POSIX signal
1675 but a thread creation is not free.
1676 The watcher has O(1) complexity, its performance doesn't depend
1677 on amount of spawn processes.
1678 """
1679
1680 def __init__(self) -> None:
1681 self._pid_counter = itertools.count(0)
1682 self._threads: Dict[Any, Any] = {}
1683
1684 def is_active(self) -> bool:
1685 return True
1686
1687 def close(self) -> None:
1688 self._join_threads()
1689
1690 def _join_threads(self) -> None:
1691 """Internal: Join all non-daemon threads"""
1692 threads = [thread for thread in list(self._threads.values())
1693 if thread.is_alive() and not thread.daemon]
1694 for thread in threads:
1695 thread.join()
1696
1697 def __enter__(self) -> Any:
1698 return self
1699
1700 def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
1701 pass
1702
1703 def __del__(self, _warn: Any = warnings.warn) -> None:
1704 threads = [thread for thread in list(self._threads.values())
1705 if thread.is_alive()]
1706 if threads:
1707 _warn(f'{self.__class__} has registered but not finished child processes',
1708 ResourceWarning,
1709 source=self)
1710
1711 def add_child_handler(self, pid: Any, callback: Any, *args: Any) -> None:
1712 loop = events.get_event_loop()
1713 thread = threading.Thread(target=self._do_waitpid,
1714 name=f'waitpid-{next(self._pid_counter)}',
1715 args=(loop, pid, callback, args),
1716 daemon=True)
1717 self._threads[pid] = thread
1718 thread.start()
1719
1720 def remove_child_handler(self, pid: Any) -> bool:
1721 # asyncio never calls remove_child_handler() !!!
1722 # The method is no-op but is implemented because
1723 # abstract base classe requires it
1724 return True
1725
1726 def attach_loop(self, loop: Any) -> None:
1727 pass
1728
1729 def _do_waitpid(self, loop: Any, expected_pid: Any, callback: Any, args: Any) -> None:
1730 assert expected_pid > 0
1731
1732 try:
1733 pid, status = os.waitpid(expected_pid, 0)
1734 except ChildProcessError:
1735 # The child process is already reaped
1736 # (may happen if waitpid() is called elsewhere).
1737 pid = expected_pid
1738 returncode = 255
1739 logger.warning(
1740 'Unknown child process pid %d, will report returncode 255',
1741 pid)
1742 else:
1743 if os.WIFEXITED(status):
1744 returncode = os.WEXITSTATUS(status)
1745 elif os.WIFSIGNALED(status):
1746 returncode = -os.WTERMSIG(status)
1747 else:
1748 raise ValueError(f'unknown wait status {status}')
1749 if loop.get_debug():
1750 logger.debug('process %s exited with returncode %s',
1751 expected_pid, returncode)
1752
1753 if loop.is_closed():
1754 logger.warning('Loop %r that handles pid %r is closed', loop, pid)
1755 else:
1756 loop.call_soon_threadsafe(callback, pid, returncode, *args)
1757
1758 self._threads.pop(expected_pid)
1759
1760 # unlike SafeChildWatcher which handles SIGCHLD in the main thread,
1761 # ThreadedChildWatcher runs in a separated thread, hence allows us to
1762 # run create_subprocess_exec() in non-main thread, see
1763 # https://bugs.python.org/issue35621
1764 asyncio.set_child_watcher(ThreadedChildWatcher())
1765
1766
1767 try:
1768 from asyncio import run as async_run # type: ignore[attr-defined]
1769 except ImportError:
1770 def async_run(coro): # type: ignore
1771 loop = asyncio.new_event_loop()
1772 try:
1773 asyncio.set_event_loop(loop)
1774 return loop.run_until_complete(coro)
1775 finally:
1776 try:
1777 loop.run_until_complete(loop.shutdown_asyncgens())
1778 finally:
1779 asyncio.set_event_loop(None)
1780 loop.close()
1781
1782
1783 def call(ctx: CephadmContext,
1784 command: List[str],
1785 desc: Optional[str] = None,
1786 verbosity: CallVerbosity = CallVerbosity.VERBOSE_ON_FAILURE,
1787 timeout: Optional[int] = DEFAULT_TIMEOUT,
1788 **kwargs: Any) -> Tuple[str, str, int]:
1789 """
1790 Wrap subprocess.Popen to
1791
1792 - log stdout/stderr to a logger,
1793 - decode utf-8
1794 - cleanly return out, err, returncode
1795
1796 :param timeout: timeout in seconds
1797 """
1798
1799 prefix = command[0] if desc is None else desc
1800 if prefix:
1801 prefix += ': '
1802 timeout = timeout or ctx.timeout
1803
1804 async def tee(reader: asyncio.StreamReader) -> str:
1805 collected = StringIO()
1806 async for line in reader:
1807 message = line.decode('utf-8')
1808 collected.write(message)
1809 return collected.getvalue()
1810
1811 async def run_with_timeout() -> Tuple[str, str, int]:
1812 process = await asyncio.create_subprocess_exec(
1813 *command,
1814 stdout=asyncio.subprocess.PIPE,
1815 stderr=asyncio.subprocess.PIPE,
1816 env=os.environ.copy())
1817 assert process.stdout
1818 assert process.stderr
1819 try:
1820 stdout, stderr = await asyncio.gather(tee(process.stdout),
1821 tee(process.stderr))
1822 returncode = await asyncio.wait_for(process.wait(), timeout)
1823 except asyncio.TimeoutError:
1824 logger.info(prefix + f'timeout after {timeout} seconds')
1825 return '', '', 124
1826 else:
1827 return stdout, stderr, returncode
1828
1829 stdout, stderr, returncode = async_run(run_with_timeout())
1830 log_level = verbosity.success_log_level()
1831 if returncode != 0:
1832 log_level = verbosity.error_log_level()
1833 logger.log(log_level, f'Non-zero exit code {returncode} from {" ".join(command)}')
1834 for line in stdout.splitlines():
1835 logger.log(log_level, prefix + 'stdout ' + line)
1836 for line in stderr.splitlines():
1837 logger.log(log_level, prefix + 'stderr ' + line)
1838 return stdout, stderr, returncode
1839
1840
1841 def call_throws(
1842 ctx: CephadmContext,
1843 command: List[str],
1844 desc: Optional[str] = None,
1845 verbosity: CallVerbosity = CallVerbosity.VERBOSE_ON_FAILURE,
1846 timeout: Optional[int] = DEFAULT_TIMEOUT,
1847 **kwargs: Any) -> Tuple[str, str, int]:
1848 out, err, ret = call(ctx, command, desc, verbosity, timeout, **kwargs)
1849 if ret:
1850 for s in (out, err):
1851 if s.strip() and len(s.splitlines()) <= 2: # readable message?
1852 raise RuntimeError(f'Failed command: {" ".join(command)}: {s}')
1853 raise RuntimeError('Failed command: %s' % ' '.join(command))
1854 return out, err, ret
1855
1856
1857 def call_timeout(ctx, command, timeout):
1858 # type: (CephadmContext, List[str], int) -> int
1859 logger.debug('Running command (timeout=%s): %s'
1860 % (timeout, ' '.join(command)))
1861
1862 def raise_timeout(command, timeout):
1863 # type: (List[str], int) -> NoReturn
1864 msg = 'Command `%s` timed out after %s seconds' % (command, timeout)
1865 logger.debug(msg)
1866 raise TimeoutExpired(msg)
1867
1868 try:
1869 return subprocess.call(command, timeout=timeout, env=os.environ.copy())
1870 except subprocess.TimeoutExpired:
1871 raise_timeout(command, timeout)
1872
1873 ##################################
1874
1875
1876 def json_loads_retry(cli_func: Callable[[], str]) -> Any:
1877 for sleep_secs in [1, 4, 4]:
1878 try:
1879 return json.loads(cli_func())
1880 except json.JSONDecodeError:
1881 logger.debug('Invalid JSON. Retrying in %s seconds...' % sleep_secs)
1882 time.sleep(sleep_secs)
1883 return json.loads(cli_func())
1884
1885
1886 def is_available(ctx, what, func):
1887 # type: (CephadmContext, str, Callable[[], bool]) -> None
1888 """
1889 Wait for a service to become available
1890
1891 :param what: the name of the service
1892 :param func: the callable object that determines availability
1893 """
1894 retry = ctx.retry
1895 logger.info('Waiting for %s...' % what)
1896 num = 1
1897 while True:
1898 if func():
1899 logger.info('%s is available'
1900 % what)
1901 break
1902 elif num > retry:
1903 raise Error('%s not available after %s tries'
1904 % (what, retry))
1905
1906 logger.info('%s not available, waiting (%s/%s)...'
1907 % (what, num, retry))
1908
1909 num += 1
1910 time.sleep(2)
1911
1912
1913 def read_config(fn):
1914 # type: (Optional[str]) -> ConfigParser
1915 cp = ConfigParser()
1916 if fn:
1917 cp.read(fn)
1918 return cp
1919
1920
1921 def pathify(p):
1922 # type: (str) -> str
1923 p = os.path.expanduser(p)
1924 return os.path.abspath(p)
1925
1926
1927 def get_file_timestamp(fn):
1928 # type: (str) -> Optional[str]
1929 try:
1930 mt = os.path.getmtime(fn)
1931 return datetime.datetime.fromtimestamp(
1932 mt, tz=datetime.timezone.utc
1933 ).strftime(DATEFMT)
1934 except Exception:
1935 return None
1936
1937
1938 def try_convert_datetime(s):
1939 # type: (str) -> Optional[str]
1940 # This is super irritating because
1941 # 1) podman and docker use different formats
1942 # 2) python's strptime can't parse either one
1943 #
1944 # I've seen:
1945 # docker 18.09.7: 2020-03-03T09:21:43.636153304Z
1946 # podman 1.7.0: 2020-03-03T15:52:30.136257504-06:00
1947 # 2020-03-03 15:52:30.136257504 -0600 CST
1948 # (In the podman case, there is a different string format for
1949 # 'inspect' and 'inspect --format {{.Created}}'!!)
1950
1951 # In *all* cases, the 9 digit second precision is too much for
1952 # python's strptime. Shorten it to 6 digits.
1953 p = re.compile(r'(\.[\d]{6})[\d]*')
1954 s = p.sub(r'\1', s)
1955
1956 # replace trailing Z with -0000, since (on python 3.6.8) it won't parse
1957 if s and s[-1] == 'Z':
1958 s = s[:-1] + '-0000'
1959
1960 # cut off the redundant 'CST' part that strptime can't parse, if
1961 # present.
1962 v = s.split(' ')
1963 s = ' '.join(v[0:3])
1964
1965 # try parsing with several format strings
1966 fmts = [
1967 '%Y-%m-%dT%H:%M:%S.%f%z',
1968 '%Y-%m-%d %H:%M:%S.%f %z',
1969 ]
1970 for f in fmts:
1971 try:
1972 # return timestamp normalized to UTC, rendered as DATEFMT.
1973 return datetime.datetime.strptime(s, f).astimezone(tz=datetime.timezone.utc).strftime(DATEFMT)
1974 except ValueError:
1975 pass
1976 return None
1977
1978
1979 def _parse_podman_version(version_str):
1980 # type: (str) -> Tuple[int, ...]
1981 def to_int(val: str, org_e: Optional[Exception] = None) -> int:
1982 if not val and org_e:
1983 raise org_e
1984 try:
1985 return int(val)
1986 except ValueError as e:
1987 return to_int(val[0:-1], org_e or e)
1988
1989 return tuple(map(to_int, version_str.split('.')))
1990
1991
1992 def get_hostname():
1993 # type: () -> str
1994 return socket.gethostname()
1995
1996
1997 def get_short_hostname():
1998 # type: () -> str
1999 return get_hostname().split('.', 1)[0]
2000
2001
2002 def get_fqdn():
2003 # type: () -> str
2004 return socket.getfqdn() or socket.gethostname()
2005
2006
2007 def get_arch():
2008 # type: () -> str
2009 return platform.uname().machine
2010
2011
2012 def generate_service_id():
2013 # type: () -> str
2014 return get_short_hostname() + '.' + ''.join(random.choice(string.ascii_lowercase)
2015 for _ in range(6))
2016
2017
2018 def generate_password():
2019 # type: () -> str
2020 return ''.join(random.choice(string.ascii_lowercase + string.digits)
2021 for i in range(10))
2022
2023
2024 def normalize_container_id(i):
2025 # type: (str) -> str
2026 # docker adds the sha256: prefix, but AFAICS both
2027 # docker (18.09.7 in bionic at least) and podman
2028 # both always use sha256, so leave off the prefix
2029 # for consistency.
2030 prefix = 'sha256:'
2031 if i.startswith(prefix):
2032 i = i[len(prefix):]
2033 return i
2034
2035
2036 def make_fsid():
2037 # type: () -> str
2038 return str(uuid.uuid1())
2039
2040
2041 def is_fsid(s):
2042 # type: (str) -> bool
2043 try:
2044 uuid.UUID(s)
2045 except ValueError:
2046 return False
2047 return True
2048
2049
2050 def validate_fsid(func: FuncT) -> FuncT:
2051 @wraps(func)
2052 def _validate_fsid(ctx: CephadmContext) -> Any:
2053 if 'fsid' in ctx and ctx.fsid:
2054 if not is_fsid(ctx.fsid):
2055 raise Error('not an fsid: %s' % ctx.fsid)
2056 return func(ctx)
2057 return cast(FuncT, _validate_fsid)
2058
2059
2060 def infer_fsid(func: FuncT) -> FuncT:
2061 """
2062 If we only find a single fsid in /var/lib/ceph/*, use that
2063 """
2064 @infer_config
2065 @wraps(func)
2066 def _infer_fsid(ctx: CephadmContext) -> Any:
2067 if 'fsid' in ctx and ctx.fsid:
2068 logger.debug('Using specified fsid: %s' % ctx.fsid)
2069 return func(ctx)
2070
2071 fsids = set()
2072
2073 cp = read_config(ctx.config)
2074 if cp.has_option('global', 'fsid'):
2075 fsids.add(cp.get('global', 'fsid'))
2076
2077 daemon_list = list_daemons(ctx, detail=False)
2078 for daemon in daemon_list:
2079 if not is_fsid(daemon['fsid']):
2080 # 'unknown' fsid
2081 continue
2082 elif 'name' not in ctx or not ctx.name:
2083 # ctx.name not specified
2084 fsids.add(daemon['fsid'])
2085 elif daemon['name'] == ctx.name:
2086 # ctx.name is a match
2087 fsids.add(daemon['fsid'])
2088 fsids = sorted(fsids)
2089
2090 if not fsids:
2091 # some commands do not always require an fsid
2092 pass
2093 elif len(fsids) == 1:
2094 logger.info('Inferring fsid %s' % fsids[0])
2095 ctx.fsid = fsids[0]
2096 else:
2097 raise Error('Cannot infer an fsid, one must be specified (using --fsid): %s' % fsids)
2098 return func(ctx)
2099
2100 return cast(FuncT, _infer_fsid)
2101
2102
2103 def infer_config(func: FuncT) -> FuncT:
2104 """
2105 Infer the clusater configuration using the followign priority order:
2106 1- if the user has provided custom conf file (-c option) use it
2107 2- otherwise if daemon --name has been provided use daemon conf
2108 3- otherwise find the mon daemon conf file and use it (if v1)
2109 4- otherwise if {ctx.data_dir}/{fsid}/{CEPH_CONF_DIR} dir exists use it
2110 5- finally: fallback to the default file /etc/ceph/ceph.conf
2111 """
2112 @wraps(func)
2113 def _infer_config(ctx: CephadmContext) -> Any:
2114
2115 def config_path(daemon_type: str, daemon_name: str) -> str:
2116 data_dir = get_data_dir(ctx.fsid, ctx.data_dir, daemon_type, daemon_name)
2117 return os.path.join(data_dir, 'config')
2118
2119 def get_mon_daemon_name(fsid: str) -> Optional[str]:
2120 daemon_list = list_daemons(ctx, detail=False)
2121 for daemon in daemon_list:
2122 if (
2123 daemon.get('name', '').startswith('mon.')
2124 and daemon.get('fsid', '') == fsid
2125 and daemon.get('style', '') == 'cephadm:v1'
2126 and os.path.exists(config_path('mon', daemon['name'].split('.', 1)[1]))
2127 ):
2128 return daemon['name']
2129 return None
2130
2131 ctx.config = ctx.config if 'config' in ctx else None
2132 # check if user has provided conf by using -c option
2133 if ctx.config and (ctx.config != CEPH_DEFAULT_CONF):
2134 logger.debug(f'Using specified config: {ctx.config}')
2135 return func(ctx)
2136
2137 if 'fsid' in ctx and ctx.fsid:
2138 name = ctx.name if ('name' in ctx and ctx.name) else get_mon_daemon_name(ctx.fsid)
2139 if name is not None:
2140 # daemon name has been specified (or inffered from mon), let's use its conf
2141 ctx.config = config_path(name.split('.', 1)[0], name.split('.', 1)[1])
2142 else:
2143 # no daemon, in case the cluster has a config dir then use it
2144 ceph_conf = f'{ctx.data_dir}/{ctx.fsid}/{CEPH_CONF_DIR}/{CEPH_CONF}'
2145 if os.path.exists(ceph_conf):
2146 ctx.config = ceph_conf
2147
2148 if ctx.config:
2149 logger.info(f'Inferring config {ctx.config}')
2150 elif os.path.exists(CEPH_DEFAULT_CONF):
2151 logger.debug(f'Using default config {CEPH_DEFAULT_CONF}')
2152 ctx.config = CEPH_DEFAULT_CONF
2153 return func(ctx)
2154
2155 return cast(FuncT, _infer_config)
2156
2157
2158 def _get_default_image(ctx: CephadmContext) -> str:
2159 if DEFAULT_IMAGE_IS_MASTER:
2160 warn = """This is a development version of cephadm.
2161 For information regarding the latest stable release:
2162 https://docs.ceph.com/docs/{}/cephadm/install
2163 """.format(LATEST_STABLE_RELEASE)
2164 for line in warn.splitlines():
2165 logger.warning('{}{}{}'.format(termcolor.yellow, line, termcolor.end))
2166 return DEFAULT_IMAGE
2167
2168
2169 def infer_image(func: FuncT) -> FuncT:
2170 """
2171 Use the most recent ceph image
2172 """
2173 @wraps(func)
2174 def _infer_image(ctx: CephadmContext) -> Any:
2175 if not ctx.image:
2176 ctx.image = os.environ.get('CEPHADM_IMAGE')
2177 if not ctx.image:
2178 ctx.image = infer_local_ceph_image(ctx, ctx.container_engine.path)
2179 if not ctx.image:
2180 ctx.image = _get_default_image(ctx)
2181 return func(ctx)
2182
2183 return cast(FuncT, _infer_image)
2184
2185
2186 def default_image(func: FuncT) -> FuncT:
2187 @wraps(func)
2188 def _default_image(ctx: CephadmContext) -> Any:
2189 if not ctx.image:
2190 if 'name' in ctx and ctx.name:
2191 type_ = ctx.name.split('.', 1)[0]
2192 if type_ in Monitoring.components:
2193 ctx.image = Monitoring.components[type_]['image']
2194 if type_ == 'haproxy':
2195 ctx.image = HAproxy.default_image
2196 if type_ == 'keepalived':
2197 ctx.image = Keepalived.default_image
2198 if type_ == SNMPGateway.daemon_type:
2199 ctx.image = SNMPGateway.default_image
2200 if not ctx.image:
2201 ctx.image = os.environ.get('CEPHADM_IMAGE')
2202 if not ctx.image:
2203 ctx.image = _get_default_image(ctx)
2204
2205 return func(ctx)
2206
2207 return cast(FuncT, _default_image)
2208
2209
2210 def get_container_info(ctx: CephadmContext, daemon_filter: str, by_name: bool) -> Optional[ContainerInfo]:
2211 """
2212 :param ctx: Cephadm context
2213 :param daemon_filter: daemon name or type
2214 :param by_name: must be set to True if daemon name is provided
2215 :return: Container information or None
2216 """
2217 def daemon_name_or_type(daemon: Dict[str, str]) -> str:
2218 return daemon['name'] if by_name else daemon['name'].split('.', 1)[0]
2219
2220 if by_name and '.' not in daemon_filter:
2221 logger.warning(f'Trying to get container info using invalid daemon name {daemon_filter}')
2222 return None
2223 daemons = list_daemons(ctx, detail=False)
2224 matching_daemons = [d for d in daemons if daemon_name_or_type(d) == daemon_filter and d['fsid'] == ctx.fsid]
2225 if matching_daemons:
2226 d_type, d_id = matching_daemons[0]['name'].split('.', 1)
2227 out, _, code = get_container_stats(ctx, ctx.container_engine.path, ctx.fsid, d_type, d_id)
2228 if not code:
2229 (container_id, image_name, image_id, start, version) = out.strip().split(',')
2230 return ContainerInfo(container_id, image_name, image_id, start, version)
2231 return None
2232
2233
2234 def infer_local_ceph_image(ctx: CephadmContext, container_path: str) -> Optional[str]:
2235 """
2236 Infer the local ceph image based on the following priority criteria:
2237 1- the image specified by --image arg (if provided).
2238 2- the same image as the daemon container specified by --name arg (if provided).
2239 3- image used by any ceph container running on the host. In this case we use daemon types.
2240 4- if no container is found then we use the most ceph recent image on the host.
2241
2242 Note: any selected container must have the same fsid inferred previously.
2243
2244 :return: The most recent local ceph image (already pulled)
2245 """
2246 # '|' special character is used to separate the output fields into:
2247 # - Repository@digest
2248 # - Image Id
2249 # - Image Tag
2250 # - Image creation date
2251 out, _, _ = call_throws(ctx,
2252 [container_path, 'images',
2253 '--filter', 'label=ceph=True',
2254 '--filter', 'dangling=false',
2255 '--format', '{{.Repository}}@{{.Digest}}|{{.ID}}|{{.Tag}}|{{.CreatedAt}}'])
2256
2257 container_info = None
2258 daemon_name = ctx.name if ('name' in ctx and ctx.name and '.' in ctx.name) else None
2259 daemons_ls = [daemon_name] if daemon_name is not None else Ceph.daemons # daemon types: 'mon', 'mgr', etc
2260 for daemon in daemons_ls:
2261 container_info = get_container_info(ctx, daemon, daemon_name is not None)
2262 if container_info is not None:
2263 logger.debug(f"Using container info for daemon '{daemon}'")
2264 break
2265
2266 for image in out.splitlines():
2267 if image and not image.isspace():
2268 (digest, image_id, tag, created_date) = image.lstrip().split('|')
2269 if container_info is not None and image_id not in container_info.image_id:
2270 continue
2271 if digest and not digest.endswith('@'):
2272 logger.info(f"Using ceph image with id '{image_id}' and tag '{tag}' created on {created_date}\n{digest}")
2273 return digest
2274 return None
2275
2276
2277 def write_tmp(s, uid, gid):
2278 # type: (str, int, int) -> IO[str]
2279 tmp_f = tempfile.NamedTemporaryFile(mode='w',
2280 prefix='ceph-tmp')
2281 os.fchown(tmp_f.fileno(), uid, gid)
2282 tmp_f.write(s)
2283 tmp_f.flush()
2284
2285 return tmp_f
2286
2287
2288 def makedirs(dir, uid, gid, mode):
2289 # type: (str, int, int, int) -> None
2290 if not os.path.exists(dir):
2291 os.makedirs(dir, mode=mode)
2292 else:
2293 os.chmod(dir, mode)
2294 os.chown(dir, uid, gid)
2295 os.chmod(dir, mode) # the above is masked by umask...
2296
2297
2298 def get_data_dir(fsid, data_dir, t, n):
2299 # type: (str, str, str, Union[int, str]) -> str
2300 return os.path.join(data_dir, fsid, '%s.%s' % (t, n))
2301
2302
2303 def get_log_dir(fsid, log_dir):
2304 # type: (str, str) -> str
2305 return os.path.join(log_dir, fsid)
2306
2307
2308 def make_data_dir_base(fsid, data_dir, uid, gid):
2309 # type: (str, str, int, int) -> str
2310 data_dir_base = os.path.join(data_dir, fsid)
2311 makedirs(data_dir_base, uid, gid, DATA_DIR_MODE)
2312 makedirs(os.path.join(data_dir_base, 'crash'), uid, gid, DATA_DIR_MODE)
2313 makedirs(os.path.join(data_dir_base, 'crash', 'posted'), uid, gid,
2314 DATA_DIR_MODE)
2315 return data_dir_base
2316
2317
2318 def make_data_dir(ctx, fsid, daemon_type, daemon_id, uid=None, gid=None):
2319 # type: (CephadmContext, str, str, Union[int, str], Optional[int], Optional[int]) -> str
2320 if uid is None or gid is None:
2321 uid, gid = extract_uid_gid(ctx)
2322 make_data_dir_base(fsid, ctx.data_dir, uid, gid)
2323 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2324 makedirs(data_dir, uid, gid, DATA_DIR_MODE)
2325 return data_dir
2326
2327
2328 def make_log_dir(ctx, fsid, uid=None, gid=None):
2329 # type: (CephadmContext, str, Optional[int], Optional[int]) -> str
2330 if uid is None or gid is None:
2331 uid, gid = extract_uid_gid(ctx)
2332 log_dir = get_log_dir(fsid, ctx.log_dir)
2333 makedirs(log_dir, uid, gid, LOG_DIR_MODE)
2334 return log_dir
2335
2336
2337 def make_var_run(ctx, fsid, uid, gid):
2338 # type: (CephadmContext, str, int, int) -> None
2339 call_throws(ctx, ['install', '-d', '-m0770', '-o', str(uid), '-g', str(gid),
2340 '/var/run/ceph/%s' % fsid])
2341
2342
2343 def copy_tree(ctx, src, dst, uid=None, gid=None):
2344 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
2345 """
2346 Copy a directory tree from src to dst
2347 """
2348 if uid is None or gid is None:
2349 (uid, gid) = extract_uid_gid(ctx)
2350
2351 for src_dir in src:
2352 dst_dir = dst
2353 if os.path.isdir(dst):
2354 dst_dir = os.path.join(dst, os.path.basename(src_dir))
2355
2356 logger.debug('copy directory `%s` -> `%s`' % (src_dir, dst_dir))
2357 shutil.rmtree(dst_dir, ignore_errors=True)
2358 shutil.copytree(src_dir, dst_dir) # dirs_exist_ok needs python 3.8
2359
2360 for dirpath, dirnames, filenames in os.walk(dst_dir):
2361 logger.debug('chown %s:%s `%s`' % (uid, gid, dirpath))
2362 os.chown(dirpath, uid, gid)
2363 for filename in filenames:
2364 logger.debug('chown %s:%s `%s`' % (uid, gid, filename))
2365 os.chown(os.path.join(dirpath, filename), uid, gid)
2366
2367
2368 def copy_files(ctx, src, dst, uid=None, gid=None):
2369 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
2370 """
2371 Copy a files from src to dst
2372 """
2373 if uid is None or gid is None:
2374 (uid, gid) = extract_uid_gid(ctx)
2375
2376 for src_file in src:
2377 dst_file = dst
2378 if os.path.isdir(dst):
2379 dst_file = os.path.join(dst, os.path.basename(src_file))
2380
2381 logger.debug('copy file `%s` -> `%s`' % (src_file, dst_file))
2382 shutil.copyfile(src_file, dst_file)
2383
2384 logger.debug('chown %s:%s `%s`' % (uid, gid, dst_file))
2385 os.chown(dst_file, uid, gid)
2386
2387
2388 def move_files(ctx, src, dst, uid=None, gid=None):
2389 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
2390 """
2391 Move files from src to dst
2392 """
2393 if uid is None or gid is None:
2394 (uid, gid) = extract_uid_gid(ctx)
2395
2396 for src_file in src:
2397 dst_file = dst
2398 if os.path.isdir(dst):
2399 dst_file = os.path.join(dst, os.path.basename(src_file))
2400
2401 if os.path.islink(src_file):
2402 # shutil.move() in py2 does not handle symlinks correctly
2403 src_rl = os.readlink(src_file)
2404 logger.debug("symlink '%s' -> '%s'" % (dst_file, src_rl))
2405 os.symlink(src_rl, dst_file)
2406 os.unlink(src_file)
2407 else:
2408 logger.debug("move file '%s' -> '%s'" % (src_file, dst_file))
2409 shutil.move(src_file, dst_file)
2410 logger.debug('chown %s:%s `%s`' % (uid, gid, dst_file))
2411 os.chown(dst_file, uid, gid)
2412
2413
2414 def recursive_chown(path: str, uid: int, gid: int) -> None:
2415 for dirpath, dirnames, filenames in os.walk(path):
2416 os.chown(dirpath, uid, gid)
2417 for filename in filenames:
2418 os.chown(os.path.join(dirpath, filename), uid, gid)
2419
2420
2421 # copied from distutils
2422 def find_executable(executable: str, path: Optional[str] = None) -> Optional[str]:
2423 """Tries to find 'executable' in the directories listed in 'path'.
2424 A string listing directories separated by 'os.pathsep'; defaults to
2425 os.environ['PATH']. Returns the complete filename or None if not found.
2426 """
2427 _, ext = os.path.splitext(executable)
2428 if (sys.platform == 'win32') and (ext != '.exe'):
2429 executable = executable + '.exe'
2430
2431 if os.path.isfile(executable):
2432 return executable
2433
2434 if path is None:
2435 path = os.environ.get('PATH', None)
2436 if path is None:
2437 try:
2438 path = os.confstr('CS_PATH')
2439 except (AttributeError, ValueError):
2440 # os.confstr() or CS_PATH is not available
2441 path = os.defpath
2442 # bpo-35755: Don't use os.defpath if the PATH environment variable is
2443 # set to an empty string
2444
2445 # PATH='' doesn't match, whereas PATH=':' looks in the current directory
2446 if not path:
2447 return None
2448
2449 paths = path.split(os.pathsep)
2450 for p in paths:
2451 f = os.path.join(p, executable)
2452 if os.path.isfile(f):
2453 # the file exists, we have a shot at spawn working
2454 return f
2455 return None
2456
2457
2458 def find_program(filename):
2459 # type: (str) -> str
2460 name = find_executable(filename)
2461 if name is None:
2462 raise ValueError('%s not found' % filename)
2463 return name
2464
2465
2466 def find_container_engine(ctx: CephadmContext) -> Optional[ContainerEngine]:
2467 if ctx.docker:
2468 return Docker()
2469 else:
2470 for i in CONTAINER_PREFERENCE:
2471 try:
2472 return i()
2473 except Exception:
2474 pass
2475 return None
2476
2477
2478 def check_container_engine(ctx: CephadmContext) -> ContainerEngine:
2479 engine = ctx.container_engine
2480 if not isinstance(engine, CONTAINER_PREFERENCE):
2481 # See https://github.com/python/mypy/issues/8993
2482 exes: List[str] = [i.EXE for i in CONTAINER_PREFERENCE] # type: ignore
2483 raise Error('No container engine binary found ({}). Try run `apt/dnf/yum/zypper install <container engine>`'.format(' or '.join(exes)))
2484 elif isinstance(engine, Podman):
2485 engine.get_version(ctx)
2486 if engine.version < MIN_PODMAN_VERSION:
2487 raise Error('podman version %d.%d.%d or later is required' % MIN_PODMAN_VERSION)
2488 return engine
2489
2490
2491 def get_unit_name(fsid, daemon_type, daemon_id=None):
2492 # type: (str, str, Optional[Union[int, str]]) -> str
2493 # accept either name or type + id
2494 if daemon_id is not None:
2495 return 'ceph-%s@%s.%s' % (fsid, daemon_type, daemon_id)
2496 else:
2497 return 'ceph-%s@%s' % (fsid, daemon_type)
2498
2499
2500 def get_unit_name_by_daemon_name(ctx: CephadmContext, fsid: str, name: str) -> str:
2501 daemon = get_daemon_description(ctx, fsid, name)
2502 try:
2503 return daemon['systemd_unit']
2504 except KeyError:
2505 raise Error('Failed to get unit name for {}'.format(daemon))
2506
2507
2508 def check_unit(ctx, unit_name):
2509 # type: (CephadmContext, str) -> Tuple[bool, str, bool]
2510 # NOTE: we ignore the exit code here because systemctl outputs
2511 # various exit codes based on the state of the service, but the
2512 # string result is more explicit (and sufficient).
2513 enabled = False
2514 installed = False
2515 try:
2516 out, err, code = call(ctx, ['systemctl', 'is-enabled', unit_name],
2517 verbosity=CallVerbosity.QUIET)
2518 if code == 0:
2519 enabled = True
2520 installed = True
2521 elif 'disabled' in out:
2522 installed = True
2523 except Exception as e:
2524 logger.warning('unable to run systemctl: %s' % e)
2525 enabled = False
2526 installed = False
2527
2528 state = 'unknown'
2529 try:
2530 out, err, code = call(ctx, ['systemctl', 'is-active', unit_name],
2531 verbosity=CallVerbosity.QUIET)
2532 out = out.strip()
2533 if out in ['active']:
2534 state = 'running'
2535 elif out in ['inactive']:
2536 state = 'stopped'
2537 elif out in ['failed', 'auto-restart']:
2538 state = 'error'
2539 else:
2540 state = 'unknown'
2541 except Exception as e:
2542 logger.warning('unable to run systemctl: %s' % e)
2543 state = 'unknown'
2544 return (enabled, state, installed)
2545
2546
2547 def check_units(ctx, units, enabler=None):
2548 # type: (CephadmContext, List[str], Optional[Packager]) -> bool
2549 for u in units:
2550 (enabled, state, installed) = check_unit(ctx, u)
2551 if enabled and state == 'running':
2552 logger.info('Unit %s is enabled and running' % u)
2553 return True
2554 if enabler is not None:
2555 if installed:
2556 logger.info('Enabling unit %s' % u)
2557 enabler.enable_service(u)
2558 return False
2559
2560
2561 def is_container_running(ctx: CephadmContext, c: 'CephContainer') -> bool:
2562 if ctx.name.split('.', 1)[0] in ['agent', 'cephadm-exporter']:
2563 # these are non-containerized daemon types
2564 return False
2565 return bool(get_running_container_name(ctx, c))
2566
2567
2568 def get_running_container_name(ctx: CephadmContext, c: 'CephContainer') -> Optional[str]:
2569 for name in [c.cname, c.old_cname]:
2570 out, err, ret = call(ctx, [
2571 ctx.container_engine.path, 'container', 'inspect',
2572 '--format', '{{.State.Status}}', name
2573 ])
2574 if out.strip() == 'running':
2575 return name
2576 return None
2577
2578
2579 def get_legacy_config_fsid(cluster, legacy_dir=None):
2580 # type: (str, Optional[str]) -> Optional[str]
2581 config_file = '/etc/ceph/%s.conf' % cluster
2582 if legacy_dir is not None:
2583 config_file = os.path.abspath(legacy_dir + config_file)
2584
2585 if os.path.exists(config_file):
2586 config = read_config(config_file)
2587 if config.has_section('global') and config.has_option('global', 'fsid'):
2588 return config.get('global', 'fsid')
2589 return None
2590
2591
2592 def get_legacy_daemon_fsid(ctx, cluster,
2593 daemon_type, daemon_id, legacy_dir=None):
2594 # type: (CephadmContext, str, str, Union[int, str], Optional[str]) -> Optional[str]
2595 fsid = None
2596 if daemon_type == 'osd':
2597 try:
2598 fsid_file = os.path.join(ctx.data_dir,
2599 daemon_type,
2600 'ceph-%s' % daemon_id,
2601 'ceph_fsid')
2602 if legacy_dir is not None:
2603 fsid_file = os.path.abspath(legacy_dir + fsid_file)
2604 with open(fsid_file, 'r') as f:
2605 fsid = f.read().strip()
2606 except IOError:
2607 pass
2608 if not fsid:
2609 fsid = get_legacy_config_fsid(cluster, legacy_dir=legacy_dir)
2610 return fsid
2611
2612
2613 def should_log_to_journald(ctx: CephadmContext) -> bool:
2614 if ctx.log_to_journald is not None:
2615 return ctx.log_to_journald
2616 return isinstance(ctx.container_engine, Podman) and \
2617 ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION
2618
2619
2620 def get_daemon_args(ctx, fsid, daemon_type, daemon_id):
2621 # type: (CephadmContext, str, str, Union[int, str]) -> List[str]
2622 r = list() # type: List[str]
2623
2624 if daemon_type in Ceph.daemons and daemon_type not in ['crash', 'ceph-exporter']:
2625 r += [
2626 '--setuser', 'ceph',
2627 '--setgroup', 'ceph',
2628 '--default-log-to-file=false',
2629 ]
2630 log_to_journald = should_log_to_journald(ctx)
2631 if log_to_journald:
2632 r += [
2633 '--default-log-to-journald=true',
2634 '--default-log-to-stderr=false',
2635 ]
2636 else:
2637 r += [
2638 '--default-log-to-stderr=true',
2639 '--default-log-stderr-prefix=debug ',
2640 ]
2641 if daemon_type == 'mon':
2642 r += [
2643 '--default-mon-cluster-log-to-file=false',
2644 ]
2645 if log_to_journald:
2646 r += [
2647 '--default-mon-cluster-log-to-journald=true',
2648 '--default-mon-cluster-log-to-stderr=false',
2649 ]
2650 else:
2651 r += ['--default-mon-cluster-log-to-stderr=true']
2652 elif daemon_type in Monitoring.components:
2653 metadata = Monitoring.components[daemon_type]
2654 r += metadata.get('args', list())
2655 # set ip and port to bind to for nodeexporter,alertmanager,prometheus
2656 if daemon_type not in ['grafana', 'loki', 'promtail']:
2657 ip = ''
2658 port = Monitoring.port_map[daemon_type][0]
2659 if 'meta_json' in ctx and ctx.meta_json:
2660 meta = json.loads(ctx.meta_json) or {}
2661 if 'ip' in meta and meta['ip']:
2662 ip = meta['ip']
2663 if 'ports' in meta and meta['ports']:
2664 port = meta['ports'][0]
2665 r += [f'--web.listen-address={ip}:{port}']
2666 if daemon_type == 'prometheus':
2667 config = get_parm(ctx.config_json)
2668 retention_time = config.get('retention_time', '15d')
2669 retention_size = config.get('retention_size', '0') # default to disabled
2670 r += [f'--storage.tsdb.retention.time={retention_time}']
2671 r += [f'--storage.tsdb.retention.size={retention_size}']
2672 scheme = 'http'
2673 host = get_fqdn()
2674 r += [f'--web.external-url={scheme}://{host}:{port}']
2675 if daemon_type == 'alertmanager':
2676 config = get_parm(ctx.config_json)
2677 peers = config.get('peers', list()) # type: ignore
2678 for peer in peers:
2679 r += ['--cluster.peer={}'.format(peer)]
2680 # some alertmanager, by default, look elsewhere for a config
2681 r += ['--config.file=/etc/alertmanager/alertmanager.yml']
2682 if daemon_type == 'promtail':
2683 r += ['--config.expand-env']
2684 if daemon_type == 'node-exporter':
2685 r += ['--path.procfs=/host/proc',
2686 '--path.sysfs=/host/sys',
2687 '--path.rootfs=/rootfs']
2688 elif daemon_type == NFSGanesha.daemon_type:
2689 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2690 r += nfs_ganesha.get_daemon_args()
2691 elif daemon_type == CephExporter.daemon_type:
2692 ceph_exporter = CephExporter.init(ctx, fsid, daemon_id)
2693 r.extend(ceph_exporter.get_daemon_args())
2694 elif daemon_type == HAproxy.daemon_type:
2695 haproxy = HAproxy.init(ctx, fsid, daemon_id)
2696 r += haproxy.get_daemon_args()
2697 elif daemon_type == CustomContainer.daemon_type:
2698 cc = CustomContainer.init(ctx, fsid, daemon_id)
2699 r.extend(cc.get_daemon_args())
2700 elif daemon_type == SNMPGateway.daemon_type:
2701 sc = SNMPGateway.init(ctx, fsid, daemon_id)
2702 r.extend(sc.get_daemon_args())
2703
2704 return r
2705
2706
2707 def create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid,
2708 config=None, keyring=None):
2709 # type: (CephadmContext, str, str, Union[int, str], int, int, Optional[str], Optional[str]) -> None
2710 data_dir = make_data_dir(ctx, fsid, daemon_type, daemon_id, uid=uid, gid=gid)
2711
2712 if daemon_type in Ceph.daemons:
2713 make_log_dir(ctx, fsid, uid=uid, gid=gid)
2714
2715 if config:
2716 config_path = os.path.join(data_dir, 'config')
2717 with open(config_path, 'w') as f:
2718 os.fchown(f.fileno(), uid, gid)
2719 os.fchmod(f.fileno(), 0o600)
2720 f.write(config)
2721
2722 if keyring:
2723 keyring_path = os.path.join(data_dir, 'keyring')
2724 with open(keyring_path, 'w') as f:
2725 os.fchmod(f.fileno(), 0o600)
2726 os.fchown(f.fileno(), uid, gid)
2727 f.write(keyring)
2728
2729 if daemon_type in Monitoring.components.keys():
2730 config_json: Dict[str, Any] = dict()
2731 if 'config_json' in ctx:
2732 config_json = get_parm(ctx.config_json)
2733
2734 # Set up directories specific to the monitoring component
2735 config_dir = ''
2736 data_dir_root = ''
2737 if daemon_type == 'prometheus':
2738 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2739 daemon_type, daemon_id)
2740 config_dir = 'etc/prometheus'
2741 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2742 makedirs(os.path.join(data_dir_root, config_dir, 'alerting'), uid, gid, 0o755)
2743 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2744 recursive_chown(os.path.join(data_dir_root, 'etc'), uid, gid)
2745 recursive_chown(os.path.join(data_dir_root, 'data'), uid, gid)
2746 elif daemon_type == 'grafana':
2747 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2748 daemon_type, daemon_id)
2749 config_dir = 'etc/grafana'
2750 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2751 makedirs(os.path.join(data_dir_root, config_dir, 'certs'), uid, gid, 0o755)
2752 makedirs(os.path.join(data_dir_root, config_dir, 'provisioning/datasources'), uid, gid, 0o755)
2753 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2754 touch(os.path.join(data_dir_root, 'data', 'grafana.db'), uid, gid)
2755 elif daemon_type == 'alertmanager':
2756 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2757 daemon_type, daemon_id)
2758 config_dir = 'etc/alertmanager'
2759 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2760 makedirs(os.path.join(data_dir_root, config_dir, 'data'), uid, gid, 0o755)
2761 elif daemon_type == 'promtail':
2762 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2763 daemon_type, daemon_id)
2764 config_dir = 'etc/promtail'
2765 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2766 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2767 elif daemon_type == 'loki':
2768 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2769 daemon_type, daemon_id)
2770 config_dir = 'etc/loki'
2771 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2772 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2773
2774 # populate the config directory for the component from the config-json
2775 if 'files' in config_json:
2776 for fname in config_json['files']:
2777 content = dict_get_join(config_json['files'], fname)
2778 if os.path.isabs(fname):
2779 fpath = os.path.join(data_dir_root, fname.lstrip(os.path.sep))
2780 else:
2781 fpath = os.path.join(data_dir_root, config_dir, fname)
2782 with open(fpath, 'w', encoding='utf-8') as f:
2783 os.fchown(f.fileno(), uid, gid)
2784 os.fchmod(f.fileno(), 0o600)
2785 f.write(content)
2786
2787 elif daemon_type == NFSGanesha.daemon_type:
2788 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2789 nfs_ganesha.create_daemon_dirs(data_dir, uid, gid)
2790
2791 elif daemon_type == CephIscsi.daemon_type:
2792 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2793 ceph_iscsi.create_daemon_dirs(data_dir, uid, gid)
2794
2795 elif daemon_type == HAproxy.daemon_type:
2796 haproxy = HAproxy.init(ctx, fsid, daemon_id)
2797 haproxy.create_daemon_dirs(data_dir, uid, gid)
2798
2799 elif daemon_type == Keepalived.daemon_type:
2800 keepalived = Keepalived.init(ctx, fsid, daemon_id)
2801 keepalived.create_daemon_dirs(data_dir, uid, gid)
2802
2803 elif daemon_type == CustomContainer.daemon_type:
2804 cc = CustomContainer.init(ctx, fsid, daemon_id)
2805 cc.create_daemon_dirs(data_dir, uid, gid)
2806
2807 elif daemon_type == SNMPGateway.daemon_type:
2808 sg = SNMPGateway.init(ctx, fsid, daemon_id)
2809 sg.create_daemon_conf()
2810
2811 _write_custom_conf_files(ctx, daemon_type, str(daemon_id), fsid, uid, gid)
2812
2813
2814 def _write_custom_conf_files(ctx: CephadmContext, daemon_type: str, daemon_id: str, fsid: str, uid: int, gid: int) -> None:
2815 # mostly making this its own function to make unit testing easier
2816 if 'config_json' not in ctx or not ctx.config_json:
2817 return
2818 config_json = get_custom_config_files(ctx.config_json)
2819 custom_config_dir = os.path.join(ctx.data_dir, fsid, 'custom_config_files', f'{daemon_type}.{daemon_id}')
2820 if not os.path.exists(custom_config_dir):
2821 makedirs(custom_config_dir, uid, gid, 0o755)
2822 mandatory_keys = ['mount_path', 'content']
2823 for ccf in config_json['custom_config_files']:
2824 if all(k in ccf for k in mandatory_keys):
2825 file_path = os.path.join(custom_config_dir, os.path.basename(ccf['mount_path']))
2826 with open(file_path, 'w+', encoding='utf-8') as f:
2827 os.fchown(f.fileno(), uid, gid)
2828 os.fchmod(f.fileno(), 0o600)
2829 f.write(ccf['content'])
2830
2831
2832 def get_parm(option: str) -> Dict[str, str]:
2833 js = _get_config_json(option)
2834 # custom_config_files is a special field that may be in the config
2835 # dict. It is used for mounting custom config files into daemon's containers
2836 # and should be accessed through the "get_custom_config_files" function.
2837 # For get_parm we need to discard it.
2838 js.pop('custom_config_files', None)
2839 return js
2840
2841
2842 def get_custom_config_files(option: str) -> Dict[str, List[Dict[str, str]]]:
2843 js = _get_config_json(option)
2844 res: Dict[str, List[Dict[str, str]]] = {'custom_config_files': []}
2845 if 'custom_config_files' in js:
2846 res['custom_config_files'] = js['custom_config_files']
2847 return res
2848
2849
2850 def _get_config_json(option: str) -> Dict[str, Any]:
2851 if not option:
2852 return dict()
2853
2854 global cached_stdin
2855 if option == '-':
2856 if cached_stdin is not None:
2857 j = cached_stdin
2858 else:
2859 j = sys.stdin.read()
2860 cached_stdin = j
2861 else:
2862 # inline json string
2863 if option[0] == '{' and option[-1] == '}':
2864 j = option
2865 # json file
2866 elif os.path.exists(option):
2867 with open(option, 'r') as f:
2868 j = f.read()
2869 else:
2870 raise Error('Config file {} not found'.format(option))
2871
2872 try:
2873 js = json.loads(j)
2874 except ValueError as e:
2875 raise Error('Invalid JSON in {}: {}'.format(option, e))
2876 else:
2877 return js
2878
2879
2880 def get_config_and_keyring(ctx):
2881 # type: (CephadmContext) -> Tuple[Optional[str], Optional[str]]
2882 config = None
2883 keyring = None
2884
2885 if 'config_json' in ctx and ctx.config_json:
2886 d = get_parm(ctx.config_json)
2887 config = d.get('config')
2888 keyring = d.get('keyring')
2889 if config and keyring:
2890 return config, keyring
2891
2892 if 'config' in ctx and ctx.config:
2893 try:
2894 with open(ctx.config, 'r') as f:
2895 config = f.read()
2896 except FileNotFoundError as e:
2897 raise Error(e)
2898
2899 if 'key' in ctx and ctx.key:
2900 keyring = '[%s]\n\tkey = %s\n' % (ctx.name, ctx.key)
2901 elif 'keyring' in ctx and ctx.keyring:
2902 try:
2903 with open(ctx.keyring, 'r') as f:
2904 keyring = f.read()
2905 except FileNotFoundError as e:
2906 raise Error(e)
2907
2908 return config, keyring
2909
2910
2911 def get_container_binds(ctx, fsid, daemon_type, daemon_id):
2912 # type: (CephadmContext, str, str, Union[int, str, None]) -> List[List[str]]
2913 binds = list()
2914
2915 if daemon_type == CephIscsi.daemon_type:
2916 binds.extend(CephIscsi.get_container_binds())
2917 elif daemon_type == CustomContainer.daemon_type:
2918 assert daemon_id
2919 cc = CustomContainer.init(ctx, fsid, daemon_id)
2920 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2921 binds.extend(cc.get_container_binds(data_dir))
2922
2923 return binds
2924
2925
2926 def get_container_mounts(ctx, fsid, daemon_type, daemon_id,
2927 no_config=False):
2928 # type: (CephadmContext, str, str, Union[int, str, None], Optional[bool]) -> Dict[str, str]
2929 mounts = dict()
2930
2931 if daemon_type in Ceph.daemons:
2932 if fsid:
2933 run_path = os.path.join('/var/run/ceph', fsid)
2934 if os.path.exists(run_path):
2935 mounts[run_path] = '/var/run/ceph:z'
2936 log_dir = get_log_dir(fsid, ctx.log_dir)
2937 mounts[log_dir] = '/var/log/ceph:z'
2938 crash_dir = '/var/lib/ceph/%s/crash' % fsid
2939 if os.path.exists(crash_dir):
2940 mounts[crash_dir] = '/var/lib/ceph/crash:z'
2941 if daemon_type != 'crash' and should_log_to_journald(ctx):
2942 journald_sock_dir = '/run/systemd/journal'
2943 mounts[journald_sock_dir] = journald_sock_dir
2944
2945 if daemon_type in Ceph.daemons and daemon_id:
2946 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2947 if daemon_type == 'rgw':
2948 cdata_dir = '/var/lib/ceph/radosgw/ceph-rgw.%s' % (daemon_id)
2949 else:
2950 cdata_dir = '/var/lib/ceph/%s/ceph-%s' % (daemon_type, daemon_id)
2951 if daemon_type != 'crash':
2952 mounts[data_dir] = cdata_dir + ':z'
2953 if not no_config:
2954 mounts[data_dir + '/config'] = '/etc/ceph/ceph.conf:z'
2955 if daemon_type in ['rbd-mirror', 'cephfs-mirror', 'crash', 'ceph-exporter']:
2956 # these do not search for their keyrings in a data directory
2957 mounts[data_dir + '/keyring'] = '/etc/ceph/ceph.client.%s.%s.keyring' % (daemon_type, daemon_id)
2958
2959 if daemon_type in ['mon', 'osd', 'clusterless-ceph-volume']:
2960 mounts['/dev'] = '/dev' # FIXME: narrow this down?
2961 mounts['/run/udev'] = '/run/udev'
2962 if daemon_type in ['osd', 'clusterless-ceph-volume']:
2963 mounts['/sys'] = '/sys' # for numa.cc, pick_address, cgroups, ...
2964 mounts['/run/lvm'] = '/run/lvm'
2965 mounts['/run/lock/lvm'] = '/run/lock/lvm'
2966 if daemon_type == 'osd':
2967 # selinux-policy in the container may not match the host.
2968 if HostFacts(ctx).selinux_enabled:
2969 selinux_folder = '/var/lib/ceph/%s/selinux' % fsid
2970 if not os.path.exists(selinux_folder):
2971 os.makedirs(selinux_folder, mode=0o755)
2972 mounts[selinux_folder] = '/sys/fs/selinux:ro'
2973 mounts['/'] = '/rootfs'
2974
2975 try:
2976 if ctx.shared_ceph_folder: # make easy manager modules/ceph-volume development
2977 ceph_folder = pathify(ctx.shared_ceph_folder)
2978 if os.path.exists(ceph_folder):
2979 mounts[ceph_folder + '/src/ceph-volume/ceph_volume'] = '/usr/lib/python3.6/site-packages/ceph_volume'
2980 mounts[ceph_folder + '/src/cephadm/cephadm'] = '/usr/sbin/cephadm'
2981 mounts[ceph_folder + '/src/pybind/mgr'] = '/usr/share/ceph/mgr'
2982 mounts[ceph_folder + '/src/python-common/ceph'] = '/usr/lib/python3.6/site-packages/ceph'
2983 mounts[ceph_folder + '/monitoring/ceph-mixin/dashboards_out'] = '/etc/grafana/dashboards/ceph-dashboard'
2984 mounts[ceph_folder + '/monitoring/ceph-mixin/prometheus_alerts.yml'] = '/etc/prometheus/ceph/ceph_default_alerts.yml'
2985 else:
2986 logger.error('{}{}{}'.format(termcolor.red,
2987 'Ceph shared source folder does not exist.',
2988 termcolor.end))
2989 except AttributeError:
2990 pass
2991
2992 if daemon_type in Monitoring.components and daemon_id:
2993 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2994 log_dir = get_log_dir(fsid, ctx.log_dir)
2995 if daemon_type == 'prometheus':
2996 mounts[os.path.join(data_dir, 'etc/prometheus')] = '/etc/prometheus:Z'
2997 mounts[os.path.join(data_dir, 'data')] = '/prometheus:Z'
2998 elif daemon_type == 'loki':
2999 mounts[os.path.join(data_dir, 'etc/loki')] = '/etc/loki:Z'
3000 mounts[os.path.join(data_dir, 'data')] = '/loki:Z'
3001 elif daemon_type == 'promtail':
3002 mounts[os.path.join(data_dir, 'etc/promtail')] = '/etc/promtail:Z'
3003 mounts[log_dir] = '/var/log/ceph:z'
3004 mounts[os.path.join(data_dir, 'data')] = '/promtail:Z'
3005 elif daemon_type == 'node-exporter':
3006 mounts['/proc'] = '/host/proc:ro'
3007 mounts['/sys'] = '/host/sys:ro'
3008 mounts['/'] = '/rootfs:ro'
3009 elif daemon_type == 'grafana':
3010 mounts[os.path.join(data_dir, 'etc/grafana/grafana.ini')] = '/etc/grafana/grafana.ini:Z'
3011 mounts[os.path.join(data_dir, 'etc/grafana/provisioning/datasources')] = '/etc/grafana/provisioning/datasources:Z'
3012 mounts[os.path.join(data_dir, 'etc/grafana/certs')] = '/etc/grafana/certs:Z'
3013 mounts[os.path.join(data_dir, 'data/grafana.db')] = '/var/lib/grafana/grafana.db:Z'
3014 elif daemon_type == 'alertmanager':
3015 mounts[os.path.join(data_dir, 'etc/alertmanager')] = '/etc/alertmanager:Z'
3016
3017 if daemon_type == NFSGanesha.daemon_type:
3018 assert daemon_id
3019 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
3020 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
3021 mounts.update(nfs_ganesha.get_container_mounts(data_dir))
3022
3023 if daemon_type == HAproxy.daemon_type:
3024 assert daemon_id
3025 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
3026 mounts.update(HAproxy.get_container_mounts(data_dir))
3027
3028 if daemon_type == CephIscsi.daemon_type:
3029 assert daemon_id
3030 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
3031 log_dir = get_log_dir(fsid, ctx.log_dir)
3032 mounts.update(CephIscsi.get_container_mounts(data_dir, log_dir))
3033
3034 if daemon_type == Keepalived.daemon_type:
3035 assert daemon_id
3036 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
3037 mounts.update(Keepalived.get_container_mounts(data_dir))
3038
3039 if daemon_type == CustomContainer.daemon_type:
3040 assert daemon_id
3041 cc = CustomContainer.init(ctx, fsid, daemon_id)
3042 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
3043 mounts.update(cc.get_container_mounts(data_dir))
3044
3045 return mounts
3046
3047
3048 def get_ceph_volume_container(ctx: CephadmContext,
3049 privileged: bool = True,
3050 cname: str = '',
3051 volume_mounts: Dict[str, str] = {},
3052 bind_mounts: Optional[List[List[str]]] = None,
3053 args: List[str] = [],
3054 envs: Optional[List[str]] = None) -> 'CephContainer':
3055 if envs is None:
3056 envs = []
3057 envs.append('CEPH_VOLUME_SKIP_RESTORECON=yes')
3058 envs.append('CEPH_VOLUME_DEBUG=1')
3059
3060 return CephContainer(
3061 ctx,
3062 image=ctx.image,
3063 entrypoint='/usr/sbin/ceph-volume',
3064 args=args,
3065 volume_mounts=volume_mounts,
3066 bind_mounts=bind_mounts,
3067 envs=envs,
3068 privileged=privileged,
3069 cname=cname,
3070 memory_request=ctx.memory_request,
3071 memory_limit=ctx.memory_limit,
3072 )
3073
3074
3075 def set_pids_limit_unlimited(ctx: CephadmContext, container_args: List[str]) -> None:
3076 # set container's pids-limit to unlimited rather than default (Docker 4096 / Podman 2048)
3077 # Useful for daemons like iscsi where the default pids-limit limits the number of luns
3078 # per iscsi target or rgw where increasing the rgw_thread_pool_size to a value near
3079 # the default pids-limit may cause the container to crash.
3080 if (
3081 isinstance(ctx.container_engine, Podman)
3082 and ctx.container_engine.version >= PIDS_LIMIT_UNLIMITED_PODMAN_VERSION
3083 ):
3084 container_args.append('--pids-limit=-1')
3085 else:
3086 container_args.append('--pids-limit=0')
3087
3088
3089 def get_container(ctx: CephadmContext,
3090 fsid: str, daemon_type: str, daemon_id: Union[int, str],
3091 privileged: bool = False,
3092 ptrace: bool = False,
3093 container_args: Optional[List[str]] = None) -> 'CephContainer':
3094 entrypoint: str = ''
3095 name: str = ''
3096 ceph_args: List[str] = []
3097 envs: List[str] = []
3098 host_network: bool = True
3099
3100 if daemon_type in Ceph.daemons:
3101 envs.append('TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728')
3102 if container_args is None:
3103 container_args = []
3104 if daemon_type in Ceph.daemons or daemon_type in Ceph.gateways:
3105 set_pids_limit_unlimited(ctx, container_args)
3106 if daemon_type in ['mon', 'osd']:
3107 # mon and osd need privileged in order for libudev to query devices
3108 privileged = True
3109 if daemon_type == 'rgw':
3110 entrypoint = '/usr/bin/radosgw'
3111 name = 'client.rgw.%s' % daemon_id
3112 elif daemon_type == 'rbd-mirror':
3113 entrypoint = '/usr/bin/rbd-mirror'
3114 name = 'client.rbd-mirror.%s' % daemon_id
3115 elif daemon_type == 'cephfs-mirror':
3116 entrypoint = '/usr/bin/cephfs-mirror'
3117 name = 'client.cephfs-mirror.%s' % daemon_id
3118 elif daemon_type == 'crash':
3119 entrypoint = '/usr/bin/ceph-crash'
3120 name = 'client.crash.%s' % daemon_id
3121 elif daemon_type in ['mon', 'mgr', 'mds', 'osd']:
3122 entrypoint = '/usr/bin/ceph-' + daemon_type
3123 name = '%s.%s' % (daemon_type, daemon_id)
3124 elif daemon_type in Monitoring.components:
3125 entrypoint = ''
3126 elif daemon_type == NFSGanesha.daemon_type:
3127 entrypoint = NFSGanesha.entrypoint
3128 name = '%s.%s' % (daemon_type, daemon_id)
3129 envs.extend(NFSGanesha.get_container_envs())
3130 elif daemon_type == CephExporter.daemon_type:
3131 entrypoint = CephExporter.entrypoint
3132 name = 'client.ceph-exporter.%s' % daemon_id
3133 elif daemon_type == HAproxy.daemon_type:
3134 name = '%s.%s' % (daemon_type, daemon_id)
3135 container_args.extend(['--user=root']) # haproxy 2.4 defaults to a different user
3136 elif daemon_type == Keepalived.daemon_type:
3137 name = '%s.%s' % (daemon_type, daemon_id)
3138 envs.extend(Keepalived.get_container_envs())
3139 container_args.extend(['--cap-add=NET_ADMIN', '--cap-add=NET_RAW'])
3140 elif daemon_type == CephIscsi.daemon_type:
3141 entrypoint = CephIscsi.entrypoint
3142 name = '%s.%s' % (daemon_type, daemon_id)
3143 # So the container can modprobe iscsi_target_mod and have write perms
3144 # to configfs we need to make this a privileged container.
3145 privileged = True
3146 elif daemon_type == CustomContainer.daemon_type:
3147 cc = CustomContainer.init(ctx, fsid, daemon_id)
3148 entrypoint = cc.entrypoint
3149 host_network = False
3150 envs.extend(cc.get_container_envs())
3151 container_args.extend(cc.get_container_args())
3152
3153 if daemon_type in Monitoring.components:
3154 uid, gid = extract_uid_gid_monitoring(ctx, daemon_type)
3155 monitoring_args = [
3156 '--user',
3157 str(uid),
3158 # FIXME: disable cpu/memory limits for the time being (not supported
3159 # by ubuntu 18.04 kernel!)
3160 ]
3161 container_args.extend(monitoring_args)
3162 if daemon_type == 'node-exporter':
3163 # in order to support setting '--path.procfs=/host/proc','--path.sysfs=/host/sys',
3164 # '--path.rootfs=/rootfs' for node-exporter we need to disable selinux separation
3165 # between the node-exporter container and the host to avoid selinux denials
3166 container_args.extend(['--security-opt', 'label=disable'])
3167 elif daemon_type == 'crash':
3168 ceph_args = ['-n', name]
3169 elif daemon_type in Ceph.daemons:
3170 ceph_args = ['-n', name, '-f']
3171 elif daemon_type == SNMPGateway.daemon_type:
3172 sg = SNMPGateway.init(ctx, fsid, daemon_id)
3173 container_args.append(
3174 f'--env-file={sg.conf_file_path}'
3175 )
3176
3177 # if using podman, set -d, --conmon-pidfile & --cidfile flags
3178 # so service can have Type=Forking
3179 if isinstance(ctx.container_engine, Podman):
3180 runtime_dir = '/run'
3181 container_args.extend([
3182 '-d', '--log-driver', 'journald',
3183 '--conmon-pidfile',
3184 runtime_dir + '/ceph-%s@%s.%s.service-pid' % (fsid, daemon_type, daemon_id),
3185 '--cidfile',
3186 runtime_dir + '/ceph-%s@%s.%s.service-cid' % (fsid, daemon_type, daemon_id),
3187 ])
3188 if ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION and not ctx.no_cgroups_split:
3189 container_args.append('--cgroups=split')
3190
3191 return CephContainer.for_daemon(
3192 ctx,
3193 fsid=fsid,
3194 daemon_type=daemon_type,
3195 daemon_id=str(daemon_id),
3196 entrypoint=entrypoint,
3197 args=ceph_args + get_daemon_args(ctx, fsid, daemon_type, daemon_id),
3198 container_args=container_args,
3199 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
3200 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
3201 envs=envs,
3202 privileged=privileged,
3203 ptrace=ptrace,
3204 host_network=host_network,
3205 )
3206
3207
3208 def extract_uid_gid(ctx, img='', file_path='/var/lib/ceph'):
3209 # type: (CephadmContext, str, Union[str, List[str]]) -> Tuple[int, int]
3210
3211 if not img:
3212 img = ctx.image
3213
3214 if isinstance(file_path, str):
3215 paths = [file_path]
3216 else:
3217 paths = file_path
3218
3219 ex: Optional[Tuple[str, RuntimeError]] = None
3220
3221 for fp in paths:
3222 try:
3223 out = CephContainer(
3224 ctx,
3225 image=img,
3226 entrypoint='stat',
3227 args=['-c', '%u %g', fp]
3228 ).run(verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
3229 uid, gid = out.split(' ')
3230 return int(uid), int(gid)
3231 except RuntimeError as e:
3232 ex = (fp, e)
3233 if ex:
3234 raise Error(f'Failed to extract uid/gid for path {ex[0]}: {ex[1]}')
3235
3236 raise RuntimeError('uid/gid not found')
3237
3238
3239 def deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid,
3240 config=None, keyring=None,
3241 osd_fsid=None,
3242 reconfig=False,
3243 ports=None):
3244 # type: (CephadmContext, str, str, Union[int, str], Optional[CephContainer], int, int, Optional[str], Optional[str], Optional[str], Optional[bool], Optional[List[int]]) -> None
3245
3246 ports = ports or []
3247 if any([port_in_use(ctx, port) for port in ports]):
3248 if daemon_type == 'mgr':
3249 # non-fatal for mgr when we are in mgr_standby_modules=false, but we can't
3250 # tell whether that is the case here.
3251 logger.warning(
3252 f"ceph-mgr TCP port(s) {','.join(map(str, ports))} already in use"
3253 )
3254 else:
3255 raise Error("TCP Port(s) '{}' required for {} already in use".format(','.join(map(str, ports)), daemon_type))
3256
3257 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
3258 if reconfig and not os.path.exists(data_dir):
3259 raise Error('cannot reconfig, data path %s does not exist' % data_dir)
3260 if daemon_type == 'mon' and not os.path.exists(data_dir):
3261 assert config
3262 assert keyring
3263 # tmp keyring file
3264 tmp_keyring = write_tmp(keyring, uid, gid)
3265
3266 # tmp config file
3267 tmp_config = write_tmp(config, uid, gid)
3268
3269 # --mkfs
3270 create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid)
3271 mon_dir = get_data_dir(fsid, ctx.data_dir, 'mon', daemon_id)
3272 log_dir = get_log_dir(fsid, ctx.log_dir)
3273 CephContainer(
3274 ctx,
3275 image=ctx.image,
3276 entrypoint='/usr/bin/ceph-mon',
3277 args=[
3278 '--mkfs',
3279 '-i', str(daemon_id),
3280 '--fsid', fsid,
3281 '-c', '/tmp/config',
3282 '--keyring', '/tmp/keyring',
3283 ] + get_daemon_args(ctx, fsid, 'mon', daemon_id),
3284 volume_mounts={
3285 log_dir: '/var/log/ceph:z',
3286 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (daemon_id),
3287 tmp_keyring.name: '/tmp/keyring:z',
3288 tmp_config.name: '/tmp/config:z',
3289 },
3290 ).run()
3291
3292 # write conf
3293 with open(mon_dir + '/config', 'w') as f:
3294 os.fchown(f.fileno(), uid, gid)
3295 os.fchmod(f.fileno(), 0o600)
3296 f.write(config)
3297 else:
3298 # dirs, conf, keyring
3299 create_daemon_dirs(
3300 ctx,
3301 fsid, daemon_type, daemon_id,
3302 uid, gid,
3303 config, keyring)
3304
3305 if not reconfig:
3306 if daemon_type == CephadmAgent.daemon_type:
3307 if ctx.config_json == '-':
3308 config_js = get_parm('-')
3309 else:
3310 config_js = get_parm(ctx.config_json)
3311 assert isinstance(config_js, dict)
3312
3313 cephadm_agent = CephadmAgent(ctx, fsid, daemon_id)
3314 cephadm_agent.deploy_daemon_unit(config_js)
3315 else:
3316 if c:
3317 deploy_daemon_units(ctx, fsid, uid, gid, daemon_type, daemon_id,
3318 c, osd_fsid=osd_fsid, ports=ports)
3319 else:
3320 raise RuntimeError('attempting to deploy a daemon without a container image')
3321
3322 if not os.path.exists(data_dir + '/unit.created'):
3323 with open(data_dir + '/unit.created', 'w') as f:
3324 os.fchmod(f.fileno(), 0o600)
3325 os.fchown(f.fileno(), uid, gid)
3326 f.write('mtime is time the daemon deployment was created\n')
3327
3328 with open(data_dir + '/unit.configured', 'w') as f:
3329 f.write('mtime is time we were last configured\n')
3330 os.fchmod(f.fileno(), 0o600)
3331 os.fchown(f.fileno(), uid, gid)
3332
3333 update_firewalld(ctx, daemon_type)
3334
3335 # Open ports explicitly required for the daemon
3336 if ports:
3337 fw = Firewalld(ctx)
3338 fw.open_ports(ports + fw.external_ports.get(daemon_type, []))
3339 fw.apply_rules()
3340
3341 if reconfig and daemon_type not in Ceph.daemons:
3342 # ceph daemons do not need a restart; others (presumably) do to pick
3343 # up the new config
3344 call_throws(ctx, ['systemctl', 'reset-failed',
3345 get_unit_name(fsid, daemon_type, daemon_id)])
3346 call_throws(ctx, ['systemctl', 'restart',
3347 get_unit_name(fsid, daemon_type, daemon_id)])
3348
3349
3350 def _write_container_cmd_to_bash(ctx, file_obj, container, comment=None, background=False):
3351 # type: (CephadmContext, IO[str], CephContainer, Optional[str], Optional[bool]) -> None
3352 if comment:
3353 # Sometimes adding a comment, especially if there are multiple containers in one
3354 # unit file, makes it easier to read and grok.
3355 file_obj.write('# ' + comment + '\n')
3356 # Sometimes, adding `--rm` to a run_cmd doesn't work. Let's remove the container manually
3357 file_obj.write('! ' + ' '.join(container.rm_cmd(old_cname=True)) + ' 2> /dev/null\n')
3358 file_obj.write('! ' + ' '.join(container.rm_cmd()) + ' 2> /dev/null\n')
3359 # Sometimes, `podman rm` doesn't find the container. Then you'll have to add `--storage`
3360 if isinstance(ctx.container_engine, Podman):
3361 file_obj.write(
3362 '! '
3363 + ' '.join([shlex.quote(a) for a in container.rm_cmd(storage=True)])
3364 + ' 2> /dev/null\n')
3365 file_obj.write(
3366 '! '
3367 + ' '.join([shlex.quote(a) for a in container.rm_cmd(old_cname=True, storage=True)])
3368 + ' 2> /dev/null\n')
3369
3370 # container run command
3371 file_obj.write(
3372 ' '.join([shlex.quote(a) for a in container.run_cmd()])
3373 + (' &' if background else '') + '\n')
3374
3375
3376 def clean_cgroup(ctx: CephadmContext, fsid: str, unit_name: str) -> None:
3377 # systemd may fail to cleanup cgroups from previous stopped unit, which will cause next "systemctl start" to fail.
3378 # see https://tracker.ceph.com/issues/50998
3379
3380 CGROUPV2_PATH = Path('/sys/fs/cgroup')
3381 if not (CGROUPV2_PATH / 'system.slice').exists():
3382 # Only unified cgroup is affected, skip if not the case
3383 return
3384
3385 slice_name = 'system-ceph\\x2d{}.slice'.format(fsid.replace('-', '\\x2d'))
3386 cg_path = CGROUPV2_PATH / 'system.slice' / slice_name / f'{unit_name}.service'
3387 if not cg_path.exists():
3388 return
3389
3390 def cg_trim(path: Path) -> None:
3391 for p in path.iterdir():
3392 if p.is_dir():
3393 cg_trim(p)
3394 path.rmdir()
3395 try:
3396 cg_trim(cg_path)
3397 except OSError:
3398 logger.warning(f'Failed to trim old cgroups {cg_path}')
3399
3400
3401 def deploy_daemon_units(
3402 ctx: CephadmContext,
3403 fsid: str,
3404 uid: int,
3405 gid: int,
3406 daemon_type: str,
3407 daemon_id: Union[int, str],
3408 c: 'CephContainer',
3409 enable: bool = True,
3410 start: bool = True,
3411 osd_fsid: Optional[str] = None,
3412 ports: Optional[List[int]] = None,
3413 ) -> None:
3414 # cmd
3415
3416 def add_stop_actions(f: TextIO) -> None:
3417 # following generated script basically checks if the container exists
3418 # before stopping it. Exit code will be success either if it doesn't
3419 # exist or if it exists and is stopped successfully.
3420 container_exists = f'{ctx.container_engine.path} inspect %s &>/dev/null'
3421 f.write(f'! {container_exists % c.old_cname} || {" ".join(c.stop_cmd(old_cname=True))} \n')
3422 f.write(f'! {container_exists % c.cname} || {" ".join(c.stop_cmd())} \n')
3423
3424 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
3425 with open(data_dir + '/unit.run.new', 'w') as f, \
3426 open(data_dir + '/unit.meta.new', 'w') as metaf:
3427 f.write('set -e\n')
3428
3429 if daemon_type in Ceph.daemons:
3430 install_path = find_program('install')
3431 f.write('{install_path} -d -m0770 -o {uid} -g {gid} /var/run/ceph/{fsid}\n'.format(install_path=install_path, fsid=fsid, uid=uid, gid=gid))
3432
3433 # pre-start cmd(s)
3434 if daemon_type == 'osd':
3435 # osds have a pre-start step
3436 assert osd_fsid
3437 simple_fn = os.path.join('/etc/ceph/osd',
3438 '%s-%s.json.adopted-by-cephadm' % (daemon_id, osd_fsid))
3439 if os.path.exists(simple_fn):
3440 f.write('# Simple OSDs need chown on startup:\n')
3441 for n in ['block', 'block.db', 'block.wal']:
3442 p = os.path.join(data_dir, n)
3443 f.write('[ ! -L {p} ] || chown {uid}:{gid} {p}\n'.format(p=p, uid=uid, gid=gid))
3444 else:
3445 # if ceph-volume does not support 'ceph-volume activate', we must
3446 # do 'ceph-volume lvm activate'.
3447 test_cv = get_ceph_volume_container(
3448 ctx,
3449 args=['activate', '--bad-option'],
3450 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
3451 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
3452 cname='ceph-%s-%s.%s-activate-test' % (fsid, daemon_type, daemon_id),
3453 )
3454 out, err, ret = call(ctx, test_cv.run_cmd(), verbosity=CallVerbosity.SILENT)
3455 # bad: ceph-volume: error: unrecognized arguments: activate --bad-option
3456 # good: ceph-volume: error: unrecognized arguments: --bad-option
3457 if 'unrecognized arguments: activate' in err:
3458 # older ceph-volume without top-level activate or --no-tmpfs
3459 cmd = [
3460 'lvm', 'activate',
3461 str(daemon_id), osd_fsid,
3462 '--no-systemd',
3463 ]
3464 else:
3465 cmd = [
3466 'activate',
3467 '--osd-id', str(daemon_id),
3468 '--osd-uuid', osd_fsid,
3469 '--no-systemd',
3470 '--no-tmpfs',
3471 ]
3472
3473 prestart = get_ceph_volume_container(
3474 ctx,
3475 args=cmd,
3476 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
3477 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
3478 cname='ceph-%s-%s.%s-activate' % (fsid, daemon_type, daemon_id),
3479 )
3480 _write_container_cmd_to_bash(ctx, f, prestart, 'LVM OSDs use ceph-volume lvm activate')
3481 elif daemon_type == CephIscsi.daemon_type:
3482 f.write(' '.join(CephIscsi.configfs_mount_umount(data_dir, mount=True)) + '\n')
3483 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
3484 tcmu_container = ceph_iscsi.get_tcmu_runner_container()
3485 _write_container_cmd_to_bash(ctx, f, tcmu_container, 'iscsi tcmu-runner container', background=True)
3486
3487 _write_container_cmd_to_bash(ctx, f, c, '%s.%s' % (daemon_type, str(daemon_id)))
3488
3489 # some metadata about the deploy
3490 meta: Dict[str, Any] = {}
3491 if 'meta_json' in ctx and ctx.meta_json:
3492 meta = json.loads(ctx.meta_json) or {}
3493 meta.update({
3494 'memory_request': int(ctx.memory_request) if ctx.memory_request else None,
3495 'memory_limit': int(ctx.memory_limit) if ctx.memory_limit else None,
3496 })
3497 if not meta.get('ports'):
3498 meta['ports'] = ports
3499 metaf.write(json.dumps(meta, indent=4) + '\n')
3500
3501 os.fchmod(f.fileno(), 0o600)
3502 os.fchmod(metaf.fileno(), 0o600)
3503 os.rename(data_dir + '/unit.run.new',
3504 data_dir + '/unit.run')
3505 os.rename(data_dir + '/unit.meta.new',
3506 data_dir + '/unit.meta')
3507
3508 # post-stop command(s)
3509 with open(data_dir + '/unit.poststop.new', 'w') as f:
3510 # this is a fallback to eventually stop any underlying container that was not stopped properly by unit.stop,
3511 # this could happen in very slow setups as described in the issue https://tracker.ceph.com/issues/58242.
3512 add_stop_actions(f)
3513 if daemon_type == 'osd':
3514 assert osd_fsid
3515 poststop = get_ceph_volume_container(
3516 ctx,
3517 args=[
3518 'lvm', 'deactivate',
3519 str(daemon_id), osd_fsid,
3520 ],
3521 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
3522 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
3523 cname='ceph-%s-%s.%s-deactivate' % (fsid, daemon_type,
3524 daemon_id),
3525 )
3526 _write_container_cmd_to_bash(ctx, f, poststop, 'deactivate osd')
3527 elif daemon_type == CephIscsi.daemon_type:
3528 # make sure we also stop the tcmu container
3529 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
3530 tcmu_container = ceph_iscsi.get_tcmu_runner_container()
3531 f.write('! ' + ' '.join(tcmu_container.stop_cmd()) + '\n')
3532 f.write(' '.join(CephIscsi.configfs_mount_umount(data_dir, mount=False)) + '\n')
3533 os.fchmod(f.fileno(), 0o600)
3534 os.rename(data_dir + '/unit.poststop.new',
3535 data_dir + '/unit.poststop')
3536
3537 # post-stop command(s)
3538 with open(data_dir + '/unit.stop.new', 'w') as f:
3539 add_stop_actions(f)
3540 os.fchmod(f.fileno(), 0o600)
3541 os.rename(data_dir + '/unit.stop.new',
3542 data_dir + '/unit.stop')
3543
3544 if c:
3545 with open(data_dir + '/unit.image.new', 'w') as f:
3546 f.write(c.image + '\n')
3547 os.fchmod(f.fileno(), 0o600)
3548 os.rename(data_dir + '/unit.image.new',
3549 data_dir + '/unit.image')
3550
3551 # sysctl
3552 install_sysctl(ctx, fsid, daemon_type)
3553
3554 # systemd
3555 install_base_units(ctx, fsid)
3556 unit = get_unit_file(ctx, fsid)
3557 unit_file = 'ceph-%s@.service' % (fsid)
3558 with open(ctx.unit_dir + '/' + unit_file + '.new', 'w') as f:
3559 f.write(unit)
3560 os.rename(ctx.unit_dir + '/' + unit_file + '.new',
3561 ctx.unit_dir + '/' + unit_file)
3562 call_throws(ctx, ['systemctl', 'daemon-reload'])
3563
3564 unit_name = get_unit_name(fsid, daemon_type, daemon_id)
3565 call(ctx, ['systemctl', 'stop', unit_name],
3566 verbosity=CallVerbosity.DEBUG)
3567 call(ctx, ['systemctl', 'reset-failed', unit_name],
3568 verbosity=CallVerbosity.DEBUG)
3569 if enable:
3570 call_throws(ctx, ['systemctl', 'enable', unit_name])
3571 if start:
3572 clean_cgroup(ctx, fsid, unit_name)
3573 call_throws(ctx, ['systemctl', 'start', unit_name])
3574
3575
3576 class Firewalld(object):
3577
3578 # for specifying ports we should always open when opening
3579 # ports for a daemon of that type. Main use case is for ports
3580 # that we should open when deploying the daemon type but that
3581 # the daemon itself may not necessarily need to bind to the port.
3582 # This needs to be handed differently as we don't want to fail
3583 # deployment if the port cannot be bound to but we still want to
3584 # open the port in the firewall.
3585 external_ports: Dict[str, List[int]] = {
3586 'iscsi': [3260] # 3260 is the well known iSCSI port
3587 }
3588
3589 def __init__(self, ctx):
3590 # type: (CephadmContext) -> None
3591 self.ctx = ctx
3592 self.available = self.check()
3593
3594 def check(self):
3595 # type: () -> bool
3596 self.cmd = find_executable('firewall-cmd')
3597 if not self.cmd:
3598 logger.debug('firewalld does not appear to be present')
3599 return False
3600 (enabled, state, _) = check_unit(self.ctx, 'firewalld.service')
3601 if not enabled:
3602 logger.debug('firewalld.service is not enabled')
3603 return False
3604 if state != 'running':
3605 logger.debug('firewalld.service is not running')
3606 return False
3607
3608 logger.info('firewalld ready')
3609 return True
3610
3611 def enable_service_for(self, daemon_type):
3612 # type: (str) -> None
3613 if not self.available:
3614 logger.debug('Not possible to enable service <%s>. firewalld.service is not available' % daemon_type)
3615 return
3616
3617 if daemon_type == 'mon':
3618 svc = 'ceph-mon'
3619 elif daemon_type in ['mgr', 'mds', 'osd']:
3620 svc = 'ceph'
3621 elif daemon_type == NFSGanesha.daemon_type:
3622 svc = 'nfs'
3623 else:
3624 return
3625
3626 if not self.cmd:
3627 raise RuntimeError('command not defined')
3628
3629 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-service', svc], verbosity=CallVerbosity.DEBUG)
3630 if ret:
3631 logger.info('Enabling firewalld service %s in current zone...' % svc)
3632 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--add-service', svc])
3633 if ret:
3634 raise RuntimeError(
3635 'unable to add service %s to current zone: %s' % (svc, err))
3636 else:
3637 logger.debug('firewalld service %s is enabled in current zone' % svc)
3638
3639 def open_ports(self, fw_ports):
3640 # type: (List[int]) -> None
3641 if not self.available:
3642 logger.debug('Not possible to open ports <%s>. firewalld.service is not available' % fw_ports)
3643 return
3644
3645 if not self.cmd:
3646 raise RuntimeError('command not defined')
3647
3648 for port in fw_ports:
3649 tcp_port = str(port) + '/tcp'
3650 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-port', tcp_port], verbosity=CallVerbosity.DEBUG)
3651 if ret:
3652 logger.info('Enabling firewalld port %s in current zone...' % tcp_port)
3653 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--add-port', tcp_port])
3654 if ret:
3655 raise RuntimeError('unable to add port %s to current zone: %s' %
3656 (tcp_port, err))
3657 else:
3658 logger.debug('firewalld port %s is enabled in current zone' % tcp_port)
3659
3660 def close_ports(self, fw_ports):
3661 # type: (List[int]) -> None
3662 if not self.available:
3663 logger.debug('Not possible to close ports <%s>. firewalld.service is not available' % fw_ports)
3664 return
3665
3666 if not self.cmd:
3667 raise RuntimeError('command not defined')
3668
3669 for port in fw_ports:
3670 tcp_port = str(port) + '/tcp'
3671 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-port', tcp_port], verbosity=CallVerbosity.DEBUG)
3672 if not ret:
3673 logger.info('Disabling port %s in current zone...' % tcp_port)
3674 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--remove-port', tcp_port])
3675 if ret:
3676 raise RuntimeError('unable to remove port %s from current zone: %s' %
3677 (tcp_port, err))
3678 else:
3679 logger.info(f'Port {tcp_port} disabled')
3680 else:
3681 logger.info(f'firewalld port {tcp_port} already closed')
3682
3683 def apply_rules(self):
3684 # type: () -> None
3685 if not self.available:
3686 return
3687
3688 if not self.cmd:
3689 raise RuntimeError('command not defined')
3690
3691 call_throws(self.ctx, [self.cmd, '--reload'])
3692
3693
3694 def update_firewalld(ctx, daemon_type):
3695 # type: (CephadmContext, str) -> None
3696 if not ('skip_firewalld' in ctx and ctx.skip_firewalld):
3697 firewall = Firewalld(ctx)
3698 firewall.enable_service_for(daemon_type)
3699 firewall.apply_rules()
3700
3701
3702 def install_sysctl(ctx: CephadmContext, fsid: str, daemon_type: str) -> None:
3703 """
3704 Set up sysctl settings
3705 """
3706 def _write(conf: Path, lines: List[str]) -> None:
3707 lines = [
3708 '# created by cephadm',
3709 '',
3710 *lines,
3711 '',
3712 ]
3713 with open(conf, 'w') as f:
3714 f.write('\n'.join(lines))
3715
3716 conf = Path(ctx.sysctl_dir).joinpath(f'90-ceph-{fsid}-{daemon_type}.conf')
3717 lines: Optional[List] = None
3718
3719 if daemon_type == 'osd':
3720 lines = OSD.get_sysctl_settings()
3721 elif daemon_type == 'haproxy':
3722 lines = HAproxy.get_sysctl_settings()
3723 elif daemon_type == 'keepalived':
3724 lines = Keepalived.get_sysctl_settings()
3725
3726 # apply the sysctl settings
3727 if lines:
3728 Path(ctx.sysctl_dir).mkdir(mode=0o755, exist_ok=True)
3729 _write(conf, lines)
3730 call_throws(ctx, ['sysctl', '--system'])
3731
3732
3733 def migrate_sysctl_dir(ctx: CephadmContext, fsid: str) -> None:
3734 """
3735 Cephadm once used '/usr/lib/sysctl.d' for storing sysctl configuration.
3736 This moves it to '/etc/sysctl.d'.
3737 """
3738 deprecated_location: str = '/usr/lib/sysctl.d'
3739 deprecated_confs: List[str] = glob(f'{deprecated_location}/90-ceph-{fsid}-*.conf')
3740 if not deprecated_confs:
3741 return
3742
3743 file_count: int = len(deprecated_confs)
3744 logger.info(f'Found sysctl {file_count} files in deprecated location {deprecated_location}. Starting Migration.')
3745 for conf in deprecated_confs:
3746 try:
3747 shutil.move(conf, ctx.sysctl_dir)
3748 file_count -= 1
3749 except shutil.Error as err:
3750 if str(err).endswith('already exists'):
3751 logger.warning(f'Destination file already exists. Deleting {conf}.')
3752 try:
3753 os.unlink(conf)
3754 file_count -= 1
3755 except OSError as del_err:
3756 logger.warning(f'Could not remove {conf}: {del_err}.')
3757 else:
3758 logger.warning(f'Could not move {conf} from {deprecated_location} to {ctx.sysctl_dir}: {err}')
3759
3760 # Log successful migration
3761 if file_count == 0:
3762 logger.info(f'Successfully migrated sysctl config to {ctx.sysctl_dir}.')
3763 return
3764
3765 # Log partially successful / unsuccessful migration
3766 files_processed: int = len(deprecated_confs)
3767 if file_count < files_processed:
3768 status: str = f'partially successful (failed {file_count}/{files_processed})'
3769 elif file_count == files_processed:
3770 status = 'unsuccessful'
3771 logger.warning(f'Migration of sysctl configuration {status}. You may want to perform a migration manually.')
3772
3773
3774 def install_base_units(ctx, fsid):
3775 # type: (CephadmContext, str) -> None
3776 """
3777 Set up ceph.target and ceph-$fsid.target units.
3778 """
3779 # global unit
3780 existed = os.path.exists(ctx.unit_dir + '/ceph.target')
3781 with open(ctx.unit_dir + '/ceph.target.new', 'w') as f:
3782 f.write('[Unit]\n'
3783 'Description=All Ceph clusters and services\n'
3784 '\n'
3785 '[Install]\n'
3786 'WantedBy=multi-user.target\n')
3787 os.rename(ctx.unit_dir + '/ceph.target.new',
3788 ctx.unit_dir + '/ceph.target')
3789 if not existed:
3790 # we disable before enable in case a different ceph.target
3791 # (from the traditional package) is present; while newer
3792 # systemd is smart enough to disable the old
3793 # (/lib/systemd/...) and enable the new (/etc/systemd/...),
3794 # some older versions of systemd error out with EEXIST.
3795 call_throws(ctx, ['systemctl', 'disable', 'ceph.target'])
3796 call_throws(ctx, ['systemctl', 'enable', 'ceph.target'])
3797 call_throws(ctx, ['systemctl', 'start', 'ceph.target'])
3798
3799 # cluster unit
3800 existed = os.path.exists(ctx.unit_dir + '/ceph-%s.target' % fsid)
3801 with open(ctx.unit_dir + '/ceph-%s.target.new' % fsid, 'w') as f:
3802 f.write(
3803 '[Unit]\n'
3804 'Description=Ceph cluster {fsid}\n'
3805 'PartOf=ceph.target\n'
3806 'Before=ceph.target\n'
3807 '\n'
3808 '[Install]\n'
3809 'WantedBy=multi-user.target ceph.target\n'.format(
3810 fsid=fsid)
3811 )
3812 os.rename(ctx.unit_dir + '/ceph-%s.target.new' % fsid,
3813 ctx.unit_dir + '/ceph-%s.target' % fsid)
3814 if not existed:
3815 call_throws(ctx, ['systemctl', 'enable', 'ceph-%s.target' % fsid])
3816 call_throws(ctx, ['systemctl', 'start', 'ceph-%s.target' % fsid])
3817
3818 # don't overwrite file in order to allow users to manipulate it
3819 if os.path.exists(ctx.logrotate_dir + f'/ceph-{fsid}'):
3820 return
3821
3822 # logrotate for the cluster
3823 with open(ctx.logrotate_dir + '/ceph-%s' % fsid, 'w') as f:
3824 """
3825 This is a bit sloppy in that the killall/pkill will touch all ceph daemons
3826 in all containers, but I don't see an elegant way to send SIGHUP *just* to
3827 the daemons for this cluster. (1) systemd kill -s will get the signal to
3828 podman, but podman will exit. (2) podman kill will get the signal to the
3829 first child (bash), but that isn't the ceph daemon. This is simpler and
3830 should be harmless.
3831 """
3832 f.write("""# created by cephadm
3833 /var/log/ceph/%s/*.log {
3834 rotate 7
3835 daily
3836 compress
3837 sharedscripts
3838 postrotate
3839 killall -q -1 ceph-mon ceph-mgr ceph-mds ceph-osd ceph-fuse radosgw rbd-mirror cephfs-mirror || pkill -1 -x 'ceph-mon|ceph-mgr|ceph-mds|ceph-osd|ceph-fuse|radosgw|rbd-mirror|cephfs-mirror' || true
3840 endscript
3841 missingok
3842 notifempty
3843 su root root
3844 }
3845 """ % fsid)
3846
3847
3848 def get_unit_file(ctx, fsid):
3849 # type: (CephadmContext, str) -> str
3850 extra_args = ''
3851 if isinstance(ctx.container_engine, Podman):
3852 extra_args = ('ExecStartPre=-/bin/rm -f %t/%n-pid %t/%n-cid\n'
3853 'ExecStopPost=-/bin/rm -f %t/%n-pid %t/%n-cid\n'
3854 'Type=forking\n'
3855 'PIDFile=%t/%n-pid\n')
3856 if ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION:
3857 extra_args += 'Delegate=yes\n'
3858
3859 docker = isinstance(ctx.container_engine, Docker)
3860 u = """# generated by cephadm
3861 [Unit]
3862 Description=Ceph %i for {fsid}
3863
3864 # According to:
3865 # http://www.freedesktop.org/wiki/Software/systemd/NetworkTarget
3866 # these can be removed once ceph-mon will dynamically change network
3867 # configuration.
3868 After=network-online.target local-fs.target time-sync.target{docker_after}
3869 Wants=network-online.target local-fs.target time-sync.target
3870 {docker_requires}
3871
3872 PartOf=ceph-{fsid}.target
3873 Before=ceph-{fsid}.target
3874
3875 [Service]
3876 LimitNOFILE=1048576
3877 LimitNPROC=1048576
3878 EnvironmentFile=-/etc/environment
3879 ExecStart=/bin/bash {data_dir}/{fsid}/%i/unit.run
3880 ExecStop=-/bin/bash -c 'bash {data_dir}/{fsid}/%i/unit.stop'
3881 ExecStopPost=-/bin/bash {data_dir}/{fsid}/%i/unit.poststop
3882 KillMode=none
3883 Restart=on-failure
3884 RestartSec=10s
3885 TimeoutStartSec=200
3886 TimeoutStopSec=120
3887 StartLimitInterval=30min
3888 StartLimitBurst=5
3889 {extra_args}
3890 [Install]
3891 WantedBy=ceph-{fsid}.target
3892 """.format(fsid=fsid,
3893 data_dir=ctx.data_dir,
3894 extra_args=extra_args,
3895 # if docker, we depend on docker.service
3896 docker_after=' docker.service' if docker else '',
3897 docker_requires='Requires=docker.service\n' if docker else '')
3898
3899 return u
3900
3901 ##################################
3902
3903
3904 class CephContainer:
3905 def __init__(self,
3906 ctx: CephadmContext,
3907 image: str,
3908 entrypoint: str,
3909 args: List[str] = [],
3910 volume_mounts: Dict[str, str] = {},
3911 cname: str = '',
3912 container_args: List[str] = [],
3913 envs: Optional[List[str]] = None,
3914 privileged: bool = False,
3915 ptrace: bool = False,
3916 bind_mounts: Optional[List[List[str]]] = None,
3917 init: Optional[bool] = None,
3918 host_network: bool = True,
3919 memory_request: Optional[str] = None,
3920 memory_limit: Optional[str] = None,
3921 ) -> None:
3922 self.ctx = ctx
3923 self.image = image
3924 self.entrypoint = entrypoint
3925 self.args = args
3926 self.volume_mounts = volume_mounts
3927 self._cname = cname
3928 self.container_args = container_args
3929 self.envs = envs
3930 self.privileged = privileged
3931 self.ptrace = ptrace
3932 self.bind_mounts = bind_mounts if bind_mounts else []
3933 self.init = init if init else ctx.container_init
3934 self.host_network = host_network
3935 self.memory_request = memory_request
3936 self.memory_limit = memory_limit
3937
3938 @classmethod
3939 def for_daemon(cls,
3940 ctx: CephadmContext,
3941 fsid: str,
3942 daemon_type: str,
3943 daemon_id: str,
3944 entrypoint: str,
3945 args: List[str] = [],
3946 volume_mounts: Dict[str, str] = {},
3947 container_args: List[str] = [],
3948 envs: Optional[List[str]] = None,
3949 privileged: bool = False,
3950 ptrace: bool = False,
3951 bind_mounts: Optional[List[List[str]]] = None,
3952 init: Optional[bool] = None,
3953 host_network: bool = True,
3954 memory_request: Optional[str] = None,
3955 memory_limit: Optional[str] = None,
3956 ) -> 'CephContainer':
3957 return cls(
3958 ctx,
3959 image=ctx.image,
3960 entrypoint=entrypoint,
3961 args=args,
3962 volume_mounts=volume_mounts,
3963 cname='ceph-%s-%s.%s' % (fsid, daemon_type, daemon_id),
3964 container_args=container_args,
3965 envs=envs,
3966 privileged=privileged,
3967 ptrace=ptrace,
3968 bind_mounts=bind_mounts,
3969 init=init,
3970 host_network=host_network,
3971 memory_request=memory_request,
3972 memory_limit=memory_limit,
3973 )
3974
3975 @property
3976 def cname(self) -> str:
3977 """
3978 podman adds the current container name to the /etc/hosts
3979 file. Turns out, python's `socket.getfqdn()` differs from
3980 `hostname -f`, when we have the container names containing
3981 dots in it.:
3982
3983 # podman run --name foo.bar.baz.com ceph/ceph /bin/bash
3984 [root@sebastians-laptop /]# cat /etc/hosts
3985 127.0.0.1 localhost
3986 ::1 localhost
3987 127.0.1.1 sebastians-laptop foo.bar.baz.com
3988 [root@sebastians-laptop /]# hostname -f
3989 sebastians-laptop
3990 [root@sebastians-laptop /]# python3 -c 'import socket; print(socket.getfqdn())'
3991 foo.bar.baz.com
3992
3993 Fascinatingly, this doesn't happen when using dashes.
3994 """
3995 return self._cname.replace('.', '-')
3996
3997 @cname.setter
3998 def cname(self, val: str) -> None:
3999 self._cname = val
4000
4001 @property
4002 def old_cname(self) -> str:
4003 return self._cname
4004
4005 def run_cmd(self) -> List[str]:
4006 cmd_args: List[str] = [
4007 str(self.ctx.container_engine.path),
4008 'run',
4009 '--rm',
4010 '--ipc=host',
4011 # some containers (ahem, haproxy) override this, but we want a fast
4012 # shutdown always (and, more importantly, a successful exit even if we
4013 # fall back to SIGKILL).
4014 '--stop-signal=SIGTERM',
4015 ]
4016
4017 if isinstance(self.ctx.container_engine, Podman):
4018 if os.path.exists('/etc/ceph/podman-auth.json'):
4019 cmd_args.append('--authfile=/etc/ceph/podman-auth.json')
4020
4021 envs: List[str] = [
4022 '-e', 'CONTAINER_IMAGE=%s' % self.image,
4023 '-e', 'NODE_NAME=%s' % get_hostname(),
4024 ]
4025 vols: List[str] = []
4026 binds: List[str] = []
4027
4028 if self.memory_request:
4029 cmd_args.extend(['-e', 'POD_MEMORY_REQUEST', str(self.memory_request)])
4030 if self.memory_limit:
4031 cmd_args.extend(['-e', 'POD_MEMORY_LIMIT', str(self.memory_limit)])
4032 cmd_args.extend(['--memory', str(self.memory_limit)])
4033
4034 if self.host_network:
4035 cmd_args.append('--net=host')
4036 if self.entrypoint:
4037 cmd_args.extend(['--entrypoint', self.entrypoint])
4038 if self.privileged:
4039 cmd_args.extend([
4040 '--privileged',
4041 # let OSD etc read block devs that haven't been chowned
4042 '--group-add=disk'])
4043 if self.ptrace and not self.privileged:
4044 # if privileged, the SYS_PTRACE cap is already added
4045 # in addition, --cap-add and --privileged are mutually
4046 # exclusive since podman >= 2.0
4047 cmd_args.append('--cap-add=SYS_PTRACE')
4048 if self.init:
4049 cmd_args.append('--init')
4050 envs += ['-e', 'CEPH_USE_RANDOM_NONCE=1']
4051 if self.cname:
4052 cmd_args.extend(['--name', self.cname])
4053 if self.envs:
4054 for env in self.envs:
4055 envs.extend(['-e', env])
4056
4057 vols = sum(
4058 [['-v', '%s:%s' % (host_dir, container_dir)]
4059 for host_dir, container_dir in self.volume_mounts.items()], [])
4060 binds = sum([['--mount', '{}'.format(','.join(bind))]
4061 for bind in self.bind_mounts], [])
4062
4063 return \
4064 cmd_args + self.container_args + \
4065 envs + vols + binds + \
4066 [self.image] + self.args # type: ignore
4067
4068 def shell_cmd(self, cmd: List[str]) -> List[str]:
4069 cmd_args: List[str] = [
4070 str(self.ctx.container_engine.path),
4071 'run',
4072 '--rm',
4073 '--ipc=host',
4074 ]
4075 envs: List[str] = [
4076 '-e', 'CONTAINER_IMAGE=%s' % self.image,
4077 '-e', 'NODE_NAME=%s' % get_hostname(),
4078 ]
4079 vols: List[str] = []
4080 binds: List[str] = []
4081
4082 if self.host_network:
4083 cmd_args.append('--net=host')
4084 if self.ctx.no_hosts:
4085 cmd_args.append('--no-hosts')
4086 if self.privileged:
4087 cmd_args.extend([
4088 '--privileged',
4089 # let OSD etc read block devs that haven't been chowned
4090 '--group-add=disk',
4091 ])
4092 if self.init:
4093 cmd_args.append('--init')
4094 envs += ['-e', 'CEPH_USE_RANDOM_NONCE=1']
4095 if self.envs:
4096 for env in self.envs:
4097 envs.extend(['-e', env])
4098
4099 vols = sum(
4100 [['-v', '%s:%s' % (host_dir, container_dir)]
4101 for host_dir, container_dir in self.volume_mounts.items()], [])
4102 binds = sum([['--mount', '{}'.format(','.join(bind))]
4103 for bind in self.bind_mounts], [])
4104
4105 return cmd_args + self.container_args + envs + vols + binds + [
4106 '--entrypoint', cmd[0],
4107 self.image,
4108 ] + cmd[1:]
4109
4110 def exec_cmd(self, cmd):
4111 # type: (List[str]) -> List[str]
4112 cname = get_running_container_name(self.ctx, self)
4113 if not cname:
4114 raise Error('unable to find container "{}"'.format(self.cname))
4115 return [
4116 str(self.ctx.container_engine.path),
4117 'exec',
4118 ] + self.container_args + [
4119 self.cname,
4120 ] + cmd
4121
4122 def rm_cmd(self, old_cname: bool = False, storage: bool = False) -> List[str]:
4123 ret = [
4124 str(self.ctx.container_engine.path),
4125 'rm', '-f',
4126 ]
4127 if storage:
4128 ret.append('--storage')
4129 if old_cname:
4130 ret.append(self.old_cname)
4131 else:
4132 ret.append(self.cname)
4133 return ret
4134
4135 def stop_cmd(self, old_cname: bool = False) -> List[str]:
4136 ret = [
4137 str(self.ctx.container_engine.path),
4138 'stop', self.old_cname if old_cname else self.cname,
4139 ]
4140 return ret
4141
4142 def run(self, timeout=DEFAULT_TIMEOUT, verbosity=CallVerbosity.VERBOSE_ON_FAILURE):
4143 # type: (Optional[int], CallVerbosity) -> str
4144 out, _, _ = call_throws(self.ctx, self.run_cmd(),
4145 desc=self.entrypoint, timeout=timeout, verbosity=verbosity)
4146 return out
4147
4148
4149 #####################################
4150
4151 class MgrListener(Thread):
4152 def __init__(self, agent: 'CephadmAgent') -> None:
4153 self.agent = agent
4154 self.stop = False
4155 super(MgrListener, self).__init__(target=self.run)
4156
4157 def run(self) -> None:
4158 listenSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
4159 listenSocket.bind(('0.0.0.0', int(self.agent.listener_port)))
4160 listenSocket.settimeout(60)
4161 listenSocket.listen(1)
4162 ssl_ctx = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
4163 ssl_ctx.verify_mode = ssl.CERT_REQUIRED
4164 ssl_ctx.load_cert_chain(self.agent.listener_cert_path, self.agent.listener_key_path)
4165 ssl_ctx.load_verify_locations(self.agent.ca_path)
4166 secureListenSocket = ssl_ctx.wrap_socket(listenSocket, server_side=True)
4167 while not self.stop:
4168 try:
4169 try:
4170 conn, _ = secureListenSocket.accept()
4171 except socket.timeout:
4172 continue
4173 try:
4174 length: int = int(conn.recv(10).decode())
4175 except Exception as e:
4176 err_str = f'Failed to extract length of payload from message: {e}'
4177 conn.send(err_str.encode())
4178 logger.error(err_str)
4179 while True:
4180 payload = conn.recv(length).decode()
4181 if not payload:
4182 break
4183 try:
4184 data: Dict[Any, Any] = json.loads(payload)
4185 self.handle_json_payload(data)
4186 except Exception as e:
4187 err_str = f'Failed to extract json payload from message: {e}'
4188 conn.send(err_str.encode())
4189 logger.error(err_str)
4190 else:
4191 conn.send(b'ACK')
4192 if 'config' in data:
4193 self.agent.wakeup()
4194 self.agent.ls_gatherer.wakeup()
4195 self.agent.volume_gatherer.wakeup()
4196 logger.debug(f'Got mgr message {data}')
4197 except Exception as e:
4198 logger.error(f'Mgr Listener encountered exception: {e}')
4199
4200 def shutdown(self) -> None:
4201 self.stop = True
4202
4203 def handle_json_payload(self, data: Dict[Any, Any]) -> None:
4204 self.agent.ack = int(data['counter'])
4205 if 'config' in data:
4206 logger.info('Received new config from mgr')
4207 config = data['config']
4208 for filename in config:
4209 if filename in self.agent.required_files:
4210 file_path = os.path.join(self.agent.daemon_dir, filename)
4211 with open(os.open(file_path + '.new', os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
4212 f.write(config[filename])
4213 os.rename(file_path + '.new', file_path)
4214 self.agent.pull_conf_settings()
4215 self.agent.wakeup()
4216
4217
4218 class CephadmAgent():
4219
4220 daemon_type = 'agent'
4221 default_port = 8498
4222 loop_interval = 30
4223 stop = False
4224
4225 required_files = [
4226 'agent.json',
4227 'keyring',
4228 'root_cert.pem',
4229 'listener.crt',
4230 'listener.key',
4231 ]
4232
4233 def __init__(self, ctx: CephadmContext, fsid: str, daemon_id: Union[int, str] = ''):
4234 self.ctx = ctx
4235 self.fsid = fsid
4236 self.daemon_id = daemon_id
4237 self.starting_port = 14873
4238 self.target_ip = ''
4239 self.target_port = ''
4240 self.host = ''
4241 self.daemon_dir = os.path.join(ctx.data_dir, self.fsid, f'{self.daemon_type}.{self.daemon_id}')
4242 self.config_path = os.path.join(self.daemon_dir, 'agent.json')
4243 self.keyring_path = os.path.join(self.daemon_dir, 'keyring')
4244 self.ca_path = os.path.join(self.daemon_dir, 'root_cert.pem')
4245 self.listener_cert_path = os.path.join(self.daemon_dir, 'listener.crt')
4246 self.listener_key_path = os.path.join(self.daemon_dir, 'listener.key')
4247 self.listener_port = ''
4248 self.ack = 1
4249 self.event = Event()
4250 self.mgr_listener = MgrListener(self)
4251 self.ls_gatherer = AgentGatherer(self, lambda: self._get_ls(), 'Ls')
4252 self.volume_gatherer = AgentGatherer(self, lambda: self._ceph_volume(enhanced=False), 'Volume')
4253 self.device_enhanced_scan = False
4254 self.recent_iteration_run_times: List[float] = [0.0, 0.0, 0.0]
4255 self.recent_iteration_index: int = 0
4256 self.cached_ls_values: Dict[str, Dict[str, str]] = {}
4257
4258 def validate(self, config: Dict[str, str] = {}) -> None:
4259 # check for the required files
4260 for fname in self.required_files:
4261 if fname not in config:
4262 raise Error('required file missing from config: %s' % fname)
4263
4264 def deploy_daemon_unit(self, config: Dict[str, str] = {}) -> None:
4265 if not config:
4266 raise Error('Agent needs a config')
4267 assert isinstance(config, dict)
4268 self.validate(config)
4269
4270 # Create the required config files in the daemons dir, with restricted permissions
4271 for filename in config:
4272 if filename in self.required_files:
4273 file_path = os.path.join(self.daemon_dir, filename)
4274 with open(os.open(file_path + '.new', os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
4275 f.write(config[filename])
4276 os.rename(file_path + '.new', file_path)
4277
4278 unit_run_path = os.path.join(self.daemon_dir, 'unit.run')
4279 with open(os.open(unit_run_path + '.new', os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
4280 f.write(self.unit_run())
4281 os.rename(unit_run_path + '.new', unit_run_path)
4282
4283 meta: Dict[str, Any] = {}
4284 meta_file_path = os.path.join(self.daemon_dir, 'unit.meta')
4285 if 'meta_json' in self.ctx and self.ctx.meta_json:
4286 meta = json.loads(self.ctx.meta_json) or {}
4287 with open(os.open(meta_file_path + '.new', os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
4288 f.write(json.dumps(meta, indent=4) + '\n')
4289 os.rename(meta_file_path + '.new', meta_file_path)
4290
4291 unit_file_path = os.path.join(self.ctx.unit_dir, self.unit_name())
4292 with open(os.open(unit_file_path + '.new', os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
4293 f.write(self.unit_file())
4294 os.rename(unit_file_path + '.new', unit_file_path)
4295
4296 call_throws(self.ctx, ['systemctl', 'daemon-reload'])
4297 call(self.ctx, ['systemctl', 'stop', self.unit_name()],
4298 verbosity=CallVerbosity.DEBUG)
4299 call(self.ctx, ['systemctl', 'reset-failed', self.unit_name()],
4300 verbosity=CallVerbosity.DEBUG)
4301 call_throws(self.ctx, ['systemctl', 'enable', '--now', self.unit_name()])
4302
4303 def unit_name(self) -> str:
4304 return '{}.service'.format(get_unit_name(self.fsid, self.daemon_type, self.daemon_id))
4305
4306 def unit_run(self) -> str:
4307 py3 = shutil.which('python3')
4308 binary_path = os.path.realpath(sys.argv[0])
4309 return ('set -e\n' + f'{py3} {binary_path} agent --fsid {self.fsid} --daemon-id {self.daemon_id} &\n')
4310
4311 def unit_file(self) -> str:
4312 return """#generated by cephadm
4313 [Unit]
4314 Description=cephadm agent for cluster {fsid}
4315
4316 PartOf=ceph-{fsid}.target
4317 Before=ceph-{fsid}.target
4318
4319 [Service]
4320 Type=forking
4321 ExecStart=/bin/bash {data_dir}/unit.run
4322 Restart=on-failure
4323 RestartSec=10s
4324
4325 [Install]
4326 WantedBy=ceph-{fsid}.target
4327 """.format(
4328 fsid=self.fsid,
4329 data_dir=self.daemon_dir
4330 )
4331
4332 def shutdown(self) -> None:
4333 self.stop = True
4334 if self.mgr_listener.is_alive():
4335 self.mgr_listener.shutdown()
4336
4337 def wakeup(self) -> None:
4338 self.event.set()
4339
4340 def pull_conf_settings(self) -> None:
4341 try:
4342 with open(self.config_path, 'r') as f:
4343 config = json.load(f)
4344 self.target_ip = config['target_ip']
4345 self.target_port = config['target_port']
4346 self.loop_interval = int(config['refresh_period'])
4347 self.starting_port = int(config['listener_port'])
4348 self.host = config['host']
4349 use_lsm = config['device_enhanced_scan']
4350 except Exception as e:
4351 self.shutdown()
4352 raise Error(f'Failed to get agent target ip and port from config: {e}')
4353
4354 try:
4355 with open(self.keyring_path, 'r') as f:
4356 self.keyring = f.read()
4357 except Exception as e:
4358 self.shutdown()
4359 raise Error(f'Failed to get agent keyring: {e}')
4360
4361 assert self.target_ip and self.target_port
4362
4363 self.device_enhanced_scan = False
4364 if use_lsm.lower() == 'true':
4365 self.device_enhanced_scan = True
4366 self.volume_gatherer.update_func(lambda: self._ceph_volume(enhanced=self.device_enhanced_scan))
4367
4368 def run(self) -> None:
4369 self.pull_conf_settings()
4370
4371 try:
4372 for _ in range(1001):
4373 if not port_in_use(self.ctx, self.starting_port):
4374 self.listener_port = str(self.starting_port)
4375 break
4376 self.starting_port += 1
4377 if not self.listener_port:
4378 raise Error(f'All 1000 ports starting at {str(self.starting_port - 1001)} taken.')
4379 except Exception as e:
4380 raise Error(f'Failed to pick port for agent to listen on: {e}')
4381
4382 if not self.mgr_listener.is_alive():
4383 self.mgr_listener.start()
4384
4385 if not self.ls_gatherer.is_alive():
4386 self.ls_gatherer.start()
4387
4388 if not self.volume_gatherer.is_alive():
4389 self.volume_gatherer.start()
4390
4391 ssl_ctx = ssl.create_default_context()
4392 ssl_ctx.check_hostname = True
4393 ssl_ctx.verify_mode = ssl.CERT_REQUIRED
4394 ssl_ctx.load_verify_locations(self.ca_path)
4395
4396 while not self.stop:
4397 start_time = time.monotonic()
4398 ack = self.ack
4399
4400 # part of the networks info is returned as a set which is not JSON
4401 # serializable. The set must be converted to a list
4402 networks = list_networks(self.ctx)
4403 networks_list = {}
4404 for key in networks.keys():
4405 for k, v in networks[key].items():
4406 networks_list[key] = {k: list(v)}
4407
4408 data = json.dumps({'host': self.host,
4409 'ls': (self.ls_gatherer.data if self.ack == self.ls_gatherer.ack
4410 and self.ls_gatherer.data is not None else []),
4411 'networks': networks_list,
4412 'facts': HostFacts(self.ctx).dump(),
4413 'volume': (self.volume_gatherer.data if self.ack == self.volume_gatherer.ack
4414 and self.volume_gatherer.data is not None else ''),
4415 'ack': str(ack),
4416 'keyring': self.keyring,
4417 'port': self.listener_port})
4418 data = data.encode('ascii')
4419
4420 url = f'https://{self.target_ip}:{self.target_port}/data'
4421 try:
4422 req = Request(url, data, {'Content-Type': 'application/json'})
4423 send_time = time.monotonic()
4424 with urlopen(req, context=ssl_ctx) as response:
4425 response_str = response.read()
4426 response_json = json.loads(response_str)
4427 total_request_time = datetime.timedelta(seconds=(time.monotonic() - send_time)).total_seconds()
4428 logger.info(f'Received mgr response: "{response_json["result"]}" {total_request_time} seconds after sending request.')
4429 except Exception as e:
4430 logger.error(f'Failed to send metadata to mgr: {e}')
4431
4432 end_time = time.monotonic()
4433 run_time = datetime.timedelta(seconds=(end_time - start_time))
4434 self.recent_iteration_run_times[self.recent_iteration_index] = run_time.total_seconds()
4435 self.recent_iteration_index = (self.recent_iteration_index + 1) % 3
4436 run_time_average = sum(self.recent_iteration_run_times, 0.0) / len([t for t in self.recent_iteration_run_times if t])
4437
4438 self.event.wait(max(self.loop_interval - int(run_time_average), 0))
4439 self.event.clear()
4440
4441 def _ceph_volume(self, enhanced: bool = False) -> Tuple[str, bool]:
4442 self.ctx.command = 'inventory --format=json'.split()
4443 if enhanced:
4444 self.ctx.command.append('--with-lsm')
4445 self.ctx.fsid = self.fsid
4446
4447 stream = io.StringIO()
4448 with redirect_stdout(stream):
4449 command_ceph_volume(self.ctx)
4450
4451 stdout = stream.getvalue()
4452
4453 if stdout:
4454 return (stdout, False)
4455 else:
4456 raise Exception('ceph-volume returned empty value')
4457
4458 def _daemon_ls_subset(self) -> Dict[str, Dict[str, Any]]:
4459 # gets a subset of ls info quickly. The results of this will tell us if our
4460 # cached info is still good or if we need to run the full ls again.
4461 # for legacy containers, we just grab the full info. For cephadmv1 containers,
4462 # we only grab enabled, state, mem_usage and container id. If container id has
4463 # not changed for any daemon, we assume our cached info is good.
4464 daemons: Dict[str, Dict[str, Any]] = {}
4465 data_dir = self.ctx.data_dir
4466 seen_memusage = {} # type: Dict[str, int]
4467 out, err, code = call(
4468 self.ctx,
4469 [self.ctx.container_engine.path, 'stats', '--format', '{{.ID}},{{.MemUsage}}', '--no-stream'],
4470 verbosity=CallVerbosity.DEBUG
4471 )
4472 seen_memusage_cid_len, seen_memusage = _parse_mem_usage(code, out)
4473 # we need a mapping from container names to ids. Later we will convert daemon
4474 # names to container names to get daemons container id to see if it has changed
4475 out, err, code = call(
4476 self.ctx,
4477 [self.ctx.container_engine.path, 'ps', '--format', '{{.ID}},{{.Names}}', '--no-trunc'],
4478 verbosity=CallVerbosity.DEBUG
4479 )
4480 name_id_mapping: Dict[str, str] = self._parse_container_id_name(code, out)
4481 for i in os.listdir(data_dir):
4482 if i in ['mon', 'osd', 'mds', 'mgr']:
4483 daemon_type = i
4484 for j in os.listdir(os.path.join(data_dir, i)):
4485 if '-' not in j:
4486 continue
4487 (cluster, daemon_id) = j.split('-', 1)
4488 legacy_unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
4489 (enabled, state, _) = check_unit(self.ctx, legacy_unit_name)
4490 daemons[f'{daemon_type}.{daemon_id}'] = {
4491 'style': 'legacy',
4492 'name': '%s.%s' % (daemon_type, daemon_id),
4493 'fsid': self.ctx.fsid if self.ctx.fsid is not None else 'unknown',
4494 'systemd_unit': legacy_unit_name,
4495 'enabled': 'true' if enabled else 'false',
4496 'state': state,
4497 }
4498 elif is_fsid(i):
4499 fsid = str(i) # convince mypy that fsid is a str here
4500 for j in os.listdir(os.path.join(data_dir, i)):
4501 if '.' in j and os.path.isdir(os.path.join(data_dir, fsid, j)):
4502 (daemon_type, daemon_id) = j.split('.', 1)
4503 unit_name = get_unit_name(fsid, daemon_type, daemon_id)
4504 (enabled, state, _) = check_unit(self.ctx, unit_name)
4505 daemons[j] = {
4506 'style': 'cephadm:v1',
4507 'systemd_unit': unit_name,
4508 'enabled': 'true' if enabled else 'false',
4509 'state': state,
4510 }
4511 c = CephContainer.for_daemon(self.ctx, self.ctx.fsid, daemon_type, daemon_id, 'bash')
4512 container_id: Optional[str] = None
4513 for name in (c.cname, c.old_cname):
4514 if name in name_id_mapping:
4515 container_id = name_id_mapping[name]
4516 break
4517 daemons[j]['container_id'] = container_id
4518 if container_id:
4519 daemons[j]['memory_usage'] = seen_memusage.get(container_id[0:seen_memusage_cid_len])
4520 return daemons
4521
4522 def _parse_container_id_name(self, code: int, out: str) -> Dict[str, str]:
4523 # map container names to ids from ps output
4524 name_id_mapping = {} # type: Dict[str, str]
4525 if not code:
4526 for line in out.splitlines():
4527 id, name = line.split(',')
4528 name_id_mapping[name] = id
4529 return name_id_mapping
4530
4531 def _get_ls(self) -> Tuple[List[Dict[str, str]], bool]:
4532 if not self.cached_ls_values:
4533 logger.info('No cached ls output. Running full daemon ls')
4534 ls = list_daemons(self.ctx)
4535 for d in ls:
4536 self.cached_ls_values[d['name']] = d
4537 return (ls, True)
4538 else:
4539 ls_subset = self._daemon_ls_subset()
4540 need_full_ls = False
4541 state_change = False
4542 if set(self.cached_ls_values.keys()) != set(ls_subset.keys()):
4543 # case for a new daemon in ls or an old daemon no longer appearing.
4544 # If that happens we need a full ls
4545 logger.info('Change detected in state of daemons. Running full daemon ls')
4546 ls = list_daemons(self.ctx)
4547 for d in ls:
4548 self.cached_ls_values[d['name']] = d
4549 return (ls, True)
4550 for daemon, info in self.cached_ls_values.items():
4551 if info['style'] == 'legacy':
4552 # for legacy containers, ls_subset just grabs all the info
4553 self.cached_ls_values[daemon] = ls_subset[daemon]
4554 else:
4555 if info['container_id'] != ls_subset[daemon]['container_id']:
4556 # case for container id having changed. We need full ls as
4557 # info we didn't grab like version and start time could have changed
4558 need_full_ls = True
4559 break
4560
4561 # want to know if a daemons state change because in those cases we want
4562 # to report back quicker
4563 if (
4564 self.cached_ls_values[daemon]['enabled'] != ls_subset[daemon]['enabled']
4565 or self.cached_ls_values[daemon]['state'] != ls_subset[daemon]['state']
4566 ):
4567 state_change = True
4568 # if we reach here, container id matched. Update the few values we do track
4569 # from ls subset: state, enabled, memory_usage.
4570 self.cached_ls_values[daemon]['enabled'] = ls_subset[daemon]['enabled']
4571 self.cached_ls_values[daemon]['state'] = ls_subset[daemon]['state']
4572 if 'memory_usage' in ls_subset[daemon]:
4573 self.cached_ls_values[daemon]['memory_usage'] = ls_subset[daemon]['memory_usage']
4574 if need_full_ls:
4575 logger.info('Change detected in state of daemons. Running full daemon ls')
4576 ls = list_daemons(self.ctx)
4577 for d in ls:
4578 self.cached_ls_values[d['name']] = d
4579 return (ls, True)
4580 else:
4581 ls = [info for daemon, info in self.cached_ls_values.items()]
4582 return (ls, state_change)
4583
4584
4585 class AgentGatherer(Thread):
4586 def __init__(self, agent: 'CephadmAgent', func: Callable, gatherer_type: str = 'Unnamed', initial_ack: int = 0) -> None:
4587 self.agent = agent
4588 self.func = func
4589 self.gatherer_type = gatherer_type
4590 self.ack = initial_ack
4591 self.event = Event()
4592 self.data: Any = None
4593 self.stop = False
4594 self.recent_iteration_run_times: List[float] = [0.0, 0.0, 0.0]
4595 self.recent_iteration_index: int = 0
4596 super(AgentGatherer, self).__init__(target=self.run)
4597
4598 def run(self) -> None:
4599 while not self.stop:
4600 try:
4601 start_time = time.monotonic()
4602
4603 ack = self.agent.ack
4604 change = False
4605 try:
4606 self.data, change = self.func()
4607 except Exception as e:
4608 logger.error(f'{self.gatherer_type} Gatherer encountered exception gathering data: {e}')
4609 self.data = None
4610 if ack != self.ack or change:
4611 self.ack = ack
4612 self.agent.wakeup()
4613
4614 end_time = time.monotonic()
4615 run_time = datetime.timedelta(seconds=(end_time - start_time))
4616 self.recent_iteration_run_times[self.recent_iteration_index] = run_time.total_seconds()
4617 self.recent_iteration_index = (self.recent_iteration_index + 1) % 3
4618 run_time_average = sum(self.recent_iteration_run_times, 0.0) / len([t for t in self.recent_iteration_run_times if t])
4619
4620 self.event.wait(max(self.agent.loop_interval - int(run_time_average), 0))
4621 self.event.clear()
4622 except Exception as e:
4623 logger.error(f'{self.gatherer_type} Gatherer encountered exception: {e}')
4624
4625 def shutdown(self) -> None:
4626 self.stop = True
4627
4628 def wakeup(self) -> None:
4629 self.event.set()
4630
4631 def update_func(self, func: Callable) -> None:
4632 self.func = func
4633
4634
4635 def command_agent(ctx: CephadmContext) -> None:
4636 agent = CephadmAgent(ctx, ctx.fsid, ctx.daemon_id)
4637
4638 if not os.path.isdir(agent.daemon_dir):
4639 raise Error(f'Agent daemon directory {agent.daemon_dir} does not exist. Perhaps agent was never deployed?')
4640
4641 agent.run()
4642
4643
4644 ##################################
4645
4646
4647 @infer_image
4648 def command_version(ctx):
4649 # type: (CephadmContext) -> int
4650 c = CephContainer(ctx, ctx.image, 'ceph', ['--version'])
4651 out, err, ret = call(ctx, c.run_cmd(), desc=c.entrypoint)
4652 if not ret:
4653 print(out.strip())
4654 return ret
4655
4656 ##################################
4657
4658
4659 @default_image
4660 def command_pull(ctx):
4661 # type: (CephadmContext) -> int
4662
4663 try:
4664 _pull_image(ctx, ctx.image, ctx.insecure)
4665 except UnauthorizedRegistryError:
4666 err_str = 'Failed to pull container image. Check that host(s) are logged into the registry'
4667 logger.debug(f'Pulling image for `command_pull` failed: {err_str}')
4668 raise Error(err_str)
4669 return command_inspect_image(ctx)
4670
4671
4672 def _pull_image(ctx, image, insecure=False):
4673 # type: (CephadmContext, str, bool) -> None
4674 logger.info('Pulling container image %s...' % image)
4675
4676 ignorelist = [
4677 'error creating read-write layer with ID',
4678 'net/http: TLS handshake timeout',
4679 'Digest did not match, expected',
4680 ]
4681
4682 cmd = [ctx.container_engine.path, 'pull', image]
4683 if isinstance(ctx.container_engine, Podman):
4684 if insecure:
4685 cmd.append('--tls-verify=false')
4686
4687 if os.path.exists('/etc/ceph/podman-auth.json'):
4688 cmd.append('--authfile=/etc/ceph/podman-auth.json')
4689 cmd_str = ' '.join(cmd)
4690
4691 for sleep_secs in [1, 4, 25]:
4692 out, err, ret = call(ctx, cmd, verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
4693 if not ret:
4694 return
4695
4696 if 'unauthorized' in err:
4697 raise UnauthorizedRegistryError()
4698
4699 if not any(pattern in err for pattern in ignorelist):
4700 raise Error('Failed command: %s' % cmd_str)
4701
4702 logger.info('`%s` failed transiently. Retrying. waiting %s seconds...' % (cmd_str, sleep_secs))
4703 time.sleep(sleep_secs)
4704
4705 raise Error('Failed command: %s: maximum retries reached' % cmd_str)
4706
4707 ##################################
4708
4709
4710 @infer_image
4711 def command_inspect_image(ctx):
4712 # type: (CephadmContext) -> int
4713 out, err, ret = call_throws(ctx, [
4714 ctx.container_engine.path, 'inspect',
4715 '--format', '{{.ID}},{{.RepoDigests}}',
4716 ctx.image])
4717 if ret:
4718 return errno.ENOENT
4719 info_from = get_image_info_from_inspect(out.strip(), ctx.image)
4720
4721 ver = CephContainer(ctx, ctx.image, 'ceph', ['--version']).run().strip()
4722 info_from['ceph_version'] = ver
4723
4724 print(json.dumps(info_from, indent=4, sort_keys=True))
4725 return 0
4726
4727
4728 def normalize_image_digest(digest: str) -> str:
4729 """
4730 Normal case:
4731 >>> normalize_image_digest('ceph/ceph', 'docker.io')
4732 'docker.io/ceph/ceph'
4733
4734 No change:
4735 >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io')
4736 'quay.ceph.io/ceph/ceph'
4737
4738 >>> normalize_image_digest('docker.io/ubuntu', 'docker.io')
4739 'docker.io/ubuntu'
4740
4741 >>> normalize_image_digest('localhost/ceph', 'docker.io')
4742 'localhost/ceph'
4743 """
4744 known_shortnames = [
4745 'ceph/ceph',
4746 'ceph/daemon',
4747 'ceph/daemon-base',
4748 ]
4749 for image in known_shortnames:
4750 if digest.startswith(image):
4751 return f'{DEFAULT_REGISTRY}/{digest}'
4752 return digest
4753
4754
4755 def get_image_info_from_inspect(out, image):
4756 # type: (str, str) -> Dict[str, Union[str,List[str]]]
4757 image_id, digests = out.split(',', 1)
4758 if not out:
4759 raise Error('inspect {}: empty result'.format(image))
4760 r = {
4761 'image_id': normalize_container_id(image_id)
4762 } # type: Dict[str, Union[str,List[str]]]
4763 if digests:
4764 r['repo_digests'] = list(map(normalize_image_digest, digests[1: -1].split(' ')))
4765 return r
4766
4767 ##################################
4768
4769
4770 def check_subnet(subnets: str) -> Tuple[int, List[int], str]:
4771 """Determine whether the given string is a valid subnet
4772
4773 :param subnets: subnet string, a single definition or comma separated list of CIDR subnets
4774 :returns: return code, IP version list of the subnets and msg describing any errors validation errors
4775 """
4776
4777 rc = 0
4778 versions = set()
4779 errors = []
4780 subnet_list = subnets.split(',')
4781 for subnet in subnet_list:
4782 # ensure the format of the string is as expected address/netmask
4783 subnet = subnet.strip()
4784 if not re.search(r'\/\d+$', subnet):
4785 rc = 1
4786 errors.append(f'{subnet} is not in CIDR format (address/netmask)')
4787 continue
4788 try:
4789 v = ipaddress.ip_network(subnet).version
4790 versions.add(v)
4791 except ValueError as e:
4792 rc = 1
4793 errors.append(f'{subnet} invalid: {str(e)}')
4794
4795 return rc, list(versions), ', '.join(errors)
4796
4797
4798 def unwrap_ipv6(address):
4799 # type: (str) -> str
4800 if address.startswith('[') and address.endswith(']'):
4801 return address[1: -1]
4802 return address
4803
4804
4805 def wrap_ipv6(address):
4806 # type: (str) -> str
4807
4808 # We cannot assume it's already wrapped or even an IPv6 address if
4809 # it's already wrapped it'll not pass (like if it's a hostname) and trigger
4810 # the ValueError
4811 try:
4812 if ipaddress.ip_address(address).version == 6:
4813 return f'[{address}]'
4814 except ValueError:
4815 pass
4816
4817 return address
4818
4819
4820 def is_ipv6(address):
4821 # type: (str) -> bool
4822 address = unwrap_ipv6(address)
4823 try:
4824 return ipaddress.ip_address(address).version == 6
4825 except ValueError:
4826 logger.warning('Address: {} is not a valid IP address'.format(address))
4827 return False
4828
4829
4830 def ip_in_subnets(ip_addr: str, subnets: str) -> bool:
4831 """Determine if the ip_addr belongs to any of the subnets list."""
4832 subnet_list = [x.strip() for x in subnets.split(',')]
4833 for subnet in subnet_list:
4834 ip_address = unwrap_ipv6(ip_addr) if is_ipv6(ip_addr) else ip_addr
4835 if ipaddress.ip_address(ip_address) in ipaddress.ip_network(subnet):
4836 return True
4837 return False
4838
4839
4840 def parse_mon_addrv(addrv_arg: str) -> List[EndPoint]:
4841 """Parse mon-addrv param into a list of mon end points."""
4842 r = re.compile(r':(\d+)$')
4843 addrv_args = []
4844 addr_arg = addrv_arg
4845 if addr_arg[0] != '[' or addr_arg[-1] != ']':
4846 raise Error(f'--mon-addrv value {addr_arg} must use square backets')
4847
4848 for addr in addr_arg[1: -1].split(','):
4849 hasport = r.findall(addr)
4850 if not hasport:
4851 raise Error(f'--mon-addrv value {addr_arg} must include port number')
4852 port_str = hasport[0]
4853 addr = re.sub(r'^v\d+:', '', addr) # strip off v1: or v2: prefix
4854 base_ip = addr[0:-(len(port_str)) - 1]
4855 addrv_args.append(EndPoint(base_ip, int(port_str)))
4856
4857 return addrv_args
4858
4859
4860 def parse_mon_ip(mon_ip: str) -> List[EndPoint]:
4861 """Parse mon-ip param into a list of mon end points."""
4862 r = re.compile(r':(\d+)$')
4863 addrv_args = []
4864 hasport = r.findall(mon_ip)
4865 if hasport:
4866 port_str = hasport[0]
4867 base_ip = mon_ip[0:-(len(port_str)) - 1]
4868 addrv_args.append(EndPoint(base_ip, int(port_str)))
4869 else:
4870 # No port provided: use fixed ports for ceph monitor
4871 addrv_args.append(EndPoint(mon_ip, 3300))
4872 addrv_args.append(EndPoint(mon_ip, 6789))
4873
4874 return addrv_args
4875
4876
4877 def build_addrv_params(addrv: List[EndPoint]) -> str:
4878 """Convert mon end-points (ip:port) into the format: [v[1|2]:ip:port1]"""
4879 if len(addrv) > 2:
4880 raise Error('Detected a local mon-addrv list with more than 2 entries.')
4881 port_to_ver: Dict[int, str] = {6789: 'v1', 3300: 'v2'}
4882 addr_arg_list: List[str] = []
4883 for ep in addrv:
4884 if ep.port in port_to_ver:
4885 ver = port_to_ver[ep.port]
4886 else:
4887 ver = 'v2' # default mon protocol version if port is not provided
4888 logger.warning(f'Using msgr2 protocol for unrecognized port {ep}')
4889 addr_arg_list.append(f'{ver}:{ep.ip}:{ep.port}')
4890
4891 addr_arg = '[{0}]'.format(','.join(addr_arg_list))
4892 return addr_arg
4893
4894
4895 def get_public_net_from_cfg(ctx: CephadmContext) -> Optional[str]:
4896 """Get mon public network from configuration file."""
4897 cp = read_config(ctx.config)
4898 if not cp.has_option('global', 'public_network'):
4899 return None
4900
4901 # Ensure all public CIDR networks are valid
4902 public_network = cp.get('global', 'public_network').strip('"').strip("'")
4903 rc, _, err_msg = check_subnet(public_network)
4904 if rc:
4905 raise Error(f'Invalid public_network {public_network} parameter: {err_msg}')
4906
4907 # Ensure all public CIDR networks are configured locally
4908 configured_subnets = set([x.strip() for x in public_network.split(',')])
4909 local_subnets = set([x[0] for x in list_networks(ctx).items()])
4910 valid_public_net = False
4911 for net in configured_subnets:
4912 if net in local_subnets:
4913 valid_public_net = True
4914 else:
4915 logger.warning(f'The public CIDR network {net} (from -c conf file) is not configured locally.')
4916 if not valid_public_net:
4917 raise Error(f'None of the public CIDR network(s) {configured_subnets} (from -c conf file) is configured locally.')
4918
4919 # Ensure public_network is compatible with the provided mon-ip (or mon-addrv)
4920 if ctx.mon_ip:
4921 if not ip_in_subnets(ctx.mon_ip, public_network):
4922 raise Error(f'The provided --mon-ip {ctx.mon_ip} does not belong to any public_network(s) {public_network}')
4923 elif ctx.mon_addrv:
4924 addrv_args = parse_mon_addrv(ctx.mon_addrv)
4925 for addrv in addrv_args:
4926 if not ip_in_subnets(addrv.ip, public_network):
4927 raise Error(f'The provided --mon-addrv {addrv.ip} ip does not belong to any public_network(s) {public_network}')
4928
4929 logger.debug(f'Using mon public network from configuration file {public_network}')
4930 return public_network
4931
4932
4933 def infer_mon_network(ctx: CephadmContext, mon_eps: List[EndPoint]) -> Optional[str]:
4934 """Infer mon public network from local network."""
4935 # Make sure IP is configured locally, and then figure out the CIDR network
4936 mon_networks = []
4937 for net, ifaces in list_networks(ctx).items():
4938 # build local_ips list for the specified network
4939 local_ips: List[str] = []
4940 for _, ls in ifaces.items():
4941 local_ips.extend([ipaddress.ip_address(ip) for ip in ls])
4942
4943 # check if any of mon ips belong to this net
4944 for mon_ep in mon_eps:
4945 try:
4946 if ipaddress.ip_address(unwrap_ipv6(mon_ep.ip)) in local_ips:
4947 mon_networks.append(net)
4948 logger.info(f'Mon IP `{mon_ep.ip}` is in CIDR network `{net}`')
4949 except ValueError as e:
4950 logger.warning(f'Cannot infer CIDR network for mon IP `{mon_ep.ip}` : {e}')
4951
4952 if not mon_networks:
4953 raise Error('Cannot infer CIDR network. Pass --skip-mon-network to configure it later')
4954 else:
4955 logger.debug(f'Inferred mon public CIDR from local network configuration {mon_networks}')
4956
4957 mon_networks = list(set(mon_networks)) # remove duplicates
4958 return ','.join(mon_networks)
4959
4960
4961 def prepare_mon_addresses(ctx: CephadmContext) -> Tuple[str, bool, Optional[str]]:
4962 """Get mon public network configuration."""
4963 ipv6 = False
4964 addrv_args: List[EndPoint] = []
4965 mon_addrv: str = '' # i.e: [v2:192.168.100.1:3300,v1:192.168.100.1:6789]
4966
4967 if ctx.mon_ip:
4968 ipv6 = is_ipv6(ctx.mon_ip)
4969 if ipv6:
4970 ctx.mon_ip = wrap_ipv6(ctx.mon_ip)
4971 addrv_args = parse_mon_ip(ctx.mon_ip)
4972 mon_addrv = build_addrv_params(addrv_args)
4973 elif ctx.mon_addrv:
4974 ipv6 = ctx.mon_addrv.count('[') > 1
4975 addrv_args = parse_mon_addrv(ctx.mon_addrv)
4976 mon_addrv = ctx.mon_addrv
4977 else:
4978 raise Error('must specify --mon-ip or --mon-addrv')
4979
4980 if addrv_args:
4981 for end_point in addrv_args:
4982 check_ip_port(ctx, end_point)
4983
4984 logger.debug(f'Base mon IP(s) is {addrv_args}, mon addrv is {mon_addrv}')
4985 mon_network = None
4986 if not ctx.skip_mon_network:
4987 mon_network = get_public_net_from_cfg(ctx) or infer_mon_network(ctx, addrv_args)
4988
4989 return (mon_addrv, ipv6, mon_network)
4990
4991
4992 def prepare_cluster_network(ctx: CephadmContext) -> Tuple[str, bool]:
4993 # the cluster network may not exist on this node, so all we can do is
4994 # validate that the address given is valid ipv4 or ipv6 subnet
4995 ipv6_cluster_network = False
4996 cp = read_config(ctx.config)
4997 cluster_network = ctx.cluster_network
4998 if cluster_network is None and cp.has_option('global', 'cluster_network'):
4999 cluster_network = cp.get('global', 'cluster_network').strip('"').strip("'")
5000
5001 if cluster_network:
5002 cluser_nets = set([x.strip() for x in cluster_network.split(',')])
5003 local_subnets = set([x[0] for x in list_networks(ctx).items()])
5004 for net in cluser_nets:
5005 if net not in local_subnets:
5006 logger.warning(f'The cluster CIDR network {net} is not configured locally.')
5007
5008 rc, versions, err_msg = check_subnet(cluster_network)
5009 if rc:
5010 raise Error(f'Invalid --cluster-network parameter: {err_msg}')
5011 ipv6_cluster_network = True if 6 in versions else False
5012 else:
5013 logger.info('Internal network (--cluster-network) has not '
5014 'been provided, OSD replication will default to '
5015 'the public_network')
5016
5017 return cluster_network, ipv6_cluster_network
5018
5019
5020 def create_initial_keys(
5021 ctx: CephadmContext,
5022 uid: int, gid: int,
5023 mgr_id: str
5024 ) -> Tuple[str, str, str, Any, Any]: # type: ignore
5025
5026 _image = ctx.image
5027
5028 # create some initial keys
5029 logger.info('Creating initial keys...')
5030 mon_key = CephContainer(
5031 ctx,
5032 image=_image,
5033 entrypoint='/usr/bin/ceph-authtool',
5034 args=['--gen-print-key'],
5035 ).run().strip()
5036 admin_key = CephContainer(
5037 ctx,
5038 image=_image,
5039 entrypoint='/usr/bin/ceph-authtool',
5040 args=['--gen-print-key'],
5041 ).run().strip()
5042 mgr_key = CephContainer(
5043 ctx,
5044 image=_image,
5045 entrypoint='/usr/bin/ceph-authtool',
5046 args=['--gen-print-key'],
5047 ).run().strip()
5048
5049 keyring = ('[mon.]\n'
5050 '\tkey = %s\n'
5051 '\tcaps mon = allow *\n'
5052 '[client.admin]\n'
5053 '\tkey = %s\n'
5054 '\tcaps mon = allow *\n'
5055 '\tcaps mds = allow *\n'
5056 '\tcaps mgr = allow *\n'
5057 '\tcaps osd = allow *\n'
5058 '[mgr.%s]\n'
5059 '\tkey = %s\n'
5060 '\tcaps mon = profile mgr\n'
5061 '\tcaps mds = allow *\n'
5062 '\tcaps osd = allow *\n'
5063 % (mon_key, admin_key, mgr_id, mgr_key))
5064
5065 admin_keyring = write_tmp('[client.admin]\n'
5066 '\tkey = ' + admin_key + '\n',
5067 uid, gid)
5068
5069 # tmp keyring file
5070 bootstrap_keyring = write_tmp(keyring, uid, gid)
5071 return (mon_key, mgr_key, admin_key,
5072 bootstrap_keyring, admin_keyring)
5073
5074
5075 def create_initial_monmap(
5076 ctx: CephadmContext,
5077 uid: int, gid: int,
5078 fsid: str,
5079 mon_id: str, mon_addr: str
5080 ) -> Any:
5081 logger.info('Creating initial monmap...')
5082 monmap = write_tmp('', 0, 0)
5083 out = CephContainer(
5084 ctx,
5085 image=ctx.image,
5086 entrypoint='/usr/bin/monmaptool',
5087 args=[
5088 '--create',
5089 '--clobber',
5090 '--fsid', fsid,
5091 '--addv', mon_id, mon_addr,
5092 '/tmp/monmap'
5093 ],
5094 volume_mounts={
5095 monmap.name: '/tmp/monmap:z',
5096 },
5097 ).run()
5098 logger.debug(f'monmaptool for {mon_id} {mon_addr} on {out}')
5099
5100 # pass monmap file to ceph user for use by ceph-mon --mkfs below
5101 os.fchown(monmap.fileno(), uid, gid)
5102 return monmap
5103
5104
5105 def prepare_create_mon(
5106 ctx: CephadmContext,
5107 uid: int, gid: int,
5108 fsid: str, mon_id: str,
5109 bootstrap_keyring_path: str,
5110 monmap_path: str
5111 ) -> Tuple[str, str]:
5112 logger.info('Creating mon...')
5113 create_daemon_dirs(ctx, fsid, 'mon', mon_id, uid, gid)
5114 mon_dir = get_data_dir(fsid, ctx.data_dir, 'mon', mon_id)
5115 log_dir = get_log_dir(fsid, ctx.log_dir)
5116 out = CephContainer(
5117 ctx,
5118 image=ctx.image,
5119 entrypoint='/usr/bin/ceph-mon',
5120 args=[
5121 '--mkfs',
5122 '-i', mon_id,
5123 '--fsid', fsid,
5124 '-c', '/dev/null',
5125 '--monmap', '/tmp/monmap',
5126 '--keyring', '/tmp/keyring',
5127 ] + get_daemon_args(ctx, fsid, 'mon', mon_id),
5128 volume_mounts={
5129 log_dir: '/var/log/ceph:z',
5130 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
5131 bootstrap_keyring_path: '/tmp/keyring:z',
5132 monmap_path: '/tmp/monmap:z',
5133 },
5134 ).run()
5135 logger.debug(f'create mon.{mon_id} on {out}')
5136 return (mon_dir, log_dir)
5137
5138
5139 def create_mon(
5140 ctx: CephadmContext,
5141 uid: int, gid: int,
5142 fsid: str, mon_id: str
5143 ) -> None:
5144 mon_c = get_container(ctx, fsid, 'mon', mon_id)
5145 ctx.meta_json = json.dumps({'service_name': 'mon'})
5146 deploy_daemon(ctx, fsid, 'mon', mon_id, mon_c, uid, gid,
5147 config=None, keyring=None)
5148
5149
5150 def wait_for_mon(
5151 ctx: CephadmContext,
5152 mon_id: str, mon_dir: str,
5153 admin_keyring_path: str, config_path: str
5154 ) -> None:
5155 logger.info('Waiting for mon to start...')
5156 c = CephContainer(
5157 ctx,
5158 image=ctx.image,
5159 entrypoint='/usr/bin/ceph',
5160 args=[
5161 'status'],
5162 volume_mounts={
5163 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
5164 admin_keyring_path: '/etc/ceph/ceph.client.admin.keyring:z',
5165 config_path: '/etc/ceph/ceph.conf:z',
5166 },
5167 )
5168
5169 # wait for the service to become available
5170 def is_mon_available():
5171 # type: () -> bool
5172 timeout = ctx.timeout if ctx.timeout else 60 # seconds
5173 out, err, ret = call(ctx, c.run_cmd(),
5174 desc=c.entrypoint,
5175 timeout=timeout,
5176 verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
5177 return ret == 0
5178
5179 is_available(ctx, 'mon', is_mon_available)
5180
5181
5182 def create_mgr(
5183 ctx: CephadmContext,
5184 uid: int, gid: int,
5185 fsid: str, mgr_id: str, mgr_key: str,
5186 config: str, clifunc: Callable
5187 ) -> None:
5188 logger.info('Creating mgr...')
5189 mgr_keyring = '[mgr.%s]\n\tkey = %s\n' % (mgr_id, mgr_key)
5190 mgr_c = get_container(ctx, fsid, 'mgr', mgr_id)
5191 # Note:the default port used by the Prometheus node exporter is opened in fw
5192 ctx.meta_json = json.dumps({'service_name': 'mgr'})
5193 deploy_daemon(ctx, fsid, 'mgr', mgr_id, mgr_c, uid, gid,
5194 config=config, keyring=mgr_keyring, ports=[9283])
5195
5196 # wait for the service to become available
5197 logger.info('Waiting for mgr to start...')
5198
5199 def is_mgr_available():
5200 # type: () -> bool
5201 timeout = ctx.timeout if ctx.timeout else 60 # seconds
5202 try:
5203 out = clifunc(['status', '-f', 'json-pretty'],
5204 timeout=timeout,
5205 verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
5206 j = json.loads(out)
5207 return j.get('mgrmap', {}).get('available', False)
5208 except Exception as e:
5209 logger.debug('status failed: %s' % e)
5210 return False
5211 is_available(ctx, 'mgr', is_mgr_available)
5212
5213
5214 def prepare_ssh(
5215 ctx: CephadmContext,
5216 cli: Callable, wait_for_mgr_restart: Callable
5217 ) -> None:
5218
5219 cli(['cephadm', 'set-user', ctx.ssh_user])
5220
5221 if ctx.ssh_config:
5222 logger.info('Using provided ssh config...')
5223 mounts = {
5224 pathify(ctx.ssh_config.name): '/tmp/cephadm-ssh-config:z',
5225 }
5226 cli(['cephadm', 'set-ssh-config', '-i', '/tmp/cephadm-ssh-config'], extra_mounts=mounts)
5227
5228 if ctx.ssh_private_key and ctx.ssh_public_key:
5229 logger.info('Using provided ssh keys...')
5230 mounts = {
5231 pathify(ctx.ssh_private_key.name): '/tmp/cephadm-ssh-key:z',
5232 pathify(ctx.ssh_public_key.name): '/tmp/cephadm-ssh-key.pub:z'
5233 }
5234 cli(['cephadm', 'set-priv-key', '-i', '/tmp/cephadm-ssh-key'], extra_mounts=mounts)
5235 cli(['cephadm', 'set-pub-key', '-i', '/tmp/cephadm-ssh-key.pub'], extra_mounts=mounts)
5236 ssh_pub = cli(['cephadm', 'get-pub-key'])
5237 else:
5238 logger.info('Generating ssh key...')
5239 cli(['cephadm', 'generate-key'])
5240 ssh_pub = cli(['cephadm', 'get-pub-key'])
5241 with open(ctx.output_pub_ssh_key, 'w') as f:
5242 f.write(ssh_pub)
5243 logger.info('Wrote public SSH key to %s' % ctx.output_pub_ssh_key)
5244
5245 authorize_ssh_key(ssh_pub, ctx.ssh_user)
5246
5247 host = get_hostname()
5248 logger.info('Adding host %s...' % host)
5249 try:
5250 args = ['orch', 'host', 'add', host]
5251 if ctx.mon_ip:
5252 args.append(unwrap_ipv6(ctx.mon_ip))
5253 elif ctx.mon_addrv:
5254 addrv_args = parse_mon_addrv(ctx.mon_addrv)
5255 args.append(unwrap_ipv6(addrv_args[0].ip))
5256 cli(args)
5257 except RuntimeError as e:
5258 raise Error('Failed to add host <%s>: %s' % (host, e))
5259
5260 for t in ['mon', 'mgr']:
5261 if not ctx.orphan_initial_daemons:
5262 logger.info('Deploying %s service with default placement...' % t)
5263 cli(['orch', 'apply', t])
5264 else:
5265 logger.info('Deploying unmanaged %s service...' % t)
5266 cli(['orch', 'apply', t, '--unmanaged'])
5267
5268 if not ctx.orphan_initial_daemons:
5269 logger.info('Deploying crash service with default placement...')
5270 cli(['orch', 'apply', 'crash'])
5271
5272 if not ctx.skip_monitoring_stack:
5273 for t in ['ceph-exporter', 'prometheus', 'grafana', 'node-exporter', 'alertmanager']:
5274 logger.info('Deploying %s service with default placement...' % t)
5275 cli(['orch', 'apply', t])
5276
5277 if ctx.with_centralized_logging:
5278 for t in ['loki', 'promtail']:
5279 logger.info('Deploying %s service with default placement...' % t)
5280 cli(['orch', 'apply', t])
5281
5282
5283 def enable_cephadm_mgr_module(
5284 cli: Callable, wait_for_mgr_restart: Callable
5285 ) -> None:
5286
5287 logger.info('Enabling cephadm module...')
5288 cli(['mgr', 'module', 'enable', 'cephadm'])
5289 wait_for_mgr_restart()
5290 logger.info('Setting orchestrator backend to cephadm...')
5291 cli(['orch', 'set', 'backend', 'cephadm'])
5292
5293
5294 def prepare_dashboard(
5295 ctx: CephadmContext,
5296 uid: int, gid: int,
5297 cli: Callable, wait_for_mgr_restart: Callable
5298 ) -> None:
5299
5300 # Configure SSL port (cephadm only allows to configure dashboard SSL port)
5301 # if the user does not want to use SSL he can change this setting once the cluster is up
5302 cli(['config', 'set', 'mgr', 'mgr/dashboard/ssl_server_port', str(ctx.ssl_dashboard_port)])
5303
5304 # configuring dashboard parameters
5305 logger.info('Enabling the dashboard module...')
5306 cli(['mgr', 'module', 'enable', 'dashboard'])
5307 wait_for_mgr_restart()
5308
5309 # dashboard crt and key
5310 if ctx.dashboard_key and ctx.dashboard_crt:
5311 logger.info('Using provided dashboard certificate...')
5312 mounts = {
5313 pathify(ctx.dashboard_crt.name): '/tmp/dashboard.crt:z',
5314 pathify(ctx.dashboard_key.name): '/tmp/dashboard.key:z'
5315 }
5316 cli(['dashboard', 'set-ssl-certificate', '-i', '/tmp/dashboard.crt'], extra_mounts=mounts)
5317 cli(['dashboard', 'set-ssl-certificate-key', '-i', '/tmp/dashboard.key'], extra_mounts=mounts)
5318 else:
5319 logger.info('Generating a dashboard self-signed certificate...')
5320 cli(['dashboard', 'create-self-signed-cert'])
5321
5322 logger.info('Creating initial admin user...')
5323 password = ctx.initial_dashboard_password or generate_password()
5324 tmp_password_file = write_tmp(password, uid, gid)
5325 cmd = ['dashboard', 'ac-user-create', ctx.initial_dashboard_user, '-i', '/tmp/dashboard.pw', 'administrator', '--force-password']
5326 if not ctx.dashboard_password_noupdate:
5327 cmd.append('--pwd-update-required')
5328 cli(cmd, extra_mounts={pathify(tmp_password_file.name): '/tmp/dashboard.pw:z'})
5329 logger.info('Fetching dashboard port number...')
5330 out = cli(['config', 'get', 'mgr', 'mgr/dashboard/ssl_server_port'])
5331 port = int(out)
5332
5333 # Open dashboard port
5334 if not ('skip_firewalld' in ctx and ctx.skip_firewalld):
5335 fw = Firewalld(ctx)
5336 fw.open_ports([port])
5337 fw.apply_rules()
5338
5339 logger.info('Ceph Dashboard is now available at:\n\n'
5340 '\t URL: https://%s:%s/\n'
5341 '\t User: %s\n'
5342 '\tPassword: %s\n' % (
5343 get_fqdn(), port,
5344 ctx.initial_dashboard_user,
5345 password))
5346
5347
5348 def prepare_bootstrap_config(
5349 ctx: CephadmContext,
5350 fsid: str, mon_addr: str, image: str
5351
5352 ) -> str:
5353
5354 cp = read_config(ctx.config)
5355 if not cp.has_section('global'):
5356 cp.add_section('global')
5357 cp.set('global', 'fsid', fsid)
5358 cp.set('global', 'mon_host', mon_addr)
5359 cp.set('global', 'container_image', image)
5360
5361 if not cp.has_section('mon'):
5362 cp.add_section('mon')
5363 if (
5364 not cp.has_option('mon', 'auth_allow_insecure_global_id_reclaim')
5365 and not cp.has_option('mon', 'auth allow insecure global id reclaim')
5366 ):
5367 cp.set('mon', 'auth_allow_insecure_global_id_reclaim', 'false')
5368
5369 if ctx.single_host_defaults:
5370 logger.info('Adjusting default settings to suit single-host cluster...')
5371 # replicate across osds, not hosts
5372 if (
5373 not cp.has_option('global', 'osd_crush_chooseleaf_type')
5374 and not cp.has_option('global', 'osd crush chooseleaf type')
5375 ):
5376 cp.set('global', 'osd_crush_chooseleaf_type', '0')
5377 # replica 2x
5378 if (
5379 not cp.has_option('global', 'osd_pool_default_size')
5380 and not cp.has_option('global', 'osd pool default size')
5381 ):
5382 cp.set('global', 'osd_pool_default_size', '2')
5383 # disable mgr standby modules (so we can colocate multiple mgrs on one host)
5384 if not cp.has_section('mgr'):
5385 cp.add_section('mgr')
5386 if (
5387 not cp.has_option('mgr', 'mgr_standby_modules')
5388 and not cp.has_option('mgr', 'mgr standby modules')
5389 ):
5390 cp.set('mgr', 'mgr_standby_modules', 'false')
5391 if ctx.log_to_file:
5392 cp.set('global', 'log_to_file', 'true')
5393 cp.set('global', 'log_to_stderr', 'false')
5394 cp.set('global', 'log_to_journald', 'false')
5395 cp.set('global', 'mon_cluster_log_to_file', 'true')
5396 cp.set('global', 'mon_cluster_log_to_stderr', 'false')
5397 cp.set('global', 'mon_cluster_log_to_journald', 'false')
5398
5399 cpf = StringIO()
5400 cp.write(cpf)
5401 config = cpf.getvalue()
5402
5403 if ctx.registry_json or ctx.registry_url:
5404 command_registry_login(ctx)
5405
5406 return config
5407
5408
5409 def finish_bootstrap_config(
5410 ctx: CephadmContext,
5411 fsid: str,
5412 config: str,
5413 mon_id: str, mon_dir: str,
5414 mon_network: Optional[str], ipv6: bool,
5415 cli: Callable,
5416 cluster_network: Optional[str], ipv6_cluster_network: bool
5417
5418 ) -> None:
5419 if not ctx.no_minimize_config:
5420 logger.info('Assimilating anything we can from ceph.conf...')
5421 cli([
5422 'config', 'assimilate-conf',
5423 '-i', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
5424 ], {
5425 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
5426 })
5427 logger.info('Generating new minimal ceph.conf...')
5428 cli([
5429 'config', 'generate-minimal-conf',
5430 '-o', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
5431 ], {
5432 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
5433 })
5434 # re-read our minimized config
5435 with open(mon_dir + '/config', 'r') as f:
5436 config = f.read()
5437 logger.info('Restarting the monitor...')
5438 call_throws(ctx, [
5439 'systemctl',
5440 'restart',
5441 get_unit_name(fsid, 'mon', mon_id)
5442 ])
5443 elif 'image' in ctx and ctx.image:
5444 # we still want to assimilate the given container image if provided
5445 cli(['config', 'set', 'global', 'container_image', f'{ctx.image}'])
5446
5447 if mon_network:
5448 logger.info(f'Setting mon public_network to {mon_network}')
5449 cli(['config', 'set', 'mon', 'public_network', mon_network])
5450
5451 if cluster_network:
5452 logger.info(f'Setting cluster_network to {cluster_network}')
5453 cli(['config', 'set', 'global', 'cluster_network', cluster_network])
5454
5455 if ipv6 or ipv6_cluster_network:
5456 logger.info('Enabling IPv6 (ms_bind_ipv6) binding')
5457 cli(['config', 'set', 'global', 'ms_bind_ipv6', 'true'])
5458
5459 with open(ctx.output_config, 'w') as f:
5460 f.write(config)
5461 logger.info('Wrote config to %s' % ctx.output_config)
5462 pass
5463
5464
5465 def _extract_host_info_from_applied_spec(f: Iterable[str]) -> List[Dict[str, str]]:
5466 # overall goal of this function is to go through an applied spec and find
5467 # the hostname (and addr is provided) for each host spec in the applied spec.
5468 # Generally, we should be able to just pass the spec to the mgr module where
5469 # proper yaml parsing can happen, but for host specs in particular we want to
5470 # be able to distribute ssh keys, which requires finding the hostname (and addr
5471 # if possible) for each potential host spec in the applied spec.
5472
5473 specs: List[List[str]] = []
5474 current_spec: List[str] = []
5475 for line in f:
5476 if re.search(r'^---\s+', line):
5477 if current_spec:
5478 specs.append(current_spec)
5479 current_spec = []
5480 else:
5481 line = line.strip()
5482 if line:
5483 current_spec.append(line)
5484 if current_spec:
5485 specs.append(current_spec)
5486
5487 host_specs: List[List[str]] = []
5488 for spec in specs:
5489 for line in spec:
5490 if 'service_type' in line:
5491 try:
5492 _, type = line.split(':')
5493 type = type.strip()
5494 if type == 'host':
5495 host_specs.append(spec)
5496 except ValueError as e:
5497 spec_str = '\n'.join(spec)
5498 logger.error(f'Failed to pull service_type from spec:\n{spec_str}. Got error: {e}')
5499 break
5500 spec_str = '\n'.join(spec)
5501 logger.error(f'Failed to find service_type within spec:\n{spec_str}')
5502
5503 host_dicts = []
5504 for s in host_specs:
5505 host_dict = _extract_host_info_from_spec(s)
5506 # if host_dict is empty here, we failed to pull the hostname
5507 # for the host from the spec. This should have already been logged
5508 # so at this point we just don't want to include it in our output
5509 if host_dict:
5510 host_dicts.append(host_dict)
5511
5512 return host_dicts
5513
5514
5515 def _extract_host_info_from_spec(host_spec: List[str]) -> Dict[str, str]:
5516 # note:for our purposes here, we only really want the hostname
5517 # and address of the host from each of these specs in order to
5518 # be able to distribute ssh keys. We will later apply the spec
5519 # through the mgr module where proper yaml parsing can be done
5520 # The returned dicts from this function should only contain
5521 # one or two entries, one (required) for hostname, one (optional) for addr
5522 # {
5523 # hostname: <hostname>
5524 # addr: <ip-addr>
5525 # }
5526 # if we fail to find the hostname, an empty dict is returned
5527
5528 host_dict = {} # type: Dict[str, str]
5529 for line in host_spec:
5530 for field in ['hostname', 'addr']:
5531 if field in line:
5532 try:
5533 _, field_value = line.split(':')
5534 field_value = field_value.strip()
5535 host_dict[field] = field_value
5536 except ValueError as e:
5537 spec_str = '\n'.join(host_spec)
5538 logger.error(f'Error trying to pull {field} from host spec:\n{spec_str}. Got error: {e}')
5539
5540 if 'hostname' not in host_dict:
5541 spec_str = '\n'.join(host_spec)
5542 logger.error(f'Could not find hostname in host spec:\n{spec_str}')
5543 return {}
5544 return host_dict
5545
5546
5547 def _distribute_ssh_keys(ctx: CephadmContext, host_info: Dict[str, str], bootstrap_hostname: str) -> int:
5548 # copy ssh key to hosts in host spec (used for apply spec)
5549 ssh_key = CEPH_DEFAULT_PUBKEY
5550 if ctx.ssh_public_key:
5551 ssh_key = ctx.ssh_public_key.name
5552
5553 if bootstrap_hostname != host_info['hostname']:
5554 if 'addr' in host_info:
5555 addr = host_info['addr']
5556 else:
5557 addr = host_info['hostname']
5558 out, err, code = call(ctx, ['sudo', '-u', ctx.ssh_user, 'ssh-copy-id', '-f', '-i', ssh_key, '-o StrictHostKeyChecking=no', '%s@%s' % (ctx.ssh_user, addr)])
5559 if code:
5560 logger.error('\nCopying ssh key to host %s at address %s failed!\n' % (host_info['hostname'], addr))
5561 return 1
5562 else:
5563 logger.info('Added ssh key to host %s at address %s' % (host_info['hostname'], addr))
5564 return 0
5565
5566
5567 def save_cluster_config(ctx: CephadmContext, uid: int, gid: int, fsid: str) -> None:
5568 """Save cluster configuration to the per fsid directory """
5569 def copy_file(src: str, dst: str) -> None:
5570 if src:
5571 shutil.copyfile(src, dst)
5572
5573 conf_dir = f'{ctx.data_dir}/{fsid}/{CEPH_CONF_DIR}'
5574 makedirs(conf_dir, uid, gid, DATA_DIR_MODE)
5575 if os.path.exists(conf_dir):
5576 logger.info(f'Saving cluster configuration to {conf_dir} directory')
5577 copy_file(ctx.output_config, os.path.join(conf_dir, CEPH_CONF))
5578 copy_file(ctx.output_keyring, os.path.join(conf_dir, CEPH_KEYRING))
5579 # ctx.output_pub_ssh_key may not exist if user has provided custom ssh keys
5580 if (os.path.exists(ctx.output_pub_ssh_key)):
5581 copy_file(ctx.output_pub_ssh_key, os.path.join(conf_dir, CEPH_PUBKEY))
5582 else:
5583 logger.warning(f'Cannot create cluster configuration directory {conf_dir}')
5584
5585
5586 @default_image
5587 def command_bootstrap(ctx):
5588 # type: (CephadmContext) -> int
5589
5590 ctx.error_code = 0
5591
5592 if not ctx.output_config:
5593 ctx.output_config = os.path.join(ctx.output_dir, CEPH_CONF)
5594 if not ctx.output_keyring:
5595 ctx.output_keyring = os.path.join(ctx.output_dir, CEPH_KEYRING)
5596 if not ctx.output_pub_ssh_key:
5597 ctx.output_pub_ssh_key = os.path.join(ctx.output_dir, CEPH_PUBKEY)
5598
5599 if bool(ctx.ssh_private_key) is not bool(ctx.ssh_public_key):
5600 raise Error('--ssh-private-key and --ssh-public-key must be provided together or not at all.')
5601
5602 if ctx.fsid:
5603 data_dir_base = os.path.join(ctx.data_dir, ctx.fsid)
5604 if os.path.exists(data_dir_base):
5605 raise Error(f"A cluster with the same fsid '{ctx.fsid}' already exists.")
5606 else:
5607 logger.warning('Specifying an fsid for your cluster offers no advantages and may increase the likelihood of fsid conflicts.')
5608
5609 # verify output files
5610 for f in [ctx.output_config, ctx.output_keyring,
5611 ctx.output_pub_ssh_key]:
5612 if not ctx.allow_overwrite:
5613 if os.path.exists(f):
5614 raise Error('%s already exists; delete or pass '
5615 '--allow-overwrite to overwrite' % f)
5616 dirname = os.path.dirname(f)
5617 if dirname and not os.path.exists(dirname):
5618 fname = os.path.basename(f)
5619 logger.info(f'Creating directory {dirname} for {fname}')
5620 try:
5621 # use makedirs to create intermediate missing dirs
5622 os.makedirs(dirname, 0o755)
5623 except PermissionError:
5624 raise Error(f'Unable to create {dirname} due to permissions failure. Retry with root, or sudo or preallocate the directory.')
5625
5626 (user_conf, _) = get_config_and_keyring(ctx)
5627
5628 if ctx.ssh_user != 'root':
5629 check_ssh_connectivity(ctx)
5630
5631 if not ctx.skip_prepare_host:
5632 command_prepare_host(ctx)
5633 else:
5634 logger.info('Skip prepare_host')
5635
5636 # initial vars
5637 fsid = ctx.fsid or make_fsid()
5638 if not is_fsid(fsid):
5639 raise Error('not an fsid: %s' % fsid)
5640 logger.info('Cluster fsid: %s' % fsid)
5641
5642 hostname = get_hostname()
5643 if '.' in hostname and not ctx.allow_fqdn_hostname:
5644 raise Error('hostname is a fully qualified domain name (%s); either fix (e.g., "sudo hostname %s" or similar) or pass --allow-fqdn-hostname' % (hostname, hostname.split('.')[0]))
5645 mon_id = ctx.mon_id or get_short_hostname()
5646 mgr_id = ctx.mgr_id or generate_service_id()
5647
5648 lock = FileLock(ctx, fsid)
5649 lock.acquire()
5650
5651 (addr_arg, ipv6, mon_network) = prepare_mon_addresses(ctx)
5652 cluster_network, ipv6_cluster_network = prepare_cluster_network(ctx)
5653
5654 config = prepare_bootstrap_config(ctx, fsid, addr_arg, ctx.image)
5655
5656 if not ctx.skip_pull:
5657 try:
5658 _pull_image(ctx, ctx.image)
5659 except UnauthorizedRegistryError:
5660 err_str = 'Failed to pull container image. Check that correct registry credentials are provided in bootstrap by --registry-url, --registry-username, --registry-password, or supply --registry-json with credentials'
5661 logger.debug(f'Pulling image for bootstrap on {hostname} failed: {err_str}')
5662 raise Error(err_str)
5663
5664 image_ver = CephContainer(ctx, ctx.image, 'ceph', ['--version']).run().strip()
5665 logger.info(f'Ceph version: {image_ver}')
5666
5667 if not ctx.allow_mismatched_release:
5668 image_release = image_ver.split()[4]
5669 if image_release not in \
5670 [DEFAULT_IMAGE_RELEASE, LATEST_STABLE_RELEASE]:
5671 raise Error(
5672 f'Container release {image_release} != cephadm release {DEFAULT_IMAGE_RELEASE};'
5673 ' please use matching version of cephadm (pass --allow-mismatched-release to continue anyway)'
5674 )
5675
5676 logger.info('Extracting ceph user uid/gid from container image...')
5677 (uid, gid) = extract_uid_gid(ctx)
5678
5679 # create some initial keys
5680 (mon_key, mgr_key, admin_key, bootstrap_keyring, admin_keyring) = create_initial_keys(ctx, uid, gid, mgr_id)
5681
5682 monmap = create_initial_monmap(ctx, uid, gid, fsid, mon_id, addr_arg)
5683 (mon_dir, log_dir) = prepare_create_mon(ctx, uid, gid, fsid, mon_id,
5684 bootstrap_keyring.name, monmap.name)
5685
5686 with open(mon_dir + '/config', 'w') as f:
5687 os.fchown(f.fileno(), uid, gid)
5688 os.fchmod(f.fileno(), 0o600)
5689 f.write(config)
5690
5691 make_var_run(ctx, fsid, uid, gid)
5692 create_mon(ctx, uid, gid, fsid, mon_id)
5693
5694 # config to issue various CLI commands
5695 tmp_config = write_tmp(config, uid, gid)
5696
5697 # a CLI helper to reduce our typing
5698 def cli(cmd, extra_mounts={}, timeout=DEFAULT_TIMEOUT, verbosity=CallVerbosity.VERBOSE_ON_FAILURE):
5699 # type: (List[str], Dict[str, str], Optional[int], CallVerbosity) -> str
5700 mounts = {
5701 log_dir: '/var/log/ceph:z',
5702 admin_keyring.name: '/etc/ceph/ceph.client.admin.keyring:z',
5703 tmp_config.name: '/etc/ceph/ceph.conf:z',
5704 }
5705 for k, v in extra_mounts.items():
5706 mounts[k] = v
5707 timeout = timeout or ctx.timeout
5708 return CephContainer(
5709 ctx,
5710 image=ctx.image,
5711 entrypoint='/usr/bin/ceph',
5712 args=cmd,
5713 volume_mounts=mounts,
5714 ).run(timeout=timeout, verbosity=verbosity)
5715
5716 wait_for_mon(ctx, mon_id, mon_dir, admin_keyring.name, tmp_config.name)
5717
5718 finish_bootstrap_config(ctx, fsid, config, mon_id, mon_dir,
5719 mon_network, ipv6, cli,
5720 cluster_network, ipv6_cluster_network)
5721
5722 # output files
5723 with open(ctx.output_keyring, 'w') as f:
5724 os.fchmod(f.fileno(), 0o600)
5725 f.write('[client.admin]\n'
5726 '\tkey = ' + admin_key + '\n')
5727 logger.info('Wrote keyring to %s' % ctx.output_keyring)
5728
5729 # create mgr
5730 create_mgr(ctx, uid, gid, fsid, mgr_id, mgr_key, config, cli)
5731
5732 if user_conf:
5733 # user given config settings were already assimilated earlier
5734 # but if the given settings contained any attributes in
5735 # the mgr (e.g. mgr/cephadm/container_image_prometheus)
5736 # they don't seem to be stored if there isn't a mgr yet.
5737 # Since re-assimilating the same conf settings should be
5738 # idempotent we can just do it again here.
5739 with tempfile.NamedTemporaryFile(buffering=0) as tmp:
5740 tmp.write(user_conf.encode('utf-8'))
5741 cli(['config', 'assimilate-conf',
5742 '-i', '/var/lib/ceph/user.conf'],
5743 {tmp.name: '/var/lib/ceph/user.conf:z'})
5744
5745 # wait for mgr to restart (after enabling a module)
5746 def wait_for_mgr_restart() -> None:
5747 # first get latest mgrmap epoch from the mon. try newer 'mgr
5748 # stat' command first, then fall back to 'mgr dump' if
5749 # necessary
5750 try:
5751 j = json_loads_retry(lambda: cli(['mgr', 'stat'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR))
5752 except Exception:
5753 j = json_loads_retry(lambda: cli(['mgr', 'dump'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR))
5754 epoch = j['epoch']
5755
5756 # wait for mgr to have it
5757 logger.info('Waiting for the mgr to restart...')
5758
5759 def mgr_has_latest_epoch():
5760 # type: () -> bool
5761 try:
5762 out = cli(['tell', 'mgr', 'mgr_status'])
5763 j = json.loads(out)
5764 return j['mgrmap_epoch'] >= epoch
5765 except Exception as e:
5766 logger.debug('tell mgr mgr_status failed: %s' % e)
5767 return False
5768 is_available(ctx, 'mgr epoch %d' % epoch, mgr_has_latest_epoch)
5769
5770 enable_cephadm_mgr_module(cli, wait_for_mgr_restart)
5771
5772 # ssh
5773 if not ctx.skip_ssh:
5774 prepare_ssh(ctx, cli, wait_for_mgr_restart)
5775
5776 if ctx.registry_url and ctx.registry_username and ctx.registry_password:
5777 registry_credentials = {'url': ctx.registry_url, 'username': ctx.registry_username, 'password': ctx.registry_password}
5778 cli(['config-key', 'set', 'mgr/cephadm/registry_credentials', json.dumps(registry_credentials)])
5779
5780 cli(['config', 'set', 'mgr', 'mgr/cephadm/container_init', str(ctx.container_init), '--force'])
5781
5782 if not ctx.skip_dashboard:
5783 prepare_dashboard(ctx, uid, gid, cli, wait_for_mgr_restart)
5784
5785 if ctx.output_config == CEPH_DEFAULT_CONF and not ctx.skip_admin_label and not ctx.no_minimize_config:
5786 logger.info('Enabling client.admin keyring and conf on hosts with "admin" label')
5787 try:
5788 cli(['orch', 'client-keyring', 'set', 'client.admin', 'label:_admin'])
5789 cli(['orch', 'host', 'label', 'add', get_hostname(), '_admin'])
5790 except Exception:
5791 logger.info('Unable to set up "admin" label; assuming older version of Ceph')
5792
5793 if ctx.apply_spec:
5794 logger.info('Applying %s to cluster' % ctx.apply_spec)
5795 # copy ssh key to hosts in spec file
5796 with open(ctx.apply_spec) as f:
5797 host_dicts = _extract_host_info_from_applied_spec(f)
5798 for h in host_dicts:
5799 _distribute_ssh_keys(ctx, h, hostname)
5800
5801 mounts = {}
5802 mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:ro'
5803 try:
5804 out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
5805 logger.info(out)
5806 except Exception:
5807 ctx.error_code = -errno.EINVAL
5808 logger.info('\nApplying %s to cluster failed!\n' % ctx.apply_spec)
5809
5810 save_cluster_config(ctx, uid, gid, fsid)
5811
5812 # enable autotune for osd_memory_target
5813 logger.info('Enabling autotune for osd_memory_target')
5814 cli(['config', 'set', 'osd', 'osd_memory_target_autotune', 'true'])
5815
5816 # Notify the Dashboard to show the 'Expand cluster' page on first log in.
5817 cli(['config-key', 'set', 'mgr/dashboard/cluster/status', 'INSTALLED'])
5818
5819 logger.info('You can access the Ceph CLI as following in case of multi-cluster or non-default config:\n\n'
5820 '\tsudo %s shell --fsid %s -c %s -k %s\n' % (
5821 sys.argv[0],
5822 fsid,
5823 ctx.output_config,
5824 ctx.output_keyring))
5825
5826 logger.info('Or, if you are only running a single cluster on this host:\n\n\tsudo %s shell \n' % (sys.argv[0]))
5827
5828 logger.info('Please consider enabling telemetry to help improve Ceph:\n\n'
5829 '\tceph telemetry on\n\n'
5830 'For more information see:\n\n'
5831 '\thttps://docs.ceph.com/docs/master/mgr/telemetry/\n')
5832 logger.info('Bootstrap complete.')
5833 return ctx.error_code
5834
5835 ##################################
5836
5837
5838 def command_registry_login(ctx: CephadmContext) -> int:
5839 if ctx.registry_json:
5840 logger.info('Pulling custom registry login info from %s.' % ctx.registry_json)
5841 d = get_parm(ctx.registry_json)
5842 if d.get('url') and d.get('username') and d.get('password'):
5843 ctx.registry_url = d.get('url')
5844 ctx.registry_username = d.get('username')
5845 ctx.registry_password = d.get('password')
5846 registry_login(ctx, ctx.registry_url, ctx.registry_username, ctx.registry_password)
5847 else:
5848 raise Error('json provided for custom registry login did not include all necessary fields. '
5849 'Please setup json file as\n'
5850 '{\n'
5851 ' "url": "REGISTRY_URL",\n'
5852 ' "username": "REGISTRY_USERNAME",\n'
5853 ' "password": "REGISTRY_PASSWORD"\n'
5854 '}\n')
5855 elif ctx.registry_url and ctx.registry_username and ctx.registry_password:
5856 registry_login(ctx, ctx.registry_url, ctx.registry_username, ctx.registry_password)
5857 else:
5858 raise Error('Invalid custom registry arguments received. To login to a custom registry include '
5859 '--registry-url, --registry-username and --registry-password '
5860 'options or --registry-json option')
5861 return 0
5862
5863
5864 def registry_login(ctx: CephadmContext, url: Optional[str], username: Optional[str], password: Optional[str]) -> None:
5865 logger.info('Logging into custom registry.')
5866 try:
5867 engine = ctx.container_engine
5868 cmd = [engine.path, 'login',
5869 '-u', username, '-p', password,
5870 url]
5871 if isinstance(engine, Podman):
5872 cmd.append('--authfile=/etc/ceph/podman-auth.json')
5873 out, _, _ = call_throws(ctx, cmd)
5874 if isinstance(engine, Podman):
5875 os.chmod('/etc/ceph/podman-auth.json', 0o600)
5876 except Exception:
5877 raise Error('Failed to login to custom registry @ %s as %s with given password' % (ctx.registry_url, ctx.registry_username))
5878
5879 ##################################
5880
5881
5882 def extract_uid_gid_monitoring(ctx, daemon_type):
5883 # type: (CephadmContext, str) -> Tuple[int, int]
5884
5885 if daemon_type == 'prometheus':
5886 uid, gid = extract_uid_gid(ctx, file_path='/etc/prometheus')
5887 elif daemon_type == 'node-exporter':
5888 uid, gid = 65534, 65534
5889 elif daemon_type == 'grafana':
5890 uid, gid = extract_uid_gid(ctx, file_path='/var/lib/grafana')
5891 elif daemon_type == 'loki':
5892 uid, gid = extract_uid_gid(ctx, file_path='/etc/loki')
5893 elif daemon_type == 'promtail':
5894 uid, gid = extract_uid_gid(ctx, file_path='/etc/promtail')
5895 elif daemon_type == 'alertmanager':
5896 uid, gid = extract_uid_gid(ctx, file_path=['/etc/alertmanager', '/etc/prometheus'])
5897 else:
5898 raise Error('{} not implemented yet'.format(daemon_type))
5899 return uid, gid
5900
5901
5902 def get_deployment_container(ctx: CephadmContext,
5903 fsid: str, daemon_type: str, daemon_id: Union[int, str],
5904 privileged: bool = False,
5905 ptrace: bool = False,
5906 container_args: Optional[List[str]] = None) -> 'CephContainer':
5907 # wrapper for get_container specifically for containers made during the `cephadm deploy`
5908 # command. Adds some extra things such as extra container args and custom config files
5909 c = get_container(ctx, fsid, daemon_type, daemon_id, privileged, ptrace, container_args)
5910 if 'extra_container_args' in ctx and ctx.extra_container_args:
5911 c.container_args.extend(ctx.extra_container_args)
5912 if 'extra_entrypoint_args' in ctx and ctx.extra_entrypoint_args:
5913 c.args.extend(ctx.extra_entrypoint_args)
5914 if 'config_json' in ctx and ctx.config_json:
5915 conf_files = get_custom_config_files(ctx.config_json)
5916 mandatory_keys = ['mount_path', 'content']
5917 for conf in conf_files['custom_config_files']:
5918 if all(k in conf for k in mandatory_keys):
5919 mount_path = conf['mount_path']
5920 file_path = os.path.join(
5921 ctx.data_dir,
5922 fsid,
5923 'custom_config_files',
5924 f'{daemon_type}.{daemon_id}',
5925 os.path.basename(mount_path)
5926 )
5927 c.volume_mounts[file_path] = mount_path
5928 return c
5929
5930
5931 @default_image
5932 def command_deploy(ctx):
5933 # type: (CephadmContext) -> None
5934 daemon_type, daemon_id = ctx.name.split('.', 1)
5935
5936 lock = FileLock(ctx, ctx.fsid)
5937 lock.acquire()
5938
5939 if daemon_type not in get_supported_daemons():
5940 raise Error('daemon type %s not recognized' % daemon_type)
5941
5942 redeploy = False
5943 unit_name = get_unit_name(ctx.fsid, daemon_type, daemon_id)
5944 (_, state, _) = check_unit(ctx, unit_name)
5945 if state == 'running' or is_container_running(ctx, CephContainer.for_daemon(ctx, ctx.fsid, daemon_type, daemon_id, 'bash')):
5946 redeploy = True
5947
5948 if ctx.reconfig:
5949 logger.info('%s daemon %s ...' % ('Reconfig', ctx.name))
5950 elif redeploy:
5951 logger.info('%s daemon %s ...' % ('Redeploy', ctx.name))
5952 else:
5953 logger.info('%s daemon %s ...' % ('Deploy', ctx.name))
5954
5955 # Migrate sysctl conf files from /usr/lib to /etc
5956 migrate_sysctl_dir(ctx, ctx.fsid)
5957
5958 # Get and check ports explicitly required to be opened
5959 daemon_ports = [] # type: List[int]
5960
5961 # only check port in use if not reconfig or redeploy since service
5962 # we are redeploying/reconfiguring will already be using the port
5963 if not ctx.reconfig and not redeploy:
5964 if ctx.tcp_ports:
5965 daemon_ports = list(map(int, ctx.tcp_ports.split()))
5966
5967 if daemon_type in Ceph.daemons:
5968 config, keyring = get_config_and_keyring(ctx)
5969 uid, gid = extract_uid_gid(ctx)
5970 make_var_run(ctx, ctx.fsid, uid, gid)
5971
5972 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id,
5973 ptrace=ctx.allow_ptrace)
5974 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
5975 config=config, keyring=keyring,
5976 osd_fsid=ctx.osd_fsid,
5977 reconfig=ctx.reconfig,
5978 ports=daemon_ports)
5979
5980 elif daemon_type in Monitoring.components:
5981 # monitoring daemon - prometheus, grafana, alertmanager, node-exporter
5982 # Default Checks
5983 # make sure provided config-json is sufficient
5984 config = get_parm(ctx.config_json) # type: ignore
5985 required_files = Monitoring.components[daemon_type].get('config-json-files', list())
5986 required_args = Monitoring.components[daemon_type].get('config-json-args', list())
5987 if required_files:
5988 if not config or not all(c in config.get('files', {}).keys() for c in required_files): # type: ignore
5989 raise Error('{} deployment requires config-json which must '
5990 'contain file content for {}'.format(daemon_type.capitalize(), ', '.join(required_files)))
5991 if required_args:
5992 if not config or not all(c in config.keys() for c in required_args): # type: ignore
5993 raise Error('{} deployment requires config-json which must '
5994 'contain arg for {}'.format(daemon_type.capitalize(), ', '.join(required_args)))
5995
5996 uid, gid = extract_uid_gid_monitoring(ctx, daemon_type)
5997 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
5998 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
5999 reconfig=ctx.reconfig,
6000 ports=daemon_ports)
6001
6002 elif daemon_type == NFSGanesha.daemon_type:
6003 if not ctx.reconfig and not redeploy and not daemon_ports:
6004 daemon_ports = list(NFSGanesha.port_map.values())
6005
6006 config, keyring = get_config_and_keyring(ctx)
6007 # TODO: extract ganesha uid/gid (997, 994) ?
6008 uid, gid = extract_uid_gid(ctx)
6009 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
6010 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
6011 config=config, keyring=keyring,
6012 reconfig=ctx.reconfig,
6013 ports=daemon_ports)
6014
6015 elif daemon_type == CephIscsi.daemon_type:
6016 config, keyring = get_config_and_keyring(ctx)
6017 uid, gid = extract_uid_gid(ctx)
6018 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
6019 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
6020 config=config, keyring=keyring,
6021 reconfig=ctx.reconfig,
6022 ports=daemon_ports)
6023
6024 elif daemon_type == HAproxy.daemon_type:
6025 haproxy = HAproxy.init(ctx, ctx.fsid, daemon_id)
6026 uid, gid = haproxy.extract_uid_gid_haproxy()
6027 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
6028 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
6029 reconfig=ctx.reconfig,
6030 ports=daemon_ports)
6031
6032 elif daemon_type == Keepalived.daemon_type:
6033 keepalived = Keepalived.init(ctx, ctx.fsid, daemon_id)
6034 uid, gid = keepalived.extract_uid_gid_keepalived()
6035 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
6036 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
6037 reconfig=ctx.reconfig,
6038 ports=daemon_ports)
6039
6040 elif daemon_type == CustomContainer.daemon_type:
6041 cc = CustomContainer.init(ctx, ctx.fsid, daemon_id)
6042 if not ctx.reconfig and not redeploy:
6043 daemon_ports.extend(cc.ports)
6044 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id,
6045 privileged=cc.privileged,
6046 ptrace=ctx.allow_ptrace)
6047 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c,
6048 uid=cc.uid, gid=cc.gid, config=None,
6049 keyring=None, reconfig=ctx.reconfig,
6050 ports=daemon_ports)
6051
6052 elif daemon_type == CephadmAgent.daemon_type:
6053 # get current user gid and uid
6054 uid = os.getuid()
6055 gid = os.getgid()
6056 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, None,
6057 uid, gid, ports=daemon_ports)
6058
6059 elif daemon_type == SNMPGateway.daemon_type:
6060 sc = SNMPGateway.init(ctx, ctx.fsid, daemon_id)
6061 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
6062 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c,
6063 sc.uid, sc.gid,
6064 ports=daemon_ports)
6065
6066 else:
6067 raise Error('daemon type {} not implemented in command_deploy function'
6068 .format(daemon_type))
6069
6070 ##################################
6071
6072
6073 @infer_image
6074 def command_run(ctx):
6075 # type: (CephadmContext) -> int
6076 (daemon_type, daemon_id) = ctx.name.split('.', 1)
6077 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
6078 command = c.run_cmd()
6079 return call_timeout(ctx, command, ctx.timeout)
6080
6081 ##################################
6082
6083
6084 @infer_fsid
6085 @infer_config
6086 @infer_image
6087 @validate_fsid
6088 def command_shell(ctx):
6089 # type: (CephadmContext) -> int
6090 cp = read_config(ctx.config)
6091 if cp.has_option('global', 'fsid') and \
6092 cp.get('global', 'fsid') != ctx.fsid:
6093 raise Error('fsid does not match ceph.conf')
6094
6095 if ctx.name:
6096 if '.' in ctx.name:
6097 (daemon_type, daemon_id) = ctx.name.split('.', 1)
6098 else:
6099 daemon_type = ctx.name
6100 daemon_id = None
6101 else:
6102 daemon_type = 'osd' # get the most mounts
6103 daemon_id = None
6104
6105 if ctx.fsid and daemon_type in Ceph.daemons:
6106 make_log_dir(ctx, ctx.fsid)
6107
6108 if daemon_id and not ctx.fsid:
6109 raise Error('must pass --fsid to specify cluster')
6110
6111 # in case a dedicated keyring for the specified fsid is found we us it.
6112 # Otherwise, use /etc/ceph files by default, if present. We do this instead of
6113 # making these defaults in the arg parser because we don't want an error
6114 # if they don't exist.
6115 if not ctx.keyring:
6116 keyring_file = f'{ctx.data_dir}/{ctx.fsid}/{CEPH_CONF_DIR}/{CEPH_KEYRING}'
6117 if os.path.exists(keyring_file):
6118 ctx.keyring = keyring_file
6119 elif os.path.exists(CEPH_DEFAULT_KEYRING):
6120 ctx.keyring = CEPH_DEFAULT_KEYRING
6121
6122 container_args: List[str] = ['-i']
6123 mounts = get_container_mounts(ctx, ctx.fsid, daemon_type, daemon_id,
6124 no_config=True if ctx.config else False)
6125 binds = get_container_binds(ctx, ctx.fsid, daemon_type, daemon_id)
6126 if ctx.config:
6127 mounts[pathify(ctx.config)] = '/etc/ceph/ceph.conf:z'
6128 if ctx.keyring:
6129 mounts[pathify(ctx.keyring)] = '/etc/ceph/ceph.keyring:z'
6130 if ctx.mount:
6131 for _mount in ctx.mount:
6132 split_src_dst = _mount.split(':')
6133 mount = pathify(split_src_dst[0])
6134 filename = os.path.basename(split_src_dst[0])
6135 if len(split_src_dst) > 1:
6136 dst = split_src_dst[1]
6137 if len(split_src_dst) == 3:
6138 dst = '{}:{}'.format(dst, split_src_dst[2])
6139 mounts[mount] = dst
6140 else:
6141 mounts[mount] = '/mnt/{}'.format(filename)
6142 if ctx.command:
6143 command = ctx.command
6144 else:
6145 command = ['bash']
6146 container_args += [
6147 '-t',
6148 '-e', 'LANG=C',
6149 '-e', 'PS1=%s' % CUSTOM_PS1,
6150 ]
6151 if ctx.fsid:
6152 home = os.path.join(ctx.data_dir, ctx.fsid, 'home')
6153 if not os.path.exists(home):
6154 logger.debug('Creating root home at %s' % home)
6155 makedirs(home, 0, 0, 0o660)
6156 if os.path.exists('/etc/skel'):
6157 for f in os.listdir('/etc/skel'):
6158 if f.startswith('.bash'):
6159 shutil.copyfile(os.path.join('/etc/skel', f),
6160 os.path.join(home, f))
6161 mounts[home] = '/root'
6162
6163 for i in ctx.volume:
6164 a, b = i.split(':', 1)
6165 mounts[a] = b
6166
6167 c = CephContainer(
6168 ctx,
6169 image=ctx.image,
6170 entrypoint='doesnotmatter',
6171 args=[],
6172 container_args=container_args,
6173 volume_mounts=mounts,
6174 bind_mounts=binds,
6175 envs=ctx.env,
6176 privileged=True)
6177 command = c.shell_cmd(command)
6178
6179 return call_timeout(ctx, command, ctx.timeout)
6180
6181 ##################################
6182
6183
6184 @infer_fsid
6185 def command_enter(ctx):
6186 # type: (CephadmContext) -> int
6187 if not ctx.fsid:
6188 raise Error('must pass --fsid to specify cluster')
6189 (daemon_type, daemon_id) = ctx.name.split('.', 1)
6190 container_args = ['-i'] # type: List[str]
6191 if ctx.command:
6192 command = ctx.command
6193 else:
6194 command = ['sh']
6195 container_args += [
6196 '-t',
6197 '-e', 'LANG=C',
6198 '-e', 'PS1=%s' % CUSTOM_PS1,
6199 ]
6200 c = CephContainer(
6201 ctx,
6202 image=ctx.image,
6203 entrypoint='doesnotmatter',
6204 container_args=container_args,
6205 cname='ceph-%s-%s.%s' % (ctx.fsid, daemon_type, daemon_id),
6206 )
6207 command = c.exec_cmd(command)
6208 return call_timeout(ctx, command, ctx.timeout)
6209
6210 ##################################
6211
6212
6213 @infer_fsid
6214 @infer_image
6215 @validate_fsid
6216 def command_ceph_volume(ctx):
6217 # type: (CephadmContext) -> None
6218 cp = read_config(ctx.config)
6219 if cp.has_option('global', 'fsid') and \
6220 cp.get('global', 'fsid') != ctx.fsid:
6221 raise Error('fsid does not match ceph.conf')
6222
6223 if ctx.fsid:
6224 make_log_dir(ctx, ctx.fsid)
6225
6226 lock = FileLock(ctx, ctx.fsid)
6227 lock.acquire()
6228
6229 (uid, gid) = (0, 0) # ceph-volume runs as root
6230 mounts = get_container_mounts(ctx, ctx.fsid, 'osd', None)
6231
6232 tmp_config = None
6233 tmp_keyring = None
6234
6235 (config, keyring) = get_config_and_keyring(ctx)
6236
6237 if config:
6238 # tmp config file
6239 tmp_config = write_tmp(config, uid, gid)
6240 mounts[tmp_config.name] = '/etc/ceph/ceph.conf:z'
6241
6242 if keyring:
6243 # tmp keyring file
6244 tmp_keyring = write_tmp(keyring, uid, gid)
6245 mounts[tmp_keyring.name] = '/var/lib/ceph/bootstrap-osd/ceph.keyring:z'
6246
6247 c = get_ceph_volume_container(
6248 ctx,
6249 envs=ctx.env,
6250 args=ctx.command,
6251 volume_mounts=mounts,
6252 )
6253
6254 out, err, code = call_throws(ctx, c.run_cmd(), verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
6255 if not code:
6256 print(out)
6257
6258 ##################################
6259
6260
6261 @infer_fsid
6262 def command_unit(ctx):
6263 # type: (CephadmContext) -> int
6264 if not ctx.fsid:
6265 raise Error('must pass --fsid to specify cluster')
6266
6267 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
6268
6269 _, _, code = call(
6270 ctx,
6271 ['systemctl', ctx.command, unit_name],
6272 verbosity=CallVerbosity.VERBOSE,
6273 desc=''
6274 )
6275 return code
6276
6277 ##################################
6278
6279
6280 @infer_fsid
6281 def command_logs(ctx):
6282 # type: (CephadmContext) -> None
6283 if not ctx.fsid:
6284 raise Error('must pass --fsid to specify cluster')
6285
6286 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
6287
6288 cmd = [find_program('journalctl')]
6289 cmd.extend(['-u', unit_name])
6290 if ctx.command:
6291 cmd.extend(ctx.command)
6292
6293 # call this directly, without our wrapper, so that we get an unmolested
6294 # stdout with logger prefixing.
6295 logger.debug('Running command: %s' % ' '.join(cmd))
6296 subprocess.call(cmd, env=os.environ.copy()) # type: ignore
6297
6298 ##################################
6299
6300
6301 def list_networks(ctx):
6302 # type: (CephadmContext) -> Dict[str,Dict[str, Set[str]]]
6303
6304 # sadly, 18.04's iproute2 4.15.0-2ubun doesn't support the -j flag,
6305 # so we'll need to use a regex to parse 'ip' command output.
6306 #
6307 # out, _, _ = call_throws(['ip', '-j', 'route', 'ls'])
6308 # j = json.loads(out)
6309 # for x in j:
6310 res = _list_ipv4_networks(ctx)
6311 res.update(_list_ipv6_networks(ctx))
6312 return res
6313
6314
6315 def _list_ipv4_networks(ctx: CephadmContext) -> Dict[str, Dict[str, Set[str]]]:
6316 execstr: Optional[str] = find_executable('ip')
6317 if not execstr:
6318 raise FileNotFoundError("unable to find 'ip' command")
6319 out, _, _ = call_throws(ctx, [execstr, 'route', 'ls'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
6320 return _parse_ipv4_route(out)
6321
6322
6323 def _parse_ipv4_route(out: str) -> Dict[str, Dict[str, Set[str]]]:
6324 r = {} # type: Dict[str, Dict[str, Set[str]]]
6325 p = re.compile(r'^(\S+) (?:via \S+)? ?dev (\S+) (.*)scope link (.*)src (\S+)')
6326 for line in out.splitlines():
6327 m = p.findall(line)
6328 if not m:
6329 continue
6330 net = m[0][0]
6331 if '/' not in net: # aggregate /32 mask for single host sub-networks
6332 net += '/32'
6333 iface = m[0][1]
6334 ip = m[0][4]
6335 if net not in r:
6336 r[net] = {}
6337 if iface not in r[net]:
6338 r[net][iface] = set()
6339 r[net][iface].add(ip)
6340 return r
6341
6342
6343 def _list_ipv6_networks(ctx: CephadmContext) -> Dict[str, Dict[str, Set[str]]]:
6344 execstr: Optional[str] = find_executable('ip')
6345 if not execstr:
6346 raise FileNotFoundError("unable to find 'ip' command")
6347 routes, _, _ = call_throws(ctx, [execstr, '-6', 'route', 'ls'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
6348 ips, _, _ = call_throws(ctx, [execstr, '-6', 'addr', 'ls'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
6349 return _parse_ipv6_route(routes, ips)
6350
6351
6352 def _parse_ipv6_route(routes: str, ips: str) -> Dict[str, Dict[str, Set[str]]]:
6353 r = {} # type: Dict[str, Dict[str, Set[str]]]
6354 route_p = re.compile(r'^(\S+) dev (\S+) proto (\S+) metric (\S+) .*pref (\S+)$')
6355 ip_p = re.compile(r'^\s+inet6 (\S+)/(.*)scope (.*)$')
6356 iface_p = re.compile(r'^(\d+): (\S+): (.*)$')
6357 for line in routes.splitlines():
6358 m = route_p.findall(line)
6359 if not m or m[0][0].lower() == 'default':
6360 continue
6361 net = m[0][0]
6362 if '/' not in net: # aggregate /128 mask for single host sub-networks
6363 net += '/128'
6364 iface = m[0][1]
6365 if iface == 'lo': # skip loopback devices
6366 continue
6367 if net not in r:
6368 r[net] = {}
6369 if iface not in r[net]:
6370 r[net][iface] = set()
6371
6372 iface = None
6373 for line in ips.splitlines():
6374 m = ip_p.findall(line)
6375 if not m:
6376 m = iface_p.findall(line)
6377 if m:
6378 # drop @... suffix, if present
6379 iface = m[0][1].split('@')[0]
6380 continue
6381 ip = m[0][0]
6382 # find the network it belongs to
6383 net = [n for n in r.keys()
6384 if ipaddress.ip_address(ip) in ipaddress.ip_network(n)]
6385 if net and iface in r[net[0]]:
6386 assert iface
6387 r[net[0]][iface].add(ip)
6388
6389 return r
6390
6391
6392 def command_list_networks(ctx):
6393 # type: (CephadmContext) -> None
6394 r = list_networks(ctx)
6395
6396 def serialize_sets(obj: Any) -> Any:
6397 return list(obj) if isinstance(obj, set) else obj
6398
6399 print(json.dumps(r, indent=4, default=serialize_sets))
6400
6401 ##################################
6402
6403
6404 def command_ls(ctx):
6405 # type: (CephadmContext) -> None
6406 ls = list_daemons(ctx, detail=not ctx.no_detail,
6407 legacy_dir=ctx.legacy_dir)
6408 print(json.dumps(ls, indent=4))
6409
6410
6411 def with_units_to_int(v: str) -> int:
6412 if v.endswith('iB'):
6413 v = v[:-2]
6414 elif v.endswith('B'):
6415 v = v[:-1]
6416 mult = 1
6417 if v[-1].upper() == 'K':
6418 mult = 1024
6419 v = v[:-1]
6420 elif v[-1].upper() == 'M':
6421 mult = 1024 * 1024
6422 v = v[:-1]
6423 elif v[-1].upper() == 'G':
6424 mult = 1024 * 1024 * 1024
6425 v = v[:-1]
6426 elif v[-1].upper() == 'T':
6427 mult = 1024 * 1024 * 1024 * 1024
6428 v = v[:-1]
6429 return int(float(v) * mult)
6430
6431
6432 def list_daemons(ctx, detail=True, legacy_dir=None):
6433 # type: (CephadmContext, bool, Optional[str]) -> List[Dict[str, str]]
6434 host_version: Optional[str] = None
6435 ls = []
6436 container_path = ctx.container_engine.path
6437
6438 data_dir = ctx.data_dir
6439 if legacy_dir is not None:
6440 data_dir = os.path.abspath(legacy_dir + data_dir)
6441
6442 # keep track of ceph versions we see
6443 seen_versions = {} # type: Dict[str, Optional[str]]
6444
6445 # keep track of image digests
6446 seen_digests = {} # type: Dict[str, List[str]]
6447
6448 # keep track of memory and cpu usage we've seen
6449 seen_memusage = {} # type: Dict[str, int]
6450 seen_cpuperc = {} # type: Dict[str, str]
6451 out, err, code = call(
6452 ctx,
6453 [container_path, 'stats', '--format', '{{.ID}},{{.MemUsage}}', '--no-stream'],
6454 verbosity=CallVerbosity.QUIET
6455 )
6456 seen_memusage_cid_len, seen_memusage = _parse_mem_usage(code, out)
6457
6458 out, err, code = call(
6459 ctx,
6460 [container_path, 'stats', '--format', '{{.ID}},{{.CPUPerc}}', '--no-stream'],
6461 verbosity=CallVerbosity.QUIET
6462 )
6463 seen_cpuperc_cid_len, seen_cpuperc = _parse_cpu_perc(code, out)
6464
6465 # /var/lib/ceph
6466 if os.path.exists(data_dir):
6467 for i in os.listdir(data_dir):
6468 if i in ['mon', 'osd', 'mds', 'mgr']:
6469 daemon_type = i
6470 for j in os.listdir(os.path.join(data_dir, i)):
6471 if '-' not in j:
6472 continue
6473 (cluster, daemon_id) = j.split('-', 1)
6474 fsid = get_legacy_daemon_fsid(ctx,
6475 cluster, daemon_type, daemon_id,
6476 legacy_dir=legacy_dir)
6477 legacy_unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
6478 val: Dict[str, Any] = {
6479 'style': 'legacy',
6480 'name': '%s.%s' % (daemon_type, daemon_id),
6481 'fsid': fsid if fsid is not None else 'unknown',
6482 'systemd_unit': legacy_unit_name,
6483 }
6484 if detail:
6485 (val['enabled'], val['state'], _) = check_unit(ctx, legacy_unit_name)
6486 if not host_version:
6487 try:
6488 out, err, code = call(ctx,
6489 ['ceph', '-v'],
6490 verbosity=CallVerbosity.QUIET)
6491 if not code and out.startswith('ceph version '):
6492 host_version = out.split(' ')[2]
6493 except Exception:
6494 pass
6495 val['host_version'] = host_version
6496 ls.append(val)
6497 elif is_fsid(i):
6498 fsid = str(i) # convince mypy that fsid is a str here
6499 for j in os.listdir(os.path.join(data_dir, i)):
6500 if '.' in j and os.path.isdir(os.path.join(data_dir, fsid, j)):
6501 name = j
6502 (daemon_type, daemon_id) = j.split('.', 1)
6503 unit_name = get_unit_name(fsid,
6504 daemon_type,
6505 daemon_id)
6506 else:
6507 continue
6508 val = {
6509 'style': 'cephadm:v1',
6510 'name': name,
6511 'fsid': fsid,
6512 'systemd_unit': unit_name,
6513 }
6514 if detail:
6515 # get container id
6516 (val['enabled'], val['state'], _) = check_unit(ctx, unit_name)
6517 container_id = None
6518 image_name = None
6519 image_id = None
6520 image_digests = None
6521 version = None
6522 start_stamp = None
6523
6524 out, err, code = get_container_stats(ctx, container_path, fsid, daemon_type, daemon_id)
6525 if not code:
6526 (container_id, image_name, image_id, start,
6527 version) = out.strip().split(',')
6528 image_id = normalize_container_id(image_id)
6529 daemon_type = name.split('.', 1)[0]
6530 start_stamp = try_convert_datetime(start)
6531
6532 # collect digests for this image id
6533 image_digests = seen_digests.get(image_id)
6534 if not image_digests:
6535 out, err, code = call(
6536 ctx,
6537 [
6538 container_path, 'image', 'inspect', image_id,
6539 '--format', '{{.RepoDigests}}',
6540 ],
6541 verbosity=CallVerbosity.QUIET)
6542 if not code:
6543 image_digests = list(set(map(
6544 normalize_image_digest,
6545 out.strip()[1:-1].split(' '))))
6546 seen_digests[image_id] = image_digests
6547
6548 # identify software version inside the container (if we can)
6549 if not version or '.' not in version:
6550 version = seen_versions.get(image_id, None)
6551 if daemon_type == NFSGanesha.daemon_type:
6552 version = NFSGanesha.get_version(ctx, container_id)
6553 if daemon_type == CephIscsi.daemon_type:
6554 version = CephIscsi.get_version(ctx, container_id)
6555 elif not version:
6556 if daemon_type in Ceph.daemons:
6557 out, err, code = call(ctx,
6558 [container_path, 'exec', container_id,
6559 'ceph', '-v'],
6560 verbosity=CallVerbosity.QUIET)
6561 if not code and \
6562 out.startswith('ceph version '):
6563 version = out.split(' ')[2]
6564 seen_versions[image_id] = version
6565 elif daemon_type == 'grafana':
6566 out, err, code = call(ctx,
6567 [container_path, 'exec', container_id,
6568 'grafana-server', '-v'],
6569 verbosity=CallVerbosity.QUIET)
6570 if not code and \
6571 out.startswith('Version '):
6572 version = out.split(' ')[1]
6573 seen_versions[image_id] = version
6574 elif daemon_type in ['prometheus',
6575 'alertmanager',
6576 'node-exporter',
6577 'loki',
6578 'promtail']:
6579 version = Monitoring.get_version(ctx, container_id, daemon_type)
6580 seen_versions[image_id] = version
6581 elif daemon_type == 'haproxy':
6582 out, err, code = call(ctx,
6583 [container_path, 'exec', container_id,
6584 'haproxy', '-v'],
6585 verbosity=CallVerbosity.QUIET)
6586 if not code and \
6587 out.startswith('HA-Proxy version '):
6588 version = out.split(' ')[2]
6589 seen_versions[image_id] = version
6590 elif daemon_type == 'keepalived':
6591 out, err, code = call(ctx,
6592 [container_path, 'exec', container_id,
6593 'keepalived', '--version'],
6594 verbosity=CallVerbosity.QUIET)
6595 if not code and \
6596 err.startswith('Keepalived '):
6597 version = err.split(' ')[1]
6598 if version[0] == 'v':
6599 version = version[1:]
6600 seen_versions[image_id] = version
6601 elif daemon_type == CustomContainer.daemon_type:
6602 # Because a custom container can contain
6603 # everything, we do not know which command
6604 # to execute to get the version.
6605 pass
6606 elif daemon_type == SNMPGateway.daemon_type:
6607 version = SNMPGateway.get_version(ctx, fsid, daemon_id)
6608 seen_versions[image_id] = version
6609 else:
6610 logger.warning('version for unknown daemon type %s' % daemon_type)
6611 else:
6612 vfile = os.path.join(data_dir, fsid, j, 'unit.image') # type: ignore
6613 try:
6614 with open(vfile, 'r') as f:
6615 image_name = f.read().strip() or None
6616 except IOError:
6617 pass
6618
6619 # unit.meta?
6620 mfile = os.path.join(data_dir, fsid, j, 'unit.meta') # type: ignore
6621 try:
6622 with open(mfile, 'r') as f:
6623 meta = json.loads(f.read())
6624 val.update(meta)
6625 except IOError:
6626 pass
6627
6628 val['container_id'] = container_id
6629 val['container_image_name'] = image_name
6630 val['container_image_id'] = image_id
6631 val['container_image_digests'] = image_digests
6632 if container_id:
6633 val['memory_usage'] = seen_memusage.get(container_id[0:seen_memusage_cid_len])
6634 val['cpu_percentage'] = seen_cpuperc.get(container_id[0:seen_cpuperc_cid_len])
6635 val['version'] = version
6636 val['started'] = start_stamp
6637 val['created'] = get_file_timestamp(
6638 os.path.join(data_dir, fsid, j, 'unit.created')
6639 )
6640 val['deployed'] = get_file_timestamp(
6641 os.path.join(data_dir, fsid, j, 'unit.image'))
6642 val['configured'] = get_file_timestamp(
6643 os.path.join(data_dir, fsid, j, 'unit.configured'))
6644 ls.append(val)
6645
6646 return ls
6647
6648
6649 def _parse_mem_usage(code: int, out: str) -> Tuple[int, Dict[str, int]]:
6650 # keep track of memory usage we've seen
6651 seen_memusage = {} # type: Dict[str, int]
6652 seen_memusage_cid_len = 0
6653 if not code:
6654 for line in out.splitlines():
6655 (cid, usage) = line.split(',')
6656 (used, limit) = usage.split(' / ')
6657 try:
6658 seen_memusage[cid] = with_units_to_int(used)
6659 if not seen_memusage_cid_len:
6660 seen_memusage_cid_len = len(cid)
6661 except ValueError:
6662 logger.info('unable to parse memory usage line\n>{}'.format(line))
6663 pass
6664 return seen_memusage_cid_len, seen_memusage
6665
6666
6667 def _parse_cpu_perc(code: int, out: str) -> Tuple[int, Dict[str, str]]:
6668 seen_cpuperc = {}
6669 seen_cpuperc_cid_len = 0
6670 if not code:
6671 for line in out.splitlines():
6672 (cid, cpuperc) = line.split(',')
6673 try:
6674 seen_cpuperc[cid] = cpuperc
6675 if not seen_cpuperc_cid_len:
6676 seen_cpuperc_cid_len = len(cid)
6677 except ValueError:
6678 logger.info('unable to parse cpu percentage line\n>{}'.format(line))
6679 pass
6680 return seen_cpuperc_cid_len, seen_cpuperc
6681
6682
6683 def get_daemon_description(ctx, fsid, name, detail=False, legacy_dir=None):
6684 # type: (CephadmContext, str, str, bool, Optional[str]) -> Dict[str, str]
6685
6686 for d in list_daemons(ctx, detail=detail, legacy_dir=legacy_dir):
6687 if d['fsid'] != fsid:
6688 continue
6689 if d['name'] != name:
6690 continue
6691 return d
6692 raise Error('Daemon not found: {}. See `cephadm ls`'.format(name))
6693
6694
6695 def get_container_stats(ctx: CephadmContext, container_path: str, fsid: str, daemon_type: str, daemon_id: str) -> Tuple[str, str, int]:
6696 c = CephContainer.for_daemon(ctx, fsid, daemon_type, daemon_id, 'bash')
6697 out, err, code = '', '', -1
6698 for name in (c.cname, c.old_cname):
6699 cmd = [
6700 container_path, 'inspect',
6701 '--format', '{{.Id}},{{.Config.Image}},{{.Image}},{{.Created}},{{index .Config.Labels "io.ceph.version"}}',
6702 name
6703 ]
6704 out, err, code = call(ctx, cmd, verbosity=CallVerbosity.QUIET)
6705 if not code:
6706 break
6707 return out, err, code
6708
6709 ##################################
6710
6711
6712 @default_image
6713 def command_adopt(ctx):
6714 # type: (CephadmContext) -> None
6715
6716 if not ctx.skip_pull:
6717 try:
6718 _pull_image(ctx, ctx.image)
6719 except UnauthorizedRegistryError:
6720 err_str = 'Failed to pull container image. Host may not be logged into container registry. Try `cephadm registry-login --registry-url <url> --registry-username <username> --registry-password <password>` or supply login info via a json file with `cephadm registry-login --registry-json <file>`'
6721 logger.debug(f'Pulling image for `command_adopt` failed: {err_str}')
6722 raise Error(err_str)
6723
6724 (daemon_type, daemon_id) = ctx.name.split('.', 1)
6725
6726 # legacy check
6727 if ctx.style != 'legacy':
6728 raise Error('adoption of style %s not implemented' % ctx.style)
6729
6730 # lock
6731 fsid = get_legacy_daemon_fsid(ctx,
6732 ctx.cluster,
6733 daemon_type,
6734 daemon_id,
6735 legacy_dir=ctx.legacy_dir)
6736 if not fsid:
6737 raise Error('could not detect legacy fsid; set fsid in ceph.conf')
6738 lock = FileLock(ctx, fsid)
6739 lock.acquire()
6740
6741 # call correct adoption
6742 if daemon_type in Ceph.daemons:
6743 command_adopt_ceph(ctx, daemon_type, daemon_id, fsid)
6744 elif daemon_type == 'prometheus':
6745 command_adopt_prometheus(ctx, daemon_id, fsid)
6746 elif daemon_type == 'grafana':
6747 command_adopt_grafana(ctx, daemon_id, fsid)
6748 elif daemon_type == 'node-exporter':
6749 raise Error('adoption of node-exporter not implemented')
6750 elif daemon_type == 'alertmanager':
6751 command_adopt_alertmanager(ctx, daemon_id, fsid)
6752 else:
6753 raise Error('daemon type %s not recognized' % daemon_type)
6754
6755
6756 class AdoptOsd(object):
6757 def __init__(self, ctx, osd_data_dir, osd_id):
6758 # type: (CephadmContext, str, str) -> None
6759 self.ctx = ctx
6760 self.osd_data_dir = osd_data_dir
6761 self.osd_id = osd_id
6762
6763 def check_online_osd(self):
6764 # type: () -> Tuple[Optional[str], Optional[str]]
6765
6766 osd_fsid, osd_type = None, None
6767
6768 path = os.path.join(self.osd_data_dir, 'fsid')
6769 try:
6770 with open(path, 'r') as f:
6771 osd_fsid = f.read().strip()
6772 logger.info('Found online OSD at %s' % path)
6773 except IOError:
6774 logger.info('Unable to read OSD fsid from %s' % path)
6775 if os.path.exists(os.path.join(self.osd_data_dir, 'type')):
6776 with open(os.path.join(self.osd_data_dir, 'type')) as f:
6777 osd_type = f.read().strip()
6778 else:
6779 logger.info('"type" file missing for OSD data dir')
6780
6781 return osd_fsid, osd_type
6782
6783 def check_offline_lvm_osd(self):
6784 # type: () -> Tuple[Optional[str], Optional[str]]
6785 osd_fsid, osd_type = None, None
6786
6787 c = get_ceph_volume_container(
6788 self.ctx,
6789 args=['lvm', 'list', '--format=json'],
6790 )
6791 out, err, code = call_throws(self.ctx, c.run_cmd())
6792 if not code:
6793 try:
6794 js = json.loads(out)
6795 if self.osd_id in js:
6796 logger.info('Found offline LVM OSD {}'.format(self.osd_id))
6797 osd_fsid = js[self.osd_id][0]['tags']['ceph.osd_fsid']
6798 for device in js[self.osd_id]:
6799 if device['tags']['ceph.type'] == 'block':
6800 osd_type = 'bluestore'
6801 break
6802 if device['tags']['ceph.type'] == 'data':
6803 osd_type = 'filestore'
6804 break
6805 except ValueError as e:
6806 logger.info('Invalid JSON in ceph-volume lvm list: {}'.format(e))
6807
6808 return osd_fsid, osd_type
6809
6810 def check_offline_simple_osd(self):
6811 # type: () -> Tuple[Optional[str], Optional[str]]
6812 osd_fsid, osd_type = None, None
6813
6814 osd_file = glob('/etc/ceph/osd/{}-[a-f0-9-]*.json'.format(self.osd_id))
6815 if len(osd_file) == 1:
6816 with open(osd_file[0], 'r') as f:
6817 try:
6818 js = json.loads(f.read())
6819 logger.info('Found offline simple OSD {}'.format(self.osd_id))
6820 osd_fsid = js['fsid']
6821 osd_type = js['type']
6822 if osd_type != 'filestore':
6823 # need this to be mounted for the adopt to work, as it
6824 # needs to move files from this directory
6825 call_throws(self.ctx, ['mount', js['data']['path'], self.osd_data_dir])
6826 except ValueError as e:
6827 logger.info('Invalid JSON in {}: {}'.format(osd_file, e))
6828
6829 return osd_fsid, osd_type
6830
6831 def change_cluster_name(self) -> None:
6832 logger.info('Attempting to convert osd cluster name to ceph . . .')
6833 c = get_ceph_volume_container(
6834 self.ctx,
6835 args=['lvm', 'list', '{}'.format(self.osd_id), '--format=json'],
6836 )
6837 out, err, code = call_throws(self.ctx, c.run_cmd())
6838 if code:
6839 raise Exception(f'Failed to get list of LVs: {err}\nceph-volume failed with rc {code}')
6840 try:
6841 js = json.loads(out)
6842 if not js:
6843 raise RuntimeError(f'Failed to find osd.{self.osd_id}')
6844 device: Optional[Dict[Any, Any]] = None
6845 for d in js[self.osd_id]:
6846 if d['type'] == 'block':
6847 device = d
6848 break
6849 if not device:
6850 raise RuntimeError(f'Failed to find block device for osd.{self.osd_id}')
6851 vg = device['vg_name']
6852 out, err, code = call_throws(self.ctx, ['lvchange', '--deltag', f'ceph.cluster_name={self.ctx.cluster}', vg])
6853 if code:
6854 raise RuntimeError(f"Can't delete tag ceph.cluster_name={self.ctx.cluster} on osd.{self.osd_id}.\nlvchange failed with rc {code}")
6855 out, err, code = call_throws(self.ctx, ['lvchange', '--addtag', 'ceph.cluster_name=ceph', vg])
6856 if code:
6857 raise RuntimeError(f"Can't add tag ceph.cluster_name=ceph on osd.{self.osd_id}.\nlvchange failed with rc {code}")
6858 logger.info('Successfully converted osd cluster name')
6859 except (Exception, RuntimeError) as e:
6860 logger.info(f'Failed to convert osd cluster name: {e}')
6861
6862
6863 def command_adopt_ceph(ctx, daemon_type, daemon_id, fsid):
6864 # type: (CephadmContext, str, str, str) -> None
6865
6866 (uid, gid) = extract_uid_gid(ctx)
6867
6868 data_dir_src = ('/var/lib/ceph/%s/%s-%s' %
6869 (daemon_type, ctx.cluster, daemon_id))
6870 data_dir_src = os.path.abspath(ctx.legacy_dir + data_dir_src)
6871
6872 if not os.path.exists(data_dir_src):
6873 raise Error("{}.{} data directory '{}' does not exist. "
6874 'Incorrect ID specified, or daemon already adopted?'.format(
6875 daemon_type, daemon_id, data_dir_src))
6876
6877 osd_fsid = None
6878 if daemon_type == 'osd':
6879 adopt_osd = AdoptOsd(ctx, data_dir_src, daemon_id)
6880 osd_fsid, osd_type = adopt_osd.check_online_osd()
6881 if not osd_fsid:
6882 osd_fsid, osd_type = adopt_osd.check_offline_lvm_osd()
6883 if not osd_fsid:
6884 osd_fsid, osd_type = adopt_osd.check_offline_simple_osd()
6885 if not osd_fsid:
6886 raise Error('Unable to find OSD {}'.format(daemon_id))
6887 elif ctx.cluster != 'ceph':
6888 adopt_osd.change_cluster_name()
6889 logger.info('objectstore_type is %s' % osd_type)
6890 assert osd_type
6891 if osd_type == 'filestore':
6892 raise Error('FileStore is not supported by cephadm')
6893
6894 # NOTE: implicit assumption here that the units correspond to the
6895 # cluster we are adopting based on the /etc/{defaults,sysconfig}/ceph
6896 # CLUSTER field.
6897 unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
6898 (enabled, state, _) = check_unit(ctx, unit_name)
6899 if state == 'running':
6900 logger.info('Stopping old systemd unit %s...' % unit_name)
6901 call_throws(ctx, ['systemctl', 'stop', unit_name])
6902 if enabled:
6903 logger.info('Disabling old systemd unit %s...' % unit_name)
6904 call_throws(ctx, ['systemctl', 'disable', unit_name])
6905
6906 # data
6907 logger.info('Moving data...')
6908 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
6909 uid=uid, gid=gid)
6910 move_files(ctx, glob(os.path.join(data_dir_src, '*')),
6911 data_dir_dst,
6912 uid=uid, gid=gid)
6913 logger.debug('Remove dir `%s`' % (data_dir_src))
6914 if os.path.ismount(data_dir_src):
6915 call_throws(ctx, ['umount', data_dir_src])
6916 os.rmdir(data_dir_src)
6917
6918 logger.info('Chowning content...')
6919 call_throws(ctx, ['chown', '-c', '-R', '%d.%d' % (uid, gid), data_dir_dst])
6920
6921 if daemon_type == 'mon':
6922 # rename *.ldb -> *.sst, in case they are coming from ubuntu
6923 store = os.path.join(data_dir_dst, 'store.db')
6924 num_renamed = 0
6925 if os.path.exists(store):
6926 for oldf in os.listdir(store):
6927 if oldf.endswith('.ldb'):
6928 newf = oldf.replace('.ldb', '.sst')
6929 oldp = os.path.join(store, oldf)
6930 newp = os.path.join(store, newf)
6931 logger.debug('Renaming %s -> %s' % (oldp, newp))
6932 os.rename(oldp, newp)
6933 if num_renamed:
6934 logger.info('Renamed %d leveldb *.ldb files to *.sst',
6935 num_renamed)
6936 if daemon_type == 'osd':
6937 for n in ['block', 'block.db', 'block.wal']:
6938 p = os.path.join(data_dir_dst, n)
6939 if os.path.exists(p):
6940 logger.info('Chowning %s...' % p)
6941 os.chown(p, uid, gid)
6942 # disable the ceph-volume 'simple' mode files on the host
6943 simple_fn = os.path.join('/etc/ceph/osd',
6944 '%s-%s.json' % (daemon_id, osd_fsid))
6945 if os.path.exists(simple_fn):
6946 new_fn = simple_fn + '.adopted-by-cephadm'
6947 logger.info('Renaming %s -> %s', simple_fn, new_fn)
6948 os.rename(simple_fn, new_fn)
6949 logger.info('Disabling host unit ceph-volume@ simple unit...')
6950 call(ctx, ['systemctl', 'disable',
6951 'ceph-volume@simple-%s-%s.service' % (daemon_id, osd_fsid)])
6952 else:
6953 # assume this is an 'lvm' c-v for now, but don't error
6954 # out if it's not.
6955 logger.info('Disabling host unit ceph-volume@ lvm unit...')
6956 call(ctx, ['systemctl', 'disable',
6957 'ceph-volume@lvm-%s-%s.service' % (daemon_id, osd_fsid)])
6958
6959 # config
6960 config_src = '/etc/ceph/%s.conf' % (ctx.cluster)
6961 config_src = os.path.abspath(ctx.legacy_dir + config_src)
6962 config_dst = os.path.join(data_dir_dst, 'config')
6963 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
6964
6965 # logs
6966 logger.info('Moving logs...')
6967 log_dir_src = ('/var/log/ceph/%s-%s.%s.log*' %
6968 (ctx.cluster, daemon_type, daemon_id))
6969 log_dir_src = os.path.abspath(ctx.legacy_dir + log_dir_src)
6970 log_dir_dst = make_log_dir(ctx, fsid, uid=uid, gid=gid)
6971 move_files(ctx, glob(log_dir_src),
6972 log_dir_dst,
6973 uid=uid, gid=gid)
6974
6975 logger.info('Creating new units...')
6976 make_var_run(ctx, fsid, uid, gid)
6977 c = get_container(ctx, fsid, daemon_type, daemon_id)
6978 deploy_daemon_units(ctx, fsid, uid, gid, daemon_type, daemon_id, c,
6979 enable=True, # unconditionally enable the new unit
6980 start=(state == 'running' or ctx.force_start),
6981 osd_fsid=osd_fsid)
6982 update_firewalld(ctx, daemon_type)
6983
6984
6985 def command_adopt_prometheus(ctx, daemon_id, fsid):
6986 # type: (CephadmContext, str, str) -> None
6987 daemon_type = 'prometheus'
6988 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
6989
6990 _stop_and_disable(ctx, 'prometheus')
6991
6992 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
6993 uid=uid, gid=gid)
6994
6995 # config
6996 config_src = '/etc/prometheus/prometheus.yml'
6997 config_src = os.path.abspath(ctx.legacy_dir + config_src)
6998 config_dst = os.path.join(data_dir_dst, 'etc/prometheus')
6999 makedirs(config_dst, uid, gid, 0o755)
7000 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
7001
7002 # data
7003 data_src = '/var/lib/prometheus/metrics/'
7004 data_src = os.path.abspath(ctx.legacy_dir + data_src)
7005 data_dst = os.path.join(data_dir_dst, 'data')
7006 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
7007
7008 make_var_run(ctx, fsid, uid, gid)
7009 c = get_container(ctx, fsid, daemon_type, daemon_id)
7010 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
7011 update_firewalld(ctx, daemon_type)
7012
7013
7014 def command_adopt_grafana(ctx, daemon_id, fsid):
7015 # type: (CephadmContext, str, str) -> None
7016
7017 daemon_type = 'grafana'
7018 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
7019
7020 _stop_and_disable(ctx, 'grafana-server')
7021
7022 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
7023 uid=uid, gid=gid)
7024
7025 # config
7026 config_src = '/etc/grafana/grafana.ini'
7027 config_src = os.path.abspath(ctx.legacy_dir + config_src)
7028 config_dst = os.path.join(data_dir_dst, 'etc/grafana')
7029 makedirs(config_dst, uid, gid, 0o755)
7030 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
7031
7032 prov_src = '/etc/grafana/provisioning/'
7033 prov_src = os.path.abspath(ctx.legacy_dir + prov_src)
7034 prov_dst = os.path.join(data_dir_dst, 'etc/grafana')
7035 copy_tree(ctx, [prov_src], prov_dst, uid=uid, gid=gid)
7036
7037 # cert
7038 cert = '/etc/grafana/grafana.crt'
7039 key = '/etc/grafana/grafana.key'
7040 if os.path.exists(cert) and os.path.exists(key):
7041 cert_src = '/etc/grafana/grafana.crt'
7042 cert_src = os.path.abspath(ctx.legacy_dir + cert_src)
7043 makedirs(os.path.join(data_dir_dst, 'etc/grafana/certs'), uid, gid, 0o755)
7044 cert_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_file')
7045 copy_files(ctx, [cert_src], cert_dst, uid=uid, gid=gid)
7046
7047 key_src = '/etc/grafana/grafana.key'
7048 key_src = os.path.abspath(ctx.legacy_dir + key_src)
7049 key_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_key')
7050 copy_files(ctx, [key_src], key_dst, uid=uid, gid=gid)
7051
7052 _adjust_grafana_ini(os.path.join(config_dst, 'grafana.ini'))
7053 else:
7054 logger.debug('Skipping ssl, missing cert {} or key {}'.format(cert, key))
7055
7056 # data - possible custom dashboards/plugins
7057 data_src = '/var/lib/grafana/'
7058 data_src = os.path.abspath(ctx.legacy_dir + data_src)
7059 data_dst = os.path.join(data_dir_dst, 'data')
7060 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
7061
7062 make_var_run(ctx, fsid, uid, gid)
7063 c = get_container(ctx, fsid, daemon_type, daemon_id)
7064 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
7065 update_firewalld(ctx, daemon_type)
7066
7067
7068 def command_adopt_alertmanager(ctx, daemon_id, fsid):
7069 # type: (CephadmContext, str, str) -> None
7070
7071 daemon_type = 'alertmanager'
7072 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
7073
7074 _stop_and_disable(ctx, 'prometheus-alertmanager')
7075
7076 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
7077 uid=uid, gid=gid)
7078
7079 # config
7080 config_src = '/etc/prometheus/alertmanager.yml'
7081 config_src = os.path.abspath(ctx.legacy_dir + config_src)
7082 config_dst = os.path.join(data_dir_dst, 'etc/alertmanager')
7083 makedirs(config_dst, uid, gid, 0o755)
7084 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
7085
7086 # data
7087 data_src = '/var/lib/prometheus/alertmanager/'
7088 data_src = os.path.abspath(ctx.legacy_dir + data_src)
7089 data_dst = os.path.join(data_dir_dst, 'etc/alertmanager/data')
7090 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
7091
7092 make_var_run(ctx, fsid, uid, gid)
7093 c = get_container(ctx, fsid, daemon_type, daemon_id)
7094 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
7095 update_firewalld(ctx, daemon_type)
7096
7097
7098 def _adjust_grafana_ini(filename):
7099 # type: (str) -> None
7100
7101 # Update cert_file, cert_key pathnames in server section
7102 # ConfigParser does not preserve comments
7103 try:
7104 with open(filename, 'r') as grafana_ini:
7105 lines = grafana_ini.readlines()
7106 with open('{}.new'.format(filename), 'w') as grafana_ini:
7107 server_section = False
7108 for line in lines:
7109 if line.startswith('['):
7110 server_section = False
7111 if line.startswith('[server]'):
7112 server_section = True
7113 if server_section:
7114 line = re.sub(r'^cert_file.*',
7115 'cert_file = /etc/grafana/certs/cert_file', line)
7116 line = re.sub(r'^cert_key.*',
7117 'cert_key = /etc/grafana/certs/cert_key', line)
7118 grafana_ini.write(line)
7119 os.rename('{}.new'.format(filename), filename)
7120 except OSError as err:
7121 raise Error('Cannot update {}: {}'.format(filename, err))
7122
7123
7124 def _stop_and_disable(ctx, unit_name):
7125 # type: (CephadmContext, str) -> None
7126
7127 (enabled, state, _) = check_unit(ctx, unit_name)
7128 if state == 'running':
7129 logger.info('Stopping old systemd unit %s...' % unit_name)
7130 call_throws(ctx, ['systemctl', 'stop', unit_name])
7131 if enabled:
7132 logger.info('Disabling old systemd unit %s...' % unit_name)
7133 call_throws(ctx, ['systemctl', 'disable', unit_name])
7134
7135 ##################################
7136
7137
7138 def command_rm_daemon(ctx):
7139 # type: (CephadmContext) -> None
7140 lock = FileLock(ctx, ctx.fsid)
7141 lock.acquire()
7142
7143 (daemon_type, daemon_id) = ctx.name.split('.', 1)
7144 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
7145
7146 if daemon_type in ['mon', 'osd'] and not ctx.force:
7147 raise Error('must pass --force to proceed: '
7148 'this command may destroy precious data!')
7149
7150 call(ctx, ['systemctl', 'stop', unit_name],
7151 verbosity=CallVerbosity.DEBUG)
7152 call(ctx, ['systemctl', 'reset-failed', unit_name],
7153 verbosity=CallVerbosity.DEBUG)
7154 call(ctx, ['systemctl', 'disable', unit_name],
7155 verbosity=CallVerbosity.DEBUG)
7156 data_dir = get_data_dir(ctx.fsid, ctx.data_dir, daemon_type, daemon_id)
7157 if daemon_type in ['mon', 'osd', 'prometheus'] and \
7158 not ctx.force_delete_data:
7159 # rename it out of the way -- do not delete
7160 backup_dir = os.path.join(ctx.data_dir, ctx.fsid, 'removed')
7161 if not os.path.exists(backup_dir):
7162 makedirs(backup_dir, 0, 0, DATA_DIR_MODE)
7163 dirname = '%s.%s_%s' % (daemon_type, daemon_id,
7164 datetime.datetime.utcnow().strftime(DATEFMT))
7165 os.rename(data_dir,
7166 os.path.join(backup_dir, dirname))
7167 else:
7168 call_throws(ctx, ['rm', '-rf', data_dir])
7169
7170 if 'tcp_ports' in ctx and ctx.tcp_ports is not None:
7171 ports: List[int] = [int(p) for p in ctx.tcp_ports.split()]
7172 try:
7173 fw = Firewalld(ctx)
7174 fw.close_ports(ports)
7175 fw.apply_rules()
7176 except RuntimeError as e:
7177 # in case we cannot close the ports we will remove
7178 # the daemon but keep them open.
7179 logger.warning(f' Error when trying to close ports: {e}')
7180
7181
7182 ##################################
7183
7184
7185 def _zap(ctx: CephadmContext, what: str) -> None:
7186 mounts = get_container_mounts(ctx, ctx.fsid, 'clusterless-ceph-volume', None)
7187 c = get_ceph_volume_container(ctx,
7188 args=['lvm', 'zap', '--destroy', what],
7189 volume_mounts=mounts,
7190 envs=ctx.env)
7191 logger.info(f'Zapping {what}...')
7192 out, err, code = call_throws(ctx, c.run_cmd())
7193
7194
7195 @infer_image
7196 def _zap_osds(ctx: CephadmContext) -> None:
7197 # assume fsid lock already held
7198
7199 # list
7200 mounts = get_container_mounts(ctx, ctx.fsid, 'clusterless-ceph-volume', None)
7201 c = get_ceph_volume_container(ctx,
7202 args=['inventory', '--format', 'json'],
7203 volume_mounts=mounts,
7204 envs=ctx.env)
7205 out, err, code = call_throws(ctx, c.run_cmd())
7206 if code:
7207 raise Error('failed to list osd inventory')
7208 try:
7209 ls = json.loads(out)
7210 except ValueError as e:
7211 raise Error(f'Invalid JSON in ceph-volume inventory: {e}')
7212
7213 for i in ls:
7214 matches = [lv.get('cluster_fsid') == ctx.fsid and i.get('ceph_device') for lv in i.get('lvs', [])]
7215 if any(matches) and all(matches):
7216 _zap(ctx, i.get('path'))
7217 elif any(matches):
7218 lv_names = [lv['name'] for lv in i.get('lvs', [])]
7219 # TODO: we need to map the lv_names back to device paths (the vg
7220 # id isn't part of the output here!)
7221 logger.warning(f'Not zapping LVs (not implemented): {lv_names}')
7222
7223
7224 def command_zap_osds(ctx: CephadmContext) -> None:
7225 if not ctx.force:
7226 raise Error('must pass --force to proceed: '
7227 'this command may destroy precious data!')
7228
7229 lock = FileLock(ctx, ctx.fsid)
7230 lock.acquire()
7231
7232 _zap_osds(ctx)
7233
7234 ##################################
7235
7236
7237 def get_ceph_cluster_count(ctx: CephadmContext) -> int:
7238 return len([c for c in os.listdir(ctx.data_dir) if is_fsid(c)])
7239
7240
7241 def command_rm_cluster(ctx):
7242 # type: (CephadmContext) -> None
7243 if not ctx.force:
7244 raise Error('must pass --force to proceed: '
7245 'this command may destroy precious data!')
7246
7247 lock = FileLock(ctx, ctx.fsid)
7248 lock.acquire()
7249
7250 def disable_systemd_service(unit_name: str) -> None:
7251 call(ctx, ['systemctl', 'stop', unit_name],
7252 verbosity=CallVerbosity.DEBUG)
7253 call(ctx, ['systemctl', 'reset-failed', unit_name],
7254 verbosity=CallVerbosity.DEBUG)
7255 call(ctx, ['systemctl', 'disable', unit_name],
7256 verbosity=CallVerbosity.DEBUG)
7257
7258 # stop + disable individual daemon units
7259 for d in list_daemons(ctx, detail=False):
7260 if d['fsid'] != ctx.fsid:
7261 continue
7262 if d['style'] != 'cephadm:v1':
7263 continue
7264 disable_systemd_service(get_unit_name(ctx.fsid, d['name']))
7265
7266 # cluster units
7267 for unit_name in ['ceph-%s.target' % ctx.fsid]:
7268 disable_systemd_service(unit_name)
7269
7270 slice_name = 'system-ceph\\x2d{}.slice'.format(ctx.fsid.replace('-', '\\x2d'))
7271 call(ctx, ['systemctl', 'stop', slice_name],
7272 verbosity=CallVerbosity.DEBUG)
7273
7274 # osds?
7275 if ctx.zap_osds:
7276 _zap_osds(ctx)
7277
7278 # rm units
7279 call_throws(ctx, ['rm', '-f', ctx.unit_dir
7280 + '/ceph-%s@.service' % ctx.fsid])
7281 call_throws(ctx, ['rm', '-f', ctx.unit_dir
7282 + '/ceph-%s.target' % ctx.fsid])
7283 call_throws(ctx, ['rm', '-rf',
7284 ctx.unit_dir + '/ceph-%s.target.wants' % ctx.fsid])
7285 # rm data
7286 call_throws(ctx, ['rm', '-rf', ctx.data_dir + '/' + ctx.fsid])
7287
7288 if not ctx.keep_logs:
7289 # rm logs
7290 call_throws(ctx, ['rm', '-rf', ctx.log_dir + '/' + ctx.fsid])
7291 call_throws(ctx, ['rm', '-rf', ctx.log_dir
7292 + '/*.wants/ceph-%s@*' % ctx.fsid])
7293
7294 # rm logrotate config
7295 call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/ceph-%s' % ctx.fsid])
7296
7297 # if last cluster on host remove shared files
7298 if get_ceph_cluster_count(ctx) == 0:
7299 disable_systemd_service('ceph.target')
7300
7301 # rm shared ceph target files
7302 call_throws(ctx, ['rm', '-f', ctx.unit_dir + '/multi-user.target.wants/ceph.target'])
7303 call_throws(ctx, ['rm', '-f', ctx.unit_dir + '/ceph.target'])
7304
7305 # rm cephadm logrotate config
7306 call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/cephadm'])
7307
7308 if not ctx.keep_logs:
7309 # remove all cephadm logs
7310 for fname in glob(f'{ctx.log_dir}/cephadm.log*'):
7311 os.remove(fname)
7312
7313 # rm sysctl settings
7314 sysctl_dirs: List[Path] = [Path(ctx.sysctl_dir), Path('/usr/lib/sysctl.d')]
7315
7316 for sysctl_dir in sysctl_dirs:
7317 for p in sysctl_dir.glob(f'90-ceph-{ctx.fsid}-*.conf'):
7318 p.unlink()
7319
7320 # cleanup remaining ceph directories
7321 ceph_dirs = [f'/run/ceph/{ctx.fsid}', f'/tmp/var/lib/ceph/{ctx.fsid}', f'/var/run/ceph/{ctx.fsid}']
7322 for dd in ceph_dirs:
7323 shutil.rmtree(dd, ignore_errors=True)
7324
7325 # clean up config, keyring, and pub key files
7326 files = [CEPH_DEFAULT_CONF, CEPH_DEFAULT_PUBKEY, CEPH_DEFAULT_KEYRING]
7327 if os.path.exists(files[0]):
7328 valid_fsid = False
7329 with open(files[0]) as f:
7330 if ctx.fsid in f.read():
7331 valid_fsid = True
7332 if valid_fsid:
7333 # rm configuration files on /etc/ceph
7334 for n in range(0, len(files)):
7335 if os.path.exists(files[n]):
7336 os.remove(files[n])
7337
7338 ##################################
7339
7340
7341 def check_time_sync(ctx, enabler=None):
7342 # type: (CephadmContext, Optional[Packager]) -> bool
7343 units = [
7344 'chrony.service', # 18.04 (at least)
7345 'chronyd.service', # el / opensuse
7346 'systemd-timesyncd.service',
7347 'ntpd.service', # el7 (at least)
7348 'ntp.service', # 18.04 (at least)
7349 'ntpsec.service', # 20.04 (at least) / buster
7350 'openntpd.service', # ubuntu / debian
7351 ]
7352 if not check_units(ctx, units, enabler):
7353 logger.warning('No time sync service is running; checked for %s' % units)
7354 return False
7355 return True
7356
7357
7358 def command_check_host(ctx: CephadmContext) -> None:
7359 errors = []
7360 commands = ['systemctl', 'lvcreate']
7361
7362 try:
7363 engine = check_container_engine(ctx)
7364 logger.info(f'{engine} is present')
7365 except Error as e:
7366 errors.append(str(e))
7367
7368 for command in commands:
7369 try:
7370 find_program(command)
7371 logger.info('%s is present' % command)
7372 except ValueError:
7373 errors.append('%s binary does not appear to be installed' % command)
7374
7375 # check for configured+running chronyd or ntp
7376 if not check_time_sync(ctx):
7377 errors.append('No time synchronization is active')
7378
7379 if 'expect_hostname' in ctx and ctx.expect_hostname:
7380 if get_hostname().lower() != ctx.expect_hostname.lower():
7381 errors.append('hostname "%s" does not match expected hostname "%s"' % (
7382 get_hostname(), ctx.expect_hostname))
7383 else:
7384 logger.info('Hostname "%s" matches what is expected.',
7385 ctx.expect_hostname)
7386
7387 if errors:
7388 raise Error('\nERROR: '.join(errors))
7389
7390 logger.info('Host looks OK')
7391
7392 ##################################
7393
7394
7395 def get_ssh_vars(ssh_user: str) -> Tuple[int, int, str]:
7396 try:
7397 s_pwd = pwd.getpwnam(ssh_user)
7398 except KeyError:
7399 raise Error('Cannot find uid/gid for ssh-user: %s' % (ssh_user))
7400
7401 ssh_uid = s_pwd.pw_uid
7402 ssh_gid = s_pwd.pw_gid
7403 ssh_dir = os.path.join(s_pwd.pw_dir, '.ssh')
7404 return ssh_uid, ssh_gid, ssh_dir
7405
7406
7407 def authorize_ssh_key(ssh_pub_key: str, ssh_user: str) -> bool:
7408 """Authorize the public key for the provided ssh user"""
7409
7410 def key_in_file(path: str, key: str) -> bool:
7411 if not os.path.exists(path):
7412 return False
7413 with open(path) as f:
7414 lines = f.readlines()
7415 for line in lines:
7416 if line.strip() == key.strip():
7417 return True
7418 return False
7419
7420 logger.info(f'Adding key to {ssh_user}@localhost authorized_keys...')
7421 if ssh_pub_key is None or ssh_pub_key.isspace():
7422 raise Error('Trying to authorize an empty ssh key')
7423
7424 ssh_pub_key = ssh_pub_key.strip()
7425 ssh_uid, ssh_gid, ssh_dir = get_ssh_vars(ssh_user)
7426 if not os.path.exists(ssh_dir):
7427 makedirs(ssh_dir, ssh_uid, ssh_gid, 0o700)
7428
7429 auth_keys_file = '%s/authorized_keys' % ssh_dir
7430 if key_in_file(auth_keys_file, ssh_pub_key):
7431 logger.info(f'key already in {ssh_user}@localhost authorized_keys...')
7432 return False
7433
7434 add_newline = False
7435 if os.path.exists(auth_keys_file):
7436 with open(auth_keys_file, 'r') as f:
7437 f.seek(0, os.SEEK_END)
7438 if f.tell() > 0:
7439 f.seek(f.tell() - 1, os.SEEK_SET) # go to last char
7440 if f.read() != '\n':
7441 add_newline = True
7442
7443 with open(auth_keys_file, 'a') as f:
7444 os.fchown(f.fileno(), ssh_uid, ssh_gid) # just in case we created it
7445 os.fchmod(f.fileno(), 0o600) # just in case we created it
7446 if add_newline:
7447 f.write('\n')
7448 f.write(ssh_pub_key + '\n')
7449
7450 return True
7451
7452
7453 def revoke_ssh_key(key: str, ssh_user: str) -> None:
7454 """Revoke the public key authorization for the ssh user"""
7455 ssh_uid, ssh_gid, ssh_dir = get_ssh_vars(ssh_user)
7456 auth_keys_file = '%s/authorized_keys' % ssh_dir
7457 deleted = False
7458 if os.path.exists(auth_keys_file):
7459 with open(auth_keys_file, 'r') as f:
7460 lines = f.readlines()
7461 _, filename = tempfile.mkstemp()
7462 with open(filename, 'w') as f:
7463 os.fchown(f.fileno(), ssh_uid, ssh_gid)
7464 os.fchmod(f.fileno(), 0o600) # secure access to the keys file
7465 for line in lines:
7466 if line.strip() == key.strip():
7467 deleted = True
7468 else:
7469 f.write(line)
7470
7471 if deleted:
7472 shutil.move(filename, auth_keys_file)
7473 else:
7474 logger.warning('Cannot find the ssh key to be deleted')
7475
7476
7477 def check_ssh_connectivity(ctx: CephadmContext) -> None:
7478
7479 def cmd_is_available(cmd: str) -> bool:
7480 if shutil.which(cmd) is None:
7481 logger.warning(f'Command not found: {cmd}')
7482 return False
7483 return True
7484
7485 if not cmd_is_available('ssh') or not cmd_is_available('ssh-keygen'):
7486 logger.warning('Cannot check ssh connectivity. Skipping...')
7487 return
7488
7489 logger.info('Verifying ssh connectivity ...')
7490 if ctx.ssh_private_key and ctx.ssh_public_key:
7491 # let's use the keys provided by the user
7492 ssh_priv_key_path = pathify(ctx.ssh_private_key.name)
7493 ssh_pub_key_path = pathify(ctx.ssh_public_key.name)
7494 else:
7495 # no custom keys, let's generate some random keys just for this check
7496 ssh_priv_key_path = f'/tmp/ssh_key_{uuid.uuid1()}'
7497 ssh_pub_key_path = f'{ssh_priv_key_path}.pub'
7498 ssh_key_gen_cmd = ['ssh-keygen', '-q', '-t', 'rsa', '-N', '', '-C', '', '-f', ssh_priv_key_path]
7499 _, _, code = call(ctx, ssh_key_gen_cmd)
7500 if code != 0:
7501 logger.warning('Cannot generate keys to check ssh connectivity.')
7502 return
7503
7504 with open(ssh_pub_key_path, 'r') as f:
7505 key = f.read().strip()
7506 new_key = authorize_ssh_key(key, ctx.ssh_user)
7507 ssh_cfg_file_arg = ['-F', pathify(ctx.ssh_config.name)] if ctx.ssh_config else []
7508 _, _, code = call(ctx, ['ssh', '-o StrictHostKeyChecking=no',
7509 *ssh_cfg_file_arg, '-i', ssh_priv_key_path,
7510 '-o PasswordAuthentication=no',
7511 f'{ctx.ssh_user}@{get_hostname()}',
7512 'sudo echo'])
7513
7514 # we only remove the key if it's a new one. In case the user has provided
7515 # some already existing key then we don't alter authorized_keys file
7516 if new_key:
7517 revoke_ssh_key(key, ctx.ssh_user)
7518
7519 pub_key_msg = '- The public key file configured by --ssh-public-key is valid\n' if ctx.ssh_public_key else ''
7520 prv_key_msg = '- The private key file configured by --ssh-private-key is valid\n' if ctx.ssh_private_key else ''
7521 ssh_cfg_msg = '- The ssh configuration file configured by --ssh-config is valid\n' if ctx.ssh_config else ''
7522 err_msg = f"""
7523 ** Please verify your user's ssh configuration and make sure:
7524 - User {ctx.ssh_user} must have passwordless sudo access
7525 {pub_key_msg}{prv_key_msg}{ssh_cfg_msg}
7526 """
7527 if code != 0:
7528 raise Error(err_msg)
7529
7530
7531 def command_prepare_host(ctx: CephadmContext) -> None:
7532 logger.info('Verifying podman|docker is present...')
7533 pkg = None
7534 try:
7535 check_container_engine(ctx)
7536 except Error as e:
7537 logger.warning(str(e))
7538 if not pkg:
7539 pkg = create_packager(ctx)
7540 pkg.install_podman()
7541
7542 logger.info('Verifying lvm2 is present...')
7543 if not find_executable('lvcreate'):
7544 if not pkg:
7545 pkg = create_packager(ctx)
7546 pkg.install(['lvm2'])
7547
7548 logger.info('Verifying time synchronization is in place...')
7549 if not check_time_sync(ctx):
7550 if not pkg:
7551 pkg = create_packager(ctx)
7552 pkg.install(['chrony'])
7553 # check again, and this time try to enable
7554 # the service
7555 check_time_sync(ctx, enabler=pkg)
7556
7557 if 'expect_hostname' in ctx and ctx.expect_hostname and ctx.expect_hostname != get_hostname():
7558 logger.warning('Adjusting hostname from %s -> %s...' % (get_hostname(), ctx.expect_hostname))
7559 call_throws(ctx, ['hostname', ctx.expect_hostname])
7560 with open('/etc/hostname', 'w') as f:
7561 f.write(ctx.expect_hostname + '\n')
7562
7563 logger.info('Repeating the final host check...')
7564 command_check_host(ctx)
7565
7566 ##################################
7567
7568
7569 class CustomValidation(argparse.Action):
7570
7571 def _check_name(self, values: str) -> None:
7572 try:
7573 (daemon_type, daemon_id) = values.split('.', 1)
7574 except ValueError:
7575 raise argparse.ArgumentError(self,
7576 'must be of the format <type>.<id>. For example, osd.1 or prometheus.myhost.com')
7577
7578 daemons = get_supported_daemons()
7579 if daemon_type not in daemons:
7580 raise argparse.ArgumentError(self,
7581 'name must declare the type of daemon e.g. '
7582 '{}'.format(', '.join(daemons)))
7583
7584 def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Union[str, Sequence[Any], None],
7585 option_string: Optional[str] = None) -> None:
7586 assert isinstance(values, str)
7587 if self.dest == 'name':
7588 self._check_name(values)
7589 setattr(namespace, self.dest, values)
7590
7591 ##################################
7592
7593
7594 def get_distro():
7595 # type: () -> Tuple[Optional[str], Optional[str], Optional[str]]
7596 distro = None
7597 distro_version = None
7598 distro_codename = None
7599 with open('/etc/os-release', 'r') as f:
7600 for line in f.readlines():
7601 line = line.strip()
7602 if '=' not in line or line.startswith('#'):
7603 continue
7604 (var, val) = line.split('=', 1)
7605 if val[0] == '"' and val[-1] == '"':
7606 val = val[1:-1]
7607 if var == 'ID':
7608 distro = val.lower()
7609 elif var == 'VERSION_ID':
7610 distro_version = val.lower()
7611 elif var == 'VERSION_CODENAME':
7612 distro_codename = val.lower()
7613 return distro, distro_version, distro_codename
7614
7615
7616 class Packager(object):
7617 def __init__(self, ctx: CephadmContext,
7618 stable: Optional[str] = None, version: Optional[str] = None,
7619 branch: Optional[str] = None, commit: Optional[str] = None):
7620 assert \
7621 (stable and not version and not branch and not commit) or \
7622 (not stable and version and not branch and not commit) or \
7623 (not stable and not version and branch) or \
7624 (not stable and not version and not branch and not commit)
7625 self.ctx = ctx
7626 self.stable = stable
7627 self.version = version
7628 self.branch = branch
7629 self.commit = commit
7630
7631 def validate(self) -> None:
7632 """Validate parameters before writing any state to disk."""
7633 pass
7634
7635 def add_repo(self) -> None:
7636 raise NotImplementedError
7637
7638 def rm_repo(self) -> None:
7639 raise NotImplementedError
7640
7641 def install(self, ls: List[str]) -> None:
7642 raise NotImplementedError
7643
7644 def install_podman(self) -> None:
7645 raise NotImplementedError
7646
7647 def query_shaman(self, distro: str, distro_version: Any, branch: Optional[str], commit: Optional[str]) -> str:
7648 # query shaman
7649 logger.info('Fetching repo metadata from shaman and chacra...')
7650 shaman_url = 'https://shaman.ceph.com/api/repos/ceph/{branch}/{sha1}/{distro}/{distro_version}/repo/?arch={arch}'.format(
7651 distro=distro,
7652 distro_version=distro_version,
7653 branch=branch,
7654 sha1=commit or 'latest',
7655 arch=get_arch()
7656 )
7657 try:
7658 shaman_response = urlopen(shaman_url)
7659 except HTTPError as err:
7660 logger.error('repository not found in shaman (might not be available yet)')
7661 raise Error('%s, failed to fetch %s' % (err, shaman_url))
7662 chacra_url = ''
7663 try:
7664 chacra_url = shaman_response.geturl()
7665 chacra_response = urlopen(chacra_url)
7666 except HTTPError as err:
7667 logger.error('repository not found in chacra (might not be available yet)')
7668 raise Error('%s, failed to fetch %s' % (err, chacra_url))
7669 return chacra_response.read().decode('utf-8')
7670
7671 def repo_gpgkey(self) -> Tuple[str, str]:
7672 if self.ctx.gpg_url:
7673 return self.ctx.gpg_url, 'manual'
7674 if self.stable or self.version:
7675 return 'https://download.ceph.com/keys/release.gpg', 'release'
7676 else:
7677 return 'https://download.ceph.com/keys/autobuild.gpg', 'autobuild'
7678
7679 def enable_service(self, service: str) -> None:
7680 """
7681 Start and enable the service (typically using systemd).
7682 """
7683 call_throws(self.ctx, ['systemctl', 'enable', '--now', service])
7684
7685
7686 class Apt(Packager):
7687 DISTRO_NAMES = {
7688 'ubuntu': 'ubuntu',
7689 'debian': 'debian',
7690 }
7691
7692 def __init__(self, ctx: CephadmContext,
7693 stable: Optional[str], version: Optional[str], branch: Optional[str], commit: Optional[str],
7694 distro: Optional[str], distro_version: Optional[str], distro_codename: Optional[str]) -> None:
7695 super(Apt, self).__init__(ctx, stable=stable, version=version,
7696 branch=branch, commit=commit)
7697 assert distro
7698 self.ctx = ctx
7699 self.distro = self.DISTRO_NAMES[distro]
7700 self.distro_codename = distro_codename
7701 self.distro_version = distro_version
7702
7703 def repo_path(self) -> str:
7704 return '/etc/apt/sources.list.d/ceph.list'
7705
7706 def add_repo(self) -> None:
7707
7708 url, name = self.repo_gpgkey()
7709 logger.info('Installing repo GPG key from %s...' % url)
7710 try:
7711 response = urlopen(url)
7712 except HTTPError as err:
7713 logger.error('failed to fetch GPG repo key from %s: %s' % (
7714 url, err))
7715 raise Error('failed to fetch GPG key')
7716 key = response.read()
7717 with open('/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name, 'wb') as f:
7718 f.write(key)
7719
7720 if self.version:
7721 content = 'deb %s/debian-%s/ %s main\n' % (
7722 self.ctx.repo_url, self.version, self.distro_codename)
7723 elif self.stable:
7724 content = 'deb %s/debian-%s/ %s main\n' % (
7725 self.ctx.repo_url, self.stable, self.distro_codename)
7726 else:
7727 content = self.query_shaman(self.distro, self.distro_codename, self.branch,
7728 self.commit)
7729
7730 logger.info('Installing repo file at %s...' % self.repo_path())
7731 with open(self.repo_path(), 'w') as f:
7732 f.write(content)
7733
7734 self.update()
7735
7736 def rm_repo(self) -> None:
7737 for name in ['autobuild', 'release', 'manual']:
7738 p = '/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name
7739 if os.path.exists(p):
7740 logger.info('Removing repo GPG key %s...' % p)
7741 os.unlink(p)
7742 if os.path.exists(self.repo_path()):
7743 logger.info('Removing repo at %s...' % self.repo_path())
7744 os.unlink(self.repo_path())
7745
7746 if self.distro == 'ubuntu':
7747 self.rm_kubic_repo()
7748
7749 def install(self, ls: List[str]) -> None:
7750 logger.info('Installing packages %s...' % ls)
7751 call_throws(self.ctx, ['apt-get', 'install', '-y'] + ls)
7752
7753 def update(self) -> None:
7754 logger.info('Updating package list...')
7755 call_throws(self.ctx, ['apt-get', 'update'])
7756
7757 def install_podman(self) -> None:
7758 if self.distro == 'ubuntu':
7759 logger.info('Setting up repo for podman...')
7760 self.add_kubic_repo()
7761 self.update()
7762
7763 logger.info('Attempting podman install...')
7764 try:
7765 self.install(['podman'])
7766 except Error:
7767 logger.info('Podman did not work. Falling back to docker...')
7768 self.install(['docker.io'])
7769
7770 def kubic_repo_url(self) -> str:
7771 return 'https://download.opensuse.org/repositories/devel:/kubic:/' \
7772 'libcontainers:/stable/xUbuntu_%s/' % self.distro_version
7773
7774 def kubic_repo_path(self) -> str:
7775 return '/etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list'
7776
7777 def kubric_repo_gpgkey_url(self) -> str:
7778 return '%s/Release.key' % self.kubic_repo_url()
7779
7780 def kubric_repo_gpgkey_path(self) -> str:
7781 return '/etc/apt/trusted.gpg.d/kubic.release.gpg'
7782
7783 def add_kubic_repo(self) -> None:
7784 url = self.kubric_repo_gpgkey_url()
7785 logger.info('Installing repo GPG key from %s...' % url)
7786 try:
7787 response = urlopen(url)
7788 except HTTPError as err:
7789 logger.error('failed to fetch GPG repo key from %s: %s' % (
7790 url, err))
7791 raise Error('failed to fetch GPG key')
7792 key = response.read().decode('utf-8')
7793 tmp_key = write_tmp(key, 0, 0)
7794 keyring = self.kubric_repo_gpgkey_path()
7795 call_throws(self.ctx, ['apt-key', '--keyring', keyring, 'add', tmp_key.name])
7796
7797 logger.info('Installing repo file at %s...' % self.kubic_repo_path())
7798 content = 'deb %s /\n' % self.kubic_repo_url()
7799 with open(self.kubic_repo_path(), 'w') as f:
7800 f.write(content)
7801
7802 def rm_kubic_repo(self) -> None:
7803 keyring = self.kubric_repo_gpgkey_path()
7804 if os.path.exists(keyring):
7805 logger.info('Removing repo GPG key %s...' % keyring)
7806 os.unlink(keyring)
7807
7808 p = self.kubic_repo_path()
7809 if os.path.exists(p):
7810 logger.info('Removing repo at %s...' % p)
7811 os.unlink(p)
7812
7813
7814 class YumDnf(Packager):
7815 DISTRO_NAMES = {
7816 'centos': ('centos', 'el'),
7817 'rhel': ('centos', 'el'),
7818 'scientific': ('centos', 'el'),
7819 'rocky': ('centos', 'el'),
7820 'almalinux': ('centos', 'el'),
7821 'ol': ('centos', 'el'),
7822 'fedora': ('fedora', 'fc'),
7823 'mariner': ('mariner', 'cm'),
7824 }
7825
7826 def __init__(self, ctx: CephadmContext,
7827 stable: Optional[str], version: Optional[str], branch: Optional[str], commit: Optional[str],
7828 distro: Optional[str], distro_version: Optional[str]) -> None:
7829 super(YumDnf, self).__init__(ctx, stable=stable, version=version,
7830 branch=branch, commit=commit)
7831 assert distro
7832 assert distro_version
7833 self.ctx = ctx
7834 self.major = int(distro_version.split('.')[0])
7835 self.distro_normalized = self.DISTRO_NAMES[distro][0]
7836 self.distro_code = self.DISTRO_NAMES[distro][1] + str(self.major)
7837 if (self.distro_code == 'fc' and self.major >= 30) or \
7838 (self.distro_code == 'el' and self.major >= 8):
7839 self.tool = 'dnf'
7840 elif (self.distro_code == 'cm'):
7841 self.tool = 'tdnf'
7842 else:
7843 self.tool = 'yum'
7844
7845 def custom_repo(self, **kw: Any) -> str:
7846 """
7847 Repo files need special care in that a whole line should not be present
7848 if there is no value for it. Because we were using `format()` we could
7849 not conditionally add a line for a repo file. So the end result would
7850 contain a key with a missing value (say if we were passing `None`).
7851
7852 For example, it could look like::
7853
7854 [ceph repo]
7855 name= ceph repo
7856 proxy=
7857 gpgcheck=
7858
7859 Which breaks. This function allows us to conditionally add lines,
7860 preserving an order and be more careful.
7861
7862 Previously, and for historical purposes, this is how the template used
7863 to look::
7864
7865 custom_repo =
7866 [{repo_name}]
7867 name={name}
7868 baseurl={baseurl}
7869 enabled={enabled}
7870 gpgcheck={gpgcheck}
7871 type={_type}
7872 gpgkey={gpgkey}
7873 proxy={proxy}
7874
7875 """
7876 lines = []
7877
7878 # by using tuples (vs a dict) we preserve the order of what we want to
7879 # return, like starting with a [repo name]
7880 tmpl = (
7881 ('reponame', '[%s]'),
7882 ('name', 'name=%s'),
7883 ('baseurl', 'baseurl=%s'),
7884 ('enabled', 'enabled=%s'),
7885 ('gpgcheck', 'gpgcheck=%s'),
7886 ('_type', 'type=%s'),
7887 ('gpgkey', 'gpgkey=%s'),
7888 ('proxy', 'proxy=%s'),
7889 ('priority', 'priority=%s'),
7890 )
7891
7892 for line in tmpl:
7893 tmpl_key, tmpl_value = line # key values from tmpl
7894
7895 # ensure that there is an actual value (not None nor empty string)
7896 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
7897 lines.append(tmpl_value % kw.get(tmpl_key))
7898
7899 return '\n'.join(lines)
7900
7901 def repo_path(self) -> str:
7902 return '/etc/yum.repos.d/ceph.repo'
7903
7904 def repo_baseurl(self) -> str:
7905 assert self.stable or self.version
7906 if self.version:
7907 return '%s/rpm-%s/%s' % (self.ctx.repo_url, self.version,
7908 self.distro_code)
7909 else:
7910 return '%s/rpm-%s/%s' % (self.ctx.repo_url, self.stable,
7911 self.distro_code)
7912
7913 def validate(self) -> None:
7914 if self.distro_code.startswith('fc'):
7915 raise Error('Ceph team does not build Fedora specific packages and therefore cannot add repos for this distro')
7916 if self.distro_code == 'el7':
7917 if self.stable and self.stable >= 'pacific':
7918 raise Error('Ceph does not support pacific or later for this version of this linux distro and therefore cannot add a repo for it')
7919 if self.version and self.version.split('.')[0] >= '16':
7920 raise Error('Ceph does not support 16.y.z or later for this version of this linux distro and therefore cannot add a repo for it')
7921
7922 if self.stable or self.version:
7923 # we know that yum & dnf require there to be a
7924 # $base_url/$arch/repodata/repomd.xml so we can test if this URL
7925 # is gettable in order to validate the inputs
7926 test_url = self.repo_baseurl() + '/noarch/repodata/repomd.xml'
7927 try:
7928 urlopen(test_url)
7929 except HTTPError as err:
7930 logger.error('unable to fetch repo metadata: %r', err)
7931 raise Error('failed to fetch repository metadata. please check'
7932 ' the provided parameters are correct and try again')
7933
7934 def add_repo(self) -> None:
7935 if self.stable or self.version:
7936 content = ''
7937 for n, t in {
7938 'Ceph': '$basearch',
7939 'Ceph-noarch': 'noarch',
7940 'Ceph-source': 'SRPMS'}.items():
7941 content += '[%s]\n' % (n)
7942 content += self.custom_repo(
7943 name='Ceph %s' % t,
7944 baseurl=self.repo_baseurl() + '/' + t,
7945 enabled=1,
7946 gpgcheck=1,
7947 gpgkey=self.repo_gpgkey()[0],
7948 )
7949 content += '\n\n'
7950 else:
7951 content = self.query_shaman(self.distro_normalized, self.major,
7952 self.branch,
7953 self.commit)
7954
7955 logger.info('Writing repo to %s...' % self.repo_path())
7956 with open(self.repo_path(), 'w') as f:
7957 f.write(content)
7958
7959 if self.distro_code.startswith('el'):
7960 logger.info('Enabling EPEL...')
7961 call_throws(self.ctx, [self.tool, 'install', '-y', 'epel-release'])
7962
7963 def rm_repo(self) -> None:
7964 if os.path.exists(self.repo_path()):
7965 os.unlink(self.repo_path())
7966
7967 def install(self, ls: List[str]) -> None:
7968 logger.info('Installing packages %s...' % ls)
7969 call_throws(self.ctx, [self.tool, 'install', '-y'] + ls)
7970
7971 def install_podman(self) -> None:
7972 self.install(['podman'])
7973
7974
7975 class Zypper(Packager):
7976 DISTRO_NAMES = [
7977 'sles',
7978 'opensuse-tumbleweed',
7979 'opensuse-leap'
7980 ]
7981
7982 def __init__(self, ctx: CephadmContext,
7983 stable: Optional[str], version: Optional[str], branch: Optional[str], commit: Optional[str],
7984 distro: Optional[str], distro_version: Optional[str]) -> None:
7985 super(Zypper, self).__init__(ctx, stable=stable, version=version,
7986 branch=branch, commit=commit)
7987 assert distro is not None
7988 self.ctx = ctx
7989 self.tool = 'zypper'
7990 self.distro = 'opensuse'
7991 self.distro_version = '15.1'
7992 if 'tumbleweed' not in distro and distro_version is not None:
7993 self.distro_version = distro_version
7994
7995 def custom_repo(self, **kw: Any) -> str:
7996 """
7997 See YumDnf for format explanation.
7998 """
7999 lines = []
8000
8001 # by using tuples (vs a dict) we preserve the order of what we want to
8002 # return, like starting with a [repo name]
8003 tmpl = (
8004 ('reponame', '[%s]'),
8005 ('name', 'name=%s'),
8006 ('baseurl', 'baseurl=%s'),
8007 ('enabled', 'enabled=%s'),
8008 ('gpgcheck', 'gpgcheck=%s'),
8009 ('_type', 'type=%s'),
8010 ('gpgkey', 'gpgkey=%s'),
8011 ('proxy', 'proxy=%s'),
8012 ('priority', 'priority=%s'),
8013 )
8014
8015 for line in tmpl:
8016 tmpl_key, tmpl_value = line # key values from tmpl
8017
8018 # ensure that there is an actual value (not None nor empty string)
8019 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
8020 lines.append(tmpl_value % kw.get(tmpl_key))
8021
8022 return '\n'.join(lines)
8023
8024 def repo_path(self) -> str:
8025 return '/etc/zypp/repos.d/ceph.repo'
8026
8027 def repo_baseurl(self) -> str:
8028 assert self.stable or self.version
8029 if self.version:
8030 return '%s/rpm-%s/%s' % (self.ctx.repo_url,
8031 self.stable, self.distro)
8032 else:
8033 return '%s/rpm-%s/%s' % (self.ctx.repo_url,
8034 self.stable, self.distro)
8035
8036 def add_repo(self) -> None:
8037 if self.stable or self.version:
8038 content = ''
8039 for n, t in {
8040 'Ceph': '$basearch',
8041 'Ceph-noarch': 'noarch',
8042 'Ceph-source': 'SRPMS'}.items():
8043 content += '[%s]\n' % (n)
8044 content += self.custom_repo(
8045 name='Ceph %s' % t,
8046 baseurl=self.repo_baseurl() + '/' + t,
8047 enabled=1,
8048 gpgcheck=1,
8049 gpgkey=self.repo_gpgkey()[0],
8050 )
8051 content += '\n\n'
8052 else:
8053 content = self.query_shaman(self.distro, self.distro_version,
8054 self.branch,
8055 self.commit)
8056
8057 logger.info('Writing repo to %s...' % self.repo_path())
8058 with open(self.repo_path(), 'w') as f:
8059 f.write(content)
8060
8061 def rm_repo(self) -> None:
8062 if os.path.exists(self.repo_path()):
8063 os.unlink(self.repo_path())
8064
8065 def install(self, ls: List[str]) -> None:
8066 logger.info('Installing packages %s...' % ls)
8067 call_throws(self.ctx, [self.tool, 'in', '-y'] + ls)
8068
8069 def install_podman(self) -> None:
8070 self.install(['podman'])
8071
8072
8073 def create_packager(ctx: CephadmContext,
8074 stable: Optional[str] = None, version: Optional[str] = None,
8075 branch: Optional[str] = None, commit: Optional[str] = None) -> Packager:
8076 distro, distro_version, distro_codename = get_distro()
8077 if distro in YumDnf.DISTRO_NAMES:
8078 return YumDnf(ctx, stable=stable, version=version,
8079 branch=branch, commit=commit,
8080 distro=distro, distro_version=distro_version)
8081 elif distro in Apt.DISTRO_NAMES:
8082 return Apt(ctx, stable=stable, version=version,
8083 branch=branch, commit=commit,
8084 distro=distro, distro_version=distro_version,
8085 distro_codename=distro_codename)
8086 elif distro in Zypper.DISTRO_NAMES:
8087 return Zypper(ctx, stable=stable, version=version,
8088 branch=branch, commit=commit,
8089 distro=distro, distro_version=distro_version)
8090 raise Error('Distro %s version %s not supported' % (distro, distro_version))
8091
8092
8093 def command_add_repo(ctx: CephadmContext) -> None:
8094 if ctx.version and ctx.release:
8095 raise Error('you can specify either --release or --version but not both')
8096 if not ctx.version and not ctx.release and not ctx.dev and not ctx.dev_commit:
8097 raise Error('please supply a --release, --version, --dev or --dev-commit argument')
8098 if ctx.version:
8099 try:
8100 (x, y, z) = ctx.version.split('.')
8101 except Exception:
8102 raise Error('version must be in the form x.y.z (e.g., 15.2.0)')
8103 if ctx.release:
8104 # Pacific =/= pacific in this case, set to undercase to avoid confision
8105 ctx.release = ctx.release.lower()
8106
8107 pkg = create_packager(ctx, stable=ctx.release,
8108 version=ctx.version,
8109 branch=ctx.dev,
8110 commit=ctx.dev_commit)
8111 pkg.validate()
8112 pkg.add_repo()
8113 logger.info('Completed adding repo.')
8114
8115
8116 def command_rm_repo(ctx: CephadmContext) -> None:
8117 pkg = create_packager(ctx)
8118 pkg.rm_repo()
8119
8120
8121 def command_install(ctx: CephadmContext) -> None:
8122 pkg = create_packager(ctx)
8123 pkg.install(ctx.packages)
8124
8125
8126 def command_rescan_disks(ctx: CephadmContext) -> str:
8127
8128 def probe_hba(scan_path: str) -> None:
8129 """Tell the adapter to rescan"""
8130 with open(scan_path, 'w') as f:
8131 f.write('- - -')
8132
8133 cmd = ctx.func.__name__.replace('command_', '')
8134 logger.info(f'{cmd}: starting')
8135 start = time.time()
8136
8137 all_scan_files = glob('/sys/class/scsi_host/*/scan')
8138 scan_files = []
8139 skipped = []
8140 for scan_path in all_scan_files:
8141 adapter_name = os.path.basename(os.path.dirname(scan_path))
8142 proc_name = read_file([os.path.join(os.path.dirname(scan_path), 'proc_name')])
8143 if proc_name in ['unknown', 'usb-storage']:
8144 skipped.append(os.path.basename(scan_path))
8145 logger.info(f'{cmd}: rescan skipping incompatible host adapter {adapter_name} : {proc_name}')
8146 continue
8147
8148 scan_files.append(scan_path)
8149
8150 if not scan_files:
8151 logger.info(f'{cmd}: no compatible HBAs found')
8152 return 'Ok. No compatible HBAs found'
8153
8154 responses = async_run(concurrent_tasks(probe_hba, scan_files))
8155 failures = [r for r in responses if r]
8156
8157 logger.info(f'{cmd}: Complete. {len(scan_files)} adapters rescanned, {len(failures)} failures, {len(skipped)} skipped')
8158
8159 elapsed = time.time() - start
8160 if failures:
8161 plural = 's' if len(failures) > 1 else ''
8162 if len(failures) == len(scan_files):
8163 return f'Failed. All {len(scan_files)} rescan requests failed'
8164 else:
8165 return f'Partial. {len(scan_files) - len(failures)} successful, {len(failures)} failure{plural} against: {", ".join(failures)}'
8166
8167 return f'Ok. {len(all_scan_files)} adapters detected: {len(scan_files)} rescanned, {len(skipped)} skipped, {len(failures)} failed ({elapsed:.2f}s)'
8168
8169 ##################################
8170
8171
8172 def get_ipv4_address(ifname):
8173 # type: (str) -> str
8174 def _extract(sock: socket.socket, offset: int) -> str:
8175 return socket.inet_ntop(
8176 socket.AF_INET,
8177 fcntl.ioctl(
8178 sock.fileno(),
8179 offset,
8180 struct.pack('256s', bytes(ifname[:15], 'utf-8'))
8181 )[20:24])
8182
8183 s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
8184 try:
8185 addr = _extract(s, 35093) # '0x8915' = SIOCGIFADDR
8186 dq_mask = _extract(s, 35099) # 0x891b = SIOCGIFNETMASK
8187 except OSError:
8188 # interface does not have an ipv4 address
8189 return ''
8190
8191 dec_mask = sum([bin(int(i)).count('1')
8192 for i in dq_mask.split('.')])
8193 return '{}/{}'.format(addr, dec_mask)
8194
8195
8196 def get_ipv6_address(ifname):
8197 # type: (str) -> str
8198 if not os.path.exists('/proc/net/if_inet6'):
8199 return ''
8200
8201 raw = read_file(['/proc/net/if_inet6'])
8202 data = raw.splitlines()
8203 # based on docs @ https://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/ch11s04.html
8204 # field 0 is ipv6, field 2 is scope
8205 for iface_setting in data:
8206 field = iface_setting.split()
8207 if field[-1] == ifname:
8208 ipv6_raw = field[0]
8209 ipv6_fmtd = ':'.join([ipv6_raw[_p:_p + 4] for _p in range(0, len(field[0]), 4)])
8210 # apply naming rules using ipaddress module
8211 ipv6 = ipaddress.ip_address(ipv6_fmtd)
8212 return '{}/{}'.format(str(ipv6), int('0x{}'.format(field[2]), 16))
8213 return ''
8214
8215
8216 def bytes_to_human(num, mode='decimal'):
8217 # type: (float, str) -> str
8218 """Convert a bytes value into it's human-readable form.
8219
8220 :param num: number, in bytes, to convert
8221 :param mode: Either decimal (default) or binary to determine divisor
8222 :returns: string representing the bytes value in a more readable format
8223 """
8224 unit_list = ['', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB']
8225 divisor = 1000.0
8226 yotta = 'YB'
8227
8228 if mode == 'binary':
8229 unit_list = ['', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB']
8230 divisor = 1024.0
8231 yotta = 'YiB'
8232
8233 for unit in unit_list:
8234 if abs(num) < divisor:
8235 return '%3.1f%s' % (num, unit)
8236 num /= divisor
8237 return '%.1f%s' % (num, yotta)
8238
8239
8240 def read_file(path_list, file_name=''):
8241 # type: (List[str], str) -> str
8242 """Returns the content of the first file found within the `path_list`
8243
8244 :param path_list: list of file paths to search
8245 :param file_name: optional file_name to be applied to a file path
8246 :returns: content of the file or 'Unknown'
8247 """
8248 for path in path_list:
8249 if file_name:
8250 file_path = os.path.join(path, file_name)
8251 else:
8252 file_path = path
8253 if os.path.exists(file_path):
8254 with open(file_path, 'r') as f:
8255 try:
8256 content = f.read().strip()
8257 except OSError:
8258 # sysfs may populate the file, but for devices like
8259 # virtio reads can fail
8260 return 'Unknown'
8261 else:
8262 return content
8263 return 'Unknown'
8264
8265 ##################################
8266
8267
8268 class HostFacts():
8269 _dmi_path_list = ['/sys/class/dmi/id']
8270 _nic_path_list = ['/sys/class/net']
8271 _apparmor_path_list = ['/etc/apparmor']
8272 _disk_vendor_workarounds = {
8273 '0x1af4': 'Virtio Block Device'
8274 }
8275 _excluded_block_devices = ('sr', 'zram', 'dm-')
8276
8277 def __init__(self, ctx: CephadmContext):
8278 self.ctx: CephadmContext = ctx
8279 self.cpu_model: str = 'Unknown'
8280 self.sysctl_options: Dict[str, str] = self._populate_sysctl_options()
8281 self.cpu_count: int = 0
8282 self.cpu_cores: int = 0
8283 self.cpu_threads: int = 0
8284 self.interfaces: Dict[str, Any] = {}
8285
8286 self._meminfo: List[str] = read_file(['/proc/meminfo']).splitlines()
8287 self._get_cpuinfo()
8288 self._process_nics()
8289 self.arch: str = platform.processor()
8290 self.kernel: str = platform.release()
8291
8292 def _populate_sysctl_options(self) -> Dict[str, str]:
8293 sysctl_options = {}
8294 out, _, _ = call_throws(self.ctx, ['sysctl', '-a'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
8295 if out:
8296 for line in out.splitlines():
8297 option, value = line.split('=')
8298 sysctl_options[option.strip()] = value.strip()
8299 return sysctl_options
8300
8301 def _get_cpuinfo(self):
8302 # type: () -> None
8303 """Determine cpu information via /proc/cpuinfo"""
8304 raw = read_file(['/proc/cpuinfo'])
8305 output = raw.splitlines()
8306 cpu_set = set()
8307
8308 for line in output:
8309 field = [f.strip() for f in line.split(':')]
8310 if 'model name' in line:
8311 self.cpu_model = field[1]
8312 if 'physical id' in line:
8313 cpu_set.add(field[1])
8314 if 'siblings' in line:
8315 self.cpu_threads = int(field[1].strip())
8316 if 'cpu cores' in line:
8317 self.cpu_cores = int(field[1].strip())
8318 pass
8319 self.cpu_count = len(cpu_set)
8320
8321 def _get_block_devs(self):
8322 # type: () -> List[str]
8323 """Determine the list of block devices by looking at /sys/block"""
8324 return [dev for dev in os.listdir('/sys/block')
8325 if not dev.startswith(HostFacts._excluded_block_devices)]
8326
8327 def _get_devs_by_type(self, rota='0'):
8328 # type: (str) -> List[str]
8329 """Filter block devices by a given rotational attribute (0=flash, 1=spinner)"""
8330 devs = list()
8331 for blk_dev in self._get_block_devs():
8332 rot_path = '/sys/block/{}/queue/rotational'.format(blk_dev)
8333 rot_value = read_file([rot_path])
8334 if rot_value == rota:
8335 devs.append(blk_dev)
8336 return devs
8337
8338 @property
8339 def operating_system(self):
8340 # type: () -> str
8341 """Determine OS version"""
8342 raw_info = read_file(['/etc/os-release'])
8343 os_release = raw_info.splitlines()
8344 rel_str = 'Unknown'
8345 rel_dict = dict()
8346
8347 for line in os_release:
8348 if '=' in line:
8349 var_name, var_value = line.split('=')
8350 rel_dict[var_name] = var_value.strip('"')
8351
8352 # Would normally use PRETTY_NAME, but NAME and VERSION are more
8353 # consistent
8354 if all(_v in rel_dict for _v in ['NAME', 'VERSION']):
8355 rel_str = '{} {}'.format(rel_dict['NAME'], rel_dict['VERSION'])
8356 return rel_str
8357
8358 @property
8359 def hostname(self):
8360 # type: () -> str
8361 """Return the hostname"""
8362 return platform.node()
8363
8364 @property
8365 def subscribed(self):
8366 # type: () -> str
8367 """Highlevel check to see if the host is subscribed to receive updates/support"""
8368 def _red_hat():
8369 # type: () -> str
8370 # RHEL 7 and RHEL 8
8371 entitlements_dir = '/etc/pki/entitlement'
8372 if os.path.exists(entitlements_dir):
8373 pems = glob('{}/*.pem'.format(entitlements_dir))
8374 if len(pems) >= 2:
8375 return 'Yes'
8376
8377 return 'No'
8378
8379 os_name = self.operating_system
8380 if os_name.upper().startswith('RED HAT'):
8381 return _red_hat()
8382
8383 return 'Unknown'
8384
8385 @property
8386 def hdd_count(self):
8387 # type: () -> int
8388 """Return a count of HDDs (spinners)"""
8389 return len(self._get_devs_by_type(rota='1'))
8390
8391 def _get_capacity(self, dev):
8392 # type: (str) -> int
8393 """Determine the size of a given device
8394
8395 The kernel always bases device size calculations based on a 512 byte
8396 sector. For more information see
8397 https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/tree/include/linux/types.h?h=v5.15.63#n120
8398 """
8399 size_path = os.path.join('/sys/block', dev, 'size')
8400 size_blocks = int(read_file([size_path]))
8401 return size_blocks * 512
8402
8403 def _get_capacity_by_type(self, rota='0'):
8404 # type: (str) -> int
8405 """Return the total capacity of a category of device (flash or hdd)"""
8406 devs = self._get_devs_by_type(rota=rota)
8407 capacity = 0
8408 for dev in devs:
8409 capacity += self._get_capacity(dev)
8410 return capacity
8411
8412 def _dev_list(self, dev_list):
8413 # type: (List[str]) -> List[Dict[str, object]]
8414 """Return a 'pretty' name list for each device in the `dev_list`"""
8415 disk_list = list()
8416
8417 for dev in dev_list:
8418 disk_model = read_file(['/sys/block/{}/device/model'.format(dev)]).strip()
8419 disk_rev = read_file(['/sys/block/{}/device/rev'.format(dev)]).strip()
8420 disk_wwid = read_file(['/sys/block/{}/device/wwid'.format(dev)]).strip()
8421 vendor = read_file(['/sys/block/{}/device/vendor'.format(dev)]).strip()
8422 disk_vendor = HostFacts._disk_vendor_workarounds.get(vendor, vendor)
8423 disk_size_bytes = self._get_capacity(dev)
8424 disk_list.append({
8425 'description': '{} {} ({})'.format(disk_vendor, disk_model, bytes_to_human(disk_size_bytes)),
8426 'vendor': disk_vendor,
8427 'model': disk_model,
8428 'rev': disk_rev,
8429 'wwid': disk_wwid,
8430 'dev_name': dev,
8431 'disk_size_bytes': disk_size_bytes,
8432 })
8433 return disk_list
8434
8435 @property
8436 def hdd_list(self):
8437 # type: () -> List[Dict[str, object]]
8438 """Return a list of devices that are HDDs (spinners)"""
8439 devs = self._get_devs_by_type(rota='1')
8440 return self._dev_list(devs)
8441
8442 @property
8443 def flash_list(self):
8444 # type: () -> List[Dict[str, object]]
8445 """Return a list of devices that are flash based (SSD, NVMe)"""
8446 devs = self._get_devs_by_type(rota='0')
8447 return self._dev_list(devs)
8448
8449 @property
8450 def hdd_capacity_bytes(self):
8451 # type: () -> int
8452 """Return the total capacity for all HDD devices (bytes)"""
8453 return self._get_capacity_by_type(rota='1')
8454
8455 @property
8456 def hdd_capacity(self):
8457 # type: () -> str
8458 """Return the total capacity for all HDD devices (human readable format)"""
8459 return bytes_to_human(self.hdd_capacity_bytes)
8460
8461 @property
8462 def cpu_load(self):
8463 # type: () -> Dict[str, float]
8464 """Return the cpu load average data for the host"""
8465 raw = read_file(['/proc/loadavg']).strip()
8466 data = raw.split()
8467 return {
8468 '1min': float(data[0]),
8469 '5min': float(data[1]),
8470 '15min': float(data[2]),
8471 }
8472
8473 @property
8474 def flash_count(self):
8475 # type: () -> int
8476 """Return the number of flash devices in the system (SSD, NVMe)"""
8477 return len(self._get_devs_by_type(rota='0'))
8478
8479 @property
8480 def flash_capacity_bytes(self):
8481 # type: () -> int
8482 """Return the total capacity for all flash devices (bytes)"""
8483 return self._get_capacity_by_type(rota='0')
8484
8485 @property
8486 def flash_capacity(self):
8487 # type: () -> str
8488 """Return the total capacity for all Flash devices (human readable format)"""
8489 return bytes_to_human(self.flash_capacity_bytes)
8490
8491 def _process_nics(self):
8492 # type: () -> None
8493 """Look at the NIC devices and extract network related metadata"""
8494 # from https://github.com/torvalds/linux/blob/master/include/uapi/linux/if_arp.h
8495 hw_lookup = {
8496 '1': 'ethernet',
8497 '32': 'infiniband',
8498 '772': 'loopback',
8499 }
8500
8501 for nic_path in HostFacts._nic_path_list:
8502 if not os.path.exists(nic_path):
8503 continue
8504 for iface in os.listdir(nic_path):
8505
8506 if os.path.exists(os.path.join(nic_path, iface, 'bridge')):
8507 nic_type = 'bridge'
8508 elif os.path.exists(os.path.join(nic_path, iface, 'bonding')):
8509 nic_type = 'bonding'
8510 else:
8511 nic_type = hw_lookup.get(read_file([os.path.join(nic_path, iface, 'type')]), 'Unknown')
8512
8513 if nic_type == 'loopback': # skip loopback devices
8514 continue
8515
8516 lower_devs_list = [os.path.basename(link.replace('lower_', '')) for link in glob(os.path.join(nic_path, iface, 'lower_*'))]
8517 upper_devs_list = [os.path.basename(link.replace('upper_', '')) for link in glob(os.path.join(nic_path, iface, 'upper_*'))]
8518
8519 try:
8520 mtu = int(read_file([os.path.join(nic_path, iface, 'mtu')]))
8521 except ValueError:
8522 mtu = 0
8523
8524 operstate = read_file([os.path.join(nic_path, iface, 'operstate')])
8525 try:
8526 speed = int(read_file([os.path.join(nic_path, iface, 'speed')]))
8527 except (OSError, ValueError):
8528 # OSError : device doesn't support the ethtool get_link_ksettings
8529 # ValueError : raised when the read fails, and returns Unknown
8530 #
8531 # Either way, we show a -1 when speed isn't available
8532 speed = -1
8533
8534 dev_link = os.path.join(nic_path, iface, 'device')
8535 if os.path.exists(dev_link):
8536 iftype = 'physical'
8537 driver_path = os.path.join(dev_link, 'driver')
8538 if os.path.exists(driver_path):
8539 driver = os.path.basename(os.path.realpath(driver_path))
8540 else:
8541 driver = 'Unknown'
8542
8543 else:
8544 iftype = 'logical'
8545 driver = ''
8546
8547 self.interfaces[iface] = {
8548 'mtu': mtu,
8549 'upper_devs_list': upper_devs_list,
8550 'lower_devs_list': lower_devs_list,
8551 'operstate': operstate,
8552 'iftype': iftype,
8553 'nic_type': nic_type,
8554 'driver': driver,
8555 'speed': speed,
8556 'ipv4_address': get_ipv4_address(iface),
8557 'ipv6_address': get_ipv6_address(iface),
8558 }
8559
8560 @property
8561 def nic_count(self):
8562 # type: () -> int
8563 """Return a total count of all physical NICs detected in the host"""
8564 phys_devs = []
8565 for iface in self.interfaces:
8566 if self.interfaces[iface]['iftype'] == 'physical':
8567 phys_devs.append(iface)
8568 return len(phys_devs)
8569
8570 def _get_mem_data(self, field_name):
8571 # type: (str) -> int
8572 for line in self._meminfo:
8573 if line.startswith(field_name):
8574 _d = line.split()
8575 return int(_d[1])
8576 return 0
8577
8578 @property
8579 def memory_total_kb(self):
8580 # type: () -> int
8581 """Determine the memory installed (kb)"""
8582 return self._get_mem_data('MemTotal')
8583
8584 @property
8585 def memory_free_kb(self):
8586 # type: () -> int
8587 """Determine the memory free (not cache, immediately usable)"""
8588 return self._get_mem_data('MemFree')
8589
8590 @property
8591 def memory_available_kb(self):
8592 # type: () -> int
8593 """Determine the memory available to new applications without swapping"""
8594 return self._get_mem_data('MemAvailable')
8595
8596 @property
8597 def vendor(self):
8598 # type: () -> str
8599 """Determine server vendor from DMI data in sysfs"""
8600 return read_file(HostFacts._dmi_path_list, 'sys_vendor')
8601
8602 @property
8603 def model(self):
8604 # type: () -> str
8605 """Determine server model information from DMI data in sysfs"""
8606 family = read_file(HostFacts._dmi_path_list, 'product_family')
8607 product = read_file(HostFacts._dmi_path_list, 'product_name')
8608 if family == 'Unknown' and product:
8609 return '{}'.format(product)
8610
8611 return '{} ({})'.format(family, product)
8612
8613 @property
8614 def bios_version(self):
8615 # type: () -> str
8616 """Determine server BIOS version from DMI data in sysfs"""
8617 return read_file(HostFacts._dmi_path_list, 'bios_version')
8618
8619 @property
8620 def bios_date(self):
8621 # type: () -> str
8622 """Determine server BIOS date from DMI data in sysfs"""
8623 return read_file(HostFacts._dmi_path_list, 'bios_date')
8624
8625 @property
8626 def timestamp(self):
8627 # type: () -> float
8628 """Return the current time as Epoch seconds"""
8629 return time.time()
8630
8631 @property
8632 def system_uptime(self):
8633 # type: () -> float
8634 """Return the system uptime (in secs)"""
8635 raw_time = read_file(['/proc/uptime'])
8636 up_secs, _ = raw_time.split()
8637 return float(up_secs)
8638
8639 @property
8640 def kernel_security(self):
8641 # type: () -> Dict[str, str]
8642 """Determine the security features enabled in the kernel - SELinux, AppArmor"""
8643 def _fetch_selinux() -> Dict[str, str]:
8644 """Get the selinux status"""
8645 security = {}
8646 try:
8647 out, err, code = call(self.ctx, ['sestatus'],
8648 verbosity=CallVerbosity.QUIET)
8649 security['type'] = 'SELinux'
8650 status, mode, policy = '', '', ''
8651 for line in out.split('\n'):
8652 if line.startswith('SELinux status:'):
8653 k, v = line.split(':')
8654 status = v.strip()
8655 elif line.startswith('Current mode:'):
8656 k, v = line.split(':')
8657 mode = v.strip()
8658 elif line.startswith('Loaded policy name:'):
8659 k, v = line.split(':')
8660 policy = v.strip()
8661 if status == 'disabled':
8662 security['description'] = 'SELinux: Disabled'
8663 else:
8664 security['description'] = 'SELinux: Enabled({}, {})'.format(mode, policy)
8665 except Exception as e:
8666 logger.info('unable to get selinux status: %s' % e)
8667 return security
8668
8669 def _fetch_apparmor() -> Dict[str, str]:
8670 """Read the apparmor profiles directly, returning an overview of AppArmor status"""
8671 security = {}
8672 for apparmor_path in HostFacts._apparmor_path_list:
8673 if os.path.exists(apparmor_path):
8674 security['type'] = 'AppArmor'
8675 security['description'] = 'AppArmor: Enabled'
8676 try:
8677 profiles = read_file(['/sys/kernel/security/apparmor/profiles'])
8678 if len(profiles) == 0:
8679 return {}
8680 except OSError:
8681 pass
8682 else:
8683 summary = {} # type: Dict[str, int]
8684 for line in profiles.split('\n'):
8685 item, mode = line.split(' ')
8686 mode = mode.strip('()')
8687 if mode in summary:
8688 summary[mode] += 1
8689 else:
8690 summary[mode] = 0
8691 summary_str = ','.join(['{} {}'.format(v, k) for k, v in summary.items()])
8692 security = {**security, **summary} # type: ignore
8693 security['description'] += '({})'.format(summary_str)
8694
8695 return security
8696 return {}
8697
8698 ret = {}
8699 if os.path.exists('/sys/kernel/security/lsm'):
8700 lsm = read_file(['/sys/kernel/security/lsm']).strip()
8701 if 'selinux' in lsm:
8702 ret = _fetch_selinux()
8703 elif 'apparmor' in lsm:
8704 ret = _fetch_apparmor()
8705 else:
8706 return {
8707 'type': 'Unknown',
8708 'description': 'Linux Security Module framework is active, but is not using SELinux or AppArmor'
8709 }
8710
8711 if ret:
8712 return ret
8713
8714 return {
8715 'type': 'None',
8716 'description': 'Linux Security Module framework is not available'
8717 }
8718
8719 @property
8720 def selinux_enabled(self) -> bool:
8721 return (self.kernel_security['type'] == 'SELinux') and \
8722 (self.kernel_security['description'] != 'SELinux: Disabled')
8723
8724 @property
8725 def kernel_parameters(self):
8726 # type: () -> Dict[str, str]
8727 """Get kernel parameters required/used in Ceph clusters"""
8728
8729 k_param = {}
8730 out, _, _ = call_throws(self.ctx, ['sysctl', '-a'], verbosity=CallVerbosity.SILENT)
8731 if out:
8732 param_list = out.split('\n')
8733 param_dict = {param.split(' = ')[0]: param.split(' = ')[-1] for param in param_list}
8734
8735 # return only desired parameters
8736 if 'net.ipv4.ip_nonlocal_bind' in param_dict:
8737 k_param['net.ipv4.ip_nonlocal_bind'] = param_dict['net.ipv4.ip_nonlocal_bind']
8738
8739 return k_param
8740
8741 @staticmethod
8742 def _process_net_data(tcp_file: str, protocol: str = 'tcp') -> List[int]:
8743 listening_ports = []
8744 # Connections state documentation
8745 # tcp - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/net/tcp_states.h
8746 # udp - uses 07 (TCP_CLOSE or UNCONN, since udp is stateless. test with netcat -ul <port>)
8747 listening_state = {
8748 'tcp': '0A',
8749 'udp': '07'
8750 }
8751
8752 if protocol not in listening_state.keys():
8753 return []
8754
8755 if os.path.exists(tcp_file):
8756 with open(tcp_file) as f:
8757 tcp_data = f.readlines()[1:]
8758
8759 for con in tcp_data:
8760 con_info = con.strip().split()
8761 if con_info[3] == listening_state[protocol]:
8762 local_port = int(con_info[1].split(':')[1], 16)
8763 listening_ports.append(local_port)
8764
8765 return listening_ports
8766
8767 @property
8768 def tcp_ports_used(self) -> List[int]:
8769 return HostFacts._process_net_data('/proc/net/tcp')
8770
8771 @property
8772 def tcp6_ports_used(self) -> List[int]:
8773 return HostFacts._process_net_data('/proc/net/tcp6')
8774
8775 @property
8776 def udp_ports_used(self) -> List[int]:
8777 return HostFacts._process_net_data('/proc/net/udp', 'udp')
8778
8779 @property
8780 def udp6_ports_used(self) -> List[int]:
8781 return HostFacts._process_net_data('/proc/net/udp6', 'udp')
8782
8783 def dump(self):
8784 # type: () -> str
8785 """Return the attributes of this HostFacts object as json"""
8786 data = {
8787 k: getattr(self, k) for k in dir(self)
8788 if not k.startswith('_')
8789 and isinstance(getattr(self, k), (float, int, str, list, dict, tuple))
8790 }
8791 return json.dumps(data, indent=2, sort_keys=True)
8792
8793 ##################################
8794
8795
8796 def command_gather_facts(ctx: CephadmContext) -> None:
8797 """gather_facts is intended to provide host releated metadata to the caller"""
8798 host = HostFacts(ctx)
8799 print(host.dump())
8800
8801
8802 ##################################
8803
8804
8805 def systemd_target_state(ctx: CephadmContext, target_name: str, subsystem: str = 'ceph') -> bool:
8806 # TODO: UNITTEST
8807 return os.path.exists(
8808 os.path.join(
8809 ctx.unit_dir,
8810 f'{subsystem}.target.wants',
8811 target_name
8812 )
8813 )
8814
8815
8816 def target_exists(ctx: CephadmContext) -> bool:
8817 return os.path.exists(ctx.unit_dir + '/ceph.target')
8818
8819
8820 @infer_fsid
8821 def command_maintenance(ctx: CephadmContext) -> str:
8822 if not ctx.fsid:
8823 raise Error('failed - must pass --fsid to specify cluster')
8824
8825 target = f'ceph-{ctx.fsid}.target'
8826
8827 if ctx.maintenance_action.lower() == 'enter':
8828 logger.info('Requested to place host into maintenance')
8829 if systemd_target_state(ctx, target):
8830 _out, _err, code = call(ctx,
8831 ['systemctl', 'disable', target],
8832 verbosity=CallVerbosity.DEBUG)
8833 if code:
8834 logger.error(f'Failed to disable the {target} target')
8835 return 'failed - to disable the target'
8836 else:
8837 # stopping a target waits by default
8838 _out, _err, code = call(ctx,
8839 ['systemctl', 'stop', target],
8840 verbosity=CallVerbosity.DEBUG)
8841 if code:
8842 logger.error(f'Failed to stop the {target} target')
8843 return 'failed - to disable the target'
8844 else:
8845 return f'success - systemd target {target} disabled'
8846
8847 else:
8848 return 'skipped - target already disabled'
8849
8850 else:
8851 logger.info('Requested to exit maintenance state')
8852 # if we've never deployed a daemon on this host there will be no systemd
8853 # target to disable so attempting a disable will fail. We still need to
8854 # return success here or host will be permanently stuck in maintenance mode
8855 # as no daemons can be deployed so no systemd target will ever exist to disable.
8856 if not target_exists(ctx):
8857 return 'skipped - systemd target not present on this host. Host removed from maintenance mode.'
8858 # exit maintenance request
8859 if not systemd_target_state(ctx, target):
8860 _out, _err, code = call(ctx,
8861 ['systemctl', 'enable', target],
8862 verbosity=CallVerbosity.DEBUG)
8863 if code:
8864 logger.error(f'Failed to enable the {target} target')
8865 return 'failed - unable to enable the target'
8866 else:
8867 # starting a target waits by default
8868 _out, _err, code = call(ctx,
8869 ['systemctl', 'start', target],
8870 verbosity=CallVerbosity.DEBUG)
8871 if code:
8872 logger.error(f'Failed to start the {target} target')
8873 return 'failed - unable to start the target'
8874 else:
8875 return f'success - systemd target {target} enabled and started'
8876 return f'success - systemd target {target} enabled and started'
8877
8878 ##################################
8879
8880
8881 def _get_parser():
8882 # type: () -> argparse.ArgumentParser
8883 parser = argparse.ArgumentParser(
8884 description='Bootstrap Ceph daemons with systemd and containers.',
8885 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
8886 parser.add_argument(
8887 '--image',
8888 help='container image. Can also be set via the "CEPHADM_IMAGE" '
8889 'env var')
8890 parser.add_argument(
8891 '--docker',
8892 action='store_true',
8893 help='use docker instead of podman')
8894 parser.add_argument(
8895 '--data-dir',
8896 default=DATA_DIR,
8897 help='base directory for daemon data')
8898 parser.add_argument(
8899 '--log-dir',
8900 default=LOG_DIR,
8901 help='base directory for daemon logs')
8902 parser.add_argument(
8903 '--logrotate-dir',
8904 default=LOGROTATE_DIR,
8905 help='location of logrotate configuration files')
8906 parser.add_argument(
8907 '--sysctl-dir',
8908 default=SYSCTL_DIR,
8909 help='location of sysctl configuration files')
8910 parser.add_argument(
8911 '--unit-dir',
8912 default=UNIT_DIR,
8913 help='base directory for systemd units')
8914 parser.add_argument(
8915 '--verbose', '-v',
8916 action='store_true',
8917 help='Show debug-level log messages')
8918 parser.add_argument(
8919 '--timeout',
8920 type=int,
8921 default=DEFAULT_TIMEOUT,
8922 help='timeout in seconds')
8923 parser.add_argument(
8924 '--retry',
8925 type=int,
8926 default=DEFAULT_RETRY,
8927 help='max number of retries')
8928 parser.add_argument(
8929 '--env', '-e',
8930 action='append',
8931 default=[],
8932 help='set environment variable')
8933 parser.add_argument(
8934 '--no-container-init',
8935 action='store_true',
8936 default=not CONTAINER_INIT,
8937 help='Do not run podman/docker with `--init`')
8938 parser.add_argument(
8939 '--no-cgroups-split',
8940 action='store_true',
8941 default=False,
8942 help='Do not run containers with --cgroups=split (currently only relevant when using podman)')
8943
8944 subparsers = parser.add_subparsers(help='sub-command')
8945
8946 parser_version = subparsers.add_parser(
8947 'version', help='get ceph version from container')
8948 parser_version.set_defaults(func=command_version)
8949
8950 parser_pull = subparsers.add_parser(
8951 'pull', help='pull the default container image')
8952 parser_pull.set_defaults(func=command_pull)
8953 parser_pull.add_argument(
8954 '--insecure',
8955 action='store_true',
8956 help=argparse.SUPPRESS,
8957 )
8958
8959 parser_inspect_image = subparsers.add_parser(
8960 'inspect-image', help='inspect local container image')
8961 parser_inspect_image.set_defaults(func=command_inspect_image)
8962
8963 parser_ls = subparsers.add_parser(
8964 'ls', help='list daemon instances on this host')
8965 parser_ls.set_defaults(func=command_ls)
8966 parser_ls.add_argument(
8967 '--no-detail',
8968 action='store_true',
8969 help='Do not include daemon status')
8970 parser_ls.add_argument(
8971 '--legacy-dir',
8972 default='/',
8973 help='base directory for legacy daemon data')
8974
8975 parser_list_networks = subparsers.add_parser(
8976 'list-networks', help='list IP networks')
8977 parser_list_networks.set_defaults(func=command_list_networks)
8978
8979 parser_adopt = subparsers.add_parser(
8980 'adopt', help='adopt daemon deployed with a different tool')
8981 parser_adopt.set_defaults(func=command_adopt)
8982 parser_adopt.add_argument(
8983 '--name', '-n',
8984 required=True,
8985 help='daemon name (type.id)')
8986 parser_adopt.add_argument(
8987 '--style',
8988 required=True,
8989 help='deployment style (legacy, ...)')
8990 parser_adopt.add_argument(
8991 '--cluster',
8992 default='ceph',
8993 help='cluster name')
8994 parser_adopt.add_argument(
8995 '--legacy-dir',
8996 default='/',
8997 help='base directory for legacy daemon data')
8998 parser_adopt.add_argument(
8999 '--config-json',
9000 help='Additional configuration information in JSON format')
9001 parser_adopt.add_argument(
9002 '--skip-firewalld',
9003 action='store_true',
9004 help='Do not configure firewalld')
9005 parser_adopt.add_argument(
9006 '--skip-pull',
9007 action='store_true',
9008 help='do not pull the default image before adopting')
9009 parser_adopt.add_argument(
9010 '--force-start',
9011 action='store_true',
9012 help='start newly adoped daemon, even if it was not running previously')
9013 parser_adopt.add_argument(
9014 '--container-init',
9015 action='store_true',
9016 default=CONTAINER_INIT,
9017 help=argparse.SUPPRESS)
9018
9019 parser_rm_daemon = subparsers.add_parser(
9020 'rm-daemon', help='remove daemon instance')
9021 parser_rm_daemon.set_defaults(func=command_rm_daemon)
9022 parser_rm_daemon.add_argument(
9023 '--name', '-n',
9024 required=True,
9025 action=CustomValidation,
9026 help='daemon name (type.id)')
9027 parser_rm_daemon.add_argument(
9028 '--tcp-ports',
9029 help='List of tcp ports to close in the host firewall')
9030 parser_rm_daemon.add_argument(
9031 '--fsid',
9032 required=True,
9033 help='cluster FSID')
9034 parser_rm_daemon.add_argument(
9035 '--force',
9036 action='store_true',
9037 help='proceed, even though this may destroy valuable data')
9038 parser_rm_daemon.add_argument(
9039 '--force-delete-data',
9040 action='store_true',
9041 help='delete valuable daemon data instead of making a backup')
9042
9043 parser_rm_cluster = subparsers.add_parser(
9044 'rm-cluster', help='remove all daemons for a cluster')
9045 parser_rm_cluster.set_defaults(func=command_rm_cluster)
9046 parser_rm_cluster.add_argument(
9047 '--fsid',
9048 required=True,
9049 help='cluster FSID')
9050 parser_rm_cluster.add_argument(
9051 '--force',
9052 action='store_true',
9053 help='proceed, even though this may destroy valuable data')
9054 parser_rm_cluster.add_argument(
9055 '--keep-logs',
9056 action='store_true',
9057 help='do not remove log files')
9058 parser_rm_cluster.add_argument(
9059 '--zap-osds',
9060 action='store_true',
9061 help='zap OSD devices for this cluster')
9062
9063 parser_run = subparsers.add_parser(
9064 'run', help='run a ceph daemon, in a container, in the foreground')
9065 parser_run.set_defaults(func=command_run)
9066 parser_run.add_argument(
9067 '--name', '-n',
9068 required=True,
9069 help='daemon name (type.id)')
9070 parser_run.add_argument(
9071 '--fsid',
9072 required=True,
9073 help='cluster FSID')
9074
9075 parser_shell = subparsers.add_parser(
9076 'shell', help='run an interactive shell inside a daemon container')
9077 parser_shell.set_defaults(func=command_shell)
9078 parser_shell.add_argument(
9079 '--shared_ceph_folder',
9080 metavar='CEPH_SOURCE_FOLDER',
9081 help='Development mode. Several folders in containers are volumes mapped to different sub-folders in the ceph source folder')
9082 parser_shell.add_argument(
9083 '--fsid',
9084 help='cluster FSID')
9085 parser_shell.add_argument(
9086 '--name', '-n',
9087 help='daemon name (type.id)')
9088 parser_shell.add_argument(
9089 '--config', '-c',
9090 help='ceph.conf to pass through to the container')
9091 parser_shell.add_argument(
9092 '--keyring', '-k',
9093 help='ceph.keyring to pass through to the container')
9094 parser_shell.add_argument(
9095 '--mount', '-m',
9096 help=('mount a file or directory in the container. '
9097 'Support multiple mounts. '
9098 'ie: `--mount /foo /bar:/bar`. '
9099 'When no destination is passed, default is /mnt'),
9100 nargs='+')
9101 parser_shell.add_argument(
9102 '--env', '-e',
9103 action='append',
9104 default=[],
9105 help='set environment variable')
9106 parser_shell.add_argument(
9107 '--volume', '-v',
9108 action='append',
9109 default=[],
9110 help='set environment variable')
9111 parser_shell.add_argument(
9112 'command', nargs=argparse.REMAINDER,
9113 help='command (optional)')
9114 parser_shell.add_argument(
9115 '--no-hosts',
9116 action='store_true',
9117 help='dont pass /etc/hosts through to the container')
9118
9119 parser_enter = subparsers.add_parser(
9120 'enter', help='run an interactive shell inside a running daemon container')
9121 parser_enter.set_defaults(func=command_enter)
9122 parser_enter.add_argument(
9123 '--fsid',
9124 help='cluster FSID')
9125 parser_enter.add_argument(
9126 '--name', '-n',
9127 required=True,
9128 help='daemon name (type.id)')
9129 parser_enter.add_argument(
9130 'command', nargs=argparse.REMAINDER,
9131 help='command')
9132
9133 parser_ceph_volume = subparsers.add_parser(
9134 'ceph-volume', help='run ceph-volume inside a container')
9135 parser_ceph_volume.set_defaults(func=command_ceph_volume)
9136 parser_ceph_volume.add_argument(
9137 '--shared_ceph_folder',
9138 metavar='CEPH_SOURCE_FOLDER',
9139 help='Development mode. Several folders in containers are volumes mapped to different sub-folders in the ceph source folder')
9140 parser_ceph_volume.add_argument(
9141 '--fsid',
9142 help='cluster FSID')
9143 parser_ceph_volume.add_argument(
9144 '--config-json',
9145 help='JSON file with config and (client.bootstrap-osd) key')
9146 parser_ceph_volume.add_argument(
9147 '--config', '-c',
9148 help='ceph conf file')
9149 parser_ceph_volume.add_argument(
9150 '--keyring', '-k',
9151 help='ceph.keyring to pass through to the container')
9152 parser_ceph_volume.add_argument(
9153 'command', nargs=argparse.REMAINDER,
9154 help='command')
9155
9156 parser_zap_osds = subparsers.add_parser(
9157 'zap-osds', help='zap all OSDs associated with a particular fsid')
9158 parser_zap_osds.set_defaults(func=command_zap_osds)
9159 parser_zap_osds.add_argument(
9160 '--fsid',
9161 required=True,
9162 help='cluster FSID')
9163 parser_zap_osds.add_argument(
9164 '--force',
9165 action='store_true',
9166 help='proceed, even though this may destroy valuable data')
9167
9168 parser_unit = subparsers.add_parser(
9169 'unit', help="operate on the daemon's systemd unit")
9170 parser_unit.set_defaults(func=command_unit)
9171 parser_unit.add_argument(
9172 'command',
9173 help='systemd command (start, stop, restart, enable, disable, ...)')
9174 parser_unit.add_argument(
9175 '--fsid',
9176 help='cluster FSID')
9177 parser_unit.add_argument(
9178 '--name', '-n',
9179 required=True,
9180 help='daemon name (type.id)')
9181
9182 parser_logs = subparsers.add_parser(
9183 'logs', help='print journald logs for a daemon container')
9184 parser_logs.set_defaults(func=command_logs)
9185 parser_logs.add_argument(
9186 '--fsid',
9187 help='cluster FSID')
9188 parser_logs.add_argument(
9189 '--name', '-n',
9190 required=True,
9191 help='daemon name (type.id)')
9192 parser_logs.add_argument(
9193 'command', nargs='*',
9194 help='additional journalctl args')
9195
9196 parser_bootstrap = subparsers.add_parser(
9197 'bootstrap', help='bootstrap a cluster (mon + mgr daemons)')
9198 parser_bootstrap.set_defaults(func=command_bootstrap)
9199 parser_bootstrap.add_argument(
9200 '--config', '-c',
9201 help='ceph conf file to incorporate')
9202 parser_bootstrap.add_argument(
9203 '--mon-id',
9204 required=False,
9205 help='mon id (default: local hostname)')
9206 group = parser_bootstrap.add_mutually_exclusive_group()
9207 group.add_argument(
9208 '--mon-addrv',
9209 help='mon IPs (e.g., [v2:localipaddr:3300,v1:localipaddr:6789])')
9210 group.add_argument(
9211 '--mon-ip',
9212 help='mon IP')
9213 parser_bootstrap.add_argument(
9214 '--mgr-id',
9215 required=False,
9216 help='mgr id (default: randomly generated)')
9217 parser_bootstrap.add_argument(
9218 '--fsid',
9219 help='cluster FSID')
9220 parser_bootstrap.add_argument(
9221 '--output-dir',
9222 default='/etc/ceph',
9223 help='directory to write config, keyring, and pub key files')
9224 parser_bootstrap.add_argument(
9225 '--output-keyring',
9226 help='location to write keyring file with new cluster admin and mon keys')
9227 parser_bootstrap.add_argument(
9228 '--output-config',
9229 help='location to write conf file to connect to new cluster')
9230 parser_bootstrap.add_argument(
9231 '--output-pub-ssh-key',
9232 help="location to write the cluster's public SSH key")
9233 parser_bootstrap.add_argument(
9234 '--skip-admin-label',
9235 action='store_true',
9236 help='do not create admin label for ceph.conf and client.admin keyring distribution')
9237 parser_bootstrap.add_argument(
9238 '--skip-ssh',
9239 action='store_true',
9240 help='skip setup of ssh key on local host')
9241 parser_bootstrap.add_argument(
9242 '--initial-dashboard-user',
9243 default='admin',
9244 help='Initial user for the dashboard')
9245 parser_bootstrap.add_argument(
9246 '--initial-dashboard-password',
9247 help='Initial password for the initial dashboard user')
9248 parser_bootstrap.add_argument(
9249 '--ssl-dashboard-port',
9250 type=int,
9251 default=8443,
9252 help='Port number used to connect with dashboard using SSL')
9253 parser_bootstrap.add_argument(
9254 '--dashboard-key',
9255 type=argparse.FileType('r'),
9256 help='Dashboard key')
9257 parser_bootstrap.add_argument(
9258 '--dashboard-crt',
9259 type=argparse.FileType('r'),
9260 help='Dashboard certificate')
9261
9262 parser_bootstrap.add_argument(
9263 '--ssh-config',
9264 type=argparse.FileType('r'),
9265 help='SSH config')
9266 parser_bootstrap.add_argument(
9267 '--ssh-private-key',
9268 type=argparse.FileType('r'),
9269 help='SSH private key')
9270 parser_bootstrap.add_argument(
9271 '--ssh-public-key',
9272 type=argparse.FileType('r'),
9273 help='SSH public key')
9274 parser_bootstrap.add_argument(
9275 '--ssh-user',
9276 default='root',
9277 help='set user for SSHing to cluster hosts, passwordless sudo will be needed for non-root users')
9278 parser_bootstrap.add_argument(
9279 '--skip-mon-network',
9280 action='store_true',
9281 help='set mon public_network based on bootstrap mon ip')
9282 parser_bootstrap.add_argument(
9283 '--skip-dashboard',
9284 action='store_true',
9285 help='do not enable the Ceph Dashboard')
9286 parser_bootstrap.add_argument(
9287 '--dashboard-password-noupdate',
9288 action='store_true',
9289 help='stop forced dashboard password change')
9290 parser_bootstrap.add_argument(
9291 '--no-minimize-config',
9292 action='store_true',
9293 help='do not assimilate and minimize the config file')
9294 parser_bootstrap.add_argument(
9295 '--skip-ping-check',
9296 action='store_true',
9297 help='do not verify that mon IP is pingable')
9298 parser_bootstrap.add_argument(
9299 '--skip-pull',
9300 action='store_true',
9301 help='do not pull the default image before bootstrapping')
9302 parser_bootstrap.add_argument(
9303 '--skip-firewalld',
9304 action='store_true',
9305 help='Do not configure firewalld')
9306 parser_bootstrap.add_argument(
9307 '--allow-overwrite',
9308 action='store_true',
9309 help='allow overwrite of existing --output-* config/keyring/ssh files')
9310 parser_bootstrap.add_argument(
9311 '--allow-fqdn-hostname',
9312 action='store_true',
9313 help='allow hostname that is fully-qualified (contains ".")')
9314 parser_bootstrap.add_argument(
9315 '--allow-mismatched-release',
9316 action='store_true',
9317 help="allow bootstrap of ceph that doesn't match this version of cephadm")
9318 parser_bootstrap.add_argument(
9319 '--skip-prepare-host',
9320 action='store_true',
9321 help='Do not prepare host')
9322 parser_bootstrap.add_argument(
9323 '--orphan-initial-daemons',
9324 action='store_true',
9325 help='Set mon and mgr service to `unmanaged`, Do not create the crash service')
9326 parser_bootstrap.add_argument(
9327 '--skip-monitoring-stack',
9328 action='store_true',
9329 help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter)')
9330 parser_bootstrap.add_argument(
9331 '--with-centralized-logging',
9332 action='store_true',
9333 help='Automatically provision centralized logging (promtail, loki)')
9334 parser_bootstrap.add_argument(
9335 '--apply-spec',
9336 help='Apply cluster spec after bootstrap (copy ssh key, add hosts and apply services)')
9337 parser_bootstrap.add_argument(
9338 '--shared_ceph_folder',
9339 metavar='CEPH_SOURCE_FOLDER',
9340 help='Development mode. Several folders in containers are volumes mapped to different sub-folders in the ceph source folder')
9341
9342 parser_bootstrap.add_argument(
9343 '--registry-url',
9344 help='url for custom registry')
9345 parser_bootstrap.add_argument(
9346 '--registry-username',
9347 help='username for custom registry')
9348 parser_bootstrap.add_argument(
9349 '--registry-password',
9350 help='password for custom registry')
9351 parser_bootstrap.add_argument(
9352 '--registry-json',
9353 help='json file with custom registry login info (URL, Username, Password)')
9354 parser_bootstrap.add_argument(
9355 '--container-init',
9356 action='store_true',
9357 default=CONTAINER_INIT,
9358 help=argparse.SUPPRESS)
9359 parser_bootstrap.add_argument(
9360 '--cluster-network',
9361 help='subnet to use for cluster replication, recovery and heartbeats (in CIDR notation network/mask)')
9362 parser_bootstrap.add_argument(
9363 '--single-host-defaults',
9364 action='store_true',
9365 help='adjust configuration defaults to suit a single-host cluster')
9366 parser_bootstrap.add_argument(
9367 '--log-to-file',
9368 action='store_true',
9369 help='configure cluster to log to traditional log files in /var/log/ceph/$fsid')
9370
9371 parser_deploy = subparsers.add_parser(
9372 'deploy', help='deploy a daemon')
9373 parser_deploy.set_defaults(func=command_deploy)
9374 parser_deploy.add_argument(
9375 '--name',
9376 required=True,
9377 action=CustomValidation,
9378 help='daemon name (type.id)')
9379 parser_deploy.add_argument(
9380 '--fsid',
9381 required=True,
9382 help='cluster FSID')
9383 parser_deploy.add_argument(
9384 '--config', '-c',
9385 help='config file for new daemon')
9386 parser_deploy.add_argument(
9387 '--config-json',
9388 help='Additional configuration information in JSON format')
9389 parser_deploy.add_argument(
9390 '--keyring',
9391 help='keyring for new daemon')
9392 parser_deploy.add_argument(
9393 '--key',
9394 help='key for new daemon')
9395 parser_deploy.add_argument(
9396 '--osd-fsid',
9397 help='OSD uuid, if creating an OSD container')
9398 parser_deploy.add_argument(
9399 '--skip-firewalld',
9400 action='store_true',
9401 help='Do not configure firewalld')
9402 parser_deploy.add_argument(
9403 '--tcp-ports',
9404 help='List of tcp ports to open in the host firewall')
9405 parser_deploy.add_argument(
9406 '--reconfig',
9407 action='store_true',
9408 help='Reconfigure a previously deployed daemon')
9409 parser_deploy.add_argument(
9410 '--allow-ptrace',
9411 action='store_true',
9412 help='Allow SYS_PTRACE on daemon container')
9413 parser_deploy.add_argument(
9414 '--container-init',
9415 action='store_true',
9416 default=CONTAINER_INIT,
9417 help=argparse.SUPPRESS)
9418 parser_deploy.add_argument(
9419 '--memory-request',
9420 help='Container memory request/target'
9421 )
9422 parser_deploy.add_argument(
9423 '--memory-limit',
9424 help='Container memory hard limit'
9425 )
9426 parser_deploy.add_argument(
9427 '--meta-json',
9428 help='JSON dict of additional metadata'
9429 )
9430 parser_deploy.add_argument(
9431 '--extra-container-args',
9432 action='append',
9433 default=[],
9434 help='Additional container arguments to apply to deamon'
9435 )
9436 parser_deploy.add_argument(
9437 '--extra-entrypoint-args',
9438 action='append',
9439 default=[],
9440 help='Additional entrypoint arguments to apply to deamon'
9441 )
9442
9443 parser_check_host = subparsers.add_parser(
9444 'check-host', help='check host configuration')
9445 parser_check_host.set_defaults(func=command_check_host)
9446 parser_check_host.add_argument(
9447 '--expect-hostname',
9448 help='Check that hostname matches an expected value')
9449
9450 parser_prepare_host = subparsers.add_parser(
9451 'prepare-host', help='prepare a host for cephadm use')
9452 parser_prepare_host.set_defaults(func=command_prepare_host)
9453 parser_prepare_host.add_argument(
9454 '--expect-hostname',
9455 help='Set hostname')
9456
9457 parser_add_repo = subparsers.add_parser(
9458 'add-repo', help='configure package repository')
9459 parser_add_repo.set_defaults(func=command_add_repo)
9460 parser_add_repo.add_argument(
9461 '--release',
9462 help='use latest version of a named release (e.g., {})'.format(LATEST_STABLE_RELEASE))
9463 parser_add_repo.add_argument(
9464 '--version',
9465 help='use specific upstream version (x.y.z)')
9466 parser_add_repo.add_argument(
9467 '--dev',
9468 help='use specified bleeding edge build from git branch or tag')
9469 parser_add_repo.add_argument(
9470 '--dev-commit',
9471 help='use specified bleeding edge build from git commit')
9472 parser_add_repo.add_argument(
9473 '--gpg-url',
9474 help='specify alternative GPG key location')
9475 parser_add_repo.add_argument(
9476 '--repo-url',
9477 default='https://download.ceph.com',
9478 help='specify alternative repo location')
9479 # TODO: proxy?
9480
9481 parser_rm_repo = subparsers.add_parser(
9482 'rm-repo', help='remove package repository configuration')
9483 parser_rm_repo.set_defaults(func=command_rm_repo)
9484
9485 parser_install = subparsers.add_parser(
9486 'install', help='install ceph package(s)')
9487 parser_install.set_defaults(func=command_install)
9488 parser_install.add_argument(
9489 'packages', nargs='*',
9490 default=['cephadm'],
9491 help='packages')
9492
9493 parser_registry_login = subparsers.add_parser(
9494 'registry-login', help='log host into authenticated registry')
9495 parser_registry_login.set_defaults(func=command_registry_login)
9496 parser_registry_login.add_argument(
9497 '--registry-url',
9498 help='url for custom registry')
9499 parser_registry_login.add_argument(
9500 '--registry-username',
9501 help='username for custom registry')
9502 parser_registry_login.add_argument(
9503 '--registry-password',
9504 help='password for custom registry')
9505 parser_registry_login.add_argument(
9506 '--registry-json',
9507 help='json file with custom registry login info (URL, Username, Password)')
9508 parser_registry_login.add_argument(
9509 '--fsid',
9510 help='cluster FSID')
9511
9512 parser_gather_facts = subparsers.add_parser(
9513 'gather-facts', help='gather and return host related information (JSON format)')
9514 parser_gather_facts.set_defaults(func=command_gather_facts)
9515
9516 parser_maintenance = subparsers.add_parser(
9517 'host-maintenance', help='Manage the maintenance state of a host')
9518 parser_maintenance.add_argument(
9519 '--fsid',
9520 help='cluster FSID')
9521 parser_maintenance.add_argument(
9522 'maintenance_action',
9523 type=str,
9524 choices=['enter', 'exit'],
9525 help='Maintenance action - enter maintenance, or exit maintenance')
9526 parser_maintenance.set_defaults(func=command_maintenance)
9527
9528 parser_agent = subparsers.add_parser(
9529 'agent', help='start cephadm agent')
9530 parser_agent.set_defaults(func=command_agent)
9531 parser_agent.add_argument(
9532 '--fsid',
9533 required=True,
9534 help='cluster FSID')
9535 parser_agent.add_argument(
9536 '--daemon-id',
9537 help='daemon id for agent')
9538
9539 parser_disk_rescan = subparsers.add_parser(
9540 'disk-rescan', help='rescan all HBAs to detect new/removed devices')
9541 parser_disk_rescan.set_defaults(func=command_rescan_disks)
9542
9543 return parser
9544
9545
9546 def _parse_args(av: List[str]) -> argparse.Namespace:
9547 parser = _get_parser()
9548
9549 args = parser.parse_args(av)
9550 if 'command' in args and args.command and args.command[0] == '--':
9551 args.command.pop(0)
9552
9553 # workaround argparse to deprecate the subparser `--container-init` flag
9554 # container_init and no_container_init must always be mutually exclusive
9555 container_init_args = ('--container-init', '--no-container-init')
9556 if set(container_init_args).issubset(av):
9557 parser.error('argument %s: not allowed with argument %s' % (container_init_args))
9558 elif '--container-init' in av:
9559 args.no_container_init = not args.container_init
9560 else:
9561 args.container_init = not args.no_container_init
9562 assert args.container_init is not args.no_container_init
9563
9564 return args
9565
9566
9567 def cephadm_init_ctx(args: List[str]) -> CephadmContext:
9568 ctx = CephadmContext()
9569 ctx.set_args(_parse_args(args))
9570 return ctx
9571
9572
9573 def cephadm_init_logging(ctx: CephadmContext, args: List[str]) -> None:
9574 """Configure the logging for cephadm as well as updating the system
9575 to have the expected log dir and logrotate configuration.
9576 """
9577 logging.addLevelName(QUIET_LOG_LEVEL, 'QUIET')
9578 global logger
9579 if not os.path.exists(LOG_DIR):
9580 os.makedirs(LOG_DIR)
9581 operations = ['bootstrap', 'rm-cluster']
9582 if any(op in args for op in operations):
9583 dictConfig(interactive_logging_config)
9584 else:
9585 dictConfig(logging_config)
9586
9587 logger = logging.getLogger()
9588 logger.setLevel(QUIET_LOG_LEVEL)
9589
9590 if not os.path.exists(ctx.logrotate_dir + '/cephadm'):
9591 with open(ctx.logrotate_dir + '/cephadm', 'w') as f:
9592 f.write("""# created by cephadm
9593 /var/log/ceph/cephadm.log {
9594 rotate 7
9595 daily
9596 compress
9597 missingok
9598 notifempty
9599 su root root
9600 }
9601 """)
9602
9603 if ctx.verbose:
9604 for handler in logger.handlers:
9605 if handler.name in ['console', 'log_file', 'console_stdout']:
9606 handler.setLevel(QUIET_LOG_LEVEL)
9607 logger.debug('%s\ncephadm %s' % ('-' * 80, args))
9608
9609
9610 def cephadm_require_root() -> None:
9611 """Exit if the process is not running as root."""
9612 if os.geteuid() != 0:
9613 sys.stderr.write('ERROR: cephadm should be run as root\n')
9614 sys.exit(1)
9615
9616
9617 def main() -> None:
9618 av: List[str] = []
9619 av = sys.argv[1:]
9620
9621 ctx = cephadm_init_ctx(av)
9622 if not ctx.has_function():
9623 sys.stderr.write('No command specified; pass -h or --help for usage\n')
9624 sys.exit(1)
9625
9626 cephadm_require_root()
9627 cephadm_init_logging(ctx, av)
9628 try:
9629 # podman or docker?
9630 ctx.container_engine = find_container_engine(ctx)
9631 if ctx.func not in \
9632 [
9633 command_check_host,
9634 command_prepare_host,
9635 command_add_repo,
9636 command_rm_repo,
9637 command_install
9638 ]:
9639 check_container_engine(ctx)
9640 # command handler
9641 r = ctx.func(ctx)
9642 except Error as e:
9643 if ctx.verbose:
9644 raise
9645 logger.error('ERROR: %s' % e)
9646 sys.exit(1)
9647 if not r:
9648 r = 0
9649 sys.exit(r)
9650
9651
9652 if __name__ == '__main__':
9653 main()