]> git.proxmox.com Git - ceph.git/blob - ceph/src/cephadm/cephadm
ed5a48d87968689a43d65c90a13bcb67932e1d69
[ceph.git] / ceph / src / cephadm / cephadm
1 #!/usr/bin/python3
2
3 import asyncio
4 import asyncio.subprocess
5 import argparse
6 import datetime
7 import fcntl
8 import ipaddress
9 import io
10 import json
11 import logging
12 from logging.config import dictConfig
13 import os
14 import platform
15 import pwd
16 import random
17 import shlex
18 import shutil
19 import socket
20 import string
21 import subprocess
22 import sys
23 import tempfile
24 import time
25 import errno
26 import struct
27 import ssl
28 from enum import Enum
29 from typing import Dict, List, Tuple, Optional, Union, Any, NoReturn, Callable, IO, Sequence, TypeVar, cast, Set, Iterable
30
31 import re
32 import uuid
33
34 from configparser import ConfigParser
35 from contextlib import redirect_stdout
36 from functools import wraps
37 from glob import glob
38 from io import StringIO
39 from threading import Thread, Event
40 from urllib.error import HTTPError, URLError
41 from urllib.request import urlopen, Request
42 from pathlib import Path
43
44 FuncT = TypeVar('FuncT', bound=Callable)
45
46 # Default container images -----------------------------------------------------
47 DEFAULT_IMAGE = 'quay.io/ceph/ceph:v17'
48 DEFAULT_IMAGE_IS_MASTER = False
49 DEFAULT_IMAGE_RELEASE = 'quincy'
50 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.33.4'
51 DEFAULT_LOKI_IMAGE = 'docker.io/grafana/loki:2.4.0'
52 DEFAULT_PROMTAIL_IMAGE = 'docker.io/grafana/promtail:2.4.0'
53 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v1.3.1'
54 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.23.0'
55 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/ceph-grafana:8.3.5'
56 DEFAULT_HAPROXY_IMAGE = 'quay.io/ceph/haproxy:2.3'
57 DEFAULT_KEEPALIVED_IMAGE = 'quay.io/ceph/keepalived:2.1.5'
58 DEFAULT_SNMP_GATEWAY_IMAGE = 'docker.io/maxwo/snmp-notifier:v1.2.1'
59 DEFAULT_REGISTRY = 'docker.io' # normalize unqualified digests to this
60 # ------------------------------------------------------------------------------
61
62 LATEST_STABLE_RELEASE = 'quincy'
63 DATA_DIR = '/var/lib/ceph'
64 LOG_DIR = '/var/log/ceph'
65 LOCK_DIR = '/run/cephadm'
66 LOGROTATE_DIR = '/etc/logrotate.d'
67 SYSCTL_DIR = '/etc/sysctl.d'
68 UNIT_DIR = '/etc/systemd/system'
69 CEPH_CONF_DIR = 'config'
70 CEPH_CONF = 'ceph.conf'
71 CEPH_PUBKEY = 'ceph.pub'
72 CEPH_KEYRING = 'ceph.client.admin.keyring'
73 CEPH_DEFAULT_CONF = f'/etc/ceph/{CEPH_CONF}'
74 CEPH_DEFAULT_KEYRING = f'/etc/ceph/{CEPH_KEYRING}'
75 CEPH_DEFAULT_PUBKEY = f'/etc/ceph/{CEPH_PUBKEY}'
76 LOG_DIR_MODE = 0o770
77 DATA_DIR_MODE = 0o700
78 CONTAINER_INIT = True
79 MIN_PODMAN_VERSION = (2, 0, 2)
80 CGROUPS_SPLIT_PODMAN_VERSION = (2, 1, 0)
81 CUSTOM_PS1 = r'[ceph: \u@\h \W]\$ '
82 DEFAULT_TIMEOUT = None # in seconds
83 DEFAULT_RETRY = 15
84 DATEFMT = '%Y-%m-%dT%H:%M:%S.%fZ'
85 QUIET_LOG_LEVEL = 9 # DEBUG is 10, so using 9 to be lower level than DEBUG
86
87 logger: logging.Logger = None # type: ignore
88
89 """
90 You can invoke cephadm in two ways:
91
92 1. The normal way, at the command line.
93
94 2. By piping the script to the python3 binary. In this latter case, you should
95 prepend one or more lines to the beginning of the script.
96
97 For arguments,
98
99 injected_argv = [...]
100
101 e.g.,
102
103 injected_argv = ['ls']
104
105 For reading stdin from the '--config-json -' argument,
106
107 injected_stdin = '...'
108 """
109 cached_stdin = None
110
111
112 ##################################
113
114
115 async def run_func(func: Callable, cmd: str) -> subprocess.CompletedProcess:
116 logger.debug(f'running function {func.__name__}, with parms: {cmd}')
117 response = func(cmd)
118 return response
119
120
121 async def concurrent_tasks(func: Callable, cmd_list: List[str]) -> List[Any]:
122 tasks = []
123 for cmd in cmd_list:
124 tasks.append(run_func(func, cmd))
125
126 data = await asyncio.gather(*tasks)
127
128 return data
129
130
131 class EndPoint:
132 """EndPoint representing an ip:port format"""
133
134 def __init__(self, ip: str, port: int) -> None:
135 self.ip = ip
136 self.port = port
137
138 def __str__(self) -> str:
139 return f'{self.ip}:{self.port}'
140
141 def __repr__(self) -> str:
142 return f'{self.ip}:{self.port}'
143
144
145 class ContainerInfo:
146 def __init__(self, container_id: str,
147 image_name: str,
148 image_id: str,
149 start: str,
150 version: str) -> None:
151 self.container_id = container_id
152 self.image_name = image_name
153 self.image_id = image_id
154 self.start = start
155 self.version = version
156
157 def __eq__(self, other: Any) -> bool:
158 if not isinstance(other, ContainerInfo):
159 return NotImplemented
160 return (self.container_id == other.container_id
161 and self.image_name == other.image_name
162 and self.image_id == other.image_id
163 and self.start == other.start
164 and self.version == other.version)
165
166
167 class BaseConfig:
168
169 def __init__(self) -> None:
170 self.image: str = ''
171 self.docker: bool = False
172 self.data_dir: str = DATA_DIR
173 self.log_dir: str = LOG_DIR
174 self.logrotate_dir: str = LOGROTATE_DIR
175 self.sysctl_dir: str = SYSCTL_DIR
176 self.unit_dir: str = UNIT_DIR
177 self.verbose: bool = False
178 self.timeout: Optional[int] = DEFAULT_TIMEOUT
179 self.retry: int = DEFAULT_RETRY
180 self.env: List[str] = []
181 self.memory_request: Optional[int] = None
182 self.memory_limit: Optional[int] = None
183 self.log_to_journald: Optional[bool] = None
184
185 self.container_init: bool = CONTAINER_INIT
186 self.container_engine: Optional[ContainerEngine] = None
187
188 def set_from_args(self, args: argparse.Namespace) -> None:
189 argdict: Dict[str, Any] = vars(args)
190 for k, v in argdict.items():
191 if hasattr(self, k):
192 setattr(self, k, v)
193
194
195 class CephadmContext:
196
197 def __init__(self) -> None:
198 self.__dict__['_args'] = None
199 self.__dict__['_conf'] = BaseConfig()
200
201 def set_args(self, args: argparse.Namespace) -> None:
202 self._conf.set_from_args(args)
203 self._args = args
204
205 def has_function(self) -> bool:
206 return 'func' in self._args
207
208 def __contains__(self, name: str) -> bool:
209 return hasattr(self, name)
210
211 def __getattr__(self, name: str) -> Any:
212 if '_conf' in self.__dict__ and hasattr(self._conf, name):
213 return getattr(self._conf, name)
214 elif '_args' in self.__dict__ and hasattr(self._args, name):
215 return getattr(self._args, name)
216 else:
217 return super().__getattribute__(name)
218
219 def __setattr__(self, name: str, value: Any) -> None:
220 if hasattr(self._conf, name):
221 setattr(self._conf, name, value)
222 elif hasattr(self._args, name):
223 setattr(self._args, name, value)
224 else:
225 super().__setattr__(name, value)
226
227
228 class ContainerEngine:
229 def __init__(self) -> None:
230 self.path = find_program(self.EXE)
231
232 @classmethod
233 @property
234 def EXE(cls) -> str:
235 raise NotImplementedError()
236
237 def __str__(self) -> str:
238 return f'{self.EXE} ({self.path})'
239
240
241 class Podman(ContainerEngine):
242 EXE = 'podman'
243
244 def __init__(self) -> None:
245 super().__init__()
246 self._version: Optional[Tuple[int, ...]] = None
247
248 @property
249 def version(self) -> Tuple[int, ...]:
250 if self._version is None:
251 raise RuntimeError('Please call `get_version` first')
252 return self._version
253
254 def get_version(self, ctx: CephadmContext) -> None:
255 out, _, _ = call_throws(ctx, [self.path, 'version', '--format', '{{.Client.Version}}'], verbosity=CallVerbosity.QUIET)
256 self._version = _parse_podman_version(out)
257
258 def __str__(self) -> str:
259 version = '.'.join(map(str, self.version))
260 return f'{self.EXE} ({self.path}) version {version}'
261
262
263 class Docker(ContainerEngine):
264 EXE = 'docker'
265
266
267 CONTAINER_PREFERENCE = (Podman, Docker) # prefer podman to docker
268
269
270 # During normal cephadm operations (cephadm ls, gather-facts, etc ) we use:
271 # stdout: for JSON output only
272 # stderr: for error, debug, info, etc
273 logging_config = {
274 'version': 1,
275 'disable_existing_loggers': True,
276 'formatters': {
277 'cephadm': {
278 'format': '%(asctime)s %(thread)x %(levelname)s %(message)s'
279 },
280 },
281 'handlers': {
282 'console': {
283 'level': 'INFO',
284 'class': 'logging.StreamHandler',
285 },
286 'log_file': {
287 'level': 'DEBUG',
288 'class': 'logging.handlers.WatchedFileHandler',
289 'formatter': 'cephadm',
290 'filename': '%s/cephadm.log' % LOG_DIR,
291 }
292 },
293 'loggers': {
294 '': {
295 'level': 'DEBUG',
296 'handlers': ['console', 'log_file'],
297 }
298 }
299 }
300
301
302 class ExcludeErrorsFilter(logging.Filter):
303 def filter(self, record: logging.LogRecord) -> bool:
304 """Only lets through log messages with log level below WARNING ."""
305 return record.levelno < logging.WARNING
306
307
308 # When cephadm is used as standard binary (bootstrap, rm-cluster, etc) we use:
309 # stdout: for debug and info
310 # stderr: for errors and warnings
311 interactive_logging_config = {
312 'version': 1,
313 'filters': {
314 'exclude_errors': {
315 '()': ExcludeErrorsFilter
316 }
317 },
318 'disable_existing_loggers': True,
319 'formatters': {
320 'cephadm': {
321 'format': '%(asctime)s %(thread)x %(levelname)s %(message)s'
322 },
323 },
324 'handlers': {
325 'console_stdout': {
326 'level': 'INFO',
327 'class': 'logging.StreamHandler',
328 'filters': ['exclude_errors'],
329 'stream': sys.stdout
330 },
331 'console_stderr': {
332 'level': 'WARNING',
333 'class': 'logging.StreamHandler',
334 'stream': sys.stderr
335 },
336 'log_file': {
337 'level': 'DEBUG',
338 'class': 'logging.handlers.WatchedFileHandler',
339 'formatter': 'cephadm',
340 'filename': '%s/cephadm.log' % LOG_DIR,
341 }
342 },
343 'loggers': {
344 '': {
345 'level': 'DEBUG',
346 'handlers': ['console_stdout', 'console_stderr', 'log_file'],
347 }
348 }
349 }
350
351
352 class termcolor:
353 yellow = '\033[93m'
354 red = '\033[31m'
355 end = '\033[0m'
356
357
358 class Error(Exception):
359 pass
360
361
362 class TimeoutExpired(Error):
363 pass
364
365
366 class UnauthorizedRegistryError(Error):
367 pass
368
369 ##################################
370
371
372 class Ceph(object):
373 daemons = ('mon', 'mgr', 'osd', 'mds', 'rgw', 'rbd-mirror',
374 'crash', 'cephfs-mirror')
375
376 ##################################
377
378
379 class OSD(object):
380 @staticmethod
381 def get_sysctl_settings() -> List[str]:
382 return [
383 '# allow a large number of OSDs',
384 'fs.aio-max-nr = 1048576',
385 'kernel.pid_max = 4194304',
386 ]
387
388
389 ##################################
390
391
392 class SNMPGateway:
393 """Defines an SNMP gateway between Prometheus and SNMP monitoring Frameworks"""
394 daemon_type = 'snmp-gateway'
395 SUPPORTED_VERSIONS = ['V2c', 'V3']
396 default_image = DEFAULT_SNMP_GATEWAY_IMAGE
397 DEFAULT_PORT = 9464
398 env_filename = 'snmp-gateway.conf'
399
400 def __init__(self,
401 ctx: CephadmContext,
402 fsid: str,
403 daemon_id: Union[int, str],
404 config_json: Dict[str, Any],
405 image: Optional[str] = None) -> None:
406 self.ctx = ctx
407 self.fsid = fsid
408 self.daemon_id = daemon_id
409 self.image = image or SNMPGateway.default_image
410
411 self.uid = config_json.get('uid', 0)
412 self.gid = config_json.get('gid', 0)
413
414 self.destination = config_json.get('destination', '')
415 self.snmp_version = config_json.get('snmp_version', 'V2c')
416 self.snmp_community = config_json.get('snmp_community', 'public')
417 self.log_level = config_json.get('log_level', 'info')
418 self.snmp_v3_auth_username = config_json.get('snmp_v3_auth_username', '')
419 self.snmp_v3_auth_password = config_json.get('snmp_v3_auth_password', '')
420 self.snmp_v3_auth_protocol = config_json.get('snmp_v3_auth_protocol', '')
421 self.snmp_v3_priv_protocol = config_json.get('snmp_v3_priv_protocol', '')
422 self.snmp_v3_priv_password = config_json.get('snmp_v3_priv_password', '')
423 self.snmp_v3_engine_id = config_json.get('snmp_v3_engine_id', '')
424
425 self.validate()
426
427 @classmethod
428 def init(cls, ctx: CephadmContext, fsid: str,
429 daemon_id: Union[int, str]) -> 'SNMPGateway':
430 assert ctx.config_json
431 return cls(ctx, fsid, daemon_id,
432 get_parm(ctx.config_json), ctx.image)
433
434 @staticmethod
435 def get_version(ctx: CephadmContext, fsid: str, daemon_id: str) -> Optional[str]:
436 """Return the version of the notifer from it's http endpoint"""
437 path = os.path.join(ctx.data_dir, fsid, f'snmp-gateway.{daemon_id}', 'unit.meta')
438 try:
439 with open(path, 'r') as env:
440 metadata = json.loads(env.read())
441 except (OSError, json.JSONDecodeError):
442 return None
443
444 ports = metadata.get('ports', [])
445 if not ports:
446 return None
447
448 try:
449 with urlopen(f'http://127.0.0.1:{ports[0]}/') as r:
450 html = r.read().decode('utf-8').split('\n')
451 except (HTTPError, URLError):
452 return None
453
454 for h in html:
455 stripped = h.strip()
456 if stripped.startswith(('<pre>', '<PRE>')) and \
457 stripped.endswith(('</pre>', '</PRE>')):
458 # <pre>(version=1.2.1, branch=HEAD, revision=7...
459 return stripped.split(',')[0].split('version=')[1]
460
461 return None
462
463 @property
464 def port(self) -> int:
465 if not self.ctx.tcp_ports:
466 return self.DEFAULT_PORT
467 else:
468 if len(self.ctx.tcp_ports) > 0:
469 return int(self.ctx.tcp_ports.split()[0])
470 else:
471 return self.DEFAULT_PORT
472
473 def get_daemon_args(self) -> List[str]:
474 v3_args = []
475 base_args = [
476 f'--web.listen-address=:{self.port}',
477 f'--snmp.destination={self.destination}',
478 f'--snmp.version={self.snmp_version}',
479 f'--log.level={self.log_level}',
480 '--snmp.trap-description-template=/etc/snmp_notifier/description-template.tpl'
481 ]
482
483 if self.snmp_version == 'V3':
484 # common auth settings
485 v3_args.extend([
486 '--snmp.authentication-enabled',
487 f'--snmp.authentication-protocol={self.snmp_v3_auth_protocol}',
488 f'--snmp.security-engine-id={self.snmp_v3_engine_id}'
489 ])
490 # authPriv setting is applied if we have a privacy protocol setting
491 if self.snmp_v3_priv_protocol:
492 v3_args.extend([
493 '--snmp.private-enabled',
494 f'--snmp.private-protocol={self.snmp_v3_priv_protocol}'
495 ])
496
497 return base_args + v3_args
498
499 @property
500 def data_dir(self) -> str:
501 return os.path.join(self.ctx.data_dir, self.ctx.fsid, f'{self.daemon_type}.{self.daemon_id}')
502
503 @property
504 def conf_file_path(self) -> str:
505 return os.path.join(self.data_dir, self.env_filename)
506
507 def create_daemon_conf(self) -> None:
508 """Creates the environment file holding 'secrets' passed to the snmp-notifier daemon"""
509 with open(os.open(self.conf_file_path, os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
510 if self.snmp_version == 'V2c':
511 f.write(f'SNMP_NOTIFIER_COMMUNITY={self.snmp_community}\n')
512 else:
513 f.write(f'SNMP_NOTIFIER_AUTH_USERNAME={self.snmp_v3_auth_username}\n')
514 f.write(f'SNMP_NOTIFIER_AUTH_PASSWORD={self.snmp_v3_auth_password}\n')
515 if self.snmp_v3_priv_password:
516 f.write(f'SNMP_NOTIFIER_PRIV_PASSWORD={self.snmp_v3_priv_password}\n')
517
518 def validate(self) -> None:
519 """Validate the settings
520
521 Raises:
522 Error: if the fsid doesn't look like an fsid
523 Error: if the snmp version is not supported
524 Error: destination IP and port address missing
525 """
526 if not is_fsid(self.fsid):
527 raise Error(f'not a valid fsid: {self.fsid}')
528
529 if self.snmp_version not in SNMPGateway.SUPPORTED_VERSIONS:
530 raise Error(f'not a valid snmp version: {self.snmp_version}')
531
532 if not self.destination:
533 raise Error('config is missing destination attribute(<ip>:<port>) of the target SNMP listener')
534
535
536 ##################################
537 class Monitoring(object):
538 """Define the configs for the monitoring containers"""
539
540 port_map = {
541 'prometheus': [9095], # Avoid default 9090, due to conflict with cockpit UI
542 'node-exporter': [9100],
543 'grafana': [3000],
544 'alertmanager': [9093, 9094],
545 'loki': [3100],
546 'promtail': [9080]
547 }
548
549 components = {
550 'prometheus': {
551 'image': DEFAULT_PROMETHEUS_IMAGE,
552 'cpus': '2',
553 'memory': '4GB',
554 'args': [
555 '--config.file=/etc/prometheus/prometheus.yml',
556 '--storage.tsdb.path=/prometheus',
557 ],
558 'config-json-files': [
559 'prometheus.yml',
560 ],
561 },
562 'loki': {
563 'image': DEFAULT_LOKI_IMAGE,
564 'cpus': '1',
565 'memory': '1GB',
566 'args': [
567 '--config.file=/etc/loki/loki.yml',
568 ],
569 'config-json-files': [
570 'loki.yml'
571 ],
572 },
573 'promtail': {
574 'image': DEFAULT_PROMTAIL_IMAGE,
575 'cpus': '1',
576 'memory': '1GB',
577 'args': [
578 '--config.file=/etc/promtail/promtail.yml',
579 ],
580 'config-json-files': [
581 'promtail.yml',
582 ],
583 },
584 'node-exporter': {
585 'image': DEFAULT_NODE_EXPORTER_IMAGE,
586 'cpus': '1',
587 'memory': '1GB',
588 'args': [
589 '--no-collector.timex',
590 ],
591 },
592 'grafana': {
593 'image': DEFAULT_GRAFANA_IMAGE,
594 'cpus': '2',
595 'memory': '4GB',
596 'args': [],
597 'config-json-files': [
598 'grafana.ini',
599 'provisioning/datasources/ceph-dashboard.yml',
600 'certs/cert_file',
601 'certs/cert_key',
602 ],
603 },
604 'alertmanager': {
605 'image': DEFAULT_ALERT_MANAGER_IMAGE,
606 'cpus': '2',
607 'memory': '2GB',
608 'args': [
609 '--cluster.listen-address=:{}'.format(port_map['alertmanager'][1]),
610 ],
611 'config-json-files': [
612 'alertmanager.yml',
613 ],
614 'config-json-args': [
615 'peers',
616 ],
617 },
618 } # type: ignore
619
620 @staticmethod
621 def get_version(ctx, container_id, daemon_type):
622 # type: (CephadmContext, str, str) -> str
623 """
624 :param: daemon_type Either "prometheus", "alertmanager", "loki", "promtail" or "node-exporter"
625 """
626 assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter', 'loki', 'promtail')
627 cmd = daemon_type.replace('-', '_')
628 code = -1
629 err = ''
630 version = ''
631 if daemon_type == 'alertmanager':
632 for cmd in ['alertmanager', 'prometheus-alertmanager']:
633 _, err, code = call(ctx, [
634 ctx.container_engine.path, 'exec', container_id, cmd,
635 '--version'
636 ], verbosity=CallVerbosity.QUIET)
637 if code == 0:
638 break
639 cmd = 'alertmanager' # reset cmd for version extraction
640 else:
641 _, err, code = call(ctx, [
642 ctx.container_engine.path, 'exec', container_id, cmd, '--version'
643 ], verbosity=CallVerbosity.QUIET)
644 if code == 0 and \
645 err.startswith('%s, version ' % cmd):
646 version = err.split(' ')[2]
647 return version
648
649 ##################################
650
651
652 def populate_files(config_dir, config_files, uid, gid):
653 # type: (str, Dict, int, int) -> None
654 """create config files for different services"""
655 for fname in config_files:
656 config_file = os.path.join(config_dir, fname)
657 config_content = dict_get_join(config_files, fname)
658 logger.info('Write file: %s' % (config_file))
659 with open(config_file, 'w', encoding='utf-8') as f:
660 os.fchown(f.fileno(), uid, gid)
661 os.fchmod(f.fileno(), 0o600)
662 f.write(config_content)
663
664
665 class NFSGanesha(object):
666 """Defines a NFS-Ganesha container"""
667
668 daemon_type = 'nfs'
669 entrypoint = '/usr/bin/ganesha.nfsd'
670 daemon_args = ['-F', '-L', 'STDERR']
671
672 required_files = ['ganesha.conf']
673
674 port_map = {
675 'nfs': 2049,
676 }
677
678 def __init__(self,
679 ctx,
680 fsid,
681 daemon_id,
682 config_json,
683 image=DEFAULT_IMAGE):
684 # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
685 self.ctx = ctx
686 self.fsid = fsid
687 self.daemon_id = daemon_id
688 self.image = image
689
690 # config-json options
691 self.pool = dict_get(config_json, 'pool', require=True)
692 self.namespace = dict_get(config_json, 'namespace')
693 self.userid = dict_get(config_json, 'userid')
694 self.extra_args = dict_get(config_json, 'extra_args', [])
695 self.files = dict_get(config_json, 'files', {})
696 self.rgw = dict_get(config_json, 'rgw', {})
697
698 # validate the supplied args
699 self.validate()
700
701 @classmethod
702 def init(cls, ctx, fsid, daemon_id):
703 # type: (CephadmContext, str, Union[int, str]) -> NFSGanesha
704 return cls(ctx, fsid, daemon_id, get_parm(ctx.config_json), ctx.image)
705
706 def get_container_mounts(self, data_dir):
707 # type: (str) -> Dict[str, str]
708 mounts = dict()
709 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
710 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
711 mounts[os.path.join(data_dir, 'etc/ganesha')] = '/etc/ganesha:z'
712 if self.rgw:
713 cluster = self.rgw.get('cluster', 'ceph')
714 rgw_user = self.rgw.get('user', 'admin')
715 mounts[os.path.join(data_dir, 'keyring.rgw')] = \
716 '/var/lib/ceph/radosgw/%s-%s/keyring:z' % (cluster, rgw_user)
717 return mounts
718
719 @staticmethod
720 def get_container_envs():
721 # type: () -> List[str]
722 envs = [
723 'CEPH_CONF=%s' % (CEPH_DEFAULT_CONF)
724 ]
725 return envs
726
727 @staticmethod
728 def get_version(ctx, container_id):
729 # type: (CephadmContext, str) -> Optional[str]
730 version = None
731 out, err, code = call(ctx,
732 [ctx.container_engine.path, 'exec', container_id,
733 NFSGanesha.entrypoint, '-v'],
734 verbosity=CallVerbosity.QUIET)
735 if code == 0:
736 match = re.search(r'NFS-Ganesha Release\s*=\s*[V]*([\d.]+)', out)
737 if match:
738 version = match.group(1)
739 return version
740
741 def validate(self):
742 # type: () -> None
743 if not is_fsid(self.fsid):
744 raise Error('not an fsid: %s' % self.fsid)
745 if not self.daemon_id:
746 raise Error('invalid daemon_id: %s' % self.daemon_id)
747 if not self.image:
748 raise Error('invalid image: %s' % self.image)
749
750 # check for the required files
751 if self.required_files:
752 for fname in self.required_files:
753 if fname not in self.files:
754 raise Error('required file missing from config-json: %s' % fname)
755
756 # check for an RGW config
757 if self.rgw:
758 if not self.rgw.get('keyring'):
759 raise Error('RGW keyring is missing')
760 if not self.rgw.get('user'):
761 raise Error('RGW user is missing')
762
763 def get_daemon_name(self):
764 # type: () -> str
765 return '%s.%s' % (self.daemon_type, self.daemon_id)
766
767 def get_container_name(self, desc=None):
768 # type: (Optional[str]) -> str
769 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
770 if desc:
771 cname = '%s-%s' % (cname, desc)
772 return cname
773
774 def get_daemon_args(self):
775 # type: () -> List[str]
776 return self.daemon_args + self.extra_args
777
778 def create_daemon_dirs(self, data_dir, uid, gid):
779 # type: (str, int, int) -> None
780 """Create files under the container data dir"""
781 if not os.path.isdir(data_dir):
782 raise OSError('data_dir is not a directory: %s' % (data_dir))
783
784 logger.info('Creating ganesha config...')
785
786 # create the ganesha conf dir
787 config_dir = os.path.join(data_dir, 'etc/ganesha')
788 makedirs(config_dir, uid, gid, 0o755)
789
790 # populate files from the config-json
791 populate_files(config_dir, self.files, uid, gid)
792
793 # write the RGW keyring
794 if self.rgw:
795 keyring_path = os.path.join(data_dir, 'keyring.rgw')
796 with open(keyring_path, 'w') as f:
797 os.fchmod(f.fileno(), 0o600)
798 os.fchown(f.fileno(), uid, gid)
799 f.write(self.rgw.get('keyring', ''))
800
801 ##################################
802
803
804 class CephIscsi(object):
805 """Defines a Ceph-Iscsi container"""
806
807 daemon_type = 'iscsi'
808 entrypoint = '/usr/bin/rbd-target-api'
809
810 required_files = ['iscsi-gateway.cfg']
811
812 def __init__(self,
813 ctx,
814 fsid,
815 daemon_id,
816 config_json,
817 image=DEFAULT_IMAGE):
818 # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
819 self.ctx = ctx
820 self.fsid = fsid
821 self.daemon_id = daemon_id
822 self.image = image
823
824 # config-json options
825 self.files = dict_get(config_json, 'files', {})
826
827 # validate the supplied args
828 self.validate()
829
830 @classmethod
831 def init(cls, ctx, fsid, daemon_id):
832 # type: (CephadmContext, str, Union[int, str]) -> CephIscsi
833 return cls(ctx, fsid, daemon_id,
834 get_parm(ctx.config_json), ctx.image)
835
836 @staticmethod
837 def get_container_mounts(data_dir, log_dir):
838 # type: (str, str) -> Dict[str, str]
839 mounts = dict()
840 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
841 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
842 mounts[os.path.join(data_dir, 'iscsi-gateway.cfg')] = '/etc/ceph/iscsi-gateway.cfg:z'
843 mounts[os.path.join(data_dir, 'configfs')] = '/sys/kernel/config'
844 mounts[log_dir] = '/var/log:z'
845 mounts['/dev'] = '/dev'
846 return mounts
847
848 @staticmethod
849 def get_container_binds():
850 # type: () -> List[List[str]]
851 binds = []
852 lib_modules = ['type=bind',
853 'source=/lib/modules',
854 'destination=/lib/modules',
855 'ro=true']
856 binds.append(lib_modules)
857 return binds
858
859 @staticmethod
860 def get_version(ctx, container_id):
861 # type: (CephadmContext, str) -> Optional[str]
862 version = None
863 out, err, code = call(ctx,
864 [ctx.container_engine.path, 'exec', container_id,
865 '/usr/bin/python3', '-c', "import pkg_resources; print(pkg_resources.require('ceph_iscsi')[0].version)"],
866 verbosity=CallVerbosity.QUIET)
867 if code == 0:
868 version = out.strip()
869 return version
870
871 def validate(self):
872 # type: () -> None
873 if not is_fsid(self.fsid):
874 raise Error('not an fsid: %s' % self.fsid)
875 if not self.daemon_id:
876 raise Error('invalid daemon_id: %s' % self.daemon_id)
877 if not self.image:
878 raise Error('invalid image: %s' % self.image)
879
880 # check for the required files
881 if self.required_files:
882 for fname in self.required_files:
883 if fname not in self.files:
884 raise Error('required file missing from config-json: %s' % fname)
885
886 def get_daemon_name(self):
887 # type: () -> str
888 return '%s.%s' % (self.daemon_type, self.daemon_id)
889
890 def get_container_name(self, desc=None):
891 # type: (Optional[str]) -> str
892 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
893 if desc:
894 cname = '%s-%s' % (cname, desc)
895 return cname
896
897 def create_daemon_dirs(self, data_dir, uid, gid):
898 # type: (str, int, int) -> None
899 """Create files under the container data dir"""
900 if not os.path.isdir(data_dir):
901 raise OSError('data_dir is not a directory: %s' % (data_dir))
902
903 logger.info('Creating ceph-iscsi config...')
904 configfs_dir = os.path.join(data_dir, 'configfs')
905 makedirs(configfs_dir, uid, gid, 0o755)
906
907 # populate files from the config-json
908 populate_files(data_dir, self.files, uid, gid)
909
910 @staticmethod
911 def configfs_mount_umount(data_dir, mount=True):
912 # type: (str, bool) -> List[str]
913 mount_path = os.path.join(data_dir, 'configfs')
914 if mount:
915 cmd = 'if ! grep -qs {0} /proc/mounts; then ' \
916 'mount -t configfs none {0}; fi'.format(mount_path)
917 else:
918 cmd = 'if grep -qs {0} /proc/mounts; then ' \
919 'umount {0}; fi'.format(mount_path)
920 return cmd.split()
921
922 def get_tcmu_runner_container(self):
923 # type: () -> CephContainer
924 tcmu_container = get_container(self.ctx, self.fsid, self.daemon_type, self.daemon_id)
925 tcmu_container.entrypoint = '/usr/bin/tcmu-runner'
926 tcmu_container.cname = self.get_container_name(desc='tcmu')
927 # remove extra container args for tcmu container.
928 # extra args could cause issue with forking service type
929 tcmu_container.container_args = []
930 return tcmu_container
931
932 ##################################
933
934
935 class HAproxy(object):
936 """Defines an HAproxy container"""
937 daemon_type = 'haproxy'
938 required_files = ['haproxy.cfg']
939 default_image = DEFAULT_HAPROXY_IMAGE
940
941 def __init__(self,
942 ctx: CephadmContext,
943 fsid: str, daemon_id: Union[int, str],
944 config_json: Dict, image: str) -> None:
945 self.ctx = ctx
946 self.fsid = fsid
947 self.daemon_id = daemon_id
948 self.image = image
949
950 # config-json options
951 self.files = dict_get(config_json, 'files', {})
952
953 self.validate()
954
955 @classmethod
956 def init(cls, ctx: CephadmContext,
957 fsid: str, daemon_id: Union[int, str]) -> 'HAproxy':
958 return cls(ctx, fsid, daemon_id, get_parm(ctx.config_json),
959 ctx.image)
960
961 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
962 """Create files under the container data dir"""
963 if not os.path.isdir(data_dir):
964 raise OSError('data_dir is not a directory: %s' % (data_dir))
965
966 # create additional directories in data dir for HAproxy to use
967 if not os.path.isdir(os.path.join(data_dir, 'haproxy')):
968 makedirs(os.path.join(data_dir, 'haproxy'), uid, gid, DATA_DIR_MODE)
969
970 data_dir = os.path.join(data_dir, 'haproxy')
971 populate_files(data_dir, self.files, uid, gid)
972
973 def get_daemon_args(self) -> List[str]:
974 return ['haproxy', '-f', '/var/lib/haproxy/haproxy.cfg']
975
976 def validate(self):
977 # type: () -> None
978 if not is_fsid(self.fsid):
979 raise Error('not an fsid: %s' % self.fsid)
980 if not self.daemon_id:
981 raise Error('invalid daemon_id: %s' % self.daemon_id)
982 if not self.image:
983 raise Error('invalid image: %s' % self.image)
984
985 # check for the required files
986 if self.required_files:
987 for fname in self.required_files:
988 if fname not in self.files:
989 raise Error('required file missing from config-json: %s' % fname)
990
991 def get_daemon_name(self):
992 # type: () -> str
993 return '%s.%s' % (self.daemon_type, self.daemon_id)
994
995 def get_container_name(self, desc=None):
996 # type: (Optional[str]) -> str
997 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
998 if desc:
999 cname = '%s-%s' % (cname, desc)
1000 return cname
1001
1002 def extract_uid_gid_haproxy(self) -> Tuple[int, int]:
1003 # better directory for this?
1004 return extract_uid_gid(self.ctx, file_path='/var/lib')
1005
1006 @staticmethod
1007 def get_container_mounts(data_dir: str) -> Dict[str, str]:
1008 mounts = dict()
1009 mounts[os.path.join(data_dir, 'haproxy')] = '/var/lib/haproxy'
1010 return mounts
1011
1012 @staticmethod
1013 def get_sysctl_settings() -> List[str]:
1014 return [
1015 '# IP forwarding',
1016 'net.ipv4.ip_forward = 1',
1017 ]
1018
1019 ##################################
1020
1021
1022 class Keepalived(object):
1023 """Defines an Keepalived container"""
1024 daemon_type = 'keepalived'
1025 required_files = ['keepalived.conf']
1026 default_image = DEFAULT_KEEPALIVED_IMAGE
1027
1028 def __init__(self,
1029 ctx: CephadmContext,
1030 fsid: str, daemon_id: Union[int, str],
1031 config_json: Dict, image: str) -> None:
1032 self.ctx = ctx
1033 self.fsid = fsid
1034 self.daemon_id = daemon_id
1035 self.image = image
1036
1037 # config-json options
1038 self.files = dict_get(config_json, 'files', {})
1039
1040 self.validate()
1041
1042 @classmethod
1043 def init(cls, ctx: CephadmContext, fsid: str,
1044 daemon_id: Union[int, str]) -> 'Keepalived':
1045 return cls(ctx, fsid, daemon_id,
1046 get_parm(ctx.config_json), ctx.image)
1047
1048 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
1049 """Create files under the container data dir"""
1050 if not os.path.isdir(data_dir):
1051 raise OSError('data_dir is not a directory: %s' % (data_dir))
1052
1053 # create additional directories in data dir for keepalived to use
1054 if not os.path.isdir(os.path.join(data_dir, 'keepalived')):
1055 makedirs(os.path.join(data_dir, 'keepalived'), uid, gid, DATA_DIR_MODE)
1056
1057 # populate files from the config-json
1058 populate_files(data_dir, self.files, uid, gid)
1059
1060 def validate(self):
1061 # type: () -> None
1062 if not is_fsid(self.fsid):
1063 raise Error('not an fsid: %s' % self.fsid)
1064 if not self.daemon_id:
1065 raise Error('invalid daemon_id: %s' % self.daemon_id)
1066 if not self.image:
1067 raise Error('invalid image: %s' % self.image)
1068
1069 # check for the required files
1070 if self.required_files:
1071 for fname in self.required_files:
1072 if fname not in self.files:
1073 raise Error('required file missing from config-json: %s' % fname)
1074
1075 def get_daemon_name(self):
1076 # type: () -> str
1077 return '%s.%s' % (self.daemon_type, self.daemon_id)
1078
1079 def get_container_name(self, desc=None):
1080 # type: (Optional[str]) -> str
1081 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
1082 if desc:
1083 cname = '%s-%s' % (cname, desc)
1084 return cname
1085
1086 @staticmethod
1087 def get_container_envs():
1088 # type: () -> List[str]
1089 envs = [
1090 'KEEPALIVED_AUTOCONF=false',
1091 'KEEPALIVED_CONF=/etc/keepalived/keepalived.conf',
1092 'KEEPALIVED_CMD=/usr/sbin/keepalived -n -l -f /etc/keepalived/keepalived.conf',
1093 'KEEPALIVED_DEBUG=false'
1094 ]
1095 return envs
1096
1097 @staticmethod
1098 def get_sysctl_settings() -> List[str]:
1099 return [
1100 '# IP forwarding and non-local bind',
1101 'net.ipv4.ip_forward = 1',
1102 'net.ipv4.ip_nonlocal_bind = 1',
1103 ]
1104
1105 def extract_uid_gid_keepalived(self) -> Tuple[int, int]:
1106 # better directory for this?
1107 return extract_uid_gid(self.ctx, file_path='/var/lib')
1108
1109 @staticmethod
1110 def get_container_mounts(data_dir: str) -> Dict[str, str]:
1111 mounts = dict()
1112 mounts[os.path.join(data_dir, 'keepalived.conf')] = '/etc/keepalived/keepalived.conf'
1113 return mounts
1114
1115 ##################################
1116
1117
1118 class CustomContainer(object):
1119 """Defines a custom container"""
1120 daemon_type = 'container'
1121
1122 def __init__(self,
1123 fsid: str, daemon_id: Union[int, str],
1124 config_json: Dict, image: str) -> None:
1125 self.fsid = fsid
1126 self.daemon_id = daemon_id
1127 self.image = image
1128
1129 # config-json options
1130 self.entrypoint = dict_get(config_json, 'entrypoint')
1131 self.uid = dict_get(config_json, 'uid', 65534) # nobody
1132 self.gid = dict_get(config_json, 'gid', 65534) # nobody
1133 self.volume_mounts = dict_get(config_json, 'volume_mounts', {})
1134 self.args = dict_get(config_json, 'args', [])
1135 self.envs = dict_get(config_json, 'envs', [])
1136 self.privileged = dict_get(config_json, 'privileged', False)
1137 self.bind_mounts = dict_get(config_json, 'bind_mounts', [])
1138 self.ports = dict_get(config_json, 'ports', [])
1139 self.dirs = dict_get(config_json, 'dirs', [])
1140 self.files = dict_get(config_json, 'files', {})
1141
1142 @classmethod
1143 def init(cls, ctx: CephadmContext,
1144 fsid: str, daemon_id: Union[int, str]) -> 'CustomContainer':
1145 return cls(fsid, daemon_id,
1146 get_parm(ctx.config_json), ctx.image)
1147
1148 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
1149 """
1150 Create dirs/files below the container data directory.
1151 """
1152 logger.info('Creating custom container configuration '
1153 'dirs/files in {} ...'.format(data_dir))
1154
1155 if not os.path.isdir(data_dir):
1156 raise OSError('data_dir is not a directory: %s' % data_dir)
1157
1158 for dir_path in self.dirs:
1159 logger.info('Creating directory: {}'.format(dir_path))
1160 dir_path = os.path.join(data_dir, dir_path.strip('/'))
1161 makedirs(dir_path, uid, gid, 0o755)
1162
1163 for file_path in self.files:
1164 logger.info('Creating file: {}'.format(file_path))
1165 content = dict_get_join(self.files, file_path)
1166 file_path = os.path.join(data_dir, file_path.strip('/'))
1167 with open(file_path, 'w', encoding='utf-8') as f:
1168 os.fchown(f.fileno(), uid, gid)
1169 os.fchmod(f.fileno(), 0o600)
1170 f.write(content)
1171
1172 def get_daemon_args(self) -> List[str]:
1173 return []
1174
1175 def get_container_args(self) -> List[str]:
1176 return self.args
1177
1178 def get_container_envs(self) -> List[str]:
1179 return self.envs
1180
1181 def get_container_mounts(self, data_dir: str) -> Dict[str, str]:
1182 """
1183 Get the volume mounts. Relative source paths will be located below
1184 `/var/lib/ceph/<cluster-fsid>/<daemon-name>`.
1185
1186 Example:
1187 {
1188 /foo/conf: /conf
1189 foo/conf: /conf
1190 }
1191 becomes
1192 {
1193 /foo/conf: /conf
1194 /var/lib/ceph/<cluster-fsid>/<daemon-name>/foo/conf: /conf
1195 }
1196 """
1197 mounts = {}
1198 for source, destination in self.volume_mounts.items():
1199 source = os.path.join(data_dir, source)
1200 mounts[source] = destination
1201 return mounts
1202
1203 def get_container_binds(self, data_dir: str) -> List[List[str]]:
1204 """
1205 Get the bind mounts. Relative `source=...` paths will be located below
1206 `/var/lib/ceph/<cluster-fsid>/<daemon-name>`.
1207
1208 Example:
1209 [
1210 'type=bind',
1211 'source=lib/modules',
1212 'destination=/lib/modules',
1213 'ro=true'
1214 ]
1215 becomes
1216 [
1217 ...
1218 'source=/var/lib/ceph/<cluster-fsid>/<daemon-name>/lib/modules',
1219 ...
1220 ]
1221 """
1222 binds = self.bind_mounts.copy()
1223 for bind in binds:
1224 for index, value in enumerate(bind):
1225 match = re.match(r'^source=(.+)$', value)
1226 if match:
1227 bind[index] = 'source={}'.format(os.path.join(
1228 data_dir, match.group(1)))
1229 return binds
1230
1231 ##################################
1232
1233
1234 def touch(file_path: str, uid: Optional[int] = None, gid: Optional[int] = None) -> None:
1235 Path(file_path).touch()
1236 if uid and gid:
1237 os.chown(file_path, uid, gid)
1238
1239
1240 ##################################
1241
1242
1243 def dict_get(d: Dict, key: str, default: Any = None, require: bool = False) -> Any:
1244 """
1245 Helper function to get a key from a dictionary.
1246 :param d: The dictionary to process.
1247 :param key: The name of the key to get.
1248 :param default: The default value in case the key does not
1249 exist. Default is `None`.
1250 :param require: Set to `True` if the key is required. An
1251 exception will be raised if the key does not exist in
1252 the given dictionary.
1253 :return: Returns the value of the given key.
1254 :raises: :exc:`self.Error` if the given key does not exist
1255 and `require` is set to `True`.
1256 """
1257 if require and key not in d.keys():
1258 raise Error('{} missing from dict'.format(key))
1259 return d.get(key, default) # type: ignore
1260
1261 ##################################
1262
1263
1264 def dict_get_join(d: Dict, key: str) -> Any:
1265 """
1266 Helper function to get the value of a given key from a dictionary.
1267 `List` values will be converted to a string by joining them with a
1268 line break.
1269 :param d: The dictionary to process.
1270 :param key: The name of the key to get.
1271 :return: Returns the value of the given key. If it was a `list`, it
1272 will be joining with a line break.
1273 """
1274 value = d.get(key)
1275 if isinstance(value, list):
1276 value = '\n'.join(map(str, value))
1277 return value
1278
1279 ##################################
1280
1281
1282 def get_supported_daemons():
1283 # type: () -> List[str]
1284 supported_daemons = list(Ceph.daemons)
1285 supported_daemons.extend(Monitoring.components)
1286 supported_daemons.append(NFSGanesha.daemon_type)
1287 supported_daemons.append(CephIscsi.daemon_type)
1288 supported_daemons.append(CustomContainer.daemon_type)
1289 supported_daemons.append(HAproxy.daemon_type)
1290 supported_daemons.append(Keepalived.daemon_type)
1291 supported_daemons.append(CephadmAgent.daemon_type)
1292 supported_daemons.append(SNMPGateway.daemon_type)
1293 assert len(supported_daemons) == len(set(supported_daemons))
1294 return supported_daemons
1295
1296 ##################################
1297
1298
1299 class PortOccupiedError(Error):
1300 pass
1301
1302
1303 def attempt_bind(ctx, s, address, port):
1304 # type: (CephadmContext, socket.socket, str, int) -> None
1305 try:
1306 s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
1307 s.bind((address, port))
1308 except OSError as e:
1309 if e.errno == errno.EADDRINUSE:
1310 msg = 'Cannot bind to IP %s port %d: %s' % (address, port, e)
1311 logger.warning(msg)
1312 raise PortOccupiedError(msg)
1313 else:
1314 raise Error(e)
1315 except Exception as e:
1316 raise Error(e)
1317 finally:
1318 s.close()
1319
1320
1321 def port_in_use(ctx, port_num):
1322 # type: (CephadmContext, int) -> bool
1323 """Detect whether a port is in use on the local machine - IPv4 and IPv6"""
1324 logger.info('Verifying port %d ...' % port_num)
1325
1326 def _port_in_use(af: socket.AddressFamily, address: str) -> bool:
1327 try:
1328 s = socket.socket(af, socket.SOCK_STREAM)
1329 attempt_bind(ctx, s, address, port_num)
1330 except PortOccupiedError:
1331 return True
1332 except OSError as e:
1333 if e.errno in (errno.EAFNOSUPPORT, errno.EADDRNOTAVAIL):
1334 # Ignore EAFNOSUPPORT and EADDRNOTAVAIL as two interfaces are
1335 # being tested here and one might be intentionally be disabled.
1336 # In that case no error should be raised.
1337 return False
1338 else:
1339 raise e
1340 return False
1341 return any(_port_in_use(af, address) for af, address in (
1342 (socket.AF_INET, '0.0.0.0'),
1343 (socket.AF_INET6, '::')
1344 ))
1345
1346
1347 def check_ip_port(ctx, ep):
1348 # type: (CephadmContext, EndPoint) -> None
1349 if not ctx.skip_ping_check:
1350 logger.info(f'Verifying IP {ep.ip} port {ep.port} ...')
1351 if is_ipv6(ep.ip):
1352 s = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
1353 ip = unwrap_ipv6(ep.ip)
1354 else:
1355 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1356 ip = ep.ip
1357 attempt_bind(ctx, s, ip, ep.port)
1358
1359 ##################################
1360
1361
1362 # this is an abbreviated version of
1363 # https://github.com/benediktschmitt/py-filelock/blob/master/filelock.py
1364 # that drops all of the compatibility (this is Unix/Linux only).
1365
1366 class Timeout(TimeoutError):
1367 """
1368 Raised when the lock could not be acquired in *timeout*
1369 seconds.
1370 """
1371
1372 def __init__(self, lock_file: str) -> None:
1373 """
1374 """
1375 #: The path of the file lock.
1376 self.lock_file = lock_file
1377 return None
1378
1379 def __str__(self) -> str:
1380 temp = "The file lock '{}' could not be acquired."\
1381 .format(self.lock_file)
1382 return temp
1383
1384
1385 class _Acquire_ReturnProxy(object):
1386 def __init__(self, lock: 'FileLock') -> None:
1387 self.lock = lock
1388 return None
1389
1390 def __enter__(self) -> 'FileLock':
1391 return self.lock
1392
1393 def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
1394 self.lock.release()
1395 return None
1396
1397
1398 class FileLock(object):
1399 def __init__(self, ctx: CephadmContext, name: str, timeout: int = -1) -> None:
1400 if not os.path.exists(LOCK_DIR):
1401 os.mkdir(LOCK_DIR, 0o700)
1402 self._lock_file = os.path.join(LOCK_DIR, name + '.lock')
1403 self.ctx = ctx
1404
1405 # The file descriptor for the *_lock_file* as it is returned by the
1406 # os.open() function.
1407 # This file lock is only NOT None, if the object currently holds the
1408 # lock.
1409 self._lock_file_fd: Optional[int] = None
1410 self.timeout = timeout
1411 # The lock counter is used for implementing the nested locking
1412 # mechanism. Whenever the lock is acquired, the counter is increased and
1413 # the lock is only released, when this value is 0 again.
1414 self._lock_counter = 0
1415 return None
1416
1417 @property
1418 def is_locked(self) -> bool:
1419 return self._lock_file_fd is not None
1420
1421 def acquire(self, timeout: Optional[int] = None, poll_intervall: float = 0.05) -> _Acquire_ReturnProxy:
1422 """
1423 Acquires the file lock or fails with a :exc:`Timeout` error.
1424 .. code-block:: python
1425 # You can use this method in the context manager (recommended)
1426 with lock.acquire():
1427 pass
1428 # Or use an equivalent try-finally construct:
1429 lock.acquire()
1430 try:
1431 pass
1432 finally:
1433 lock.release()
1434 :arg float timeout:
1435 The maximum time waited for the file lock.
1436 If ``timeout < 0``, there is no timeout and this method will
1437 block until the lock could be acquired.
1438 If ``timeout`` is None, the default :attr:`~timeout` is used.
1439 :arg float poll_intervall:
1440 We check once in *poll_intervall* seconds if we can acquire the
1441 file lock.
1442 :raises Timeout:
1443 if the lock could not be acquired in *timeout* seconds.
1444 .. versionchanged:: 2.0.0
1445 This method returns now a *proxy* object instead of *self*,
1446 so that it can be used in a with statement without side effects.
1447 """
1448
1449 # Use the default timeout, if no timeout is provided.
1450 if timeout is None:
1451 timeout = self.timeout
1452
1453 # Increment the number right at the beginning.
1454 # We can still undo it, if something fails.
1455 self._lock_counter += 1
1456
1457 lock_id = id(self)
1458 lock_filename = self._lock_file
1459 start_time = time.time()
1460 try:
1461 while True:
1462 if not self.is_locked:
1463 logger.log(QUIET_LOG_LEVEL, 'Acquiring lock %s on %s', lock_id,
1464 lock_filename)
1465 self._acquire()
1466
1467 if self.is_locked:
1468 logger.log(QUIET_LOG_LEVEL, 'Lock %s acquired on %s', lock_id,
1469 lock_filename)
1470 break
1471 elif timeout >= 0 and time.time() - start_time > timeout:
1472 logger.warning('Timeout acquiring lock %s on %s', lock_id,
1473 lock_filename)
1474 raise Timeout(self._lock_file)
1475 else:
1476 logger.log(
1477 QUIET_LOG_LEVEL,
1478 'Lock %s not acquired on %s, waiting %s seconds ...',
1479 lock_id, lock_filename, poll_intervall
1480 )
1481 time.sleep(poll_intervall)
1482 except Exception:
1483 # Something did go wrong, so decrement the counter.
1484 self._lock_counter = max(0, self._lock_counter - 1)
1485
1486 raise
1487 return _Acquire_ReturnProxy(lock=self)
1488
1489 def release(self, force: bool = False) -> None:
1490 """
1491 Releases the file lock.
1492 Please note, that the lock is only completly released, if the lock
1493 counter is 0.
1494 Also note, that the lock file itself is not automatically deleted.
1495 :arg bool force:
1496 If true, the lock counter is ignored and the lock is released in
1497 every case.
1498 """
1499 if self.is_locked:
1500 self._lock_counter -= 1
1501
1502 if self._lock_counter == 0 or force:
1503 # lock_id = id(self)
1504 # lock_filename = self._lock_file
1505
1506 # Can't log in shutdown:
1507 # File "/usr/lib64/python3.9/logging/__init__.py", line 1175, in _open
1508 # NameError: name 'open' is not defined
1509 # logger.debug('Releasing lock %s on %s', lock_id, lock_filename)
1510 self._release()
1511 self._lock_counter = 0
1512 # logger.debug('Lock %s released on %s', lock_id, lock_filename)
1513
1514 return None
1515
1516 def __enter__(self) -> 'FileLock':
1517 self.acquire()
1518 return self
1519
1520 def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
1521 self.release()
1522 return None
1523
1524 def __del__(self) -> None:
1525 self.release(force=True)
1526 return None
1527
1528 def _acquire(self) -> None:
1529 open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
1530 fd = os.open(self._lock_file, open_mode)
1531
1532 try:
1533 fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
1534 except (IOError, OSError):
1535 os.close(fd)
1536 else:
1537 self._lock_file_fd = fd
1538 return None
1539
1540 def _release(self) -> None:
1541 # Do not remove the lockfile:
1542 #
1543 # https://github.com/benediktschmitt/py-filelock/issues/31
1544 # https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
1545 fd = self._lock_file_fd
1546 self._lock_file_fd = None
1547 fcntl.flock(fd, fcntl.LOCK_UN) # type: ignore
1548 os.close(fd) # type: ignore
1549 return None
1550
1551
1552 ##################################
1553 # Popen wrappers, lifted from ceph-volume
1554
1555 class CallVerbosity(Enum):
1556 #####
1557 # Format:
1558 # Normal Operation: <log-level-when-no-errors>, Errors: <log-level-when-error>
1559 #
1560 # NOTE: QUIET log level is custom level only used when --verbose is passed
1561 #####
1562
1563 # Normal Operation: None, Errors: None
1564 SILENT = 0
1565 # Normal Operation: QUIET, Error: QUIET
1566 QUIET = 1
1567 # Normal Operation: DEBUG, Error: DEBUG
1568 DEBUG = 2
1569 # Normal Operation: QUIET, Error: INFO
1570 QUIET_UNLESS_ERROR = 3
1571 # Normal Operation: DEBUG, Error: INFO
1572 VERBOSE_ON_FAILURE = 4
1573 # Normal Operation: INFO, Error: INFO
1574 VERBOSE = 5
1575
1576 def success_log_level(self) -> int:
1577 _verbosity_level_to_log_level = {
1578 self.SILENT: 0,
1579 self.QUIET: QUIET_LOG_LEVEL,
1580 self.DEBUG: logging.DEBUG,
1581 self.QUIET_UNLESS_ERROR: QUIET_LOG_LEVEL,
1582 self.VERBOSE_ON_FAILURE: logging.DEBUG,
1583 self.VERBOSE: logging.INFO
1584 }
1585 return _verbosity_level_to_log_level[self] # type: ignore
1586
1587 def error_log_level(self) -> int:
1588 _verbosity_level_to_log_level = {
1589 self.SILENT: 0,
1590 self.QUIET: QUIET_LOG_LEVEL,
1591 self.DEBUG: logging.DEBUG,
1592 self.QUIET_UNLESS_ERROR: logging.INFO,
1593 self.VERBOSE_ON_FAILURE: logging.INFO,
1594 self.VERBOSE: logging.INFO
1595 }
1596 return _verbosity_level_to_log_level[self] # type: ignore
1597
1598
1599 if sys.version_info < (3, 8):
1600 import itertools
1601 import threading
1602 import warnings
1603 from asyncio import events
1604
1605 class ThreadedChildWatcher(asyncio.AbstractChildWatcher):
1606 """Threaded child watcher implementation.
1607 The watcher uses a thread per process
1608 for waiting for the process finish.
1609 It doesn't require subscription on POSIX signal
1610 but a thread creation is not free.
1611 The watcher has O(1) complexity, its performance doesn't depend
1612 on amount of spawn processes.
1613 """
1614
1615 def __init__(self) -> None:
1616 self._pid_counter = itertools.count(0)
1617 self._threads: Dict[Any, Any] = {}
1618
1619 def is_active(self) -> bool:
1620 return True
1621
1622 def close(self) -> None:
1623 self._join_threads()
1624
1625 def _join_threads(self) -> None:
1626 """Internal: Join all non-daemon threads"""
1627 threads = [thread for thread in list(self._threads.values())
1628 if thread.is_alive() and not thread.daemon]
1629 for thread in threads:
1630 thread.join()
1631
1632 def __enter__(self) -> Any:
1633 return self
1634
1635 def __exit__(self, exc_type: Any, exc_val: Any, exc_tb: Any) -> None:
1636 pass
1637
1638 def __del__(self, _warn: Any = warnings.warn) -> None:
1639 threads = [thread for thread in list(self._threads.values())
1640 if thread.is_alive()]
1641 if threads:
1642 _warn(f'{self.__class__} has registered but not finished child processes',
1643 ResourceWarning,
1644 source=self)
1645
1646 def add_child_handler(self, pid: Any, callback: Any, *args: Any) -> None:
1647 loop = events.get_event_loop()
1648 thread = threading.Thread(target=self._do_waitpid,
1649 name=f'waitpid-{next(self._pid_counter)}',
1650 args=(loop, pid, callback, args),
1651 daemon=True)
1652 self._threads[pid] = thread
1653 thread.start()
1654
1655 def remove_child_handler(self, pid: Any) -> bool:
1656 # asyncio never calls remove_child_handler() !!!
1657 # The method is no-op but is implemented because
1658 # abstract base classe requires it
1659 return True
1660
1661 def attach_loop(self, loop: Any) -> None:
1662 pass
1663
1664 def _do_waitpid(self, loop: Any, expected_pid: Any, callback: Any, args: Any) -> None:
1665 assert expected_pid > 0
1666
1667 try:
1668 pid, status = os.waitpid(expected_pid, 0)
1669 except ChildProcessError:
1670 # The child process is already reaped
1671 # (may happen if waitpid() is called elsewhere).
1672 pid = expected_pid
1673 returncode = 255
1674 logger.warning(
1675 'Unknown child process pid %d, will report returncode 255',
1676 pid)
1677 else:
1678 if os.WIFEXITED(status):
1679 returncode = os.WEXITSTATUS(status)
1680 elif os.WIFSIGNALED(status):
1681 returncode = -os.WTERMSIG(status)
1682 else:
1683 raise ValueError(f'unknown wait status {status}')
1684 if loop.get_debug():
1685 logger.debug('process %s exited with returncode %s',
1686 expected_pid, returncode)
1687
1688 if loop.is_closed():
1689 logger.warning('Loop %r that handles pid %r is closed', loop, pid)
1690 else:
1691 loop.call_soon_threadsafe(callback, pid, returncode, *args)
1692
1693 self._threads.pop(expected_pid)
1694
1695 # unlike SafeChildWatcher which handles SIGCHLD in the main thread,
1696 # ThreadedChildWatcher runs in a separated thread, hence allows us to
1697 # run create_subprocess_exec() in non-main thread, see
1698 # https://bugs.python.org/issue35621
1699 asyncio.set_child_watcher(ThreadedChildWatcher())
1700
1701
1702 try:
1703 from asyncio import run as async_run # type: ignore[attr-defined]
1704 except ImportError:
1705 def async_run(coro): # type: ignore
1706 loop = asyncio.new_event_loop()
1707 try:
1708 asyncio.set_event_loop(loop)
1709 return loop.run_until_complete(coro)
1710 finally:
1711 try:
1712 loop.run_until_complete(loop.shutdown_asyncgens())
1713 finally:
1714 asyncio.set_event_loop(None)
1715 loop.close()
1716
1717
1718 def call(ctx: CephadmContext,
1719 command: List[str],
1720 desc: Optional[str] = None,
1721 verbosity: CallVerbosity = CallVerbosity.VERBOSE_ON_FAILURE,
1722 timeout: Optional[int] = DEFAULT_TIMEOUT,
1723 **kwargs: Any) -> Tuple[str, str, int]:
1724 """
1725 Wrap subprocess.Popen to
1726
1727 - log stdout/stderr to a logger,
1728 - decode utf-8
1729 - cleanly return out, err, returncode
1730
1731 :param timeout: timeout in seconds
1732 """
1733
1734 prefix = command[0] if desc is None else desc
1735 if prefix:
1736 prefix += ': '
1737 timeout = timeout or ctx.timeout
1738
1739 async def tee(reader: asyncio.StreamReader) -> str:
1740 collected = StringIO()
1741 async for line in reader:
1742 message = line.decode('utf-8')
1743 collected.write(message)
1744 return collected.getvalue()
1745
1746 async def run_with_timeout() -> Tuple[str, str, int]:
1747 process = await asyncio.create_subprocess_exec(
1748 *command,
1749 stdout=asyncio.subprocess.PIPE,
1750 stderr=asyncio.subprocess.PIPE,
1751 env=os.environ.copy())
1752 assert process.stdout
1753 assert process.stderr
1754 try:
1755 stdout, stderr = await asyncio.gather(tee(process.stdout),
1756 tee(process.stderr))
1757 returncode = await asyncio.wait_for(process.wait(), timeout)
1758 except asyncio.TimeoutError:
1759 logger.info(prefix + f'timeout after {timeout} seconds')
1760 return '', '', 124
1761 else:
1762 return stdout, stderr, returncode
1763
1764 stdout, stderr, returncode = async_run(run_with_timeout())
1765 log_level = verbosity.success_log_level()
1766 if returncode != 0:
1767 log_level = verbosity.error_log_level()
1768 logger.log(log_level, f'Non-zero exit code {returncode} from {" ".join(command)}')
1769 for line in stdout.splitlines():
1770 logger.log(log_level, prefix + 'stdout ' + line)
1771 for line in stderr.splitlines():
1772 logger.log(log_level, prefix + 'stderr ' + line)
1773 return stdout, stderr, returncode
1774
1775
1776 def call_throws(
1777 ctx: CephadmContext,
1778 command: List[str],
1779 desc: Optional[str] = None,
1780 verbosity: CallVerbosity = CallVerbosity.VERBOSE_ON_FAILURE,
1781 timeout: Optional[int] = DEFAULT_TIMEOUT,
1782 **kwargs: Any) -> Tuple[str, str, int]:
1783 out, err, ret = call(ctx, command, desc, verbosity, timeout, **kwargs)
1784 if ret:
1785 for s in (out, err):
1786 if s.strip() and len(s.splitlines()) <= 2: # readable message?
1787 raise RuntimeError(f'Failed command: {" ".join(command)}: {s}')
1788 raise RuntimeError('Failed command: %s' % ' '.join(command))
1789 return out, err, ret
1790
1791
1792 def call_timeout(ctx, command, timeout):
1793 # type: (CephadmContext, List[str], int) -> int
1794 logger.debug('Running command (timeout=%s): %s'
1795 % (timeout, ' '.join(command)))
1796
1797 def raise_timeout(command, timeout):
1798 # type: (List[str], int) -> NoReturn
1799 msg = 'Command `%s` timed out after %s seconds' % (command, timeout)
1800 logger.debug(msg)
1801 raise TimeoutExpired(msg)
1802
1803 try:
1804 return subprocess.call(command, timeout=timeout, env=os.environ.copy())
1805 except subprocess.TimeoutExpired:
1806 raise_timeout(command, timeout)
1807
1808 ##################################
1809
1810
1811 def json_loads_retry(cli_func: Callable[[], str]) -> Any:
1812 for sleep_secs in [1, 4, 4]:
1813 try:
1814 return json.loads(cli_func())
1815 except json.JSONDecodeError:
1816 logger.debug('Invalid JSON. Retrying in %s seconds...' % sleep_secs)
1817 time.sleep(sleep_secs)
1818 return json.loads(cli_func())
1819
1820
1821 def is_available(ctx, what, func):
1822 # type: (CephadmContext, str, Callable[[], bool]) -> None
1823 """
1824 Wait for a service to become available
1825
1826 :param what: the name of the service
1827 :param func: the callable object that determines availability
1828 """
1829 retry = ctx.retry
1830 logger.info('Waiting for %s...' % what)
1831 num = 1
1832 while True:
1833 if func():
1834 logger.info('%s is available'
1835 % what)
1836 break
1837 elif num > retry:
1838 raise Error('%s not available after %s tries'
1839 % (what, retry))
1840
1841 logger.info('%s not available, waiting (%s/%s)...'
1842 % (what, num, retry))
1843
1844 num += 1
1845 time.sleep(2)
1846
1847
1848 def read_config(fn):
1849 # type: (Optional[str]) -> ConfigParser
1850 cp = ConfigParser()
1851 if fn:
1852 cp.read(fn)
1853 return cp
1854
1855
1856 def pathify(p):
1857 # type: (str) -> str
1858 p = os.path.expanduser(p)
1859 return os.path.abspath(p)
1860
1861
1862 def get_file_timestamp(fn):
1863 # type: (str) -> Optional[str]
1864 try:
1865 mt = os.path.getmtime(fn)
1866 return datetime.datetime.fromtimestamp(
1867 mt, tz=datetime.timezone.utc
1868 ).strftime(DATEFMT)
1869 except Exception:
1870 return None
1871
1872
1873 def try_convert_datetime(s):
1874 # type: (str) -> Optional[str]
1875 # This is super irritating because
1876 # 1) podman and docker use different formats
1877 # 2) python's strptime can't parse either one
1878 #
1879 # I've seen:
1880 # docker 18.09.7: 2020-03-03T09:21:43.636153304Z
1881 # podman 1.7.0: 2020-03-03T15:52:30.136257504-06:00
1882 # 2020-03-03 15:52:30.136257504 -0600 CST
1883 # (In the podman case, there is a different string format for
1884 # 'inspect' and 'inspect --format {{.Created}}'!!)
1885
1886 # In *all* cases, the 9 digit second precision is too much for
1887 # python's strptime. Shorten it to 6 digits.
1888 p = re.compile(r'(\.[\d]{6})[\d]*')
1889 s = p.sub(r'\1', s)
1890
1891 # replace trailing Z with -0000, since (on python 3.6.8) it won't parse
1892 if s and s[-1] == 'Z':
1893 s = s[:-1] + '-0000'
1894
1895 # cut off the redundant 'CST' part that strptime can't parse, if
1896 # present.
1897 v = s.split(' ')
1898 s = ' '.join(v[0:3])
1899
1900 # try parsing with several format strings
1901 fmts = [
1902 '%Y-%m-%dT%H:%M:%S.%f%z',
1903 '%Y-%m-%d %H:%M:%S.%f %z',
1904 ]
1905 for f in fmts:
1906 try:
1907 # return timestamp normalized to UTC, rendered as DATEFMT.
1908 return datetime.datetime.strptime(s, f).astimezone(tz=datetime.timezone.utc).strftime(DATEFMT)
1909 except ValueError:
1910 pass
1911 return None
1912
1913
1914 def _parse_podman_version(version_str):
1915 # type: (str) -> Tuple[int, ...]
1916 def to_int(val: str, org_e: Optional[Exception] = None) -> int:
1917 if not val and org_e:
1918 raise org_e
1919 try:
1920 return int(val)
1921 except ValueError as e:
1922 return to_int(val[0:-1], org_e or e)
1923
1924 return tuple(map(to_int, version_str.split('.')))
1925
1926
1927 def get_hostname():
1928 # type: () -> str
1929 return socket.gethostname()
1930
1931
1932 def get_fqdn():
1933 # type: () -> str
1934 return socket.getfqdn() or socket.gethostname()
1935
1936
1937 def get_arch():
1938 # type: () -> str
1939 return platform.uname().machine
1940
1941
1942 def generate_service_id():
1943 # type: () -> str
1944 return get_hostname() + '.' + ''.join(random.choice(string.ascii_lowercase)
1945 for _ in range(6))
1946
1947
1948 def generate_password():
1949 # type: () -> str
1950 return ''.join(random.choice(string.ascii_lowercase + string.digits)
1951 for i in range(10))
1952
1953
1954 def normalize_container_id(i):
1955 # type: (str) -> str
1956 # docker adds the sha256: prefix, but AFAICS both
1957 # docker (18.09.7 in bionic at least) and podman
1958 # both always use sha256, so leave off the prefix
1959 # for consistency.
1960 prefix = 'sha256:'
1961 if i.startswith(prefix):
1962 i = i[len(prefix):]
1963 return i
1964
1965
1966 def make_fsid():
1967 # type: () -> str
1968 return str(uuid.uuid1())
1969
1970
1971 def is_fsid(s):
1972 # type: (str) -> bool
1973 try:
1974 uuid.UUID(s)
1975 except ValueError:
1976 return False
1977 return True
1978
1979
1980 def validate_fsid(func: FuncT) -> FuncT:
1981 @wraps(func)
1982 def _validate_fsid(ctx: CephadmContext) -> Any:
1983 if 'fsid' in ctx and ctx.fsid:
1984 if not is_fsid(ctx.fsid):
1985 raise Error('not an fsid: %s' % ctx.fsid)
1986 return func(ctx)
1987 return cast(FuncT, _validate_fsid)
1988
1989
1990 def infer_fsid(func: FuncT) -> FuncT:
1991 """
1992 If we only find a single fsid in /var/lib/ceph/*, use that
1993 """
1994 @infer_config
1995 @wraps(func)
1996 def _infer_fsid(ctx: CephadmContext) -> Any:
1997 if 'fsid' in ctx and ctx.fsid:
1998 logger.debug('Using specified fsid: %s' % ctx.fsid)
1999 return func(ctx)
2000
2001 fsids = set()
2002
2003 cp = read_config(ctx.config)
2004 if cp.has_option('global', 'fsid'):
2005 fsids.add(cp.get('global', 'fsid'))
2006
2007 daemon_list = list_daemons(ctx, detail=False)
2008 for daemon in daemon_list:
2009 if not is_fsid(daemon['fsid']):
2010 # 'unknown' fsid
2011 continue
2012 elif 'name' not in ctx or not ctx.name:
2013 # ctx.name not specified
2014 fsids.add(daemon['fsid'])
2015 elif daemon['name'] == ctx.name:
2016 # ctx.name is a match
2017 fsids.add(daemon['fsid'])
2018 fsids = sorted(fsids)
2019
2020 if not fsids:
2021 # some commands do not always require an fsid
2022 pass
2023 elif len(fsids) == 1:
2024 logger.info('Inferring fsid %s' % fsids[0])
2025 ctx.fsid = fsids[0]
2026 else:
2027 raise Error('Cannot infer an fsid, one must be specified (using --fsid): %s' % fsids)
2028 return func(ctx)
2029
2030 return cast(FuncT, _infer_fsid)
2031
2032
2033 def infer_config(func: FuncT) -> FuncT:
2034 """
2035 Infer the clusater configuration using the followign priority order:
2036 1- if the user has provided custom conf file (-c option) use it
2037 2- otherwise if daemon --name has been provided use daemon conf
2038 3- otherwise find the mon daemon conf file and use it (if v1)
2039 4- otherwise if {ctx.data_dir}/{fsid}/{CEPH_CONF_DIR} dir exists use it
2040 5- finally: fallback to the default file /etc/ceph/ceph.conf
2041 """
2042 @wraps(func)
2043 def _infer_config(ctx: CephadmContext) -> Any:
2044
2045 def config_path(daemon_type: str, daemon_name: str) -> str:
2046 data_dir = get_data_dir(ctx.fsid, ctx.data_dir, daemon_type, daemon_name)
2047 return os.path.join(data_dir, 'config')
2048
2049 def get_mon_daemon_name(fsid: str) -> Optional[str]:
2050 daemon_list = list_daemons(ctx, detail=False)
2051 for daemon in daemon_list:
2052 if (
2053 daemon.get('name', '').startswith('mon.')
2054 and daemon.get('fsid', '') == fsid
2055 and daemon.get('style', '') == 'cephadm:v1'
2056 and os.path.exists(config_path('mon', daemon['name'].split('.', 1)[1]))
2057 ):
2058 return daemon['name']
2059 return None
2060
2061 ctx.config = ctx.config if 'config' in ctx else None
2062 # check if user has provided conf by using -c option
2063 if ctx.config and (ctx.config != CEPH_DEFAULT_CONF):
2064 logger.debug(f'Using specified config: {ctx.config}')
2065 return func(ctx)
2066
2067 if 'fsid' in ctx and ctx.fsid:
2068 name = ctx.name if ('name' in ctx and ctx.name) else get_mon_daemon_name(ctx.fsid)
2069 if name is not None:
2070 # daemon name has been specified (or inffered from mon), let's use its conf
2071 ctx.config = config_path(name.split('.', 1)[0], name.split('.', 1)[1])
2072 else:
2073 # no daemon, in case the cluster has a config dir then use it
2074 ceph_conf = f'{ctx.data_dir}/{ctx.fsid}/{CEPH_CONF_DIR}/{CEPH_CONF}'
2075 if os.path.exists(ceph_conf):
2076 ctx.config = ceph_conf
2077
2078 if ctx.config:
2079 logger.info(f'Inferring config {ctx.config}')
2080 elif os.path.exists(CEPH_DEFAULT_CONF):
2081 logger.debug(f'Using default config {CEPH_DEFAULT_CONF}')
2082 ctx.config = CEPH_DEFAULT_CONF
2083 return func(ctx)
2084
2085 return cast(FuncT, _infer_config)
2086
2087
2088 def _get_default_image(ctx: CephadmContext) -> str:
2089 if DEFAULT_IMAGE_IS_MASTER:
2090 warn = """This is a development version of cephadm.
2091 For information regarding the latest stable release:
2092 https://docs.ceph.com/docs/{}/cephadm/install
2093 """.format(LATEST_STABLE_RELEASE)
2094 for line in warn.splitlines():
2095 logger.warning('{}{}{}'.format(termcolor.yellow, line, termcolor.end))
2096 return DEFAULT_IMAGE
2097
2098
2099 def infer_image(func: FuncT) -> FuncT:
2100 """
2101 Use the most recent ceph image
2102 """
2103 @wraps(func)
2104 def _infer_image(ctx: CephadmContext) -> Any:
2105 if not ctx.image:
2106 ctx.image = os.environ.get('CEPHADM_IMAGE')
2107 if not ctx.image:
2108 ctx.image = infer_local_ceph_image(ctx, ctx.container_engine.path)
2109 if not ctx.image:
2110 ctx.image = _get_default_image(ctx)
2111 return func(ctx)
2112
2113 return cast(FuncT, _infer_image)
2114
2115
2116 def default_image(func: FuncT) -> FuncT:
2117 @wraps(func)
2118 def _default_image(ctx: CephadmContext) -> Any:
2119 if not ctx.image:
2120 if 'name' in ctx and ctx.name:
2121 type_ = ctx.name.split('.', 1)[0]
2122 if type_ in Monitoring.components:
2123 ctx.image = Monitoring.components[type_]['image']
2124 if type_ == 'haproxy':
2125 ctx.image = HAproxy.default_image
2126 if type_ == 'keepalived':
2127 ctx.image = Keepalived.default_image
2128 if type_ == SNMPGateway.daemon_type:
2129 ctx.image = SNMPGateway.default_image
2130 if not ctx.image:
2131 ctx.image = os.environ.get('CEPHADM_IMAGE')
2132 if not ctx.image:
2133 ctx.image = _get_default_image(ctx)
2134
2135 return func(ctx)
2136
2137 return cast(FuncT, _default_image)
2138
2139
2140 def get_container_info(ctx: CephadmContext, daemon_filter: str, by_name: bool) -> Optional[ContainerInfo]:
2141 """
2142 :param ctx: Cephadm context
2143 :param daemon_filter: daemon name or type
2144 :param by_name: must be set to True if daemon name is provided
2145 :return: Container information or None
2146 """
2147 def daemon_name_or_type(daemon: Dict[str, str]) -> str:
2148 return daemon['name'] if by_name else daemon['name'].split('.', 1)[0]
2149
2150 if by_name and '.' not in daemon_filter:
2151 logger.warning(f'Trying to get container info using invalid daemon name {daemon_filter}')
2152 return None
2153 daemons = list_daemons(ctx, detail=False)
2154 matching_daemons = [d for d in daemons if daemon_name_or_type(d) == daemon_filter and d['fsid'] == ctx.fsid]
2155 if matching_daemons:
2156 d_type, d_id = matching_daemons[0]['name'].split('.', 1)
2157 out, _, code = get_container_stats(ctx, ctx.container_engine.path, ctx.fsid, d_type, d_id)
2158 if not code:
2159 (container_id, image_name, image_id, start, version) = out.strip().split(',')
2160 return ContainerInfo(container_id, image_name, image_id, start, version)
2161 return None
2162
2163
2164 def infer_local_ceph_image(ctx: CephadmContext, container_path: str) -> Optional[str]:
2165 """
2166 Infer the local ceph image based on the following priority criteria:
2167 1- the image specified by --image arg (if provided).
2168 2- the same image as the daemon container specified by --name arg (if provided).
2169 3- image used by any ceph container running on the host. In this case we use daemon types.
2170 4- if no container is found then we use the most ceph recent image on the host.
2171
2172 Note: any selected container must have the same fsid inferred previously.
2173
2174 :return: The most recent local ceph image (already pulled)
2175 """
2176 # '|' special character is used to separate the output fields into:
2177 # - Repository@digest
2178 # - Image Id
2179 # - Image Tag
2180 # - Image creation date
2181 out, _, _ = call_throws(ctx,
2182 [container_path, 'images',
2183 '--filter', 'label=ceph=True',
2184 '--filter', 'dangling=false',
2185 '--format', '{{.Repository}}@{{.Digest}}|{{.ID}}|{{.Tag}}|{{.CreatedAt}}'])
2186
2187 container_info = None
2188 daemon_name = ctx.name if ('name' in ctx and ctx.name and '.' in ctx.name) else None
2189 daemons_ls = [daemon_name] if daemon_name is not None else Ceph.daemons # daemon types: 'mon', 'mgr', etc
2190 for daemon in daemons_ls:
2191 container_info = get_container_info(ctx, daemon, daemon_name is not None)
2192 if container_info is not None:
2193 logger.debug(f"Using container info for daemon '{daemon}'")
2194 break
2195
2196 for image in out.splitlines():
2197 if image and not image.isspace():
2198 (digest, image_id, tag, created_date) = image.lstrip().split('|')
2199 if container_info is not None and image_id not in container_info.image_id:
2200 continue
2201 if digest and not digest.endswith('@'):
2202 logger.info(f"Using ceph image with id '{image_id}' and tag '{tag}' created on {created_date}\n{digest}")
2203 return digest
2204 return None
2205
2206
2207 def write_tmp(s, uid, gid):
2208 # type: (str, int, int) -> IO[str]
2209 tmp_f = tempfile.NamedTemporaryFile(mode='w',
2210 prefix='ceph-tmp')
2211 os.fchown(tmp_f.fileno(), uid, gid)
2212 tmp_f.write(s)
2213 tmp_f.flush()
2214
2215 return tmp_f
2216
2217
2218 def makedirs(dir, uid, gid, mode):
2219 # type: (str, int, int, int) -> None
2220 if not os.path.exists(dir):
2221 os.makedirs(dir, mode=mode)
2222 else:
2223 os.chmod(dir, mode)
2224 os.chown(dir, uid, gid)
2225 os.chmod(dir, mode) # the above is masked by umask...
2226
2227
2228 def get_data_dir(fsid, data_dir, t, n):
2229 # type: (str, str, str, Union[int, str]) -> str
2230 return os.path.join(data_dir, fsid, '%s.%s' % (t, n))
2231
2232
2233 def get_log_dir(fsid, log_dir):
2234 # type: (str, str) -> str
2235 return os.path.join(log_dir, fsid)
2236
2237
2238 def make_data_dir_base(fsid, data_dir, uid, gid):
2239 # type: (str, str, int, int) -> str
2240 data_dir_base = os.path.join(data_dir, fsid)
2241 makedirs(data_dir_base, uid, gid, DATA_DIR_MODE)
2242 makedirs(os.path.join(data_dir_base, 'crash'), uid, gid, DATA_DIR_MODE)
2243 makedirs(os.path.join(data_dir_base, 'crash', 'posted'), uid, gid,
2244 DATA_DIR_MODE)
2245 return data_dir_base
2246
2247
2248 def make_data_dir(ctx, fsid, daemon_type, daemon_id, uid=None, gid=None):
2249 # type: (CephadmContext, str, str, Union[int, str], Optional[int], Optional[int]) -> str
2250 if uid is None or gid is None:
2251 uid, gid = extract_uid_gid(ctx)
2252 make_data_dir_base(fsid, ctx.data_dir, uid, gid)
2253 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2254 makedirs(data_dir, uid, gid, DATA_DIR_MODE)
2255 return data_dir
2256
2257
2258 def make_log_dir(ctx, fsid, uid=None, gid=None):
2259 # type: (CephadmContext, str, Optional[int], Optional[int]) -> str
2260 if uid is None or gid is None:
2261 uid, gid = extract_uid_gid(ctx)
2262 log_dir = get_log_dir(fsid, ctx.log_dir)
2263 makedirs(log_dir, uid, gid, LOG_DIR_MODE)
2264 return log_dir
2265
2266
2267 def make_var_run(ctx, fsid, uid, gid):
2268 # type: (CephadmContext, str, int, int) -> None
2269 call_throws(ctx, ['install', '-d', '-m0770', '-o', str(uid), '-g', str(gid),
2270 '/var/run/ceph/%s' % fsid])
2271
2272
2273 def copy_tree(ctx, src, dst, uid=None, gid=None):
2274 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
2275 """
2276 Copy a directory tree from src to dst
2277 """
2278 if uid is None or gid is None:
2279 (uid, gid) = extract_uid_gid(ctx)
2280
2281 for src_dir in src:
2282 dst_dir = dst
2283 if os.path.isdir(dst):
2284 dst_dir = os.path.join(dst, os.path.basename(src_dir))
2285
2286 logger.debug('copy directory `%s` -> `%s`' % (src_dir, dst_dir))
2287 shutil.rmtree(dst_dir, ignore_errors=True)
2288 shutil.copytree(src_dir, dst_dir) # dirs_exist_ok needs python 3.8
2289
2290 for dirpath, dirnames, filenames in os.walk(dst_dir):
2291 logger.debug('chown %s:%s `%s`' % (uid, gid, dirpath))
2292 os.chown(dirpath, uid, gid)
2293 for filename in filenames:
2294 logger.debug('chown %s:%s `%s`' % (uid, gid, filename))
2295 os.chown(os.path.join(dirpath, filename), uid, gid)
2296
2297
2298 def copy_files(ctx, src, dst, uid=None, gid=None):
2299 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
2300 """
2301 Copy a files from src to dst
2302 """
2303 if uid is None or gid is None:
2304 (uid, gid) = extract_uid_gid(ctx)
2305
2306 for src_file in src:
2307 dst_file = dst
2308 if os.path.isdir(dst):
2309 dst_file = os.path.join(dst, os.path.basename(src_file))
2310
2311 logger.debug('copy file `%s` -> `%s`' % (src_file, dst_file))
2312 shutil.copyfile(src_file, dst_file)
2313
2314 logger.debug('chown %s:%s `%s`' % (uid, gid, dst_file))
2315 os.chown(dst_file, uid, gid)
2316
2317
2318 def move_files(ctx, src, dst, uid=None, gid=None):
2319 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
2320 """
2321 Move files from src to dst
2322 """
2323 if uid is None or gid is None:
2324 (uid, gid) = extract_uid_gid(ctx)
2325
2326 for src_file in src:
2327 dst_file = dst
2328 if os.path.isdir(dst):
2329 dst_file = os.path.join(dst, os.path.basename(src_file))
2330
2331 if os.path.islink(src_file):
2332 # shutil.move() in py2 does not handle symlinks correctly
2333 src_rl = os.readlink(src_file)
2334 logger.debug("symlink '%s' -> '%s'" % (dst_file, src_rl))
2335 os.symlink(src_rl, dst_file)
2336 os.unlink(src_file)
2337 else:
2338 logger.debug("move file '%s' -> '%s'" % (src_file, dst_file))
2339 shutil.move(src_file, dst_file)
2340 logger.debug('chown %s:%s `%s`' % (uid, gid, dst_file))
2341 os.chown(dst_file, uid, gid)
2342
2343
2344 def recursive_chown(path: str, uid: int, gid: int) -> None:
2345 for dirpath, dirnames, filenames in os.walk(path):
2346 os.chown(dirpath, uid, gid)
2347 for filename in filenames:
2348 os.chown(os.path.join(dirpath, filename), uid, gid)
2349
2350
2351 # copied from distutils
2352 def find_executable(executable: str, path: Optional[str] = None) -> Optional[str]:
2353 """Tries to find 'executable' in the directories listed in 'path'.
2354 A string listing directories separated by 'os.pathsep'; defaults to
2355 os.environ['PATH']. Returns the complete filename or None if not found.
2356 """
2357 _, ext = os.path.splitext(executable)
2358 if (sys.platform == 'win32') and (ext != '.exe'):
2359 executable = executable + '.exe'
2360
2361 if os.path.isfile(executable):
2362 return executable
2363
2364 if path is None:
2365 path = os.environ.get('PATH', None)
2366 if path is None:
2367 try:
2368 path = os.confstr('CS_PATH')
2369 except (AttributeError, ValueError):
2370 # os.confstr() or CS_PATH is not available
2371 path = os.defpath
2372 # bpo-35755: Don't use os.defpath if the PATH environment variable is
2373 # set to an empty string
2374
2375 # PATH='' doesn't match, whereas PATH=':' looks in the current directory
2376 if not path:
2377 return None
2378
2379 paths = path.split(os.pathsep)
2380 for p in paths:
2381 f = os.path.join(p, executable)
2382 if os.path.isfile(f):
2383 # the file exists, we have a shot at spawn working
2384 return f
2385 return None
2386
2387
2388 def find_program(filename):
2389 # type: (str) -> str
2390 name = find_executable(filename)
2391 if name is None:
2392 raise ValueError('%s not found' % filename)
2393 return name
2394
2395
2396 def find_container_engine(ctx: CephadmContext) -> Optional[ContainerEngine]:
2397 if ctx.docker:
2398 return Docker()
2399 else:
2400 for i in CONTAINER_PREFERENCE:
2401 try:
2402 return i()
2403 except Exception:
2404 pass
2405 return None
2406
2407
2408 def check_container_engine(ctx: CephadmContext) -> ContainerEngine:
2409 engine = ctx.container_engine
2410 if not isinstance(engine, CONTAINER_PREFERENCE):
2411 # See https://github.com/python/mypy/issues/8993
2412 exes: List[str] = [i.EXE for i in CONTAINER_PREFERENCE] # type: ignore
2413 raise Error('No container engine binary found ({}). Try run `apt/dnf/yum/zypper install <container engine>`'.format(' or '.join(exes)))
2414 elif isinstance(engine, Podman):
2415 engine.get_version(ctx)
2416 if engine.version < MIN_PODMAN_VERSION:
2417 raise Error('podman version %d.%d.%d or later is required' % MIN_PODMAN_VERSION)
2418 return engine
2419
2420
2421 def get_unit_name(fsid, daemon_type, daemon_id=None):
2422 # type: (str, str, Optional[Union[int, str]]) -> str
2423 # accept either name or type + id
2424 if daemon_id is not None:
2425 return 'ceph-%s@%s.%s' % (fsid, daemon_type, daemon_id)
2426 else:
2427 return 'ceph-%s@%s' % (fsid, daemon_type)
2428
2429
2430 def get_unit_name_by_daemon_name(ctx: CephadmContext, fsid: str, name: str) -> str:
2431 daemon = get_daemon_description(ctx, fsid, name)
2432 try:
2433 return daemon['systemd_unit']
2434 except KeyError:
2435 raise Error('Failed to get unit name for {}'.format(daemon))
2436
2437
2438 def check_unit(ctx, unit_name):
2439 # type: (CephadmContext, str) -> Tuple[bool, str, bool]
2440 # NOTE: we ignore the exit code here because systemctl outputs
2441 # various exit codes based on the state of the service, but the
2442 # string result is more explicit (and sufficient).
2443 enabled = False
2444 installed = False
2445 try:
2446 out, err, code = call(ctx, ['systemctl', 'is-enabled', unit_name],
2447 verbosity=CallVerbosity.QUIET)
2448 if code == 0:
2449 enabled = True
2450 installed = True
2451 elif 'disabled' in out:
2452 installed = True
2453 except Exception as e:
2454 logger.warning('unable to run systemctl: %s' % e)
2455 enabled = False
2456 installed = False
2457
2458 state = 'unknown'
2459 try:
2460 out, err, code = call(ctx, ['systemctl', 'is-active', unit_name],
2461 verbosity=CallVerbosity.QUIET)
2462 out = out.strip()
2463 if out in ['active']:
2464 state = 'running'
2465 elif out in ['inactive']:
2466 state = 'stopped'
2467 elif out in ['failed', 'auto-restart']:
2468 state = 'error'
2469 else:
2470 state = 'unknown'
2471 except Exception as e:
2472 logger.warning('unable to run systemctl: %s' % e)
2473 state = 'unknown'
2474 return (enabled, state, installed)
2475
2476
2477 def check_units(ctx, units, enabler=None):
2478 # type: (CephadmContext, List[str], Optional[Packager]) -> bool
2479 for u in units:
2480 (enabled, state, installed) = check_unit(ctx, u)
2481 if enabled and state == 'running':
2482 logger.info('Unit %s is enabled and running' % u)
2483 return True
2484 if enabler is not None:
2485 if installed:
2486 logger.info('Enabling unit %s' % u)
2487 enabler.enable_service(u)
2488 return False
2489
2490
2491 def is_container_running(ctx: CephadmContext, c: 'CephContainer') -> bool:
2492 if ctx.name.split('.', 1)[0] in ['agent', 'cephadm-exporter']:
2493 # these are non-containerized daemon types
2494 return False
2495 return bool(get_running_container_name(ctx, c))
2496
2497
2498 def get_running_container_name(ctx: CephadmContext, c: 'CephContainer') -> Optional[str]:
2499 for name in [c.cname, c.old_cname]:
2500 out, err, ret = call(ctx, [
2501 ctx.container_engine.path, 'container', 'inspect',
2502 '--format', '{{.State.Status}}', name
2503 ])
2504 if out.strip() == 'running':
2505 return name
2506 return None
2507
2508
2509 def get_legacy_config_fsid(cluster, legacy_dir=None):
2510 # type: (str, Optional[str]) -> Optional[str]
2511 config_file = '/etc/ceph/%s.conf' % cluster
2512 if legacy_dir is not None:
2513 config_file = os.path.abspath(legacy_dir + config_file)
2514
2515 if os.path.exists(config_file):
2516 config = read_config(config_file)
2517 if config.has_section('global') and config.has_option('global', 'fsid'):
2518 return config.get('global', 'fsid')
2519 return None
2520
2521
2522 def get_legacy_daemon_fsid(ctx, cluster,
2523 daemon_type, daemon_id, legacy_dir=None):
2524 # type: (CephadmContext, str, str, Union[int, str], Optional[str]) -> Optional[str]
2525 fsid = None
2526 if daemon_type == 'osd':
2527 try:
2528 fsid_file = os.path.join(ctx.data_dir,
2529 daemon_type,
2530 'ceph-%s' % daemon_id,
2531 'ceph_fsid')
2532 if legacy_dir is not None:
2533 fsid_file = os.path.abspath(legacy_dir + fsid_file)
2534 with open(fsid_file, 'r') as f:
2535 fsid = f.read().strip()
2536 except IOError:
2537 pass
2538 if not fsid:
2539 fsid = get_legacy_config_fsid(cluster, legacy_dir=legacy_dir)
2540 return fsid
2541
2542
2543 def should_log_to_journald(ctx: CephadmContext) -> bool:
2544 if ctx.log_to_journald is not None:
2545 return ctx.log_to_journald
2546 return isinstance(ctx.container_engine, Podman) and \
2547 ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION
2548
2549
2550 def get_daemon_args(ctx, fsid, daemon_type, daemon_id):
2551 # type: (CephadmContext, str, str, Union[int, str]) -> List[str]
2552 r = list() # type: List[str]
2553
2554 if daemon_type in Ceph.daemons and daemon_type != 'crash':
2555 r += [
2556 '--setuser', 'ceph',
2557 '--setgroup', 'ceph',
2558 '--default-log-to-file=false',
2559 ]
2560 log_to_journald = should_log_to_journald(ctx)
2561 if log_to_journald:
2562 r += [
2563 '--default-log-to-journald=true',
2564 '--default-log-to-stderr=false',
2565 ]
2566 else:
2567 r += [
2568 '--default-log-to-stderr=true',
2569 '--default-log-stderr-prefix=debug ',
2570 ]
2571 if daemon_type == 'mon':
2572 r += [
2573 '--default-mon-cluster-log-to-file=false',
2574 ]
2575 if log_to_journald:
2576 r += [
2577 '--default-mon-cluster-log-to-journald=true',
2578 '--default-mon-cluster-log-to-stderr=false',
2579 ]
2580 else:
2581 r += ['--default-mon-cluster-log-to-stderr=true']
2582 elif daemon_type in Monitoring.components:
2583 metadata = Monitoring.components[daemon_type]
2584 r += metadata.get('args', list())
2585 # set ip and port to bind to for nodeexporter,alertmanager,prometheus
2586 if daemon_type not in ['grafana', 'loki', 'promtail']:
2587 ip = ''
2588 port = Monitoring.port_map[daemon_type][0]
2589 if 'meta_json' in ctx and ctx.meta_json:
2590 meta = json.loads(ctx.meta_json) or {}
2591 if 'ip' in meta and meta['ip']:
2592 ip = meta['ip']
2593 if 'ports' in meta and meta['ports']:
2594 port = meta['ports'][0]
2595 r += [f'--web.listen-address={ip}:{port}']
2596 if daemon_type == 'prometheus':
2597 scheme = 'http'
2598 host = get_fqdn()
2599 r += [f'--web.external-url={scheme}://{host}:{port}']
2600 if daemon_type == 'alertmanager':
2601 config = get_parm(ctx.config_json)
2602 peers = config.get('peers', list()) # type: ignore
2603 for peer in peers:
2604 r += ['--cluster.peer={}'.format(peer)]
2605 # some alertmanager, by default, look elsewhere for a config
2606 r += ['--config.file=/etc/alertmanager/alertmanager.yml']
2607 if daemon_type == 'promtail':
2608 r += ['--config.expand-env']
2609 if daemon_type == 'node-exporter':
2610 r += ['--path.procfs=/host/proc',
2611 '--path.sysfs=/host/sys',
2612 '--path.rootfs=/rootfs']
2613 elif daemon_type == NFSGanesha.daemon_type:
2614 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2615 r += nfs_ganesha.get_daemon_args()
2616 elif daemon_type == HAproxy.daemon_type:
2617 haproxy = HAproxy.init(ctx, fsid, daemon_id)
2618 r += haproxy.get_daemon_args()
2619 elif daemon_type == CustomContainer.daemon_type:
2620 cc = CustomContainer.init(ctx, fsid, daemon_id)
2621 r.extend(cc.get_daemon_args())
2622 elif daemon_type == SNMPGateway.daemon_type:
2623 sc = SNMPGateway.init(ctx, fsid, daemon_id)
2624 r.extend(sc.get_daemon_args())
2625
2626 return r
2627
2628
2629 def create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid,
2630 config=None, keyring=None):
2631 # type: (CephadmContext, str, str, Union[int, str], int, int, Optional[str], Optional[str]) -> None
2632 data_dir = make_data_dir(ctx, fsid, daemon_type, daemon_id, uid=uid, gid=gid)
2633
2634 if daemon_type in Ceph.daemons:
2635 make_log_dir(ctx, fsid, uid=uid, gid=gid)
2636
2637 if config:
2638 config_path = os.path.join(data_dir, 'config')
2639 with open(config_path, 'w') as f:
2640 os.fchown(f.fileno(), uid, gid)
2641 os.fchmod(f.fileno(), 0o600)
2642 f.write(config)
2643
2644 if keyring:
2645 keyring_path = os.path.join(data_dir, 'keyring')
2646 with open(keyring_path, 'w') as f:
2647 os.fchmod(f.fileno(), 0o600)
2648 os.fchown(f.fileno(), uid, gid)
2649 f.write(keyring)
2650
2651 if daemon_type in Monitoring.components.keys():
2652 config_json: Dict[str, Any] = dict()
2653 if 'config_json' in ctx:
2654 config_json = get_parm(ctx.config_json)
2655
2656 # Set up directories specific to the monitoring component
2657 config_dir = ''
2658 data_dir_root = ''
2659 if daemon_type == 'prometheus':
2660 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2661 daemon_type, daemon_id)
2662 config_dir = 'etc/prometheus'
2663 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2664 makedirs(os.path.join(data_dir_root, config_dir, 'alerting'), uid, gid, 0o755)
2665 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2666 recursive_chown(os.path.join(data_dir_root, 'etc'), uid, gid)
2667 recursive_chown(os.path.join(data_dir_root, 'data'), uid, gid)
2668 elif daemon_type == 'grafana':
2669 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2670 daemon_type, daemon_id)
2671 config_dir = 'etc/grafana'
2672 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2673 makedirs(os.path.join(data_dir_root, config_dir, 'certs'), uid, gid, 0o755)
2674 makedirs(os.path.join(data_dir_root, config_dir, 'provisioning/datasources'), uid, gid, 0o755)
2675 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2676 touch(os.path.join(data_dir_root, 'data', 'grafana.db'), uid, gid)
2677 elif daemon_type == 'alertmanager':
2678 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2679 daemon_type, daemon_id)
2680 config_dir = 'etc/alertmanager'
2681 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2682 makedirs(os.path.join(data_dir_root, config_dir, 'data'), uid, gid, 0o755)
2683 elif daemon_type == 'promtail':
2684 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2685 daemon_type, daemon_id)
2686 config_dir = 'etc/promtail'
2687 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2688 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2689 elif daemon_type == 'loki':
2690 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2691 daemon_type, daemon_id)
2692 config_dir = 'etc/loki'
2693 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2694 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2695
2696 # populate the config directory for the component from the config-json
2697 if 'files' in config_json:
2698 for fname in config_json['files']:
2699 content = dict_get_join(config_json['files'], fname)
2700 if os.path.isabs(fname):
2701 fpath = os.path.join(data_dir_root, fname.lstrip(os.path.sep))
2702 else:
2703 fpath = os.path.join(data_dir_root, config_dir, fname)
2704 with open(fpath, 'w', encoding='utf-8') as f:
2705 os.fchown(f.fileno(), uid, gid)
2706 os.fchmod(f.fileno(), 0o600)
2707 f.write(content)
2708
2709 elif daemon_type == NFSGanesha.daemon_type:
2710 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2711 nfs_ganesha.create_daemon_dirs(data_dir, uid, gid)
2712
2713 elif daemon_type == CephIscsi.daemon_type:
2714 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2715 ceph_iscsi.create_daemon_dirs(data_dir, uid, gid)
2716
2717 elif daemon_type == HAproxy.daemon_type:
2718 haproxy = HAproxy.init(ctx, fsid, daemon_id)
2719 haproxy.create_daemon_dirs(data_dir, uid, gid)
2720
2721 elif daemon_type == Keepalived.daemon_type:
2722 keepalived = Keepalived.init(ctx, fsid, daemon_id)
2723 keepalived.create_daemon_dirs(data_dir, uid, gid)
2724
2725 elif daemon_type == CustomContainer.daemon_type:
2726 cc = CustomContainer.init(ctx, fsid, daemon_id)
2727 cc.create_daemon_dirs(data_dir, uid, gid)
2728
2729 elif daemon_type == SNMPGateway.daemon_type:
2730 sg = SNMPGateway.init(ctx, fsid, daemon_id)
2731 sg.create_daemon_conf()
2732
2733 _write_custom_conf_files(ctx, daemon_type, str(daemon_id), fsid, uid, gid)
2734
2735
2736 def _write_custom_conf_files(ctx: CephadmContext, daemon_type: str, daemon_id: str, fsid: str, uid: int, gid: int) -> None:
2737 # mostly making this its own function to make unit testing easier
2738 if 'config_json' not in ctx or not ctx.config_json:
2739 return
2740 config_json = get_custom_config_files(ctx.config_json)
2741 custom_config_dir = os.path.join(ctx.data_dir, fsid, 'custom_config_files', f'{daemon_type}.{daemon_id}')
2742 if not os.path.exists(custom_config_dir):
2743 makedirs(custom_config_dir, uid, gid, 0o755)
2744 mandatory_keys = ['mount_path', 'content']
2745 for ccf in config_json['custom_config_files']:
2746 if all(k in ccf for k in mandatory_keys):
2747 file_path = os.path.join(custom_config_dir, os.path.basename(ccf['mount_path']))
2748 with open(file_path, 'w+', encoding='utf-8') as f:
2749 os.fchown(f.fileno(), uid, gid)
2750 os.fchmod(f.fileno(), 0o600)
2751 f.write(ccf['content'])
2752
2753
2754 def get_parm(option: str) -> Dict[str, str]:
2755 js = _get_config_json(option)
2756 # custom_config_files is a special field that may be in the config
2757 # dict. It is used for mounting custom config files into daemon's containers
2758 # and should be accessed through the "get_custom_config_files" function.
2759 # For get_parm we need to discard it.
2760 js.pop('custom_config_files', None)
2761 return js
2762
2763
2764 def get_custom_config_files(option: str) -> Dict[str, List[Dict[str, str]]]:
2765 js = _get_config_json(option)
2766 res: Dict[str, List[Dict[str, str]]] = {'custom_config_files': []}
2767 if 'custom_config_files' in js:
2768 res['custom_config_files'] = js['custom_config_files']
2769 return res
2770
2771
2772 def _get_config_json(option: str) -> Dict[str, Any]:
2773 if not option:
2774 return dict()
2775
2776 global cached_stdin
2777 if option == '-':
2778 if cached_stdin is not None:
2779 j = cached_stdin
2780 else:
2781 j = sys.stdin.read()
2782 cached_stdin = j
2783 else:
2784 # inline json string
2785 if option[0] == '{' and option[-1] == '}':
2786 j = option
2787 # json file
2788 elif os.path.exists(option):
2789 with open(option, 'r') as f:
2790 j = f.read()
2791 else:
2792 raise Error('Config file {} not found'.format(option))
2793
2794 try:
2795 js = json.loads(j)
2796 except ValueError as e:
2797 raise Error('Invalid JSON in {}: {}'.format(option, e))
2798 else:
2799 return js
2800
2801
2802 def get_config_and_keyring(ctx):
2803 # type: (CephadmContext) -> Tuple[Optional[str], Optional[str]]
2804 config = None
2805 keyring = None
2806
2807 if 'config_json' in ctx and ctx.config_json:
2808 d = get_parm(ctx.config_json)
2809 config = d.get('config')
2810 keyring = d.get('keyring')
2811 if config and keyring:
2812 return config, keyring
2813
2814 if 'config' in ctx and ctx.config:
2815 try:
2816 with open(ctx.config, 'r') as f:
2817 config = f.read()
2818 except FileNotFoundError as e:
2819 raise Error(e)
2820
2821 if 'key' in ctx and ctx.key:
2822 keyring = '[%s]\n\tkey = %s\n' % (ctx.name, ctx.key)
2823 elif 'keyring' in ctx and ctx.keyring:
2824 try:
2825 with open(ctx.keyring, 'r') as f:
2826 keyring = f.read()
2827 except FileNotFoundError as e:
2828 raise Error(e)
2829
2830 return config, keyring
2831
2832
2833 def get_container_binds(ctx, fsid, daemon_type, daemon_id):
2834 # type: (CephadmContext, str, str, Union[int, str, None]) -> List[List[str]]
2835 binds = list()
2836
2837 if daemon_type == CephIscsi.daemon_type:
2838 binds.extend(CephIscsi.get_container_binds())
2839 elif daemon_type == CustomContainer.daemon_type:
2840 assert daemon_id
2841 cc = CustomContainer.init(ctx, fsid, daemon_id)
2842 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2843 binds.extend(cc.get_container_binds(data_dir))
2844
2845 return binds
2846
2847
2848 def get_container_mounts(ctx, fsid, daemon_type, daemon_id,
2849 no_config=False):
2850 # type: (CephadmContext, str, str, Union[int, str, None], Optional[bool]) -> Dict[str, str]
2851 mounts = dict()
2852
2853 if daemon_type in Ceph.daemons:
2854 if fsid:
2855 run_path = os.path.join('/var/run/ceph', fsid)
2856 if os.path.exists(run_path):
2857 mounts[run_path] = '/var/run/ceph:z'
2858 log_dir = get_log_dir(fsid, ctx.log_dir)
2859 mounts[log_dir] = '/var/log/ceph:z'
2860 crash_dir = '/var/lib/ceph/%s/crash' % fsid
2861 if os.path.exists(crash_dir):
2862 mounts[crash_dir] = '/var/lib/ceph/crash:z'
2863 if daemon_type != 'crash' and should_log_to_journald(ctx):
2864 journald_sock_dir = '/run/systemd/journal'
2865 mounts[journald_sock_dir] = journald_sock_dir
2866
2867 if daemon_type in Ceph.daemons and daemon_id:
2868 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2869 if daemon_type == 'rgw':
2870 cdata_dir = '/var/lib/ceph/radosgw/ceph-rgw.%s' % (daemon_id)
2871 else:
2872 cdata_dir = '/var/lib/ceph/%s/ceph-%s' % (daemon_type, daemon_id)
2873 if daemon_type != 'crash':
2874 mounts[data_dir] = cdata_dir + ':z'
2875 if not no_config:
2876 mounts[data_dir + '/config'] = '/etc/ceph/ceph.conf:z'
2877 if daemon_type in ['rbd-mirror', 'cephfs-mirror', 'crash']:
2878 # these do not search for their keyrings in a data directory
2879 mounts[data_dir + '/keyring'] = '/etc/ceph/ceph.client.%s.%s.keyring' % (daemon_type, daemon_id)
2880
2881 if daemon_type in ['mon', 'osd', 'clusterless-ceph-volume']:
2882 mounts['/dev'] = '/dev' # FIXME: narrow this down?
2883 mounts['/run/udev'] = '/run/udev'
2884 if daemon_type in ['osd', 'clusterless-ceph-volume']:
2885 mounts['/sys'] = '/sys' # for numa.cc, pick_address, cgroups, ...
2886 mounts['/run/lvm'] = '/run/lvm'
2887 mounts['/run/lock/lvm'] = '/run/lock/lvm'
2888 if daemon_type == 'osd':
2889 # selinux-policy in the container may not match the host.
2890 if HostFacts(ctx).selinux_enabled:
2891 selinux_folder = '/var/lib/ceph/%s/selinux' % fsid
2892 if not os.path.exists(selinux_folder):
2893 os.makedirs(selinux_folder, mode=0o755)
2894 mounts[selinux_folder] = '/sys/fs/selinux:ro'
2895 mounts['/'] = '/rootfs'
2896
2897 try:
2898 if ctx.shared_ceph_folder: # make easy manager modules/ceph-volume development
2899 ceph_folder = pathify(ctx.shared_ceph_folder)
2900 if os.path.exists(ceph_folder):
2901 mounts[ceph_folder + '/src/ceph-volume/ceph_volume'] = '/usr/lib/python3.6/site-packages/ceph_volume'
2902 mounts[ceph_folder + '/src/cephadm/cephadm'] = '/usr/sbin/cephadm'
2903 mounts[ceph_folder + '/src/pybind/mgr'] = '/usr/share/ceph/mgr'
2904 mounts[ceph_folder + '/src/python-common/ceph'] = '/usr/lib/python3.6/site-packages/ceph'
2905 mounts[ceph_folder + '/monitoring/ceph-mixin/dashboards_out'] = '/etc/grafana/dashboards/ceph-dashboard'
2906 mounts[ceph_folder + '/monitoring/ceph-mixin/prometheus_alerts.yml'] = '/etc/prometheus/ceph/ceph_default_alerts.yml'
2907 else:
2908 logger.error('{}{}{}'.format(termcolor.red,
2909 'Ceph shared source folder does not exist.',
2910 termcolor.end))
2911 except AttributeError:
2912 pass
2913
2914 if daemon_type in Monitoring.components and daemon_id:
2915 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2916 log_dir = get_log_dir(fsid, ctx.log_dir)
2917 if daemon_type == 'prometheus':
2918 mounts[os.path.join(data_dir, 'etc/prometheus')] = '/etc/prometheus:Z'
2919 mounts[os.path.join(data_dir, 'data')] = '/prometheus:Z'
2920 elif daemon_type == 'loki':
2921 mounts[os.path.join(data_dir, 'etc/loki')] = '/etc/loki:Z'
2922 mounts[os.path.join(data_dir, 'data')] = '/loki:Z'
2923 elif daemon_type == 'promtail':
2924 mounts[os.path.join(data_dir, 'etc/promtail')] = '/etc/promtail:Z'
2925 mounts[log_dir] = '/var/log/ceph:z'
2926 mounts[os.path.join(data_dir, 'data')] = '/promtail:Z'
2927 elif daemon_type == 'node-exporter':
2928 mounts['/proc'] = '/host/proc:ro'
2929 mounts['/sys'] = '/host/sys:ro'
2930 mounts['/'] = '/rootfs:ro'
2931 elif daemon_type == 'grafana':
2932 mounts[os.path.join(data_dir, 'etc/grafana/grafana.ini')] = '/etc/grafana/grafana.ini:Z'
2933 mounts[os.path.join(data_dir, 'etc/grafana/provisioning/datasources')] = '/etc/grafana/provisioning/datasources:Z'
2934 mounts[os.path.join(data_dir, 'etc/grafana/certs')] = '/etc/grafana/certs:Z'
2935 mounts[os.path.join(data_dir, 'data/grafana.db')] = '/var/lib/grafana/grafana.db:Z'
2936 elif daemon_type == 'alertmanager':
2937 mounts[os.path.join(data_dir, 'etc/alertmanager')] = '/etc/alertmanager:Z'
2938
2939 if daemon_type == NFSGanesha.daemon_type:
2940 assert daemon_id
2941 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2942 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2943 mounts.update(nfs_ganesha.get_container_mounts(data_dir))
2944
2945 if daemon_type == HAproxy.daemon_type:
2946 assert daemon_id
2947 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2948 mounts.update(HAproxy.get_container_mounts(data_dir))
2949
2950 if daemon_type == CephIscsi.daemon_type:
2951 assert daemon_id
2952 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2953 log_dir = get_log_dir(fsid, ctx.log_dir)
2954 mounts.update(CephIscsi.get_container_mounts(data_dir, log_dir))
2955
2956 if daemon_type == Keepalived.daemon_type:
2957 assert daemon_id
2958 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2959 mounts.update(Keepalived.get_container_mounts(data_dir))
2960
2961 if daemon_type == CustomContainer.daemon_type:
2962 assert daemon_id
2963 cc = CustomContainer.init(ctx, fsid, daemon_id)
2964 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2965 mounts.update(cc.get_container_mounts(data_dir))
2966
2967 return mounts
2968
2969
2970 def get_ceph_volume_container(ctx: CephadmContext,
2971 privileged: bool = True,
2972 cname: str = '',
2973 volume_mounts: Dict[str, str] = {},
2974 bind_mounts: Optional[List[List[str]]] = None,
2975 args: List[str] = [],
2976 envs: Optional[List[str]] = None) -> 'CephContainer':
2977 if envs is None:
2978 envs = []
2979 envs.append('CEPH_VOLUME_SKIP_RESTORECON=yes')
2980 envs.append('CEPH_VOLUME_DEBUG=1')
2981
2982 return CephContainer(
2983 ctx,
2984 image=ctx.image,
2985 entrypoint='/usr/sbin/ceph-volume',
2986 args=args,
2987 volume_mounts=volume_mounts,
2988 bind_mounts=bind_mounts,
2989 envs=envs,
2990 privileged=privileged,
2991 cname=cname,
2992 memory_request=ctx.memory_request,
2993 memory_limit=ctx.memory_limit,
2994 )
2995
2996
2997 def get_container(ctx: CephadmContext,
2998 fsid: str, daemon_type: str, daemon_id: Union[int, str],
2999 privileged: bool = False,
3000 ptrace: bool = False,
3001 container_args: Optional[List[str]] = None) -> 'CephContainer':
3002 entrypoint: str = ''
3003 name: str = ''
3004 ceph_args: List[str] = []
3005 envs: List[str] = []
3006 host_network: bool = True
3007
3008 if daemon_type in Ceph.daemons:
3009 envs.append('TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728')
3010 if container_args is None:
3011 container_args = []
3012 if daemon_type in ['mon', 'osd']:
3013 # mon and osd need privileged in order for libudev to query devices
3014 privileged = True
3015 if daemon_type == 'rgw':
3016 entrypoint = '/usr/bin/radosgw'
3017 name = 'client.rgw.%s' % daemon_id
3018 elif daemon_type == 'rbd-mirror':
3019 entrypoint = '/usr/bin/rbd-mirror'
3020 name = 'client.rbd-mirror.%s' % daemon_id
3021 elif daemon_type == 'cephfs-mirror':
3022 entrypoint = '/usr/bin/cephfs-mirror'
3023 name = 'client.cephfs-mirror.%s' % daemon_id
3024 elif daemon_type == 'crash':
3025 entrypoint = '/usr/bin/ceph-crash'
3026 name = 'client.crash.%s' % daemon_id
3027 elif daemon_type in ['mon', 'mgr', 'mds', 'osd']:
3028 entrypoint = '/usr/bin/ceph-' + daemon_type
3029 name = '%s.%s' % (daemon_type, daemon_id)
3030 elif daemon_type in Monitoring.components:
3031 entrypoint = ''
3032 elif daemon_type == NFSGanesha.daemon_type:
3033 entrypoint = NFSGanesha.entrypoint
3034 name = '%s.%s' % (daemon_type, daemon_id)
3035 envs.extend(NFSGanesha.get_container_envs())
3036 elif daemon_type == HAproxy.daemon_type:
3037 name = '%s.%s' % (daemon_type, daemon_id)
3038 container_args.extend(['--user=root']) # haproxy 2.4 defaults to a different user
3039 elif daemon_type == Keepalived.daemon_type:
3040 name = '%s.%s' % (daemon_type, daemon_id)
3041 envs.extend(Keepalived.get_container_envs())
3042 container_args.extend(['--cap-add=NET_ADMIN', '--cap-add=NET_RAW'])
3043 elif daemon_type == CephIscsi.daemon_type:
3044 entrypoint = CephIscsi.entrypoint
3045 name = '%s.%s' % (daemon_type, daemon_id)
3046 # So the container can modprobe iscsi_target_mod and have write perms
3047 # to configfs we need to make this a privileged container.
3048 privileged = True
3049 elif daemon_type == CustomContainer.daemon_type:
3050 cc = CustomContainer.init(ctx, fsid, daemon_id)
3051 entrypoint = cc.entrypoint
3052 host_network = False
3053 envs.extend(cc.get_container_envs())
3054 container_args.extend(cc.get_container_args())
3055
3056 if daemon_type in Monitoring.components:
3057 uid, gid = extract_uid_gid_monitoring(ctx, daemon_type)
3058 monitoring_args = [
3059 '--user',
3060 str(uid),
3061 # FIXME: disable cpu/memory limits for the time being (not supported
3062 # by ubuntu 18.04 kernel!)
3063 ]
3064 container_args.extend(monitoring_args)
3065 if daemon_type == 'node-exporter':
3066 # in order to support setting '--path.procfs=/host/proc','--path.sysfs=/host/sys',
3067 # '--path.rootfs=/rootfs' for node-exporter we need to disable selinux separation
3068 # between the node-exporter container and the host to avoid selinux denials
3069 container_args.extend(['--security-opt', 'label=disable'])
3070 elif daemon_type == 'crash':
3071 ceph_args = ['-n', name]
3072 elif daemon_type in Ceph.daemons:
3073 ceph_args = ['-n', name, '-f']
3074 elif daemon_type == SNMPGateway.daemon_type:
3075 sg = SNMPGateway.init(ctx, fsid, daemon_id)
3076 container_args.append(
3077 f'--env-file={sg.conf_file_path}'
3078 )
3079
3080 # if using podman, set -d, --conmon-pidfile & --cidfile flags
3081 # so service can have Type=Forking
3082 if isinstance(ctx.container_engine, Podman):
3083 runtime_dir = '/run'
3084 container_args.extend([
3085 '-d', '--log-driver', 'journald',
3086 '--conmon-pidfile',
3087 runtime_dir + '/ceph-%s@%s.%s.service-pid' % (fsid, daemon_type, daemon_id),
3088 '--cidfile',
3089 runtime_dir + '/ceph-%s@%s.%s.service-cid' % (fsid, daemon_type, daemon_id),
3090 ])
3091 if ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION:
3092 container_args.append('--cgroups=split')
3093
3094 return CephContainer.for_daemon(
3095 ctx,
3096 fsid=fsid,
3097 daemon_type=daemon_type,
3098 daemon_id=str(daemon_id),
3099 entrypoint=entrypoint,
3100 args=ceph_args + get_daemon_args(ctx, fsid, daemon_type, daemon_id),
3101 container_args=container_args,
3102 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
3103 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
3104 envs=envs,
3105 privileged=privileged,
3106 ptrace=ptrace,
3107 host_network=host_network,
3108 )
3109
3110
3111 def extract_uid_gid(ctx, img='', file_path='/var/lib/ceph'):
3112 # type: (CephadmContext, str, Union[str, List[str]]) -> Tuple[int, int]
3113
3114 if not img:
3115 img = ctx.image
3116
3117 if isinstance(file_path, str):
3118 paths = [file_path]
3119 else:
3120 paths = file_path
3121
3122 ex: Optional[Tuple[str, RuntimeError]] = None
3123
3124 for fp in paths:
3125 try:
3126 out = CephContainer(
3127 ctx,
3128 image=img,
3129 entrypoint='stat',
3130 args=['-c', '%u %g', fp]
3131 ).run(verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
3132 uid, gid = out.split(' ')
3133 return int(uid), int(gid)
3134 except RuntimeError as e:
3135 ex = (fp, e)
3136 if ex:
3137 raise Error(f'Failed to extract uid/gid for path {ex[0]}: {ex[1]}')
3138
3139 raise RuntimeError('uid/gid not found')
3140
3141
3142 def deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid,
3143 config=None, keyring=None,
3144 osd_fsid=None,
3145 reconfig=False,
3146 ports=None):
3147 # type: (CephadmContext, str, str, Union[int, str], Optional[CephContainer], int, int, Optional[str], Optional[str], Optional[str], Optional[bool], Optional[List[int]]) -> None
3148
3149 ports = ports or []
3150 if any([port_in_use(ctx, port) for port in ports]):
3151 if daemon_type == 'mgr':
3152 # non-fatal for mgr when we are in mgr_standby_modules=false, but we can't
3153 # tell whether that is the case here.
3154 logger.warning(
3155 f"ceph-mgr TCP port(s) {','.join(map(str, ports))} already in use"
3156 )
3157 else:
3158 raise Error("TCP Port(s) '{}' required for {} already in use".format(','.join(map(str, ports)), daemon_type))
3159
3160 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
3161 if reconfig and not os.path.exists(data_dir):
3162 raise Error('cannot reconfig, data path %s does not exist' % data_dir)
3163 if daemon_type == 'mon' and not os.path.exists(data_dir):
3164 assert config
3165 assert keyring
3166 # tmp keyring file
3167 tmp_keyring = write_tmp(keyring, uid, gid)
3168
3169 # tmp config file
3170 tmp_config = write_tmp(config, uid, gid)
3171
3172 # --mkfs
3173 create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid)
3174 mon_dir = get_data_dir(fsid, ctx.data_dir, 'mon', daemon_id)
3175 log_dir = get_log_dir(fsid, ctx.log_dir)
3176 CephContainer(
3177 ctx,
3178 image=ctx.image,
3179 entrypoint='/usr/bin/ceph-mon',
3180 args=[
3181 '--mkfs',
3182 '-i', str(daemon_id),
3183 '--fsid', fsid,
3184 '-c', '/tmp/config',
3185 '--keyring', '/tmp/keyring',
3186 ] + get_daemon_args(ctx, fsid, 'mon', daemon_id),
3187 volume_mounts={
3188 log_dir: '/var/log/ceph:z',
3189 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (daemon_id),
3190 tmp_keyring.name: '/tmp/keyring:z',
3191 tmp_config.name: '/tmp/config:z',
3192 },
3193 ).run()
3194
3195 # write conf
3196 with open(mon_dir + '/config', 'w') as f:
3197 os.fchown(f.fileno(), uid, gid)
3198 os.fchmod(f.fileno(), 0o600)
3199 f.write(config)
3200 else:
3201 # dirs, conf, keyring
3202 create_daemon_dirs(
3203 ctx,
3204 fsid, daemon_type, daemon_id,
3205 uid, gid,
3206 config, keyring)
3207
3208 if not reconfig:
3209 if daemon_type == CephadmAgent.daemon_type:
3210 if ctx.config_json == '-':
3211 config_js = get_parm('-')
3212 else:
3213 config_js = get_parm(ctx.config_json)
3214 assert isinstance(config_js, dict)
3215
3216 cephadm_agent = CephadmAgent(ctx, fsid, daemon_id)
3217 cephadm_agent.deploy_daemon_unit(config_js)
3218 else:
3219 if c:
3220 deploy_daemon_units(ctx, fsid, uid, gid, daemon_type, daemon_id,
3221 c, osd_fsid=osd_fsid, ports=ports)
3222 else:
3223 raise RuntimeError('attempting to deploy a daemon without a container image')
3224
3225 if not os.path.exists(data_dir + '/unit.created'):
3226 with open(data_dir + '/unit.created', 'w') as f:
3227 os.fchmod(f.fileno(), 0o600)
3228 os.fchown(f.fileno(), uid, gid)
3229 f.write('mtime is time the daemon deployment was created\n')
3230
3231 with open(data_dir + '/unit.configured', 'w') as f:
3232 f.write('mtime is time we were last configured\n')
3233 os.fchmod(f.fileno(), 0o600)
3234 os.fchown(f.fileno(), uid, gid)
3235
3236 update_firewalld(ctx, daemon_type)
3237
3238 # Open ports explicitly required for the daemon
3239 if ports:
3240 fw = Firewalld(ctx)
3241 fw.open_ports(ports)
3242 fw.apply_rules()
3243
3244 if reconfig and daemon_type not in Ceph.daemons:
3245 # ceph daemons do not need a restart; others (presumably) do to pick
3246 # up the new config
3247 call_throws(ctx, ['systemctl', 'reset-failed',
3248 get_unit_name(fsid, daemon_type, daemon_id)])
3249 call_throws(ctx, ['systemctl', 'restart',
3250 get_unit_name(fsid, daemon_type, daemon_id)])
3251
3252
3253 def _write_container_cmd_to_bash(ctx, file_obj, container, comment=None, background=False):
3254 # type: (CephadmContext, IO[str], CephContainer, Optional[str], Optional[bool]) -> None
3255 if comment:
3256 # Sometimes adding a comment, especially if there are multiple containers in one
3257 # unit file, makes it easier to read and grok.
3258 file_obj.write('# ' + comment + '\n')
3259 # Sometimes, adding `--rm` to a run_cmd doesn't work. Let's remove the container manually
3260 file_obj.write('! ' + ' '.join(container.rm_cmd(old_cname=True)) + ' 2> /dev/null\n')
3261 file_obj.write('! ' + ' '.join(container.rm_cmd()) + ' 2> /dev/null\n')
3262 # Sometimes, `podman rm` doesn't find the container. Then you'll have to add `--storage`
3263 if isinstance(ctx.container_engine, Podman):
3264 file_obj.write(
3265 '! '
3266 + ' '.join([shlex.quote(a) for a in container.rm_cmd(storage=True)])
3267 + ' 2> /dev/null\n')
3268 file_obj.write(
3269 '! '
3270 + ' '.join([shlex.quote(a) for a in container.rm_cmd(old_cname=True, storage=True)])
3271 + ' 2> /dev/null\n')
3272
3273 # container run command
3274 file_obj.write(
3275 ' '.join([shlex.quote(a) for a in container.run_cmd()])
3276 + (' &' if background else '') + '\n')
3277
3278
3279 def clean_cgroup(ctx: CephadmContext, fsid: str, unit_name: str) -> None:
3280 # systemd may fail to cleanup cgroups from previous stopped unit, which will cause next "systemctl start" to fail.
3281 # see https://tracker.ceph.com/issues/50998
3282
3283 CGROUPV2_PATH = Path('/sys/fs/cgroup')
3284 if not (CGROUPV2_PATH / 'system.slice').exists():
3285 # Only unified cgroup is affected, skip if not the case
3286 return
3287
3288 slice_name = 'system-ceph\\x2d{}.slice'.format(fsid.replace('-', '\\x2d'))
3289 cg_path = CGROUPV2_PATH / 'system.slice' / slice_name / f'{unit_name}.service'
3290 if not cg_path.exists():
3291 return
3292
3293 def cg_trim(path: Path) -> None:
3294 for p in path.iterdir():
3295 if p.is_dir():
3296 cg_trim(p)
3297 path.rmdir()
3298 try:
3299 cg_trim(cg_path)
3300 except OSError:
3301 logger.warning(f'Failed to trim old cgroups {cg_path}')
3302
3303
3304 def deploy_daemon_units(
3305 ctx: CephadmContext,
3306 fsid: str,
3307 uid: int,
3308 gid: int,
3309 daemon_type: str,
3310 daemon_id: Union[int, str],
3311 c: 'CephContainer',
3312 enable: bool = True,
3313 start: bool = True,
3314 osd_fsid: Optional[str] = None,
3315 ports: Optional[List[int]] = None,
3316 ) -> None:
3317 # cmd
3318 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
3319 with open(data_dir + '/unit.run.new', 'w') as f, \
3320 open(data_dir + '/unit.meta.new', 'w') as metaf:
3321 f.write('set -e\n')
3322
3323 if daemon_type in Ceph.daemons:
3324 install_path = find_program('install')
3325 f.write('{install_path} -d -m0770 -o {uid} -g {gid} /var/run/ceph/{fsid}\n'.format(install_path=install_path, fsid=fsid, uid=uid, gid=gid))
3326
3327 # pre-start cmd(s)
3328 if daemon_type == 'osd':
3329 # osds have a pre-start step
3330 assert osd_fsid
3331 simple_fn = os.path.join('/etc/ceph/osd',
3332 '%s-%s.json.adopted-by-cephadm' % (daemon_id, osd_fsid))
3333 if os.path.exists(simple_fn):
3334 f.write('# Simple OSDs need chown on startup:\n')
3335 for n in ['block', 'block.db', 'block.wal']:
3336 p = os.path.join(data_dir, n)
3337 f.write('[ ! -L {p} ] || chown {uid}:{gid} {p}\n'.format(p=p, uid=uid, gid=gid))
3338 else:
3339 # if ceph-volume does not support 'ceph-volume activate', we must
3340 # do 'ceph-volume lvm activate'.
3341 test_cv = get_ceph_volume_container(
3342 ctx,
3343 args=['activate', '--bad-option'],
3344 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
3345 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
3346 cname='ceph-%s-%s.%s-activate-test' % (fsid, daemon_type, daemon_id),
3347 )
3348 out, err, ret = call(ctx, test_cv.run_cmd(), verbosity=CallVerbosity.SILENT)
3349 # bad: ceph-volume: error: unrecognized arguments: activate --bad-option
3350 # good: ceph-volume: error: unrecognized arguments: --bad-option
3351 if 'unrecognized arguments: activate' in err:
3352 # older ceph-volume without top-level activate or --no-tmpfs
3353 cmd = [
3354 'lvm', 'activate',
3355 str(daemon_id), osd_fsid,
3356 '--no-systemd',
3357 ]
3358 else:
3359 cmd = [
3360 'activate',
3361 '--osd-id', str(daemon_id),
3362 '--osd-uuid', osd_fsid,
3363 '--no-systemd',
3364 '--no-tmpfs',
3365 ]
3366
3367 prestart = get_ceph_volume_container(
3368 ctx,
3369 args=cmd,
3370 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
3371 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
3372 cname='ceph-%s-%s.%s-activate' % (fsid, daemon_type, daemon_id),
3373 )
3374 _write_container_cmd_to_bash(ctx, f, prestart, 'LVM OSDs use ceph-volume lvm activate')
3375 elif daemon_type == CephIscsi.daemon_type:
3376 f.write(' '.join(CephIscsi.configfs_mount_umount(data_dir, mount=True)) + '\n')
3377 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
3378 tcmu_container = ceph_iscsi.get_tcmu_runner_container()
3379 _write_container_cmd_to_bash(ctx, f, tcmu_container, 'iscsi tcmu-runner container', background=True)
3380
3381 _write_container_cmd_to_bash(ctx, f, c, '%s.%s' % (daemon_type, str(daemon_id)))
3382
3383 # some metadata about the deploy
3384 meta: Dict[str, Any] = {}
3385 if 'meta_json' in ctx and ctx.meta_json:
3386 meta = json.loads(ctx.meta_json) or {}
3387 meta.update({
3388 'memory_request': int(ctx.memory_request) if ctx.memory_request else None,
3389 'memory_limit': int(ctx.memory_limit) if ctx.memory_limit else None,
3390 })
3391 if not meta.get('ports'):
3392 meta['ports'] = ports
3393 metaf.write(json.dumps(meta, indent=4) + '\n')
3394
3395 os.fchmod(f.fileno(), 0o600)
3396 os.fchmod(metaf.fileno(), 0o600)
3397 os.rename(data_dir + '/unit.run.new',
3398 data_dir + '/unit.run')
3399 os.rename(data_dir + '/unit.meta.new',
3400 data_dir + '/unit.meta')
3401
3402 # post-stop command(s)
3403 with open(data_dir + '/unit.poststop.new', 'w') as f:
3404 if daemon_type == 'osd':
3405 assert osd_fsid
3406 poststop = get_ceph_volume_container(
3407 ctx,
3408 args=[
3409 'lvm', 'deactivate',
3410 str(daemon_id), osd_fsid,
3411 ],
3412 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
3413 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
3414 cname='ceph-%s-%s.%s-deactivate' % (fsid, daemon_type,
3415 daemon_id),
3416 )
3417 _write_container_cmd_to_bash(ctx, f, poststop, 'deactivate osd')
3418 elif daemon_type == CephIscsi.daemon_type:
3419 # make sure we also stop the tcmu container
3420 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
3421 tcmu_container = ceph_iscsi.get_tcmu_runner_container()
3422 f.write('! ' + ' '.join(tcmu_container.stop_cmd()) + '\n')
3423 f.write(' '.join(CephIscsi.configfs_mount_umount(data_dir, mount=False)) + '\n')
3424 os.fchmod(f.fileno(), 0o600)
3425 os.rename(data_dir + '/unit.poststop.new',
3426 data_dir + '/unit.poststop')
3427
3428 # post-stop command(s)
3429 with open(data_dir + '/unit.stop.new', 'w') as f:
3430 # following generated script basically checks if the container exists
3431 # before stopping it. Exit code will be success either if it doesn't
3432 # exist or if it exists and is stopped successfully.
3433 container_exists = f'{ctx.container_engine.path} inspect %s &>/dev/null'
3434 f.write(f'! {container_exists % c.old_cname} || {" ".join(c.stop_cmd(old_cname=True))} \n')
3435 f.write(f'! {container_exists % c.cname} || {" ".join(c.stop_cmd())} \n')
3436
3437 os.fchmod(f.fileno(), 0o600)
3438 os.rename(data_dir + '/unit.stop.new',
3439 data_dir + '/unit.stop')
3440
3441 if c:
3442 with open(data_dir + '/unit.image.new', 'w') as f:
3443 f.write(c.image + '\n')
3444 os.fchmod(f.fileno(), 0o600)
3445 os.rename(data_dir + '/unit.image.new',
3446 data_dir + '/unit.image')
3447
3448 # sysctl
3449 install_sysctl(ctx, fsid, daemon_type)
3450
3451 # systemd
3452 install_base_units(ctx, fsid)
3453 unit = get_unit_file(ctx, fsid)
3454 unit_file = 'ceph-%s@.service' % (fsid)
3455 with open(ctx.unit_dir + '/' + unit_file + '.new', 'w') as f:
3456 f.write(unit)
3457 os.rename(ctx.unit_dir + '/' + unit_file + '.new',
3458 ctx.unit_dir + '/' + unit_file)
3459 call_throws(ctx, ['systemctl', 'daemon-reload'])
3460
3461 unit_name = get_unit_name(fsid, daemon_type, daemon_id)
3462 call(ctx, ['systemctl', 'stop', unit_name],
3463 verbosity=CallVerbosity.DEBUG)
3464 call(ctx, ['systemctl', 'reset-failed', unit_name],
3465 verbosity=CallVerbosity.DEBUG)
3466 if enable:
3467 call_throws(ctx, ['systemctl', 'enable', unit_name])
3468 if start:
3469 clean_cgroup(ctx, fsid, unit_name)
3470 call_throws(ctx, ['systemctl', 'start', unit_name])
3471
3472
3473 class Firewalld(object):
3474 def __init__(self, ctx):
3475 # type: (CephadmContext) -> None
3476 self.ctx = ctx
3477 self.available = self.check()
3478
3479 def check(self):
3480 # type: () -> bool
3481 self.cmd = find_executable('firewall-cmd')
3482 if not self.cmd:
3483 logger.debug('firewalld does not appear to be present')
3484 return False
3485 (enabled, state, _) = check_unit(self.ctx, 'firewalld.service')
3486 if not enabled:
3487 logger.debug('firewalld.service is not enabled')
3488 return False
3489 if state != 'running':
3490 logger.debug('firewalld.service is not running')
3491 return False
3492
3493 logger.info('firewalld ready')
3494 return True
3495
3496 def enable_service_for(self, daemon_type):
3497 # type: (str) -> None
3498 if not self.available:
3499 logger.debug('Not possible to enable service <%s>. firewalld.service is not available' % daemon_type)
3500 return
3501
3502 if daemon_type == 'mon':
3503 svc = 'ceph-mon'
3504 elif daemon_type in ['mgr', 'mds', 'osd']:
3505 svc = 'ceph'
3506 elif daemon_type == NFSGanesha.daemon_type:
3507 svc = 'nfs'
3508 else:
3509 return
3510
3511 if not self.cmd:
3512 raise RuntimeError('command not defined')
3513
3514 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-service', svc], verbosity=CallVerbosity.DEBUG)
3515 if ret:
3516 logger.info('Enabling firewalld service %s in current zone...' % svc)
3517 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--add-service', svc])
3518 if ret:
3519 raise RuntimeError(
3520 'unable to add service %s to current zone: %s' % (svc, err))
3521 else:
3522 logger.debug('firewalld service %s is enabled in current zone' % svc)
3523
3524 def open_ports(self, fw_ports):
3525 # type: (List[int]) -> None
3526 if not self.available:
3527 logger.debug('Not possible to open ports <%s>. firewalld.service is not available' % fw_ports)
3528 return
3529
3530 if not self.cmd:
3531 raise RuntimeError('command not defined')
3532
3533 for port in fw_ports:
3534 tcp_port = str(port) + '/tcp'
3535 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-port', tcp_port], verbosity=CallVerbosity.DEBUG)
3536 if ret:
3537 logger.info('Enabling firewalld port %s in current zone...' % tcp_port)
3538 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--add-port', tcp_port])
3539 if ret:
3540 raise RuntimeError('unable to add port %s to current zone: %s' %
3541 (tcp_port, err))
3542 else:
3543 logger.debug('firewalld port %s is enabled in current zone' % tcp_port)
3544
3545 def close_ports(self, fw_ports):
3546 # type: (List[int]) -> None
3547 if not self.available:
3548 logger.debug('Not possible to close ports <%s>. firewalld.service is not available' % fw_ports)
3549 return
3550
3551 if not self.cmd:
3552 raise RuntimeError('command not defined')
3553
3554 for port in fw_ports:
3555 tcp_port = str(port) + '/tcp'
3556 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-port', tcp_port], verbosity=CallVerbosity.DEBUG)
3557 if not ret:
3558 logger.info('Disabling port %s in current zone...' % tcp_port)
3559 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--remove-port', tcp_port])
3560 if ret:
3561 raise RuntimeError('unable to remove port %s from current zone: %s' %
3562 (tcp_port, err))
3563 else:
3564 logger.info(f'Port {tcp_port} disabled')
3565 else:
3566 logger.info(f'firewalld port {tcp_port} already closed')
3567
3568 def apply_rules(self):
3569 # type: () -> None
3570 if not self.available:
3571 return
3572
3573 if not self.cmd:
3574 raise RuntimeError('command not defined')
3575
3576 call_throws(self.ctx, [self.cmd, '--reload'])
3577
3578
3579 def update_firewalld(ctx, daemon_type):
3580 # type: (CephadmContext, str) -> None
3581 if not ('skip_firewalld' in ctx and ctx.skip_firewalld):
3582 firewall = Firewalld(ctx)
3583 firewall.enable_service_for(daemon_type)
3584 firewall.apply_rules()
3585
3586
3587 def install_sysctl(ctx: CephadmContext, fsid: str, daemon_type: str) -> None:
3588 """
3589 Set up sysctl settings
3590 """
3591 def _write(conf: Path, lines: List[str]) -> None:
3592 lines = [
3593 '# created by cephadm',
3594 '',
3595 *lines,
3596 '',
3597 ]
3598 with open(conf, 'w') as f:
3599 f.write('\n'.join(lines))
3600
3601 conf = Path(ctx.sysctl_dir).joinpath(f'90-ceph-{fsid}-{daemon_type}.conf')
3602 lines: Optional[List] = None
3603
3604 if daemon_type == 'osd':
3605 lines = OSD.get_sysctl_settings()
3606 elif daemon_type == 'haproxy':
3607 lines = HAproxy.get_sysctl_settings()
3608 elif daemon_type == 'keepalived':
3609 lines = Keepalived.get_sysctl_settings()
3610
3611 # apply the sysctl settings
3612 if lines:
3613 Path(ctx.sysctl_dir).mkdir(mode=0o755, exist_ok=True)
3614 _write(conf, lines)
3615 call_throws(ctx, ['sysctl', '--system'])
3616
3617
3618 def migrate_sysctl_dir(ctx: CephadmContext, fsid: str) -> None:
3619 """
3620 Cephadm once used '/usr/lib/sysctl.d' for storing sysctl configuration.
3621 This moves it to '/etc/sysctl.d'.
3622 """
3623 deprecated_location: str = '/usr/lib/sysctl.d'
3624 deprecated_confs: List[str] = glob(f'{deprecated_location}/90-ceph-{fsid}-*.conf')
3625 if not deprecated_confs:
3626 return
3627
3628 file_count: int = len(deprecated_confs)
3629 logger.info(f'Found sysctl {file_count} files in deprecated location {deprecated_location}. Starting Migration.')
3630 for conf in deprecated_confs:
3631 try:
3632 shutil.move(conf, ctx.sysctl_dir)
3633 file_count -= 1
3634 except shutil.Error as err:
3635 if str(err).endswith('already exists'):
3636 logger.warning(f'Destination file already exists. Deleting {conf}.')
3637 try:
3638 os.unlink(conf)
3639 file_count -= 1
3640 except OSError as del_err:
3641 logger.warning(f'Could not remove {conf}: {del_err}.')
3642 else:
3643 logger.warning(f'Could not move {conf} from {deprecated_location} to {ctx.sysctl_dir}: {err}')
3644
3645 # Log successful migration
3646 if file_count == 0:
3647 logger.info(f'Successfully migrated sysctl config to {ctx.sysctl_dir}.')
3648 return
3649
3650 # Log partially successful / unsuccessful migration
3651 files_processed: int = len(deprecated_confs)
3652 if file_count < files_processed:
3653 status: str = f'partially successful (failed {file_count}/{files_processed})'
3654 elif file_count == files_processed:
3655 status = 'unsuccessful'
3656 logger.warning(f'Migration of sysctl configuration {status}. You may want to perform a migration manually.')
3657
3658
3659 def install_base_units(ctx, fsid):
3660 # type: (CephadmContext, str) -> None
3661 """
3662 Set up ceph.target and ceph-$fsid.target units.
3663 """
3664 # global unit
3665 existed = os.path.exists(ctx.unit_dir + '/ceph.target')
3666 with open(ctx.unit_dir + '/ceph.target.new', 'w') as f:
3667 f.write('[Unit]\n'
3668 'Description=All Ceph clusters and services\n'
3669 '\n'
3670 '[Install]\n'
3671 'WantedBy=multi-user.target\n')
3672 os.rename(ctx.unit_dir + '/ceph.target.new',
3673 ctx.unit_dir + '/ceph.target')
3674 if not existed:
3675 # we disable before enable in case a different ceph.target
3676 # (from the traditional package) is present; while newer
3677 # systemd is smart enough to disable the old
3678 # (/lib/systemd/...) and enable the new (/etc/systemd/...),
3679 # some older versions of systemd error out with EEXIST.
3680 call_throws(ctx, ['systemctl', 'disable', 'ceph.target'])
3681 call_throws(ctx, ['systemctl', 'enable', 'ceph.target'])
3682 call_throws(ctx, ['systemctl', 'start', 'ceph.target'])
3683
3684 # cluster unit
3685 existed = os.path.exists(ctx.unit_dir + '/ceph-%s.target' % fsid)
3686 with open(ctx.unit_dir + '/ceph-%s.target.new' % fsid, 'w') as f:
3687 f.write(
3688 '[Unit]\n'
3689 'Description=Ceph cluster {fsid}\n'
3690 'PartOf=ceph.target\n'
3691 'Before=ceph.target\n'
3692 '\n'
3693 '[Install]\n'
3694 'WantedBy=multi-user.target ceph.target\n'.format(
3695 fsid=fsid)
3696 )
3697 os.rename(ctx.unit_dir + '/ceph-%s.target.new' % fsid,
3698 ctx.unit_dir + '/ceph-%s.target' % fsid)
3699 if not existed:
3700 call_throws(ctx, ['systemctl', 'enable', 'ceph-%s.target' % fsid])
3701 call_throws(ctx, ['systemctl', 'start', 'ceph-%s.target' % fsid])
3702
3703 # logrotate for the cluster
3704 with open(ctx.logrotate_dir + '/ceph-%s' % fsid, 'w') as f:
3705 """
3706 This is a bit sloppy in that the killall/pkill will touch all ceph daemons
3707 in all containers, but I don't see an elegant way to send SIGHUP *just* to
3708 the daemons for this cluster. (1) systemd kill -s will get the signal to
3709 podman, but podman will exit. (2) podman kill will get the signal to the
3710 first child (bash), but that isn't the ceph daemon. This is simpler and
3711 should be harmless.
3712 """
3713 f.write("""# created by cephadm
3714 /var/log/ceph/%s/*.log {
3715 rotate 7
3716 daily
3717 compress
3718 sharedscripts
3719 postrotate
3720 killall -q -1 ceph-mon ceph-mgr ceph-mds ceph-osd ceph-fuse radosgw rbd-mirror cephfs-mirror || pkill -1 -x 'ceph-mon|ceph-mgr|ceph-mds|ceph-osd|ceph-fuse|radosgw|rbd-mirror|cephfs-mirror' || true
3721 endscript
3722 missingok
3723 notifempty
3724 su root root
3725 }
3726 """ % fsid)
3727
3728
3729 def get_unit_file(ctx, fsid):
3730 # type: (CephadmContext, str) -> str
3731 extra_args = ''
3732 if isinstance(ctx.container_engine, Podman):
3733 extra_args = ('ExecStartPre=-/bin/rm -f %t/%n-pid %t/%n-cid\n'
3734 'ExecStopPost=-/bin/rm -f %t/%n-pid %t/%n-cid\n'
3735 'Type=forking\n'
3736 'PIDFile=%t/%n-pid\n')
3737 if ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION:
3738 extra_args += 'Delegate=yes\n'
3739
3740 docker = isinstance(ctx.container_engine, Docker)
3741 u = """# generated by cephadm
3742 [Unit]
3743 Description=Ceph %i for {fsid}
3744
3745 # According to:
3746 # http://www.freedesktop.org/wiki/Software/systemd/NetworkTarget
3747 # these can be removed once ceph-mon will dynamically change network
3748 # configuration.
3749 After=network-online.target local-fs.target time-sync.target{docker_after}
3750 Wants=network-online.target local-fs.target time-sync.target
3751 {docker_requires}
3752
3753 PartOf=ceph-{fsid}.target
3754 Before=ceph-{fsid}.target
3755
3756 [Service]
3757 LimitNOFILE=1048576
3758 LimitNPROC=1048576
3759 EnvironmentFile=-/etc/environment
3760 ExecStart=/bin/bash {data_dir}/{fsid}/%i/unit.run
3761 ExecStop=-/bin/bash -c 'bash {data_dir}/{fsid}/%i/unit.stop'
3762 ExecStopPost=-/bin/bash {data_dir}/{fsid}/%i/unit.poststop
3763 KillMode=none
3764 Restart=on-failure
3765 RestartSec=10s
3766 TimeoutStartSec=120
3767 TimeoutStopSec=120
3768 StartLimitInterval=30min
3769 StartLimitBurst=5
3770 {extra_args}
3771 [Install]
3772 WantedBy=ceph-{fsid}.target
3773 """.format(fsid=fsid,
3774 data_dir=ctx.data_dir,
3775 extra_args=extra_args,
3776 # if docker, we depend on docker.service
3777 docker_after=' docker.service' if docker else '',
3778 docker_requires='Requires=docker.service\n' if docker else '')
3779
3780 return u
3781
3782 ##################################
3783
3784
3785 class CephContainer:
3786 def __init__(self,
3787 ctx: CephadmContext,
3788 image: str,
3789 entrypoint: str,
3790 args: List[str] = [],
3791 volume_mounts: Dict[str, str] = {},
3792 cname: str = '',
3793 container_args: List[str] = [],
3794 envs: Optional[List[str]] = None,
3795 privileged: bool = False,
3796 ptrace: bool = False,
3797 bind_mounts: Optional[List[List[str]]] = None,
3798 init: Optional[bool] = None,
3799 host_network: bool = True,
3800 memory_request: Optional[str] = None,
3801 memory_limit: Optional[str] = None,
3802 ) -> None:
3803 self.ctx = ctx
3804 self.image = image
3805 self.entrypoint = entrypoint
3806 self.args = args
3807 self.volume_mounts = volume_mounts
3808 self._cname = cname
3809 self.container_args = container_args
3810 self.envs = envs
3811 self.privileged = privileged
3812 self.ptrace = ptrace
3813 self.bind_mounts = bind_mounts if bind_mounts else []
3814 self.init = init if init else ctx.container_init
3815 self.host_network = host_network
3816 self.memory_request = memory_request
3817 self.memory_limit = memory_limit
3818
3819 @classmethod
3820 def for_daemon(cls,
3821 ctx: CephadmContext,
3822 fsid: str,
3823 daemon_type: str,
3824 daemon_id: str,
3825 entrypoint: str,
3826 args: List[str] = [],
3827 volume_mounts: Dict[str, str] = {},
3828 container_args: List[str] = [],
3829 envs: Optional[List[str]] = None,
3830 privileged: bool = False,
3831 ptrace: bool = False,
3832 bind_mounts: Optional[List[List[str]]] = None,
3833 init: Optional[bool] = None,
3834 host_network: bool = True,
3835 memory_request: Optional[str] = None,
3836 memory_limit: Optional[str] = None,
3837 ) -> 'CephContainer':
3838 return cls(
3839 ctx,
3840 image=ctx.image,
3841 entrypoint=entrypoint,
3842 args=args,
3843 volume_mounts=volume_mounts,
3844 cname='ceph-%s-%s.%s' % (fsid, daemon_type, daemon_id),
3845 container_args=container_args,
3846 envs=envs,
3847 privileged=privileged,
3848 ptrace=ptrace,
3849 bind_mounts=bind_mounts,
3850 init=init,
3851 host_network=host_network,
3852 memory_request=memory_request,
3853 memory_limit=memory_limit,
3854 )
3855
3856 @property
3857 def cname(self) -> str:
3858 """
3859 podman adds the current container name to the /etc/hosts
3860 file. Turns out, python's `socket.getfqdn()` differs from
3861 `hostname -f`, when we have the container names containing
3862 dots in it.:
3863
3864 # podman run --name foo.bar.baz.com ceph/ceph /bin/bash
3865 [root@sebastians-laptop /]# cat /etc/hosts
3866 127.0.0.1 localhost
3867 ::1 localhost
3868 127.0.1.1 sebastians-laptop foo.bar.baz.com
3869 [root@sebastians-laptop /]# hostname -f
3870 sebastians-laptop
3871 [root@sebastians-laptop /]# python3 -c 'import socket; print(socket.getfqdn())'
3872 foo.bar.baz.com
3873
3874 Fascinatingly, this doesn't happen when using dashes.
3875 """
3876 return self._cname.replace('.', '-')
3877
3878 @cname.setter
3879 def cname(self, val: str) -> None:
3880 self._cname = val
3881
3882 @property
3883 def old_cname(self) -> str:
3884 return self._cname
3885
3886 def run_cmd(self) -> List[str]:
3887 cmd_args: List[str] = [
3888 str(self.ctx.container_engine.path),
3889 'run',
3890 '--rm',
3891 '--ipc=host',
3892 # some containers (ahem, haproxy) override this, but we want a fast
3893 # shutdown always (and, more importantly, a successful exit even if we
3894 # fall back to SIGKILL).
3895 '--stop-signal=SIGTERM',
3896 ]
3897
3898 if isinstance(self.ctx.container_engine, Podman):
3899 if os.path.exists('/etc/ceph/podman-auth.json'):
3900 cmd_args.append('--authfile=/etc/ceph/podman-auth.json')
3901
3902 envs: List[str] = [
3903 '-e', 'CONTAINER_IMAGE=%s' % self.image,
3904 '-e', 'NODE_NAME=%s' % get_hostname(),
3905 ]
3906 vols: List[str] = []
3907 binds: List[str] = []
3908
3909 if self.memory_request:
3910 cmd_args.extend(['-e', 'POD_MEMORY_REQUEST', str(self.memory_request)])
3911 if self.memory_limit:
3912 cmd_args.extend(['-e', 'POD_MEMORY_LIMIT', str(self.memory_limit)])
3913 cmd_args.extend(['--memory', str(self.memory_limit)])
3914
3915 if self.host_network:
3916 cmd_args.append('--net=host')
3917 if self.entrypoint:
3918 cmd_args.extend(['--entrypoint', self.entrypoint])
3919 if self.privileged:
3920 cmd_args.extend([
3921 '--privileged',
3922 # let OSD etc read block devs that haven't been chowned
3923 '--group-add=disk'])
3924 if self.ptrace and not self.privileged:
3925 # if privileged, the SYS_PTRACE cap is already added
3926 # in addition, --cap-add and --privileged are mutually
3927 # exclusive since podman >= 2.0
3928 cmd_args.append('--cap-add=SYS_PTRACE')
3929 if self.init:
3930 cmd_args.append('--init')
3931 envs += ['-e', 'CEPH_USE_RANDOM_NONCE=1']
3932 if self.cname:
3933 cmd_args.extend(['--name', self.cname])
3934 if self.envs:
3935 for env in self.envs:
3936 envs.extend(['-e', env])
3937
3938 vols = sum(
3939 [['-v', '%s:%s' % (host_dir, container_dir)]
3940 for host_dir, container_dir in self.volume_mounts.items()], [])
3941 binds = sum([['--mount', '{}'.format(','.join(bind))]
3942 for bind in self.bind_mounts], [])
3943
3944 return \
3945 cmd_args + self.container_args + \
3946 envs + vols + binds + \
3947 [self.image] + self.args # type: ignore
3948
3949 def shell_cmd(self, cmd: List[str]) -> List[str]:
3950 cmd_args: List[str] = [
3951 str(self.ctx.container_engine.path),
3952 'run',
3953 '--rm',
3954 '--ipc=host',
3955 ]
3956 envs: List[str] = [
3957 '-e', 'CONTAINER_IMAGE=%s' % self.image,
3958 '-e', 'NODE_NAME=%s' % get_hostname(),
3959 ]
3960 vols: List[str] = []
3961 binds: List[str] = []
3962
3963 if self.host_network:
3964 cmd_args.append('--net=host')
3965 if self.ctx.no_hosts:
3966 cmd_args.append('--no-hosts')
3967 if self.privileged:
3968 cmd_args.extend([
3969 '--privileged',
3970 # let OSD etc read block devs that haven't been chowned
3971 '--group-add=disk',
3972 ])
3973 if self.init:
3974 cmd_args.append('--init')
3975 envs += ['-e', 'CEPH_USE_RANDOM_NONCE=1']
3976 if self.envs:
3977 for env in self.envs:
3978 envs.extend(['-e', env])
3979
3980 vols = sum(
3981 [['-v', '%s:%s' % (host_dir, container_dir)]
3982 for host_dir, container_dir in self.volume_mounts.items()], [])
3983 binds = sum([['--mount', '{}'.format(','.join(bind))]
3984 for bind in self.bind_mounts], [])
3985
3986 return cmd_args + self.container_args + envs + vols + binds + [
3987 '--entrypoint', cmd[0],
3988 self.image,
3989 ] + cmd[1:]
3990
3991 def exec_cmd(self, cmd):
3992 # type: (List[str]) -> List[str]
3993 cname = get_running_container_name(self.ctx, self)
3994 if not cname:
3995 raise Error('unable to find container "{}"'.format(self.cname))
3996 return [
3997 str(self.ctx.container_engine.path),
3998 'exec',
3999 ] + self.container_args + [
4000 self.cname,
4001 ] + cmd
4002
4003 def rm_cmd(self, old_cname: bool = False, storage: bool = False) -> List[str]:
4004 ret = [
4005 str(self.ctx.container_engine.path),
4006 'rm', '-f',
4007 ]
4008 if storage:
4009 ret.append('--storage')
4010 if old_cname:
4011 ret.append(self.old_cname)
4012 else:
4013 ret.append(self.cname)
4014 return ret
4015
4016 def stop_cmd(self, old_cname: bool = False) -> List[str]:
4017 ret = [
4018 str(self.ctx.container_engine.path),
4019 'stop', self.old_cname if old_cname else self.cname,
4020 ]
4021 return ret
4022
4023 def run(self, timeout=DEFAULT_TIMEOUT, verbosity=CallVerbosity.VERBOSE_ON_FAILURE):
4024 # type: (Optional[int], CallVerbosity) -> str
4025 out, _, _ = call_throws(self.ctx, self.run_cmd(),
4026 desc=self.entrypoint, timeout=timeout, verbosity=verbosity)
4027 return out
4028
4029
4030 #####################################
4031
4032 class MgrListener(Thread):
4033 def __init__(self, agent: 'CephadmAgent') -> None:
4034 self.agent = agent
4035 self.stop = False
4036 super(MgrListener, self).__init__(target=self.run)
4037
4038 def run(self) -> None:
4039 listenSocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
4040 listenSocket.bind(('0.0.0.0', int(self.agent.listener_port)))
4041 listenSocket.settimeout(60)
4042 listenSocket.listen(1)
4043 ssl_ctx = ssl.create_default_context(ssl.Purpose.CLIENT_AUTH)
4044 ssl_ctx.verify_mode = ssl.CERT_REQUIRED
4045 ssl_ctx.load_cert_chain(self.agent.listener_cert_path, self.agent.listener_key_path)
4046 ssl_ctx.load_verify_locations(self.agent.ca_path)
4047 secureListenSocket = ssl_ctx.wrap_socket(listenSocket, server_side=True)
4048 while not self.stop:
4049 try:
4050 try:
4051 conn, _ = secureListenSocket.accept()
4052 except socket.timeout:
4053 continue
4054 try:
4055 length: int = int(conn.recv(10).decode())
4056 except Exception as e:
4057 err_str = f'Failed to extract length of payload from message: {e}'
4058 conn.send(err_str.encode())
4059 logger.error(err_str)
4060 while True:
4061 payload = conn.recv(length).decode()
4062 if not payload:
4063 break
4064 try:
4065 data: Dict[Any, Any] = json.loads(payload)
4066 self.handle_json_payload(data)
4067 except Exception as e:
4068 err_str = f'Failed to extract json payload from message: {e}'
4069 conn.send(err_str.encode())
4070 logger.error(err_str)
4071 else:
4072 conn.send(b'ACK')
4073 if 'config' in data:
4074 self.agent.wakeup()
4075 self.agent.ls_gatherer.wakeup()
4076 self.agent.volume_gatherer.wakeup()
4077 logger.debug(f'Got mgr message {data}')
4078 except Exception as e:
4079 logger.error(f'Mgr Listener encountered exception: {e}')
4080
4081 def shutdown(self) -> None:
4082 self.stop = True
4083
4084 def handle_json_payload(self, data: Dict[Any, Any]) -> None:
4085 self.agent.ack = int(data['counter'])
4086 if 'config' in data:
4087 logger.info('Received new config from mgr')
4088 config = data['config']
4089 for filename in config:
4090 if filename in self.agent.required_files:
4091 file_path = os.path.join(self.agent.daemon_dir, filename)
4092 with open(os.open(file_path + '.new', os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
4093 f.write(config[filename])
4094 os.rename(file_path + '.new', file_path)
4095 self.agent.pull_conf_settings()
4096 self.agent.wakeup()
4097
4098
4099 class CephadmAgent():
4100
4101 daemon_type = 'agent'
4102 default_port = 8498
4103 loop_interval = 30
4104 stop = False
4105
4106 required_files = [
4107 'agent.json',
4108 'keyring',
4109 'root_cert.pem',
4110 'listener.crt',
4111 'listener.key',
4112 ]
4113
4114 def __init__(self, ctx: CephadmContext, fsid: str, daemon_id: Union[int, str] = ''):
4115 self.ctx = ctx
4116 self.fsid = fsid
4117 self.daemon_id = daemon_id
4118 self.starting_port = 14873
4119 self.target_ip = ''
4120 self.target_port = ''
4121 self.host = ''
4122 self.daemon_dir = os.path.join(ctx.data_dir, self.fsid, f'{self.daemon_type}.{self.daemon_id}')
4123 self.config_path = os.path.join(self.daemon_dir, 'agent.json')
4124 self.keyring_path = os.path.join(self.daemon_dir, 'keyring')
4125 self.ca_path = os.path.join(self.daemon_dir, 'root_cert.pem')
4126 self.listener_cert_path = os.path.join(self.daemon_dir, 'listener.crt')
4127 self.listener_key_path = os.path.join(self.daemon_dir, 'listener.key')
4128 self.listener_port = ''
4129 self.ack = 1
4130 self.event = Event()
4131 self.mgr_listener = MgrListener(self)
4132 self.ls_gatherer = AgentGatherer(self, lambda: self._get_ls(), 'Ls')
4133 self.volume_gatherer = AgentGatherer(self, lambda: self._ceph_volume(enhanced=False), 'Volume')
4134 self.device_enhanced_scan = False
4135 self.recent_iteration_run_times: List[float] = [0.0, 0.0, 0.0]
4136 self.recent_iteration_index: int = 0
4137 self.cached_ls_values: Dict[str, Dict[str, str]] = {}
4138
4139 def validate(self, config: Dict[str, str] = {}) -> None:
4140 # check for the required files
4141 for fname in self.required_files:
4142 if fname not in config:
4143 raise Error('required file missing from config: %s' % fname)
4144
4145 def deploy_daemon_unit(self, config: Dict[str, str] = {}) -> None:
4146 if not config:
4147 raise Error('Agent needs a config')
4148 assert isinstance(config, dict)
4149 self.validate(config)
4150
4151 # Create the required config files in the daemons dir, with restricted permissions
4152 for filename in config:
4153 if filename in self.required_files:
4154 file_path = os.path.join(self.daemon_dir, filename)
4155 with open(os.open(file_path + '.new', os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
4156 f.write(config[filename])
4157 os.rename(file_path + '.new', file_path)
4158
4159 unit_run_path = os.path.join(self.daemon_dir, 'unit.run')
4160 with open(os.open(unit_run_path + '.new', os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
4161 f.write(self.unit_run())
4162 os.rename(unit_run_path + '.new', unit_run_path)
4163
4164 meta: Dict[str, Any] = {}
4165 meta_file_path = os.path.join(self.daemon_dir, 'unit.meta')
4166 if 'meta_json' in self.ctx and self.ctx.meta_json:
4167 meta = json.loads(self.ctx.meta_json) or {}
4168 with open(os.open(meta_file_path + '.new', os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
4169 f.write(json.dumps(meta, indent=4) + '\n')
4170 os.rename(meta_file_path + '.new', meta_file_path)
4171
4172 unit_file_path = os.path.join(self.ctx.unit_dir, self.unit_name())
4173 with open(os.open(unit_file_path + '.new', os.O_CREAT | os.O_WRONLY, 0o600), 'w') as f:
4174 f.write(self.unit_file())
4175 os.rename(unit_file_path + '.new', unit_file_path)
4176
4177 call_throws(self.ctx, ['systemctl', 'daemon-reload'])
4178 call(self.ctx, ['systemctl', 'stop', self.unit_name()],
4179 verbosity=CallVerbosity.DEBUG)
4180 call(self.ctx, ['systemctl', 'reset-failed', self.unit_name()],
4181 verbosity=CallVerbosity.DEBUG)
4182 call_throws(self.ctx, ['systemctl', 'enable', '--now', self.unit_name()])
4183
4184 def unit_name(self) -> str:
4185 return '{}.service'.format(get_unit_name(self.fsid, self.daemon_type, self.daemon_id))
4186
4187 def unit_run(self) -> str:
4188 py3 = shutil.which('python3')
4189 binary_path = os.path.realpath(sys.argv[0])
4190 return ('set -e\n' + f'{py3} {binary_path} agent --fsid {self.fsid} --daemon-id {self.daemon_id} &\n')
4191
4192 def unit_file(self) -> str:
4193 return """#generated by cephadm
4194 [Unit]
4195 Description=cephadm agent for cluster {fsid}
4196
4197 PartOf=ceph-{fsid}.target
4198 Before=ceph-{fsid}.target
4199
4200 [Service]
4201 Type=forking
4202 ExecStart=/bin/bash {data_dir}/unit.run
4203 Restart=on-failure
4204 RestartSec=10s
4205
4206 [Install]
4207 WantedBy=ceph-{fsid}.target
4208 """.format(
4209 fsid=self.fsid,
4210 data_dir=self.daemon_dir
4211 )
4212
4213 def shutdown(self) -> None:
4214 self.stop = True
4215 if self.mgr_listener.is_alive():
4216 self.mgr_listener.shutdown()
4217
4218 def wakeup(self) -> None:
4219 self.event.set()
4220
4221 def pull_conf_settings(self) -> None:
4222 try:
4223 with open(self.config_path, 'r') as f:
4224 config = json.load(f)
4225 self.target_ip = config['target_ip']
4226 self.target_port = config['target_port']
4227 self.loop_interval = int(config['refresh_period'])
4228 self.starting_port = int(config['listener_port'])
4229 self.host = config['host']
4230 use_lsm = config['device_enhanced_scan']
4231 except Exception as e:
4232 self.shutdown()
4233 raise Error(f'Failed to get agent target ip and port from config: {e}')
4234
4235 try:
4236 with open(self.keyring_path, 'r') as f:
4237 self.keyring = f.read()
4238 except Exception as e:
4239 self.shutdown()
4240 raise Error(f'Failed to get agent keyring: {e}')
4241
4242 assert self.target_ip and self.target_port
4243
4244 self.device_enhanced_scan = False
4245 if use_lsm.lower() == 'true':
4246 self.device_enhanced_scan = True
4247 self.volume_gatherer.update_func(lambda: self._ceph_volume(enhanced=self.device_enhanced_scan))
4248
4249 def run(self) -> None:
4250 self.pull_conf_settings()
4251
4252 try:
4253 for _ in range(1001):
4254 if not port_in_use(self.ctx, self.starting_port):
4255 self.listener_port = str(self.starting_port)
4256 break
4257 self.starting_port += 1
4258 if not self.listener_port:
4259 raise Error(f'All 1000 ports starting at {str(self.starting_port - 1001)} taken.')
4260 except Exception as e:
4261 raise Error(f'Failed to pick port for agent to listen on: {e}')
4262
4263 if not self.mgr_listener.is_alive():
4264 self.mgr_listener.start()
4265
4266 if not self.ls_gatherer.is_alive():
4267 self.ls_gatherer.start()
4268
4269 if not self.volume_gatherer.is_alive():
4270 self.volume_gatherer.start()
4271
4272 ssl_ctx = ssl.create_default_context()
4273 ssl_ctx.check_hostname = True
4274 ssl_ctx.verify_mode = ssl.CERT_REQUIRED
4275 ssl_ctx.load_verify_locations(self.ca_path)
4276
4277 while not self.stop:
4278 start_time = time.monotonic()
4279 ack = self.ack
4280
4281 # part of the networks info is returned as a set which is not JSON
4282 # serializable. The set must be converted to a list
4283 networks = list_networks(self.ctx)
4284 networks_list = {}
4285 for key in networks.keys():
4286 for k, v in networks[key].items():
4287 networks_list[key] = {k: list(v)}
4288
4289 data = json.dumps({'host': self.host,
4290 'ls': (self.ls_gatherer.data if self.ack == self.ls_gatherer.ack
4291 and self.ls_gatherer.data is not None else []),
4292 'networks': networks_list,
4293 'facts': HostFacts(self.ctx).dump(),
4294 'volume': (self.volume_gatherer.data if self.ack == self.volume_gatherer.ack
4295 and self.volume_gatherer.data is not None else ''),
4296 'ack': str(ack),
4297 'keyring': self.keyring,
4298 'port': self.listener_port})
4299 data = data.encode('ascii')
4300
4301 url = f'https://{self.target_ip}:{self.target_port}/data'
4302 try:
4303 req = Request(url, data, {'Content-Type': 'application/json'})
4304 send_time = time.monotonic()
4305 with urlopen(req, context=ssl_ctx) as response:
4306 response_str = response.read()
4307 response_json = json.loads(response_str)
4308 total_request_time = datetime.timedelta(seconds=(time.monotonic() - send_time)).total_seconds()
4309 logger.info(f'Received mgr response: "{response_json["result"]}" {total_request_time} seconds after sending request.')
4310 except Exception as e:
4311 logger.error(f'Failed to send metadata to mgr: {e}')
4312
4313 end_time = time.monotonic()
4314 run_time = datetime.timedelta(seconds=(end_time - start_time))
4315 self.recent_iteration_run_times[self.recent_iteration_index] = run_time.total_seconds()
4316 self.recent_iteration_index = (self.recent_iteration_index + 1) % 3
4317 run_time_average = sum(self.recent_iteration_run_times, 0.0) / len([t for t in self.recent_iteration_run_times if t])
4318
4319 self.event.wait(max(self.loop_interval - int(run_time_average), 0))
4320 self.event.clear()
4321
4322 def _ceph_volume(self, enhanced: bool = False) -> Tuple[str, bool]:
4323 self.ctx.command = 'inventory --format=json'.split()
4324 if enhanced:
4325 self.ctx.command.append('--with-lsm')
4326 self.ctx.fsid = self.fsid
4327
4328 stream = io.StringIO()
4329 with redirect_stdout(stream):
4330 command_ceph_volume(self.ctx)
4331
4332 stdout = stream.getvalue()
4333
4334 if stdout:
4335 return (stdout, False)
4336 else:
4337 raise Exception('ceph-volume returned empty value')
4338
4339 def _daemon_ls_subset(self) -> Dict[str, Dict[str, Any]]:
4340 # gets a subset of ls info quickly. The results of this will tell us if our
4341 # cached info is still good or if we need to run the full ls again.
4342 # for legacy containers, we just grab the full info. For cephadmv1 containers,
4343 # we only grab enabled, state, mem_usage and container id. If container id has
4344 # not changed for any daemon, we assume our cached info is good.
4345 daemons: Dict[str, Dict[str, Any]] = {}
4346 data_dir = self.ctx.data_dir
4347 seen_memusage = {} # type: Dict[str, int]
4348 out, err, code = call(
4349 self.ctx,
4350 [self.ctx.container_engine.path, 'stats', '--format', '{{.ID}},{{.MemUsage}}', '--no-stream'],
4351 verbosity=CallVerbosity.DEBUG
4352 )
4353 seen_memusage_cid_len, seen_memusage = _parse_mem_usage(code, out)
4354 # we need a mapping from container names to ids. Later we will convert daemon
4355 # names to container names to get daemons container id to see if it has changed
4356 out, err, code = call(
4357 self.ctx,
4358 [self.ctx.container_engine.path, 'ps', '--format', '{{.ID}},{{.Names}}', '--no-trunc'],
4359 verbosity=CallVerbosity.DEBUG
4360 )
4361 name_id_mapping: Dict[str, str] = self._parse_container_id_name(code, out)
4362 for i in os.listdir(data_dir):
4363 if i in ['mon', 'osd', 'mds', 'mgr']:
4364 daemon_type = i
4365 for j in os.listdir(os.path.join(data_dir, i)):
4366 if '-' not in j:
4367 continue
4368 (cluster, daemon_id) = j.split('-', 1)
4369 legacy_unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
4370 (enabled, state, _) = check_unit(self.ctx, legacy_unit_name)
4371 daemons[f'{daemon_type}.{daemon_id}'] = {
4372 'style': 'legacy',
4373 'name': '%s.%s' % (daemon_type, daemon_id),
4374 'fsid': self.ctx.fsid if self.ctx.fsid is not None else 'unknown',
4375 'systemd_unit': legacy_unit_name,
4376 'enabled': 'true' if enabled else 'false',
4377 'state': state,
4378 }
4379 elif is_fsid(i):
4380 fsid = str(i) # convince mypy that fsid is a str here
4381 for j in os.listdir(os.path.join(data_dir, i)):
4382 if '.' in j and os.path.isdir(os.path.join(data_dir, fsid, j)):
4383 (daemon_type, daemon_id) = j.split('.', 1)
4384 unit_name = get_unit_name(fsid, daemon_type, daemon_id)
4385 (enabled, state, _) = check_unit(self.ctx, unit_name)
4386 daemons[j] = {
4387 'style': 'cephadm:v1',
4388 'systemd_unit': unit_name,
4389 'enabled': 'true' if enabled else 'false',
4390 'state': state,
4391 }
4392 c = CephContainer.for_daemon(self.ctx, self.ctx.fsid, daemon_type, daemon_id, 'bash')
4393 container_id: Optional[str] = None
4394 for name in (c.cname, c.old_cname):
4395 if name in name_id_mapping:
4396 container_id = name_id_mapping[name]
4397 break
4398 daemons[j]['container_id'] = container_id
4399 if container_id:
4400 daemons[j]['memory_usage'] = seen_memusage.get(container_id[0:seen_memusage_cid_len])
4401 return daemons
4402
4403 def _parse_container_id_name(self, code: int, out: str) -> Dict[str, str]:
4404 # map container names to ids from ps output
4405 name_id_mapping = {} # type: Dict[str, str]
4406 if not code:
4407 for line in out.splitlines():
4408 id, name = line.split(',')
4409 name_id_mapping[name] = id
4410 return name_id_mapping
4411
4412 def _get_ls(self) -> Tuple[List[Dict[str, str]], bool]:
4413 if not self.cached_ls_values:
4414 logger.info('No cached ls output. Running full daemon ls')
4415 ls = list_daemons(self.ctx)
4416 for d in ls:
4417 self.cached_ls_values[d['name']] = d
4418 return (ls, True)
4419 else:
4420 ls_subset = self._daemon_ls_subset()
4421 need_full_ls = False
4422 state_change = False
4423 if set(self.cached_ls_values.keys()) != set(ls_subset.keys()):
4424 # case for a new daemon in ls or an old daemon no longer appearing.
4425 # If that happens we need a full ls
4426 logger.info('Change detected in state of daemons. Running full daemon ls')
4427 ls = list_daemons(self.ctx)
4428 for d in ls:
4429 self.cached_ls_values[d['name']] = d
4430 return (ls, True)
4431 for daemon, info in self.cached_ls_values.items():
4432 if info['style'] == 'legacy':
4433 # for legacy containers, ls_subset just grabs all the info
4434 self.cached_ls_values[daemon] = ls_subset[daemon]
4435 else:
4436 if info['container_id'] != ls_subset[daemon]['container_id']:
4437 # case for container id having changed. We need full ls as
4438 # info we didn't grab like version and start time could have changed
4439 need_full_ls = True
4440 break
4441
4442 # want to know if a daemons state change because in those cases we want
4443 # to report back quicker
4444 if (
4445 self.cached_ls_values[daemon]['enabled'] != ls_subset[daemon]['enabled']
4446 or self.cached_ls_values[daemon]['state'] != ls_subset[daemon]['state']
4447 ):
4448 state_change = True
4449 # if we reach here, container id matched. Update the few values we do track
4450 # from ls subset: state, enabled, memory_usage.
4451 self.cached_ls_values[daemon]['enabled'] = ls_subset[daemon]['enabled']
4452 self.cached_ls_values[daemon]['state'] = ls_subset[daemon]['state']
4453 if 'memory_usage' in ls_subset[daemon]:
4454 self.cached_ls_values[daemon]['memory_usage'] = ls_subset[daemon]['memory_usage']
4455 if need_full_ls:
4456 logger.info('Change detected in state of daemons. Running full daemon ls')
4457 ls = list_daemons(self.ctx)
4458 for d in ls:
4459 self.cached_ls_values[d['name']] = d
4460 return (ls, True)
4461 else:
4462 ls = [info for daemon, info in self.cached_ls_values.items()]
4463 return (ls, state_change)
4464
4465
4466 class AgentGatherer(Thread):
4467 def __init__(self, agent: 'CephadmAgent', func: Callable, gatherer_type: str = 'Unnamed', initial_ack: int = 0) -> None:
4468 self.agent = agent
4469 self.func = func
4470 self.gatherer_type = gatherer_type
4471 self.ack = initial_ack
4472 self.event = Event()
4473 self.data: Any = None
4474 self.stop = False
4475 self.recent_iteration_run_times: List[float] = [0.0, 0.0, 0.0]
4476 self.recent_iteration_index: int = 0
4477 super(AgentGatherer, self).__init__(target=self.run)
4478
4479 def run(self) -> None:
4480 while not self.stop:
4481 try:
4482 start_time = time.monotonic()
4483
4484 ack = self.agent.ack
4485 change = False
4486 try:
4487 self.data, change = self.func()
4488 except Exception as e:
4489 logger.error(f'{self.gatherer_type} Gatherer encountered exception gathering data: {e}')
4490 self.data = None
4491 if ack != self.ack or change:
4492 self.ack = ack
4493 self.agent.wakeup()
4494
4495 end_time = time.monotonic()
4496 run_time = datetime.timedelta(seconds=(end_time - start_time))
4497 self.recent_iteration_run_times[self.recent_iteration_index] = run_time.total_seconds()
4498 self.recent_iteration_index = (self.recent_iteration_index + 1) % 3
4499 run_time_average = sum(self.recent_iteration_run_times, 0.0) / len([t for t in self.recent_iteration_run_times if t])
4500
4501 self.event.wait(max(self.agent.loop_interval - int(run_time_average), 0))
4502 self.event.clear()
4503 except Exception as e:
4504 logger.error(f'{self.gatherer_type} Gatherer encountered exception: {e}')
4505
4506 def shutdown(self) -> None:
4507 self.stop = True
4508
4509 def wakeup(self) -> None:
4510 self.event.set()
4511
4512 def update_func(self, func: Callable) -> None:
4513 self.func = func
4514
4515
4516 def command_agent(ctx: CephadmContext) -> None:
4517 agent = CephadmAgent(ctx, ctx.fsid, ctx.daemon_id)
4518
4519 if not os.path.isdir(agent.daemon_dir):
4520 raise Error(f'Agent daemon directory {agent.daemon_dir} does not exist. Perhaps agent was never deployed?')
4521
4522 agent.run()
4523
4524
4525 ##################################
4526
4527
4528 @infer_image
4529 def command_version(ctx):
4530 # type: (CephadmContext) -> int
4531 c = CephContainer(ctx, ctx.image, 'ceph', ['--version'])
4532 out, err, ret = call(ctx, c.run_cmd(), desc=c.entrypoint)
4533 if not ret:
4534 print(out.strip())
4535 return ret
4536
4537 ##################################
4538
4539
4540 @default_image
4541 def command_pull(ctx):
4542 # type: (CephadmContext) -> int
4543
4544 try:
4545 _pull_image(ctx, ctx.image, ctx.insecure)
4546 except UnauthorizedRegistryError:
4547 err_str = 'Failed to pull container image. Check that host(s) are logged into the registry'
4548 logger.debug(f'Pulling image for `command_pull` failed: {err_str}')
4549 raise Error(err_str)
4550 return command_inspect_image(ctx)
4551
4552
4553 def _pull_image(ctx, image, insecure=False):
4554 # type: (CephadmContext, str, bool) -> None
4555 logger.info('Pulling container image %s...' % image)
4556
4557 ignorelist = [
4558 'error creating read-write layer with ID',
4559 'net/http: TLS handshake timeout',
4560 'Digest did not match, expected',
4561 ]
4562
4563 cmd = [ctx.container_engine.path, 'pull', image]
4564 if isinstance(ctx.container_engine, Podman):
4565 if insecure:
4566 cmd.append('--tls-verify=false')
4567
4568 if os.path.exists('/etc/ceph/podman-auth.json'):
4569 cmd.append('--authfile=/etc/ceph/podman-auth.json')
4570 cmd_str = ' '.join(cmd)
4571
4572 for sleep_secs in [1, 4, 25]:
4573 out, err, ret = call(ctx, cmd, verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
4574 if not ret:
4575 return
4576
4577 if 'unauthorized' in err:
4578 raise UnauthorizedRegistryError()
4579
4580 if not any(pattern in err for pattern in ignorelist):
4581 raise Error('Failed command: %s' % cmd_str)
4582
4583 logger.info('`%s` failed transiently. Retrying. waiting %s seconds...' % (cmd_str, sleep_secs))
4584 time.sleep(sleep_secs)
4585
4586 raise Error('Failed command: %s: maximum retries reached' % cmd_str)
4587
4588 ##################################
4589
4590
4591 @infer_image
4592 def command_inspect_image(ctx):
4593 # type: (CephadmContext) -> int
4594 out, err, ret = call_throws(ctx, [
4595 ctx.container_engine.path, 'inspect',
4596 '--format', '{{.ID}},{{.RepoDigests}}',
4597 ctx.image])
4598 if ret:
4599 return errno.ENOENT
4600 info_from = get_image_info_from_inspect(out.strip(), ctx.image)
4601
4602 ver = CephContainer(ctx, ctx.image, 'ceph', ['--version']).run().strip()
4603 info_from['ceph_version'] = ver
4604
4605 print(json.dumps(info_from, indent=4, sort_keys=True))
4606 return 0
4607
4608
4609 def normalize_image_digest(digest: str) -> str:
4610 """
4611 Normal case:
4612 >>> normalize_image_digest('ceph/ceph', 'docker.io')
4613 'docker.io/ceph/ceph'
4614
4615 No change:
4616 >>> normalize_image_digest('quay.ceph.io/ceph/ceph', 'docker.io')
4617 'quay.ceph.io/ceph/ceph'
4618
4619 >>> normalize_image_digest('docker.io/ubuntu', 'docker.io')
4620 'docker.io/ubuntu'
4621
4622 >>> normalize_image_digest('localhost/ceph', 'docker.io')
4623 'localhost/ceph'
4624 """
4625 known_shortnames = [
4626 'ceph/ceph',
4627 'ceph/daemon',
4628 'ceph/daemon-base',
4629 ]
4630 for image in known_shortnames:
4631 if digest.startswith(image):
4632 return f'{DEFAULT_REGISTRY}/{digest}'
4633 return digest
4634
4635
4636 def get_image_info_from_inspect(out, image):
4637 # type: (str, str) -> Dict[str, Union[str,List[str]]]
4638 image_id, digests = out.split(',', 1)
4639 if not out:
4640 raise Error('inspect {}: empty result'.format(image))
4641 r = {
4642 'image_id': normalize_container_id(image_id)
4643 } # type: Dict[str, Union[str,List[str]]]
4644 if digests:
4645 r['repo_digests'] = list(map(normalize_image_digest, digests[1: -1].split(' ')))
4646 return r
4647
4648 ##################################
4649
4650
4651 def check_subnet(subnets: str) -> Tuple[int, List[int], str]:
4652 """Determine whether the given string is a valid subnet
4653
4654 :param subnets: subnet string, a single definition or comma separated list of CIDR subnets
4655 :returns: return code, IP version list of the subnets and msg describing any errors validation errors
4656 """
4657
4658 rc = 0
4659 versions = set()
4660 errors = []
4661 subnet_list = subnets.split(',')
4662 for subnet in subnet_list:
4663 # ensure the format of the string is as expected address/netmask
4664 subnet = subnet.strip()
4665 if not re.search(r'\/\d+$', subnet):
4666 rc = 1
4667 errors.append(f'{subnet} is not in CIDR format (address/netmask)')
4668 continue
4669 try:
4670 v = ipaddress.ip_network(subnet).version
4671 versions.add(v)
4672 except ValueError as e:
4673 rc = 1
4674 errors.append(f'{subnet} invalid: {str(e)}')
4675
4676 return rc, list(versions), ', '.join(errors)
4677
4678
4679 def unwrap_ipv6(address):
4680 # type: (str) -> str
4681 if address.startswith('[') and address.endswith(']'):
4682 return address[1: -1]
4683 return address
4684
4685
4686 def wrap_ipv6(address):
4687 # type: (str) -> str
4688
4689 # We cannot assume it's already wrapped or even an IPv6 address if
4690 # it's already wrapped it'll not pass (like if it's a hostname) and trigger
4691 # the ValueError
4692 try:
4693 if ipaddress.ip_address(address).version == 6:
4694 return f'[{address}]'
4695 except ValueError:
4696 pass
4697
4698 return address
4699
4700
4701 def is_ipv6(address):
4702 # type: (str) -> bool
4703 address = unwrap_ipv6(address)
4704 try:
4705 return ipaddress.ip_address(address).version == 6
4706 except ValueError:
4707 logger.warning('Address: {} is not a valid IP address'.format(address))
4708 return False
4709
4710
4711 def ip_in_subnets(ip_addr: str, subnets: str) -> bool:
4712 """Determine if the ip_addr belongs to any of the subnets list."""
4713 subnet_list = [x.strip() for x in subnets.split(',')]
4714 for subnet in subnet_list:
4715 ip_address = unwrap_ipv6(ip_addr) if is_ipv6(ip_addr) else ip_addr
4716 if ipaddress.ip_address(ip_address) in ipaddress.ip_network(subnet):
4717 return True
4718 return False
4719
4720
4721 def parse_mon_addrv(addrv_arg: str) -> List[EndPoint]:
4722 """Parse mon-addrv param into a list of mon end points."""
4723 r = re.compile(r':(\d+)$')
4724 addrv_args = []
4725 addr_arg = addrv_arg
4726 if addr_arg[0] != '[' or addr_arg[-1] != ']':
4727 raise Error(f'--mon-addrv value {addr_arg} must use square backets')
4728
4729 for addr in addr_arg[1: -1].split(','):
4730 hasport = r.findall(addr)
4731 if not hasport:
4732 raise Error(f'--mon-addrv value {addr_arg} must include port number')
4733 port_str = hasport[0]
4734 addr = re.sub(r'^v\d+:', '', addr) # strip off v1: or v2: prefix
4735 base_ip = addr[0:-(len(port_str)) - 1]
4736 addrv_args.append(EndPoint(base_ip, int(port_str)))
4737
4738 return addrv_args
4739
4740
4741 def parse_mon_ip(mon_ip: str) -> List[EndPoint]:
4742 """Parse mon-ip param into a list of mon end points."""
4743 r = re.compile(r':(\d+)$')
4744 addrv_args = []
4745 hasport = r.findall(mon_ip)
4746 if hasport:
4747 port_str = hasport[0]
4748 base_ip = mon_ip[0:-(len(port_str)) - 1]
4749 addrv_args.append(EndPoint(base_ip, int(port_str)))
4750 else:
4751 # No port provided: use fixed ports for ceph monitor
4752 addrv_args.append(EndPoint(mon_ip, 3300))
4753 addrv_args.append(EndPoint(mon_ip, 6789))
4754
4755 return addrv_args
4756
4757
4758 def build_addrv_params(addrv: List[EndPoint]) -> str:
4759 """Convert mon end-points (ip:port) into the format: [v[1|2]:ip:port1]"""
4760 if len(addrv) > 2:
4761 raise Error('Detected a local mon-addrv list with more than 2 entries.')
4762 port_to_ver: Dict[int, str] = {6789: 'v1', 3300: 'v2'}
4763 addr_arg_list: List[str] = []
4764 for ep in addrv:
4765 if ep.port in port_to_ver:
4766 ver = port_to_ver[ep.port]
4767 else:
4768 ver = 'v2' # default mon protocol version if port is not provided
4769 logger.warning(f'Using msgr2 protocol for unrecognized port {ep}')
4770 addr_arg_list.append(f'{ver}:{ep.ip}:{ep.port}')
4771
4772 addr_arg = '[{0}]'.format(','.join(addr_arg_list))
4773 return addr_arg
4774
4775
4776 def get_public_net_from_cfg(ctx: CephadmContext) -> Optional[str]:
4777 """Get mon public network from configuration file."""
4778 cp = read_config(ctx.config)
4779 if not cp.has_option('global', 'public_network'):
4780 return None
4781
4782 # Ensure all public CIDR networks are valid
4783 public_network = cp.get('global', 'public_network')
4784 rc, _, err_msg = check_subnet(public_network)
4785 if rc:
4786 raise Error(f'Invalid public_network {public_network} parameter: {err_msg}')
4787
4788 # Ensure all public CIDR networks are configured locally
4789 configured_subnets = set([x.strip() for x in public_network.split(',')])
4790 local_subnets = set([x[0] for x in list_networks(ctx).items()])
4791 valid_public_net = False
4792 for net in configured_subnets:
4793 if net in local_subnets:
4794 valid_public_net = True
4795 else:
4796 logger.warning(f'The public CIDR network {net} (from -c conf file) is not configured locally.')
4797 if not valid_public_net:
4798 raise Error(f'None of the public CIDR network(s) {configured_subnets} (from -c conf file) is configured locally.')
4799
4800 # Ensure public_network is compatible with the provided mon-ip (or mon-addrv)
4801 if ctx.mon_ip:
4802 if not ip_in_subnets(ctx.mon_ip, public_network):
4803 raise Error(f'The provided --mon-ip {ctx.mon_ip} does not belong to any public_network(s) {public_network}')
4804 elif ctx.mon_addrv:
4805 addrv_args = parse_mon_addrv(ctx.mon_addrv)
4806 for addrv in addrv_args:
4807 if not ip_in_subnets(addrv.ip, public_network):
4808 raise Error(f'The provided --mon-addrv {addrv.ip} ip does not belong to any public_network(s) {public_network}')
4809
4810 logger.debug(f'Using mon public network from configuration file {public_network}')
4811 return public_network
4812
4813
4814 def infer_mon_network(ctx: CephadmContext, mon_eps: List[EndPoint]) -> Optional[str]:
4815 """Infer mon public network from local network."""
4816 # Make sure IP is configured locally, and then figure out the CIDR network
4817 mon_networks = []
4818 for net, ifaces in list_networks(ctx).items():
4819 # build local_ips list for the specified network
4820 local_ips: List[str] = []
4821 for _, ls in ifaces.items():
4822 local_ips.extend([ipaddress.ip_address(ip) for ip in ls])
4823
4824 # check if any of mon ips belong to this net
4825 for mon_ep in mon_eps:
4826 try:
4827 if ipaddress.ip_address(unwrap_ipv6(mon_ep.ip)) in local_ips:
4828 mon_networks.append(net)
4829 logger.info(f'Mon IP `{mon_ep.ip}` is in CIDR network `{net}`')
4830 except ValueError as e:
4831 logger.warning(f'Cannot infer CIDR network for mon IP `{mon_ep.ip}` : {e}')
4832
4833 if not mon_networks:
4834 raise Error('Cannot infer CIDR network. Pass --skip-mon-network to configure it later')
4835 else:
4836 logger.debug(f'Inferred mon public CIDR from local network configuration {mon_networks}')
4837
4838 mon_networks = list(set(mon_networks)) # remove duplicates
4839 return ','.join(mon_networks)
4840
4841
4842 def prepare_mon_addresses(ctx: CephadmContext) -> Tuple[str, bool, Optional[str]]:
4843 """Get mon public network configuration."""
4844 ipv6 = False
4845 addrv_args: List[EndPoint] = []
4846 mon_addrv: str = '' # i.e: [v2:192.168.100.1:3300,v1:192.168.100.1:6789]
4847
4848 if ctx.mon_ip:
4849 ipv6 = is_ipv6(ctx.mon_ip)
4850 if ipv6:
4851 ctx.mon_ip = wrap_ipv6(ctx.mon_ip)
4852 addrv_args = parse_mon_ip(ctx.mon_ip)
4853 mon_addrv = build_addrv_params(addrv_args)
4854 elif ctx.mon_addrv:
4855 ipv6 = ctx.mon_addrv.count('[') > 1
4856 addrv_args = parse_mon_addrv(ctx.mon_addrv)
4857 mon_addrv = ctx.mon_addrv
4858 else:
4859 raise Error('must specify --mon-ip or --mon-addrv')
4860
4861 if addrv_args:
4862 for end_point in addrv_args:
4863 check_ip_port(ctx, end_point)
4864
4865 logger.debug(f'Base mon IP(s) is {addrv_args}, mon addrv is {mon_addrv}')
4866 mon_network = None
4867 if not ctx.skip_mon_network:
4868 mon_network = get_public_net_from_cfg(ctx) or infer_mon_network(ctx, addrv_args)
4869
4870 return (mon_addrv, ipv6, mon_network)
4871
4872
4873 def prepare_cluster_network(ctx: CephadmContext) -> Tuple[str, bool]:
4874 # the cluster network may not exist on this node, so all we can do is
4875 # validate that the address given is valid ipv4 or ipv6 subnet
4876 ipv6_cluster_network = False
4877 cp = read_config(ctx.config)
4878 cluster_network = ctx.cluster_network
4879 if cluster_network is None and cp.has_option('global', 'cluster_network'):
4880 cluster_network = cp.get('global', 'cluster_network')
4881
4882 if cluster_network:
4883 cluser_nets = set([x.strip() for x in cluster_network.split(',')])
4884 local_subnets = set([x[0] for x in list_networks(ctx).items()])
4885 for net in cluser_nets:
4886 if net not in local_subnets:
4887 logger.warning(f'The cluster CIDR network {net} is not configured locally.')
4888
4889 rc, versions, err_msg = check_subnet(cluster_network)
4890 if rc:
4891 raise Error(f'Invalid --cluster-network parameter: {err_msg}')
4892 ipv6_cluster_network = True if 6 in versions else False
4893 else:
4894 logger.info('Internal network (--cluster-network) has not '
4895 'been provided, OSD replication will default to '
4896 'the public_network')
4897
4898 return cluster_network, ipv6_cluster_network
4899
4900
4901 def create_initial_keys(
4902 ctx: CephadmContext,
4903 uid: int, gid: int,
4904 mgr_id: str
4905 ) -> Tuple[str, str, str, Any, Any]: # type: ignore
4906
4907 _image = ctx.image
4908
4909 # create some initial keys
4910 logger.info('Creating initial keys...')
4911 mon_key = CephContainer(
4912 ctx,
4913 image=_image,
4914 entrypoint='/usr/bin/ceph-authtool',
4915 args=['--gen-print-key'],
4916 ).run().strip()
4917 admin_key = CephContainer(
4918 ctx,
4919 image=_image,
4920 entrypoint='/usr/bin/ceph-authtool',
4921 args=['--gen-print-key'],
4922 ).run().strip()
4923 mgr_key = CephContainer(
4924 ctx,
4925 image=_image,
4926 entrypoint='/usr/bin/ceph-authtool',
4927 args=['--gen-print-key'],
4928 ).run().strip()
4929
4930 keyring = ('[mon.]\n'
4931 '\tkey = %s\n'
4932 '\tcaps mon = allow *\n'
4933 '[client.admin]\n'
4934 '\tkey = %s\n'
4935 '\tcaps mon = allow *\n'
4936 '\tcaps mds = allow *\n'
4937 '\tcaps mgr = allow *\n'
4938 '\tcaps osd = allow *\n'
4939 '[mgr.%s]\n'
4940 '\tkey = %s\n'
4941 '\tcaps mon = profile mgr\n'
4942 '\tcaps mds = allow *\n'
4943 '\tcaps osd = allow *\n'
4944 % (mon_key, admin_key, mgr_id, mgr_key))
4945
4946 admin_keyring = write_tmp('[client.admin]\n'
4947 '\tkey = ' + admin_key + '\n',
4948 uid, gid)
4949
4950 # tmp keyring file
4951 bootstrap_keyring = write_tmp(keyring, uid, gid)
4952 return (mon_key, mgr_key, admin_key,
4953 bootstrap_keyring, admin_keyring)
4954
4955
4956 def create_initial_monmap(
4957 ctx: CephadmContext,
4958 uid: int, gid: int,
4959 fsid: str,
4960 mon_id: str, mon_addr: str
4961 ) -> Any:
4962 logger.info('Creating initial monmap...')
4963 monmap = write_tmp('', 0, 0)
4964 out = CephContainer(
4965 ctx,
4966 image=ctx.image,
4967 entrypoint='/usr/bin/monmaptool',
4968 args=[
4969 '--create',
4970 '--clobber',
4971 '--fsid', fsid,
4972 '--addv', mon_id, mon_addr,
4973 '/tmp/monmap'
4974 ],
4975 volume_mounts={
4976 monmap.name: '/tmp/monmap:z',
4977 },
4978 ).run()
4979 logger.debug(f'monmaptool for {mon_id} {mon_addr} on {out}')
4980
4981 # pass monmap file to ceph user for use by ceph-mon --mkfs below
4982 os.fchown(monmap.fileno(), uid, gid)
4983 return monmap
4984
4985
4986 def prepare_create_mon(
4987 ctx: CephadmContext,
4988 uid: int, gid: int,
4989 fsid: str, mon_id: str,
4990 bootstrap_keyring_path: str,
4991 monmap_path: str
4992 ) -> Tuple[str, str]:
4993 logger.info('Creating mon...')
4994 create_daemon_dirs(ctx, fsid, 'mon', mon_id, uid, gid)
4995 mon_dir = get_data_dir(fsid, ctx.data_dir, 'mon', mon_id)
4996 log_dir = get_log_dir(fsid, ctx.log_dir)
4997 out = CephContainer(
4998 ctx,
4999 image=ctx.image,
5000 entrypoint='/usr/bin/ceph-mon',
5001 args=[
5002 '--mkfs',
5003 '-i', mon_id,
5004 '--fsid', fsid,
5005 '-c', '/dev/null',
5006 '--monmap', '/tmp/monmap',
5007 '--keyring', '/tmp/keyring',
5008 ] + get_daemon_args(ctx, fsid, 'mon', mon_id),
5009 volume_mounts={
5010 log_dir: '/var/log/ceph:z',
5011 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
5012 bootstrap_keyring_path: '/tmp/keyring:z',
5013 monmap_path: '/tmp/monmap:z',
5014 },
5015 ).run()
5016 logger.debug(f'create mon.{mon_id} on {out}')
5017 return (mon_dir, log_dir)
5018
5019
5020 def create_mon(
5021 ctx: CephadmContext,
5022 uid: int, gid: int,
5023 fsid: str, mon_id: str
5024 ) -> None:
5025 mon_c = get_container(ctx, fsid, 'mon', mon_id)
5026 ctx.meta_json = json.dumps({'service_name': 'mon'})
5027 deploy_daemon(ctx, fsid, 'mon', mon_id, mon_c, uid, gid,
5028 config=None, keyring=None)
5029
5030
5031 def wait_for_mon(
5032 ctx: CephadmContext,
5033 mon_id: str, mon_dir: str,
5034 admin_keyring_path: str, config_path: str
5035 ) -> None:
5036 logger.info('Waiting for mon to start...')
5037 c = CephContainer(
5038 ctx,
5039 image=ctx.image,
5040 entrypoint='/usr/bin/ceph',
5041 args=[
5042 'status'],
5043 volume_mounts={
5044 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
5045 admin_keyring_path: '/etc/ceph/ceph.client.admin.keyring:z',
5046 config_path: '/etc/ceph/ceph.conf:z',
5047 },
5048 )
5049
5050 # wait for the service to become available
5051 def is_mon_available():
5052 # type: () -> bool
5053 timeout = ctx.timeout if ctx.timeout else 60 # seconds
5054 out, err, ret = call(ctx, c.run_cmd(),
5055 desc=c.entrypoint,
5056 timeout=timeout,
5057 verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
5058 return ret == 0
5059
5060 is_available(ctx, 'mon', is_mon_available)
5061
5062
5063 def create_mgr(
5064 ctx: CephadmContext,
5065 uid: int, gid: int,
5066 fsid: str, mgr_id: str, mgr_key: str,
5067 config: str, clifunc: Callable
5068 ) -> None:
5069 logger.info('Creating mgr...')
5070 mgr_keyring = '[mgr.%s]\n\tkey = %s\n' % (mgr_id, mgr_key)
5071 mgr_c = get_container(ctx, fsid, 'mgr', mgr_id)
5072 # Note:the default port used by the Prometheus node exporter is opened in fw
5073 ctx.meta_json = json.dumps({'service_name': 'mgr'})
5074 deploy_daemon(ctx, fsid, 'mgr', mgr_id, mgr_c, uid, gid,
5075 config=config, keyring=mgr_keyring, ports=[9283])
5076
5077 # wait for the service to become available
5078 logger.info('Waiting for mgr to start...')
5079
5080 def is_mgr_available():
5081 # type: () -> bool
5082 timeout = ctx.timeout if ctx.timeout else 60 # seconds
5083 try:
5084 out = clifunc(['status', '-f', 'json-pretty'],
5085 timeout=timeout,
5086 verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
5087 j = json.loads(out)
5088 return j.get('mgrmap', {}).get('available', False)
5089 except Exception as e:
5090 logger.debug('status failed: %s' % e)
5091 return False
5092 is_available(ctx, 'mgr', is_mgr_available)
5093
5094
5095 def prepare_ssh(
5096 ctx: CephadmContext,
5097 cli: Callable, wait_for_mgr_restart: Callable
5098 ) -> None:
5099
5100 cli(['cephadm', 'set-user', ctx.ssh_user])
5101
5102 if ctx.ssh_config:
5103 logger.info('Using provided ssh config...')
5104 mounts = {
5105 pathify(ctx.ssh_config.name): '/tmp/cephadm-ssh-config:z',
5106 }
5107 cli(['cephadm', 'set-ssh-config', '-i', '/tmp/cephadm-ssh-config'], extra_mounts=mounts)
5108
5109 if ctx.ssh_private_key and ctx.ssh_public_key:
5110 logger.info('Using provided ssh keys...')
5111 mounts = {
5112 pathify(ctx.ssh_private_key.name): '/tmp/cephadm-ssh-key:z',
5113 pathify(ctx.ssh_public_key.name): '/tmp/cephadm-ssh-key.pub:z'
5114 }
5115 cli(['cephadm', 'set-priv-key', '-i', '/tmp/cephadm-ssh-key'], extra_mounts=mounts)
5116 cli(['cephadm', 'set-pub-key', '-i', '/tmp/cephadm-ssh-key.pub'], extra_mounts=mounts)
5117 ssh_pub = cli(['cephadm', 'get-pub-key'])
5118 else:
5119 logger.info('Generating ssh key...')
5120 cli(['cephadm', 'generate-key'])
5121 ssh_pub = cli(['cephadm', 'get-pub-key'])
5122 with open(ctx.output_pub_ssh_key, 'w') as f:
5123 f.write(ssh_pub)
5124 logger.info('Wrote public SSH key to %s' % ctx.output_pub_ssh_key)
5125
5126 authorize_ssh_key(ssh_pub, ctx.ssh_user)
5127
5128 host = get_hostname()
5129 logger.info('Adding host %s...' % host)
5130 try:
5131 args = ['orch', 'host', 'add', host]
5132 if ctx.mon_ip:
5133 args.append(unwrap_ipv6(ctx.mon_ip))
5134 elif ctx.mon_addrv:
5135 addrv_args = parse_mon_addrv(ctx.mon_addrv)
5136 args.append(unwrap_ipv6(addrv_args[0].ip))
5137 cli(args)
5138 except RuntimeError as e:
5139 raise Error('Failed to add host <%s>: %s' % (host, e))
5140
5141 for t in ['mon', 'mgr']:
5142 if not ctx.orphan_initial_daemons:
5143 logger.info('Deploying %s service with default placement...' % t)
5144 cli(['orch', 'apply', t])
5145 else:
5146 logger.info('Deploying unmanaged %s service...' % t)
5147 cli(['orch', 'apply', t, '--unmanaged'])
5148
5149 if not ctx.orphan_initial_daemons:
5150 logger.info('Deploying crash service with default placement...')
5151 cli(['orch', 'apply', 'crash'])
5152
5153 if not ctx.skip_monitoring_stack:
5154 for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager']:
5155 logger.info('Deploying %s service with default placement...' % t)
5156 cli(['orch', 'apply', t])
5157
5158 if ctx.with_centralized_logging:
5159 for t in ['loki', 'promtail']:
5160 logger.info('Deploying %s service with default placement...' % t)
5161 cli(['orch', 'apply', t])
5162
5163
5164 def enable_cephadm_mgr_module(
5165 cli: Callable, wait_for_mgr_restart: Callable
5166 ) -> None:
5167
5168 logger.info('Enabling cephadm module...')
5169 cli(['mgr', 'module', 'enable', 'cephadm'])
5170 wait_for_mgr_restart()
5171 logger.info('Setting orchestrator backend to cephadm...')
5172 cli(['orch', 'set', 'backend', 'cephadm'])
5173
5174
5175 def prepare_dashboard(
5176 ctx: CephadmContext,
5177 uid: int, gid: int,
5178 cli: Callable, wait_for_mgr_restart: Callable
5179 ) -> None:
5180
5181 # Configure SSL port (cephadm only allows to configure dashboard SSL port)
5182 # if the user does not want to use SSL he can change this setting once the cluster is up
5183 cli(['config', 'set', 'mgr', 'mgr/dashboard/ssl_server_port', str(ctx.ssl_dashboard_port)])
5184
5185 # configuring dashboard parameters
5186 logger.info('Enabling the dashboard module...')
5187 cli(['mgr', 'module', 'enable', 'dashboard'])
5188 wait_for_mgr_restart()
5189
5190 # dashboard crt and key
5191 if ctx.dashboard_key and ctx.dashboard_crt:
5192 logger.info('Using provided dashboard certificate...')
5193 mounts = {
5194 pathify(ctx.dashboard_crt.name): '/tmp/dashboard.crt:z',
5195 pathify(ctx.dashboard_key.name): '/tmp/dashboard.key:z'
5196 }
5197 cli(['dashboard', 'set-ssl-certificate', '-i', '/tmp/dashboard.crt'], extra_mounts=mounts)
5198 cli(['dashboard', 'set-ssl-certificate-key', '-i', '/tmp/dashboard.key'], extra_mounts=mounts)
5199 else:
5200 logger.info('Generating a dashboard self-signed certificate...')
5201 cli(['dashboard', 'create-self-signed-cert'])
5202
5203 logger.info('Creating initial admin user...')
5204 password = ctx.initial_dashboard_password or generate_password()
5205 tmp_password_file = write_tmp(password, uid, gid)
5206 cmd = ['dashboard', 'ac-user-create', ctx.initial_dashboard_user, '-i', '/tmp/dashboard.pw', 'administrator', '--force-password']
5207 if not ctx.dashboard_password_noupdate:
5208 cmd.append('--pwd-update-required')
5209 cli(cmd, extra_mounts={pathify(tmp_password_file.name): '/tmp/dashboard.pw:z'})
5210 logger.info('Fetching dashboard port number...')
5211 out = cli(['config', 'get', 'mgr', 'mgr/dashboard/ssl_server_port'])
5212 port = int(out)
5213
5214 # Open dashboard port
5215 if not ('skip_firewalld' in ctx and ctx.skip_firewalld):
5216 fw = Firewalld(ctx)
5217 fw.open_ports([port])
5218 fw.apply_rules()
5219
5220 logger.info('Ceph Dashboard is now available at:\n\n'
5221 '\t URL: https://%s:%s/\n'
5222 '\t User: %s\n'
5223 '\tPassword: %s\n' % (
5224 get_fqdn(), port,
5225 ctx.initial_dashboard_user,
5226 password))
5227
5228
5229 def prepare_bootstrap_config(
5230 ctx: CephadmContext,
5231 fsid: str, mon_addr: str, image: str
5232
5233 ) -> str:
5234
5235 cp = read_config(ctx.config)
5236 if not cp.has_section('global'):
5237 cp.add_section('global')
5238 cp.set('global', 'fsid', fsid)
5239 cp.set('global', 'mon_host', mon_addr)
5240 cp.set('global', 'container_image', image)
5241
5242 if not cp.has_section('mon'):
5243 cp.add_section('mon')
5244 if (
5245 not cp.has_option('mon', 'auth_allow_insecure_global_id_reclaim')
5246 and not cp.has_option('mon', 'auth allow insecure global id reclaim')
5247 ):
5248 cp.set('mon', 'auth_allow_insecure_global_id_reclaim', 'false')
5249
5250 if ctx.single_host_defaults:
5251 logger.info('Adjusting default settings to suit single-host cluster...')
5252 # replicate across osds, not hosts
5253 if (
5254 not cp.has_option('global', 'osd_crush_chooseleaf_type')
5255 and not cp.has_option('global', 'osd crush chooseleaf type')
5256 ):
5257 cp.set('global', 'osd_crush_chooseleaf_type', '0')
5258 # replica 2x
5259 if (
5260 not cp.has_option('global', 'osd_pool_default_size')
5261 and not cp.has_option('global', 'osd pool default size')
5262 ):
5263 cp.set('global', 'osd_pool_default_size', '2')
5264 # disable mgr standby modules (so we can colocate multiple mgrs on one host)
5265 if not cp.has_section('mgr'):
5266 cp.add_section('mgr')
5267 if (
5268 not cp.has_option('mgr', 'mgr_standby_modules')
5269 and not cp.has_option('mgr', 'mgr standby modules')
5270 ):
5271 cp.set('mgr', 'mgr_standby_modules', 'false')
5272 if ctx.log_to_file:
5273 cp.set('global', 'log_to_file', 'true')
5274 cp.set('global', 'log_to_stderr', 'false')
5275 cp.set('global', 'log_to_journald', 'false')
5276 cp.set('global', 'mon_cluster_log_to_file', 'true')
5277 cp.set('global', 'mon_cluster_log_to_stderr', 'false')
5278 cp.set('global', 'mon_cluster_log_to_journald', 'false')
5279
5280 cpf = StringIO()
5281 cp.write(cpf)
5282 config = cpf.getvalue()
5283
5284 if ctx.registry_json or ctx.registry_url:
5285 command_registry_login(ctx)
5286
5287 return config
5288
5289
5290 def finish_bootstrap_config(
5291 ctx: CephadmContext,
5292 fsid: str,
5293 config: str,
5294 mon_id: str, mon_dir: str,
5295 mon_network: Optional[str], ipv6: bool,
5296 cli: Callable,
5297 cluster_network: Optional[str], ipv6_cluster_network: bool
5298
5299 ) -> None:
5300 if not ctx.no_minimize_config:
5301 logger.info('Assimilating anything we can from ceph.conf...')
5302 cli([
5303 'config', 'assimilate-conf',
5304 '-i', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
5305 ], {
5306 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
5307 })
5308 logger.info('Generating new minimal ceph.conf...')
5309 cli([
5310 'config', 'generate-minimal-conf',
5311 '-o', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
5312 ], {
5313 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
5314 })
5315 # re-read our minimized config
5316 with open(mon_dir + '/config', 'r') as f:
5317 config = f.read()
5318 logger.info('Restarting the monitor...')
5319 call_throws(ctx, [
5320 'systemctl',
5321 'restart',
5322 get_unit_name(fsid, 'mon', mon_id)
5323 ])
5324 elif 'image' in ctx and ctx.image:
5325 # we still want to assimilate the given container image if provided
5326 cli(['config', 'set', 'global', 'container_image', f'{ctx.image}'])
5327
5328 if mon_network:
5329 logger.info(f'Setting mon public_network to {mon_network}')
5330 cli(['config', 'set', 'mon', 'public_network', mon_network])
5331
5332 if cluster_network:
5333 logger.info(f'Setting cluster_network to {cluster_network}')
5334 cli(['config', 'set', 'global', 'cluster_network', cluster_network])
5335
5336 if ipv6 or ipv6_cluster_network:
5337 logger.info('Enabling IPv6 (ms_bind_ipv6) binding')
5338 cli(['config', 'set', 'global', 'ms_bind_ipv6', 'true'])
5339
5340 with open(ctx.output_config, 'w') as f:
5341 f.write(config)
5342 logger.info('Wrote config to %s' % ctx.output_config)
5343 pass
5344
5345
5346 # funcs to process spec file for apply spec
5347 def _parse_yaml_docs(f: Iterable[str]) -> List[List[str]]:
5348 docs = []
5349 current_doc = [] # type: List[str]
5350 for line in f:
5351 if re.search(r'^---\s+', line):
5352 if current_doc:
5353 docs.append(current_doc)
5354 current_doc = []
5355 else:
5356 current_doc.append(line.rstrip())
5357 if current_doc:
5358 docs.append(current_doc)
5359 return docs
5360
5361
5362 def _parse_yaml_obj(doc: List[str]) -> Dict[str, str]:
5363 # note: this only parses the first layer of yaml
5364 obj = {} # type: Dict[str, str]
5365 current_key = ''
5366 for line in doc:
5367 if line.startswith(' '):
5368 obj[current_key] += line.strip()
5369 elif line.endswith(':'):
5370 current_key = line.strip(':')
5371 obj[current_key] = ''
5372 else:
5373 current_key, val = line.split(':')
5374 obj[current_key] = val.strip()
5375 return obj
5376
5377
5378 def parse_yaml_objs(f: Iterable[str]) -> List[Dict[str, str]]:
5379 objs = []
5380 for d in _parse_yaml_docs(f):
5381 objs.append(_parse_yaml_obj(d))
5382 return objs
5383
5384
5385 def _distribute_ssh_keys(ctx: CephadmContext, host_spec: Dict[str, str], bootstrap_hostname: str) -> int:
5386 # copy ssh key to hosts in host spec (used for apply spec)
5387 ssh_key = CEPH_DEFAULT_PUBKEY
5388 if ctx.ssh_public_key:
5389 ssh_key = ctx.ssh_public_key.name
5390
5391 if bootstrap_hostname != host_spec['hostname']:
5392 if 'addr' in host_spec:
5393 addr = host_spec['addr']
5394 else:
5395 addr = host_spec['hostname']
5396 out, err, code = call(ctx, ['sudo', '-u', ctx.ssh_user, 'ssh-copy-id', '-f', '-i', ssh_key, '-o StrictHostKeyChecking=no', '%s@%s' % (ctx.ssh_user, addr)])
5397 if code:
5398 logger.info('\nCopying ssh key to host %s at address %s failed!\n' % (host_spec['hostname'], addr))
5399 return 1
5400 else:
5401 logger.info('Added ssh key to host %s at address %s\n' % (host_spec['hostname'], addr))
5402 return 0
5403
5404
5405 def save_cluster_config(ctx: CephadmContext, uid: int, gid: int, fsid: str) -> None:
5406 """Save cluster configuration to the per fsid directory """
5407 def copy_file(src: str, dst: str) -> None:
5408 if src:
5409 shutil.copyfile(src, dst)
5410
5411 conf_dir = f'{ctx.data_dir}/{fsid}/{CEPH_CONF_DIR}'
5412 makedirs(conf_dir, uid, gid, DATA_DIR_MODE)
5413 if os.path.exists(conf_dir):
5414 logger.info(f'Saving cluster configuration to {conf_dir} directory')
5415 copy_file(ctx.output_config, os.path.join(conf_dir, CEPH_CONF))
5416 copy_file(ctx.output_keyring, os.path.join(conf_dir, CEPH_KEYRING))
5417 # ctx.output_pub_ssh_key may not exist if user has provided custom ssh keys
5418 if (os.path.exists(ctx.output_pub_ssh_key)):
5419 copy_file(ctx.output_pub_ssh_key, os.path.join(conf_dir, CEPH_PUBKEY))
5420 else:
5421 logger.warning(f'Cannot create cluster configuration directory {conf_dir}')
5422
5423
5424 @default_image
5425 def command_bootstrap(ctx):
5426 # type: (CephadmContext) -> int
5427
5428 ctx.error_code = 0
5429
5430 if not ctx.output_config:
5431 ctx.output_config = os.path.join(ctx.output_dir, CEPH_CONF)
5432 if not ctx.output_keyring:
5433 ctx.output_keyring = os.path.join(ctx.output_dir, CEPH_KEYRING)
5434 if not ctx.output_pub_ssh_key:
5435 ctx.output_pub_ssh_key = os.path.join(ctx.output_dir, CEPH_PUBKEY)
5436
5437 if bool(ctx.ssh_private_key) is not bool(ctx.ssh_public_key):
5438 raise Error('--ssh-private-key and --ssh-public-key must be provided together or not at all.')
5439
5440 if ctx.fsid:
5441 data_dir_base = os.path.join(ctx.data_dir, ctx.fsid)
5442 if os.path.exists(data_dir_base):
5443 raise Error(f"A cluster with the same fsid '{ctx.fsid}' already exists.")
5444 else:
5445 logger.warning('Specifying an fsid for your cluster offers no advantages and may increase the likelihood of fsid conflicts.')
5446
5447 # verify output files
5448 for f in [ctx.output_config, ctx.output_keyring,
5449 ctx.output_pub_ssh_key]:
5450 if not ctx.allow_overwrite:
5451 if os.path.exists(f):
5452 raise Error('%s already exists; delete or pass '
5453 '--allow-overwrite to overwrite' % f)
5454 dirname = os.path.dirname(f)
5455 if dirname and not os.path.exists(dirname):
5456 fname = os.path.basename(f)
5457 logger.info(f'Creating directory {dirname} for {fname}')
5458 try:
5459 # use makedirs to create intermediate missing dirs
5460 os.makedirs(dirname, 0o755)
5461 except PermissionError:
5462 raise Error(f'Unable to create {dirname} due to permissions failure. Retry with root, or sudo or preallocate the directory.')
5463
5464 (user_conf, _) = get_config_and_keyring(ctx)
5465
5466 if ctx.ssh_user != 'root':
5467 check_ssh_connectivity(ctx)
5468
5469 if not ctx.skip_prepare_host:
5470 command_prepare_host(ctx)
5471 else:
5472 logger.info('Skip prepare_host')
5473
5474 # initial vars
5475 fsid = ctx.fsid or make_fsid()
5476 if not is_fsid(fsid):
5477 raise Error('not an fsid: %s' % fsid)
5478 logger.info('Cluster fsid: %s' % fsid)
5479
5480 hostname = get_hostname()
5481 if '.' in hostname and not ctx.allow_fqdn_hostname:
5482 raise Error('hostname is a fully qualified domain name (%s); either fix (e.g., "sudo hostname %s" or similar) or pass --allow-fqdn-hostname' % (hostname, hostname.split('.')[0]))
5483 mon_id = ctx.mon_id or hostname
5484 mgr_id = ctx.mgr_id or generate_service_id()
5485
5486 lock = FileLock(ctx, fsid)
5487 lock.acquire()
5488
5489 (addr_arg, ipv6, mon_network) = prepare_mon_addresses(ctx)
5490 cluster_network, ipv6_cluster_network = prepare_cluster_network(ctx)
5491
5492 config = prepare_bootstrap_config(ctx, fsid, addr_arg, ctx.image)
5493
5494 if not ctx.skip_pull:
5495 try:
5496 _pull_image(ctx, ctx.image)
5497 except UnauthorizedRegistryError:
5498 err_str = 'Failed to pull container image. Check that correct registry credentials are provided in bootstrap by --registry-url, --registry-username, --registry-password, or supply --registry-json with credentials'
5499 logger.debug(f'Pulling image for bootstrap on {hostname} failed: {err_str}')
5500 raise Error(err_str)
5501
5502 image_ver = CephContainer(ctx, ctx.image, 'ceph', ['--version']).run().strip()
5503 logger.info(f'Ceph version: {image_ver}')
5504
5505 if not ctx.allow_mismatched_release:
5506 image_release = image_ver.split()[4]
5507 if image_release not in \
5508 [DEFAULT_IMAGE_RELEASE, LATEST_STABLE_RELEASE]:
5509 raise Error(
5510 f'Container release {image_release} != cephadm release {DEFAULT_IMAGE_RELEASE};'
5511 ' please use matching version of cephadm (pass --allow-mismatched-release to continue anyway)'
5512 )
5513
5514 logger.info('Extracting ceph user uid/gid from container image...')
5515 (uid, gid) = extract_uid_gid(ctx)
5516
5517 # create some initial keys
5518 (mon_key, mgr_key, admin_key, bootstrap_keyring, admin_keyring) = create_initial_keys(ctx, uid, gid, mgr_id)
5519
5520 monmap = create_initial_monmap(ctx, uid, gid, fsid, mon_id, addr_arg)
5521 (mon_dir, log_dir) = prepare_create_mon(ctx, uid, gid, fsid, mon_id,
5522 bootstrap_keyring.name, monmap.name)
5523
5524 with open(mon_dir + '/config', 'w') as f:
5525 os.fchown(f.fileno(), uid, gid)
5526 os.fchmod(f.fileno(), 0o600)
5527 f.write(config)
5528
5529 make_var_run(ctx, fsid, uid, gid)
5530 create_mon(ctx, uid, gid, fsid, mon_id)
5531
5532 # config to issue various CLI commands
5533 tmp_config = write_tmp(config, uid, gid)
5534
5535 # a CLI helper to reduce our typing
5536 def cli(cmd, extra_mounts={}, timeout=DEFAULT_TIMEOUT, verbosity=CallVerbosity.VERBOSE_ON_FAILURE):
5537 # type: (List[str], Dict[str, str], Optional[int], CallVerbosity) -> str
5538 mounts = {
5539 log_dir: '/var/log/ceph:z',
5540 admin_keyring.name: '/etc/ceph/ceph.client.admin.keyring:z',
5541 tmp_config.name: '/etc/ceph/ceph.conf:z',
5542 }
5543 for k, v in extra_mounts.items():
5544 mounts[k] = v
5545 timeout = timeout or ctx.timeout
5546 return CephContainer(
5547 ctx,
5548 image=ctx.image,
5549 entrypoint='/usr/bin/ceph',
5550 args=cmd,
5551 volume_mounts=mounts,
5552 ).run(timeout=timeout, verbosity=verbosity)
5553
5554 wait_for_mon(ctx, mon_id, mon_dir, admin_keyring.name, tmp_config.name)
5555
5556 finish_bootstrap_config(ctx, fsid, config, mon_id, mon_dir,
5557 mon_network, ipv6, cli,
5558 cluster_network, ipv6_cluster_network)
5559
5560 # output files
5561 with open(ctx.output_keyring, 'w') as f:
5562 os.fchmod(f.fileno(), 0o600)
5563 f.write('[client.admin]\n'
5564 '\tkey = ' + admin_key + '\n')
5565 logger.info('Wrote keyring to %s' % ctx.output_keyring)
5566
5567 # create mgr
5568 create_mgr(ctx, uid, gid, fsid, mgr_id, mgr_key, config, cli)
5569
5570 if user_conf:
5571 # user given config settings were already assimilated earlier
5572 # but if the given settings contained any attributes in
5573 # the mgr (e.g. mgr/cephadm/container_image_prometheus)
5574 # they don't seem to be stored if there isn't a mgr yet.
5575 # Since re-assimilating the same conf settings should be
5576 # idempotent we can just do it again here.
5577 with tempfile.NamedTemporaryFile(buffering=0) as tmp:
5578 tmp.write(user_conf.encode('utf-8'))
5579 cli(['config', 'assimilate-conf',
5580 '-i', '/var/lib/ceph/user.conf'],
5581 {tmp.name: '/var/lib/ceph/user.conf:z'})
5582
5583 # wait for mgr to restart (after enabling a module)
5584 def wait_for_mgr_restart() -> None:
5585 # first get latest mgrmap epoch from the mon. try newer 'mgr
5586 # stat' command first, then fall back to 'mgr dump' if
5587 # necessary
5588 try:
5589 j = json_loads_retry(lambda: cli(['mgr', 'stat'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR))
5590 except Exception:
5591 j = json_loads_retry(lambda: cli(['mgr', 'dump'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR))
5592 epoch = j['epoch']
5593
5594 # wait for mgr to have it
5595 logger.info('Waiting for the mgr to restart...')
5596
5597 def mgr_has_latest_epoch():
5598 # type: () -> bool
5599 try:
5600 out = cli(['tell', 'mgr', 'mgr_status'])
5601 j = json.loads(out)
5602 return j['mgrmap_epoch'] >= epoch
5603 except Exception as e:
5604 logger.debug('tell mgr mgr_status failed: %s' % e)
5605 return False
5606 is_available(ctx, 'mgr epoch %d' % epoch, mgr_has_latest_epoch)
5607
5608 enable_cephadm_mgr_module(cli, wait_for_mgr_restart)
5609
5610 # ssh
5611 if not ctx.skip_ssh:
5612 prepare_ssh(ctx, cli, wait_for_mgr_restart)
5613
5614 if ctx.registry_url and ctx.registry_username and ctx.registry_password:
5615 registry_credentials = {'url': ctx.registry_url, 'username': ctx.registry_username, 'password': ctx.registry_password}
5616 cli(['config-key', 'set', 'mgr/cephadm/registry_credentials', json.dumps(registry_credentials)])
5617
5618 cli(['config', 'set', 'mgr', 'mgr/cephadm/container_init', str(ctx.container_init), '--force'])
5619
5620 if not ctx.skip_dashboard:
5621 prepare_dashboard(ctx, uid, gid, cli, wait_for_mgr_restart)
5622
5623 if ctx.output_config == CEPH_DEFAULT_CONF and not ctx.skip_admin_label and not ctx.no_minimize_config:
5624 logger.info('Enabling client.admin keyring and conf on hosts with "admin" label')
5625 try:
5626 cli(['orch', 'client-keyring', 'set', 'client.admin', 'label:_admin'])
5627 cli(['orch', 'host', 'label', 'add', get_hostname(), '_admin'])
5628 except Exception:
5629 logger.info('Unable to set up "admin" label; assuming older version of Ceph')
5630
5631 if ctx.apply_spec:
5632 logger.info('Applying %s to cluster' % ctx.apply_spec)
5633 # copy ssh key to hosts in spec file
5634 with open(ctx.apply_spec) as f:
5635 try:
5636 for spec in parse_yaml_objs(f):
5637 if spec.get('service_type') == 'host':
5638 _distribute_ssh_keys(ctx, spec, hostname)
5639 except ValueError:
5640 logger.info('Unable to parse %s succesfully' % ctx.apply_spec)
5641
5642 mounts = {}
5643 mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:ro'
5644 try:
5645 out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
5646 logger.info(out)
5647 except Exception:
5648 ctx.error_code = -errno.EINVAL
5649 logger.info('\nApplying %s to cluster failed!\n' % ctx.apply_spec)
5650
5651 save_cluster_config(ctx, uid, gid, fsid)
5652
5653 # enable autotune for osd_memory_target
5654 logger.info('Enabling autotune for osd_memory_target')
5655 cli(['config', 'set', 'osd', 'osd_memory_target_autotune', 'true'])
5656
5657 # Notify the Dashboard to show the 'Expand cluster' page on first log in.
5658 cli(['config-key', 'set', 'mgr/dashboard/cluster/status', 'INSTALLED'])
5659
5660 logger.info('You can access the Ceph CLI as following in case of multi-cluster or non-default config:\n\n'
5661 '\tsudo %s shell --fsid %s -c %s -k %s\n' % (
5662 sys.argv[0],
5663 fsid,
5664 ctx.output_config,
5665 ctx.output_keyring))
5666
5667 logger.info('Or, if you are only running a single cluster on this host:\n\n\tsudo %s shell \n' % (sys.argv[0]))
5668
5669 logger.info('Please consider enabling telemetry to help improve Ceph:\n\n'
5670 '\tceph telemetry on\n\n'
5671 'For more information see:\n\n'
5672 '\thttps://docs.ceph.com/docs/master/mgr/telemetry/\n')
5673 logger.info('Bootstrap complete.')
5674 return ctx.error_code
5675
5676 ##################################
5677
5678
5679 def command_registry_login(ctx: CephadmContext) -> int:
5680 if ctx.registry_json:
5681 logger.info('Pulling custom registry login info from %s.' % ctx.registry_json)
5682 d = get_parm(ctx.registry_json)
5683 if d.get('url') and d.get('username') and d.get('password'):
5684 ctx.registry_url = d.get('url')
5685 ctx.registry_username = d.get('username')
5686 ctx.registry_password = d.get('password')
5687 registry_login(ctx, ctx.registry_url, ctx.registry_username, ctx.registry_password)
5688 else:
5689 raise Error('json provided for custom registry login did not include all necessary fields. '
5690 'Please setup json file as\n'
5691 '{\n'
5692 ' "url": "REGISTRY_URL",\n'
5693 ' "username": "REGISTRY_USERNAME",\n'
5694 ' "password": "REGISTRY_PASSWORD"\n'
5695 '}\n')
5696 elif ctx.registry_url and ctx.registry_username and ctx.registry_password:
5697 registry_login(ctx, ctx.registry_url, ctx.registry_username, ctx.registry_password)
5698 else:
5699 raise Error('Invalid custom registry arguments received. To login to a custom registry include '
5700 '--registry-url, --registry-username and --registry-password '
5701 'options or --registry-json option')
5702 return 0
5703
5704
5705 def registry_login(ctx: CephadmContext, url: Optional[str], username: Optional[str], password: Optional[str]) -> None:
5706 logger.info('Logging into custom registry.')
5707 try:
5708 engine = ctx.container_engine
5709 cmd = [engine.path, 'login',
5710 '-u', username, '-p', password,
5711 url]
5712 if isinstance(engine, Podman):
5713 cmd.append('--authfile=/etc/ceph/podman-auth.json')
5714 out, _, _ = call_throws(ctx, cmd)
5715 if isinstance(engine, Podman):
5716 os.chmod('/etc/ceph/podman-auth.json', 0o600)
5717 except Exception:
5718 raise Error('Failed to login to custom registry @ %s as %s with given password' % (ctx.registry_url, ctx.registry_username))
5719
5720 ##################################
5721
5722
5723 def extract_uid_gid_monitoring(ctx, daemon_type):
5724 # type: (CephadmContext, str) -> Tuple[int, int]
5725
5726 if daemon_type == 'prometheus':
5727 uid, gid = extract_uid_gid(ctx, file_path='/etc/prometheus')
5728 elif daemon_type == 'node-exporter':
5729 uid, gid = 65534, 65534
5730 elif daemon_type == 'grafana':
5731 uid, gid = extract_uid_gid(ctx, file_path='/var/lib/grafana')
5732 elif daemon_type == 'loki':
5733 uid, gid = extract_uid_gid(ctx, file_path='/etc/loki')
5734 elif daemon_type == 'promtail':
5735 uid, gid = extract_uid_gid(ctx, file_path='/etc/promtail')
5736 elif daemon_type == 'alertmanager':
5737 uid, gid = extract_uid_gid(ctx, file_path=['/etc/alertmanager', '/etc/prometheus'])
5738 else:
5739 raise Error('{} not implemented yet'.format(daemon_type))
5740 return uid, gid
5741
5742
5743 def get_deployment_container(ctx: CephadmContext,
5744 fsid: str, daemon_type: str, daemon_id: Union[int, str],
5745 privileged: bool = False,
5746 ptrace: bool = False,
5747 container_args: Optional[List[str]] = None) -> 'CephContainer':
5748 # wrapper for get_container specifically for containers made during the `cephadm deploy`
5749 # command. Adds some extra things such as extra container args and custom config files
5750 c = get_container(ctx, fsid, daemon_type, daemon_id, privileged, ptrace, container_args)
5751 if 'extra_container_args' in ctx and ctx.extra_container_args:
5752 c.container_args.extend(ctx.extra_container_args)
5753 if 'config_json' in ctx and ctx.config_json:
5754 conf_files = get_custom_config_files(ctx.config_json)
5755 mandatory_keys = ['mount_path', 'content']
5756 for conf in conf_files['custom_config_files']:
5757 if all(k in conf for k in mandatory_keys):
5758 mount_path = conf['mount_path']
5759 file_path = os.path.join(
5760 ctx.data_dir,
5761 fsid,
5762 'custom_config_files',
5763 f'{daemon_type}.{daemon_id}',
5764 os.path.basename(mount_path)
5765 )
5766 c.volume_mounts[file_path] = mount_path
5767 return c
5768
5769
5770 @default_image
5771 def command_deploy(ctx):
5772 # type: (CephadmContext) -> None
5773 daemon_type, daemon_id = ctx.name.split('.', 1)
5774
5775 lock = FileLock(ctx, ctx.fsid)
5776 lock.acquire()
5777
5778 if daemon_type not in get_supported_daemons():
5779 raise Error('daemon type %s not recognized' % daemon_type)
5780
5781 redeploy = False
5782 unit_name = get_unit_name(ctx.fsid, daemon_type, daemon_id)
5783 (_, state, _) = check_unit(ctx, unit_name)
5784 if state == 'running' or is_container_running(ctx, CephContainer.for_daemon(ctx, ctx.fsid, daemon_type, daemon_id, 'bash')):
5785 redeploy = True
5786
5787 if ctx.reconfig:
5788 logger.info('%s daemon %s ...' % ('Reconfig', ctx.name))
5789 elif redeploy:
5790 logger.info('%s daemon %s ...' % ('Redeploy', ctx.name))
5791 else:
5792 logger.info('%s daemon %s ...' % ('Deploy', ctx.name))
5793
5794 # Migrate sysctl conf files from /usr/lib to /etc
5795 migrate_sysctl_dir(ctx, ctx.fsid)
5796
5797 # Get and check ports explicitly required to be opened
5798 daemon_ports = [] # type: List[int]
5799
5800 # only check port in use if not reconfig or redeploy since service
5801 # we are redeploying/reconfiguring will already be using the port
5802 if not ctx.reconfig and not redeploy:
5803 if ctx.tcp_ports:
5804 daemon_ports = list(map(int, ctx.tcp_ports.split()))
5805
5806 if daemon_type in Ceph.daemons:
5807 config, keyring = get_config_and_keyring(ctx)
5808 uid, gid = extract_uid_gid(ctx)
5809 make_var_run(ctx, ctx.fsid, uid, gid)
5810
5811 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id,
5812 ptrace=ctx.allow_ptrace)
5813 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
5814 config=config, keyring=keyring,
5815 osd_fsid=ctx.osd_fsid,
5816 reconfig=ctx.reconfig,
5817 ports=daemon_ports)
5818
5819 elif daemon_type in Monitoring.components:
5820 # monitoring daemon - prometheus, grafana, alertmanager, node-exporter
5821 # Default Checks
5822 # make sure provided config-json is sufficient
5823 config = get_parm(ctx.config_json) # type: ignore
5824 required_files = Monitoring.components[daemon_type].get('config-json-files', list())
5825 required_args = Monitoring.components[daemon_type].get('config-json-args', list())
5826 if required_files:
5827 if not config or not all(c in config.get('files', {}).keys() for c in required_files): # type: ignore
5828 raise Error('{} deployment requires config-json which must '
5829 'contain file content for {}'.format(daemon_type.capitalize(), ', '.join(required_files)))
5830 if required_args:
5831 if not config or not all(c in config.keys() for c in required_args): # type: ignore
5832 raise Error('{} deployment requires config-json which must '
5833 'contain arg for {}'.format(daemon_type.capitalize(), ', '.join(required_args)))
5834
5835 uid, gid = extract_uid_gid_monitoring(ctx, daemon_type)
5836 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
5837 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
5838 reconfig=ctx.reconfig,
5839 ports=daemon_ports)
5840
5841 elif daemon_type == NFSGanesha.daemon_type:
5842 if not ctx.reconfig and not redeploy and not daemon_ports:
5843 daemon_ports = list(NFSGanesha.port_map.values())
5844
5845 config, keyring = get_config_and_keyring(ctx)
5846 # TODO: extract ganesha uid/gid (997, 994) ?
5847 uid, gid = extract_uid_gid(ctx)
5848 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
5849 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
5850 config=config, keyring=keyring,
5851 reconfig=ctx.reconfig,
5852 ports=daemon_ports)
5853
5854 elif daemon_type == CephIscsi.daemon_type:
5855 config, keyring = get_config_and_keyring(ctx)
5856 uid, gid = extract_uid_gid(ctx)
5857 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
5858 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
5859 config=config, keyring=keyring,
5860 reconfig=ctx.reconfig,
5861 ports=daemon_ports)
5862
5863 elif daemon_type == HAproxy.daemon_type:
5864 haproxy = HAproxy.init(ctx, ctx.fsid, daemon_id)
5865 uid, gid = haproxy.extract_uid_gid_haproxy()
5866 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
5867 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
5868 reconfig=ctx.reconfig,
5869 ports=daemon_ports)
5870
5871 elif daemon_type == Keepalived.daemon_type:
5872 keepalived = Keepalived.init(ctx, ctx.fsid, daemon_id)
5873 uid, gid = keepalived.extract_uid_gid_keepalived()
5874 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
5875 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
5876 reconfig=ctx.reconfig,
5877 ports=daemon_ports)
5878
5879 elif daemon_type == CustomContainer.daemon_type:
5880 cc = CustomContainer.init(ctx, ctx.fsid, daemon_id)
5881 if not ctx.reconfig and not redeploy:
5882 daemon_ports.extend(cc.ports)
5883 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id,
5884 privileged=cc.privileged,
5885 ptrace=ctx.allow_ptrace)
5886 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c,
5887 uid=cc.uid, gid=cc.gid, config=None,
5888 keyring=None, reconfig=ctx.reconfig,
5889 ports=daemon_ports)
5890
5891 elif daemon_type == CephadmAgent.daemon_type:
5892 # get current user gid and uid
5893 uid = os.getuid()
5894 gid = os.getgid()
5895 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, None,
5896 uid, gid, ports=daemon_ports)
5897
5898 elif daemon_type == SNMPGateway.daemon_type:
5899 sc = SNMPGateway.init(ctx, ctx.fsid, daemon_id)
5900 c = get_deployment_container(ctx, ctx.fsid, daemon_type, daemon_id)
5901 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c,
5902 sc.uid, sc.gid,
5903 ports=daemon_ports)
5904
5905 else:
5906 raise Error('daemon type {} not implemented in command_deploy function'
5907 .format(daemon_type))
5908
5909 ##################################
5910
5911
5912 @infer_image
5913 def command_run(ctx):
5914 # type: (CephadmContext) -> int
5915 (daemon_type, daemon_id) = ctx.name.split('.', 1)
5916 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
5917 command = c.run_cmd()
5918 return call_timeout(ctx, command, ctx.timeout)
5919
5920 ##################################
5921
5922
5923 @infer_fsid
5924 @infer_config
5925 @infer_image
5926 @validate_fsid
5927 def command_shell(ctx):
5928 # type: (CephadmContext) -> int
5929 cp = read_config(ctx.config)
5930 if cp.has_option('global', 'fsid') and \
5931 cp.get('global', 'fsid') != ctx.fsid:
5932 raise Error('fsid does not match ceph.conf')
5933
5934 if ctx.name:
5935 if '.' in ctx.name:
5936 (daemon_type, daemon_id) = ctx.name.split('.', 1)
5937 else:
5938 daemon_type = ctx.name
5939 daemon_id = None
5940 else:
5941 daemon_type = 'osd' # get the most mounts
5942 daemon_id = None
5943
5944 if ctx.fsid and daemon_type in Ceph.daemons:
5945 make_log_dir(ctx, ctx.fsid)
5946
5947 if daemon_id and not ctx.fsid:
5948 raise Error('must pass --fsid to specify cluster')
5949
5950 # in case a dedicated keyring for the specified fsid is found we us it.
5951 # Otherwise, use /etc/ceph files by default, if present. We do this instead of
5952 # making these defaults in the arg parser because we don't want an error
5953 # if they don't exist.
5954 if not ctx.keyring:
5955 keyring_file = f'{ctx.data_dir}/{ctx.fsid}/{CEPH_CONF_DIR}/{CEPH_KEYRING}'
5956 if os.path.exists(keyring_file):
5957 ctx.keyring = keyring_file
5958 elif os.path.exists(CEPH_DEFAULT_KEYRING):
5959 ctx.keyring = CEPH_DEFAULT_KEYRING
5960
5961 container_args: List[str] = ['-i']
5962 mounts = get_container_mounts(ctx, ctx.fsid, daemon_type, daemon_id,
5963 no_config=True if ctx.config else False)
5964 binds = get_container_binds(ctx, ctx.fsid, daemon_type, daemon_id)
5965 if ctx.config:
5966 mounts[pathify(ctx.config)] = '/etc/ceph/ceph.conf:z'
5967 if ctx.keyring:
5968 mounts[pathify(ctx.keyring)] = '/etc/ceph/ceph.keyring:z'
5969 if ctx.mount:
5970 for _mount in ctx.mount:
5971 split_src_dst = _mount.split(':')
5972 mount = pathify(split_src_dst[0])
5973 filename = os.path.basename(split_src_dst[0])
5974 if len(split_src_dst) > 1:
5975 dst = split_src_dst[1]
5976 if len(split_src_dst) == 3:
5977 dst = '{}:{}'.format(dst, split_src_dst[2])
5978 mounts[mount] = dst
5979 else:
5980 mounts[mount] = '/mnt/{}'.format(filename)
5981 if ctx.command:
5982 command = ctx.command
5983 else:
5984 command = ['bash']
5985 container_args += [
5986 '-t',
5987 '-e', 'LANG=C',
5988 '-e', 'PS1=%s' % CUSTOM_PS1,
5989 ]
5990 if ctx.fsid:
5991 home = os.path.join(ctx.data_dir, ctx.fsid, 'home')
5992 if not os.path.exists(home):
5993 logger.debug('Creating root home at %s' % home)
5994 makedirs(home, 0, 0, 0o660)
5995 if os.path.exists('/etc/skel'):
5996 for f in os.listdir('/etc/skel'):
5997 if f.startswith('.bash'):
5998 shutil.copyfile(os.path.join('/etc/skel', f),
5999 os.path.join(home, f))
6000 mounts[home] = '/root'
6001
6002 for i in ctx.volume:
6003 a, b = i.split(':', 1)
6004 mounts[a] = b
6005
6006 c = CephContainer(
6007 ctx,
6008 image=ctx.image,
6009 entrypoint='doesnotmatter',
6010 args=[],
6011 container_args=container_args,
6012 volume_mounts=mounts,
6013 bind_mounts=binds,
6014 envs=ctx.env,
6015 privileged=True)
6016 command = c.shell_cmd(command)
6017
6018 return call_timeout(ctx, command, ctx.timeout)
6019
6020 ##################################
6021
6022
6023 @infer_fsid
6024 def command_enter(ctx):
6025 # type: (CephadmContext) -> int
6026 if not ctx.fsid:
6027 raise Error('must pass --fsid to specify cluster')
6028 (daemon_type, daemon_id) = ctx.name.split('.', 1)
6029 container_args = ['-i'] # type: List[str]
6030 if ctx.command:
6031 command = ctx.command
6032 else:
6033 command = ['sh']
6034 container_args += [
6035 '-t',
6036 '-e', 'LANG=C',
6037 '-e', 'PS1=%s' % CUSTOM_PS1,
6038 ]
6039 c = CephContainer(
6040 ctx,
6041 image=ctx.image,
6042 entrypoint='doesnotmatter',
6043 container_args=container_args,
6044 cname='ceph-%s-%s.%s' % (ctx.fsid, daemon_type, daemon_id),
6045 )
6046 command = c.exec_cmd(command)
6047 return call_timeout(ctx, command, ctx.timeout)
6048
6049 ##################################
6050
6051
6052 @infer_fsid
6053 @infer_image
6054 @validate_fsid
6055 def command_ceph_volume(ctx):
6056 # type: (CephadmContext) -> None
6057 cp = read_config(ctx.config)
6058 if cp.has_option('global', 'fsid') and \
6059 cp.get('global', 'fsid') != ctx.fsid:
6060 raise Error('fsid does not match ceph.conf')
6061
6062 if ctx.fsid:
6063 make_log_dir(ctx, ctx.fsid)
6064
6065 lock = FileLock(ctx, ctx.fsid)
6066 lock.acquire()
6067
6068 (uid, gid) = (0, 0) # ceph-volume runs as root
6069 mounts = get_container_mounts(ctx, ctx.fsid, 'osd', None)
6070
6071 tmp_config = None
6072 tmp_keyring = None
6073
6074 (config, keyring) = get_config_and_keyring(ctx)
6075
6076 if config:
6077 # tmp config file
6078 tmp_config = write_tmp(config, uid, gid)
6079 mounts[tmp_config.name] = '/etc/ceph/ceph.conf:z'
6080
6081 if keyring:
6082 # tmp keyring file
6083 tmp_keyring = write_tmp(keyring, uid, gid)
6084 mounts[tmp_keyring.name] = '/var/lib/ceph/bootstrap-osd/ceph.keyring:z'
6085
6086 c = get_ceph_volume_container(
6087 ctx,
6088 envs=ctx.env,
6089 args=ctx.command,
6090 volume_mounts=mounts,
6091 )
6092
6093 out, err, code = call_throws(ctx, c.run_cmd(), verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
6094 if not code:
6095 print(out)
6096
6097 ##################################
6098
6099
6100 @infer_fsid
6101 def command_unit(ctx):
6102 # type: (CephadmContext) -> int
6103 if not ctx.fsid:
6104 raise Error('must pass --fsid to specify cluster')
6105
6106 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
6107
6108 _, _, code = call(
6109 ctx,
6110 ['systemctl', ctx.command, unit_name],
6111 verbosity=CallVerbosity.VERBOSE,
6112 desc=''
6113 )
6114 return code
6115
6116 ##################################
6117
6118
6119 @infer_fsid
6120 def command_logs(ctx):
6121 # type: (CephadmContext) -> None
6122 if not ctx.fsid:
6123 raise Error('must pass --fsid to specify cluster')
6124
6125 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
6126
6127 cmd = [find_program('journalctl')]
6128 cmd.extend(['-u', unit_name])
6129 if ctx.command:
6130 cmd.extend(ctx.command)
6131
6132 # call this directly, without our wrapper, so that we get an unmolested
6133 # stdout with logger prefixing.
6134 logger.debug('Running command: %s' % ' '.join(cmd))
6135 subprocess.call(cmd, env=os.environ.copy()) # type: ignore
6136
6137 ##################################
6138
6139
6140 def list_networks(ctx):
6141 # type: (CephadmContext) -> Dict[str,Dict[str, Set[str]]]
6142
6143 # sadly, 18.04's iproute2 4.15.0-2ubun doesn't support the -j flag,
6144 # so we'll need to use a regex to parse 'ip' command output.
6145 #
6146 # out, _, _ = call_throws(['ip', '-j', 'route', 'ls'])
6147 # j = json.loads(out)
6148 # for x in j:
6149 res = _list_ipv4_networks(ctx)
6150 res.update(_list_ipv6_networks(ctx))
6151 return res
6152
6153
6154 def _list_ipv4_networks(ctx: CephadmContext) -> Dict[str, Dict[str, Set[str]]]:
6155 execstr: Optional[str] = find_executable('ip')
6156 if not execstr:
6157 raise FileNotFoundError("unable to find 'ip' command")
6158 out, _, _ = call_throws(ctx, [execstr, 'route', 'ls'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
6159 return _parse_ipv4_route(out)
6160
6161
6162 def _parse_ipv4_route(out: str) -> Dict[str, Dict[str, Set[str]]]:
6163 r = {} # type: Dict[str, Dict[str, Set[str]]]
6164 p = re.compile(r'^(\S+) (?:via \S+)? ?dev (\S+) (.*)scope link (.*)src (\S+)')
6165 for line in out.splitlines():
6166 m = p.findall(line)
6167 if not m:
6168 continue
6169 net = m[0][0]
6170 if '/' not in net: # aggregate /32 mask for single host sub-networks
6171 net += '/32'
6172 iface = m[0][1]
6173 ip = m[0][4]
6174 if net not in r:
6175 r[net] = {}
6176 if iface not in r[net]:
6177 r[net][iface] = set()
6178 r[net][iface].add(ip)
6179 return r
6180
6181
6182 def _list_ipv6_networks(ctx: CephadmContext) -> Dict[str, Dict[str, Set[str]]]:
6183 execstr: Optional[str] = find_executable('ip')
6184 if not execstr:
6185 raise FileNotFoundError("unable to find 'ip' command")
6186 routes, _, _ = call_throws(ctx, [execstr, '-6', 'route', 'ls'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
6187 ips, _, _ = call_throws(ctx, [execstr, '-6', 'addr', 'ls'], verbosity=CallVerbosity.QUIET_UNLESS_ERROR)
6188 return _parse_ipv6_route(routes, ips)
6189
6190
6191 def _parse_ipv6_route(routes: str, ips: str) -> Dict[str, Dict[str, Set[str]]]:
6192 r = {} # type: Dict[str, Dict[str, Set[str]]]
6193 route_p = re.compile(r'^(\S+) dev (\S+) proto (\S+) metric (\S+) .*pref (\S+)$')
6194 ip_p = re.compile(r'^\s+inet6 (\S+)/(.*)scope (.*)$')
6195 iface_p = re.compile(r'^(\d+): (\S+): (.*)$')
6196 for line in routes.splitlines():
6197 m = route_p.findall(line)
6198 if not m or m[0][0].lower() == 'default':
6199 continue
6200 net = m[0][0]
6201 if '/' not in net: # aggregate /128 mask for single host sub-networks
6202 net += '/128'
6203 iface = m[0][1]
6204 if iface == 'lo': # skip loopback devices
6205 continue
6206 if net not in r:
6207 r[net] = {}
6208 if iface not in r[net]:
6209 r[net][iface] = set()
6210
6211 iface = None
6212 for line in ips.splitlines():
6213 m = ip_p.findall(line)
6214 if not m:
6215 m = iface_p.findall(line)
6216 if m:
6217 # drop @... suffix, if present
6218 iface = m[0][1].split('@')[0]
6219 continue
6220 ip = m[0][0]
6221 # find the network it belongs to
6222 net = [n for n in r.keys()
6223 if ipaddress.ip_address(ip) in ipaddress.ip_network(n)]
6224 if net and iface in r[net[0]]:
6225 assert iface
6226 r[net[0]][iface].add(ip)
6227
6228 return r
6229
6230
6231 def command_list_networks(ctx):
6232 # type: (CephadmContext) -> None
6233 r = list_networks(ctx)
6234
6235 def serialize_sets(obj: Any) -> Any:
6236 return list(obj) if isinstance(obj, set) else obj
6237
6238 print(json.dumps(r, indent=4, default=serialize_sets))
6239
6240 ##################################
6241
6242
6243 def command_ls(ctx):
6244 # type: (CephadmContext) -> None
6245 ls = list_daemons(ctx, detail=not ctx.no_detail,
6246 legacy_dir=ctx.legacy_dir)
6247 print(json.dumps(ls, indent=4))
6248
6249
6250 def with_units_to_int(v: str) -> int:
6251 if v.endswith('iB'):
6252 v = v[:-2]
6253 elif v.endswith('B'):
6254 v = v[:-1]
6255 mult = 1
6256 if v[-1].upper() == 'K':
6257 mult = 1024
6258 v = v[:-1]
6259 elif v[-1].upper() == 'M':
6260 mult = 1024 * 1024
6261 v = v[:-1]
6262 elif v[-1].upper() == 'G':
6263 mult = 1024 * 1024 * 1024
6264 v = v[:-1]
6265 elif v[-1].upper() == 'T':
6266 mult = 1024 * 1024 * 1024 * 1024
6267 v = v[:-1]
6268 return int(float(v) * mult)
6269
6270
6271 def list_daemons(ctx, detail=True, legacy_dir=None):
6272 # type: (CephadmContext, bool, Optional[str]) -> List[Dict[str, str]]
6273 host_version: Optional[str] = None
6274 ls = []
6275 container_path = ctx.container_engine.path
6276
6277 data_dir = ctx.data_dir
6278 if legacy_dir is not None:
6279 data_dir = os.path.abspath(legacy_dir + data_dir)
6280
6281 # keep track of ceph versions we see
6282 seen_versions = {} # type: Dict[str, Optional[str]]
6283
6284 # keep track of image digests
6285 seen_digests = {} # type: Dict[str, List[str]]
6286
6287 # keep track of memory and cpu usage we've seen
6288 seen_memusage = {} # type: Dict[str, int]
6289 seen_cpuperc = {} # type: Dict[str, str]
6290 out, err, code = call(
6291 ctx,
6292 [container_path, 'stats', '--format', '{{.ID}},{{.MemUsage}}', '--no-stream'],
6293 verbosity=CallVerbosity.QUIET
6294 )
6295 seen_memusage_cid_len, seen_memusage = _parse_mem_usage(code, out)
6296
6297 out, err, code = call(
6298 ctx,
6299 [container_path, 'stats', '--format', '{{.ID}},{{.CPUPerc}}', '--no-stream'],
6300 verbosity=CallVerbosity.QUIET
6301 )
6302 seen_cpuperc_cid_len, seen_cpuperc = _parse_cpu_perc(code, out)
6303
6304 # /var/lib/ceph
6305 if os.path.exists(data_dir):
6306 for i in os.listdir(data_dir):
6307 if i in ['mon', 'osd', 'mds', 'mgr']:
6308 daemon_type = i
6309 for j in os.listdir(os.path.join(data_dir, i)):
6310 if '-' not in j:
6311 continue
6312 (cluster, daemon_id) = j.split('-', 1)
6313 fsid = get_legacy_daemon_fsid(ctx,
6314 cluster, daemon_type, daemon_id,
6315 legacy_dir=legacy_dir)
6316 legacy_unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
6317 val: Dict[str, Any] = {
6318 'style': 'legacy',
6319 'name': '%s.%s' % (daemon_type, daemon_id),
6320 'fsid': fsid if fsid is not None else 'unknown',
6321 'systemd_unit': legacy_unit_name,
6322 }
6323 if detail:
6324 (val['enabled'], val['state'], _) = check_unit(ctx, legacy_unit_name)
6325 if not host_version:
6326 try:
6327 out, err, code = call(ctx,
6328 ['ceph', '-v'],
6329 verbosity=CallVerbosity.QUIET)
6330 if not code and out.startswith('ceph version '):
6331 host_version = out.split(' ')[2]
6332 except Exception:
6333 pass
6334 val['host_version'] = host_version
6335 ls.append(val)
6336 elif is_fsid(i):
6337 fsid = str(i) # convince mypy that fsid is a str here
6338 for j in os.listdir(os.path.join(data_dir, i)):
6339 if '.' in j and os.path.isdir(os.path.join(data_dir, fsid, j)):
6340 name = j
6341 (daemon_type, daemon_id) = j.split('.', 1)
6342 unit_name = get_unit_name(fsid,
6343 daemon_type,
6344 daemon_id)
6345 else:
6346 continue
6347 val = {
6348 'style': 'cephadm:v1',
6349 'name': name,
6350 'fsid': fsid,
6351 'systemd_unit': unit_name,
6352 }
6353 if detail:
6354 # get container id
6355 (val['enabled'], val['state'], _) = check_unit(ctx, unit_name)
6356 container_id = None
6357 image_name = None
6358 image_id = None
6359 image_digests = None
6360 version = None
6361 start_stamp = None
6362
6363 out, err, code = get_container_stats(ctx, container_path, fsid, daemon_type, daemon_id)
6364 if not code:
6365 (container_id, image_name, image_id, start,
6366 version) = out.strip().split(',')
6367 image_id = normalize_container_id(image_id)
6368 daemon_type = name.split('.', 1)[0]
6369 start_stamp = try_convert_datetime(start)
6370
6371 # collect digests for this image id
6372 image_digests = seen_digests.get(image_id)
6373 if not image_digests:
6374 out, err, code = call(
6375 ctx,
6376 [
6377 container_path, 'image', 'inspect', image_id,
6378 '--format', '{{.RepoDigests}}',
6379 ],
6380 verbosity=CallVerbosity.QUIET)
6381 if not code:
6382 image_digests = list(set(map(
6383 normalize_image_digest,
6384 out.strip()[1:-1].split(' '))))
6385 seen_digests[image_id] = image_digests
6386
6387 # identify software version inside the container (if we can)
6388 if not version or '.' not in version:
6389 version = seen_versions.get(image_id, None)
6390 if daemon_type == NFSGanesha.daemon_type:
6391 version = NFSGanesha.get_version(ctx, container_id)
6392 if daemon_type == CephIscsi.daemon_type:
6393 version = CephIscsi.get_version(ctx, container_id)
6394 elif not version:
6395 if daemon_type in Ceph.daemons:
6396 out, err, code = call(ctx,
6397 [container_path, 'exec', container_id,
6398 'ceph', '-v'],
6399 verbosity=CallVerbosity.QUIET)
6400 if not code and \
6401 out.startswith('ceph version '):
6402 version = out.split(' ')[2]
6403 seen_versions[image_id] = version
6404 elif daemon_type == 'grafana':
6405 out, err, code = call(ctx,
6406 [container_path, 'exec', container_id,
6407 'grafana-server', '-v'],
6408 verbosity=CallVerbosity.QUIET)
6409 if not code and \
6410 out.startswith('Version '):
6411 version = out.split(' ')[1]
6412 seen_versions[image_id] = version
6413 elif daemon_type in ['prometheus',
6414 'alertmanager',
6415 'node-exporter',
6416 'loki',
6417 'promtail']:
6418 version = Monitoring.get_version(ctx, container_id, daemon_type)
6419 seen_versions[image_id] = version
6420 elif daemon_type == 'haproxy':
6421 out, err, code = call(ctx,
6422 [container_path, 'exec', container_id,
6423 'haproxy', '-v'],
6424 verbosity=CallVerbosity.QUIET)
6425 if not code and \
6426 out.startswith('HA-Proxy version '):
6427 version = out.split(' ')[2]
6428 seen_versions[image_id] = version
6429 elif daemon_type == 'keepalived':
6430 out, err, code = call(ctx,
6431 [container_path, 'exec', container_id,
6432 'keepalived', '--version'],
6433 verbosity=CallVerbosity.QUIET)
6434 if not code and \
6435 err.startswith('Keepalived '):
6436 version = err.split(' ')[1]
6437 if version[0] == 'v':
6438 version = version[1:]
6439 seen_versions[image_id] = version
6440 elif daemon_type == CustomContainer.daemon_type:
6441 # Because a custom container can contain
6442 # everything, we do not know which command
6443 # to execute to get the version.
6444 pass
6445 elif daemon_type == SNMPGateway.daemon_type:
6446 version = SNMPGateway.get_version(ctx, fsid, daemon_id)
6447 seen_versions[image_id] = version
6448 else:
6449 logger.warning('version for unknown daemon type %s' % daemon_type)
6450 else:
6451 vfile = os.path.join(data_dir, fsid, j, 'unit.image') # type: ignore
6452 try:
6453 with open(vfile, 'r') as f:
6454 image_name = f.read().strip() or None
6455 except IOError:
6456 pass
6457
6458 # unit.meta?
6459 mfile = os.path.join(data_dir, fsid, j, 'unit.meta') # type: ignore
6460 try:
6461 with open(mfile, 'r') as f:
6462 meta = json.loads(f.read())
6463 val.update(meta)
6464 except IOError:
6465 pass
6466
6467 val['container_id'] = container_id
6468 val['container_image_name'] = image_name
6469 val['container_image_id'] = image_id
6470 val['container_image_digests'] = image_digests
6471 if container_id:
6472 val['memory_usage'] = seen_memusage.get(container_id[0:seen_memusage_cid_len])
6473 val['cpu_percentage'] = seen_cpuperc.get(container_id[0:seen_cpuperc_cid_len])
6474 val['version'] = version
6475 val['started'] = start_stamp
6476 val['created'] = get_file_timestamp(
6477 os.path.join(data_dir, fsid, j, 'unit.created')
6478 )
6479 val['deployed'] = get_file_timestamp(
6480 os.path.join(data_dir, fsid, j, 'unit.image'))
6481 val['configured'] = get_file_timestamp(
6482 os.path.join(data_dir, fsid, j, 'unit.configured'))
6483 ls.append(val)
6484
6485 return ls
6486
6487
6488 def _parse_mem_usage(code: int, out: str) -> Tuple[int, Dict[str, int]]:
6489 # keep track of memory usage we've seen
6490 seen_memusage = {} # type: Dict[str, int]
6491 seen_memusage_cid_len = 0
6492 if not code:
6493 for line in out.splitlines():
6494 (cid, usage) = line.split(',')
6495 (used, limit) = usage.split(' / ')
6496 try:
6497 seen_memusage[cid] = with_units_to_int(used)
6498 if not seen_memusage_cid_len:
6499 seen_memusage_cid_len = len(cid)
6500 except ValueError:
6501 logger.info('unable to parse memory usage line\n>{}'.format(line))
6502 pass
6503 return seen_memusage_cid_len, seen_memusage
6504
6505
6506 def _parse_cpu_perc(code: int, out: str) -> Tuple[int, Dict[str, str]]:
6507 seen_cpuperc = {}
6508 seen_cpuperc_cid_len = 0
6509 if not code:
6510 for line in out.splitlines():
6511 (cid, cpuperc) = line.split(',')
6512 try:
6513 seen_cpuperc[cid] = cpuperc
6514 if not seen_cpuperc_cid_len:
6515 seen_cpuperc_cid_len = len(cid)
6516 except ValueError:
6517 logger.info('unable to parse cpu percentage line\n>{}'.format(line))
6518 pass
6519 return seen_cpuperc_cid_len, seen_cpuperc
6520
6521
6522 def get_daemon_description(ctx, fsid, name, detail=False, legacy_dir=None):
6523 # type: (CephadmContext, str, str, bool, Optional[str]) -> Dict[str, str]
6524
6525 for d in list_daemons(ctx, detail=detail, legacy_dir=legacy_dir):
6526 if d['fsid'] != fsid:
6527 continue
6528 if d['name'] != name:
6529 continue
6530 return d
6531 raise Error('Daemon not found: {}. See `cephadm ls`'.format(name))
6532
6533
6534 def get_container_stats(ctx: CephadmContext, container_path: str, fsid: str, daemon_type: str, daemon_id: str) -> Tuple[str, str, int]:
6535 c = CephContainer.for_daemon(ctx, fsid, daemon_type, daemon_id, 'bash')
6536 out, err, code = '', '', -1
6537 for name in (c.cname, c.old_cname):
6538 cmd = [
6539 container_path, 'inspect',
6540 '--format', '{{.Id}},{{.Config.Image}},{{.Image}},{{.Created}},{{index .Config.Labels "io.ceph.version"}}',
6541 name
6542 ]
6543 out, err, code = call(ctx, cmd, verbosity=CallVerbosity.QUIET)
6544 if not code:
6545 break
6546 return out, err, code
6547
6548 ##################################
6549
6550
6551 @default_image
6552 def command_adopt(ctx):
6553 # type: (CephadmContext) -> None
6554
6555 if not ctx.skip_pull:
6556 try:
6557 _pull_image(ctx, ctx.image)
6558 except UnauthorizedRegistryError:
6559 err_str = 'Failed to pull container image. Host may not be logged into container registry. Try `cephadm registry-login --registry-url <url> --registry-username <username> --registry-password <password>` or supply login info via a json file with `cephadm registry-login --registry-json <file>`'
6560 logger.debug(f'Pulling image for `command_adopt` failed: {err_str}')
6561 raise Error(err_str)
6562
6563 (daemon_type, daemon_id) = ctx.name.split('.', 1)
6564
6565 # legacy check
6566 if ctx.style != 'legacy':
6567 raise Error('adoption of style %s not implemented' % ctx.style)
6568
6569 # lock
6570 fsid = get_legacy_daemon_fsid(ctx,
6571 ctx.cluster,
6572 daemon_type,
6573 daemon_id,
6574 legacy_dir=ctx.legacy_dir)
6575 if not fsid:
6576 raise Error('could not detect legacy fsid; set fsid in ceph.conf')
6577 lock = FileLock(ctx, fsid)
6578 lock.acquire()
6579
6580 # call correct adoption
6581 if daemon_type in Ceph.daemons:
6582 command_adopt_ceph(ctx, daemon_type, daemon_id, fsid)
6583 elif daemon_type == 'prometheus':
6584 command_adopt_prometheus(ctx, daemon_id, fsid)
6585 elif daemon_type == 'grafana':
6586 command_adopt_grafana(ctx, daemon_id, fsid)
6587 elif daemon_type == 'node-exporter':
6588 raise Error('adoption of node-exporter not implemented')
6589 elif daemon_type == 'alertmanager':
6590 command_adopt_alertmanager(ctx, daemon_id, fsid)
6591 else:
6592 raise Error('daemon type %s not recognized' % daemon_type)
6593
6594
6595 class AdoptOsd(object):
6596 def __init__(self, ctx, osd_data_dir, osd_id):
6597 # type: (CephadmContext, str, str) -> None
6598 self.ctx = ctx
6599 self.osd_data_dir = osd_data_dir
6600 self.osd_id = osd_id
6601
6602 def check_online_osd(self):
6603 # type: () -> Tuple[Optional[str], Optional[str]]
6604
6605 osd_fsid, osd_type = None, None
6606
6607 path = os.path.join(self.osd_data_dir, 'fsid')
6608 try:
6609 with open(path, 'r') as f:
6610 osd_fsid = f.read().strip()
6611 logger.info('Found online OSD at %s' % path)
6612 except IOError:
6613 logger.info('Unable to read OSD fsid from %s' % path)
6614 if os.path.exists(os.path.join(self.osd_data_dir, 'type')):
6615 with open(os.path.join(self.osd_data_dir, 'type')) as f:
6616 osd_type = f.read().strip()
6617 else:
6618 logger.info('"type" file missing for OSD data dir')
6619
6620 return osd_fsid, osd_type
6621
6622 def check_offline_lvm_osd(self):
6623 # type: () -> Tuple[Optional[str], Optional[str]]
6624 osd_fsid, osd_type = None, None
6625
6626 c = get_ceph_volume_container(
6627 self.ctx,
6628 args=['lvm', 'list', '--format=json'],
6629 )
6630 out, err, code = call_throws(self.ctx, c.run_cmd())
6631 if not code:
6632 try:
6633 js = json.loads(out)
6634 if self.osd_id in js:
6635 logger.info('Found offline LVM OSD {}'.format(self.osd_id))
6636 osd_fsid = js[self.osd_id][0]['tags']['ceph.osd_fsid']
6637 for device in js[self.osd_id]:
6638 if device['tags']['ceph.type'] == 'block':
6639 osd_type = 'bluestore'
6640 break
6641 if device['tags']['ceph.type'] == 'data':
6642 osd_type = 'filestore'
6643 break
6644 except ValueError as e:
6645 logger.info('Invalid JSON in ceph-volume lvm list: {}'.format(e))
6646
6647 return osd_fsid, osd_type
6648
6649 def check_offline_simple_osd(self):
6650 # type: () -> Tuple[Optional[str], Optional[str]]
6651 osd_fsid, osd_type = None, None
6652
6653 osd_file = glob('/etc/ceph/osd/{}-[a-f0-9-]*.json'.format(self.osd_id))
6654 if len(osd_file) == 1:
6655 with open(osd_file[0], 'r') as f:
6656 try:
6657 js = json.loads(f.read())
6658 logger.info('Found offline simple OSD {}'.format(self.osd_id))
6659 osd_fsid = js['fsid']
6660 osd_type = js['type']
6661 if osd_type != 'filestore':
6662 # need this to be mounted for the adopt to work, as it
6663 # needs to move files from this directory
6664 call_throws(self.ctx, ['mount', js['data']['path'], self.osd_data_dir])
6665 except ValueError as e:
6666 logger.info('Invalid JSON in {}: {}'.format(osd_file, e))
6667
6668 return osd_fsid, osd_type
6669
6670 def change_cluster_name(self) -> None:
6671 logger.info('Attempting to convert osd cluster name to ceph . . .')
6672 c = get_ceph_volume_container(
6673 self.ctx,
6674 args=['lvm', 'list', '{}'.format(self.osd_id), '--format=json'],
6675 )
6676 out, err, code = call_throws(self.ctx, c.run_cmd())
6677 if code:
6678 raise Exception(f'Failed to get list of LVs: {err}\nceph-volume failed with rc {code}')
6679 try:
6680 js = json.loads(out)
6681 if not js:
6682 raise RuntimeError(f'Failed to find osd.{self.osd_id}')
6683 device: Optional[Dict[Any, Any]] = None
6684 for d in js[self.osd_id]:
6685 if d['type'] == 'block':
6686 device = d
6687 break
6688 if not device:
6689 raise RuntimeError(f'Failed to find block device for osd.{self.osd_id}')
6690 vg = device['vg_name']
6691 out, err, code = call_throws(self.ctx, ['lvchange', '--deltag', f'ceph.cluster_name={self.ctx.cluster}', vg])
6692 if code:
6693 raise RuntimeError(f"Can't delete tag ceph.cluster_name={self.ctx.cluster} on osd.{self.osd_id}.\nlvchange failed with rc {code}")
6694 out, err, code = call_throws(self.ctx, ['lvchange', '--addtag', 'ceph.cluster_name=ceph', vg])
6695 if code:
6696 raise RuntimeError(f"Can't add tag ceph.cluster_name=ceph on osd.{self.osd_id}.\nlvchange failed with rc {code}")
6697 logger.info('Successfully converted osd cluster name')
6698 except (Exception, RuntimeError) as e:
6699 logger.info(f'Failed to convert osd cluster name: {e}')
6700
6701
6702 def command_adopt_ceph(ctx, daemon_type, daemon_id, fsid):
6703 # type: (CephadmContext, str, str, str) -> None
6704
6705 (uid, gid) = extract_uid_gid(ctx)
6706
6707 data_dir_src = ('/var/lib/ceph/%s/%s-%s' %
6708 (daemon_type, ctx.cluster, daemon_id))
6709 data_dir_src = os.path.abspath(ctx.legacy_dir + data_dir_src)
6710
6711 if not os.path.exists(data_dir_src):
6712 raise Error("{}.{} data directory '{}' does not exist. "
6713 'Incorrect ID specified, or daemon already adopted?'.format(
6714 daemon_type, daemon_id, data_dir_src))
6715
6716 osd_fsid = None
6717 if daemon_type == 'osd':
6718 adopt_osd = AdoptOsd(ctx, data_dir_src, daemon_id)
6719 osd_fsid, osd_type = adopt_osd.check_online_osd()
6720 if not osd_fsid:
6721 osd_fsid, osd_type = adopt_osd.check_offline_lvm_osd()
6722 if not osd_fsid:
6723 osd_fsid, osd_type = adopt_osd.check_offline_simple_osd()
6724 if not osd_fsid:
6725 raise Error('Unable to find OSD {}'.format(daemon_id))
6726 elif ctx.cluster != 'ceph':
6727 adopt_osd.change_cluster_name()
6728 logger.info('objectstore_type is %s' % osd_type)
6729 assert osd_type
6730 if osd_type == 'filestore':
6731 raise Error('FileStore is not supported by cephadm')
6732
6733 # NOTE: implicit assumption here that the units correspond to the
6734 # cluster we are adopting based on the /etc/{defaults,sysconfig}/ceph
6735 # CLUSTER field.
6736 unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
6737 (enabled, state, _) = check_unit(ctx, unit_name)
6738 if state == 'running':
6739 logger.info('Stopping old systemd unit %s...' % unit_name)
6740 call_throws(ctx, ['systemctl', 'stop', unit_name])
6741 if enabled:
6742 logger.info('Disabling old systemd unit %s...' % unit_name)
6743 call_throws(ctx, ['systemctl', 'disable', unit_name])
6744
6745 # data
6746 logger.info('Moving data...')
6747 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
6748 uid=uid, gid=gid)
6749 move_files(ctx, glob(os.path.join(data_dir_src, '*')),
6750 data_dir_dst,
6751 uid=uid, gid=gid)
6752 logger.debug('Remove dir `%s`' % (data_dir_src))
6753 if os.path.ismount(data_dir_src):
6754 call_throws(ctx, ['umount', data_dir_src])
6755 os.rmdir(data_dir_src)
6756
6757 logger.info('Chowning content...')
6758 call_throws(ctx, ['chown', '-c', '-R', '%d.%d' % (uid, gid), data_dir_dst])
6759
6760 if daemon_type == 'mon':
6761 # rename *.ldb -> *.sst, in case they are coming from ubuntu
6762 store = os.path.join(data_dir_dst, 'store.db')
6763 num_renamed = 0
6764 if os.path.exists(store):
6765 for oldf in os.listdir(store):
6766 if oldf.endswith('.ldb'):
6767 newf = oldf.replace('.ldb', '.sst')
6768 oldp = os.path.join(store, oldf)
6769 newp = os.path.join(store, newf)
6770 logger.debug('Renaming %s -> %s' % (oldp, newp))
6771 os.rename(oldp, newp)
6772 if num_renamed:
6773 logger.info('Renamed %d leveldb *.ldb files to *.sst',
6774 num_renamed)
6775 if daemon_type == 'osd':
6776 for n in ['block', 'block.db', 'block.wal']:
6777 p = os.path.join(data_dir_dst, n)
6778 if os.path.exists(p):
6779 logger.info('Chowning %s...' % p)
6780 os.chown(p, uid, gid)
6781 # disable the ceph-volume 'simple' mode files on the host
6782 simple_fn = os.path.join('/etc/ceph/osd',
6783 '%s-%s.json' % (daemon_id, osd_fsid))
6784 if os.path.exists(simple_fn):
6785 new_fn = simple_fn + '.adopted-by-cephadm'
6786 logger.info('Renaming %s -> %s', simple_fn, new_fn)
6787 os.rename(simple_fn, new_fn)
6788 logger.info('Disabling host unit ceph-volume@ simple unit...')
6789 call(ctx, ['systemctl', 'disable',
6790 'ceph-volume@simple-%s-%s.service' % (daemon_id, osd_fsid)])
6791 else:
6792 # assume this is an 'lvm' c-v for now, but don't error
6793 # out if it's not.
6794 logger.info('Disabling host unit ceph-volume@ lvm unit...')
6795 call(ctx, ['systemctl', 'disable',
6796 'ceph-volume@lvm-%s-%s.service' % (daemon_id, osd_fsid)])
6797
6798 # config
6799 config_src = '/etc/ceph/%s.conf' % (ctx.cluster)
6800 config_src = os.path.abspath(ctx.legacy_dir + config_src)
6801 config_dst = os.path.join(data_dir_dst, 'config')
6802 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
6803
6804 # logs
6805 logger.info('Moving logs...')
6806 log_dir_src = ('/var/log/ceph/%s-%s.%s.log*' %
6807 (ctx.cluster, daemon_type, daemon_id))
6808 log_dir_src = os.path.abspath(ctx.legacy_dir + log_dir_src)
6809 log_dir_dst = make_log_dir(ctx, fsid, uid=uid, gid=gid)
6810 move_files(ctx, glob(log_dir_src),
6811 log_dir_dst,
6812 uid=uid, gid=gid)
6813
6814 logger.info('Creating new units...')
6815 make_var_run(ctx, fsid, uid, gid)
6816 c = get_container(ctx, fsid, daemon_type, daemon_id)
6817 deploy_daemon_units(ctx, fsid, uid, gid, daemon_type, daemon_id, c,
6818 enable=True, # unconditionally enable the new unit
6819 start=(state == 'running' or ctx.force_start),
6820 osd_fsid=osd_fsid)
6821 update_firewalld(ctx, daemon_type)
6822
6823
6824 def command_adopt_prometheus(ctx, daemon_id, fsid):
6825 # type: (CephadmContext, str, str) -> None
6826 daemon_type = 'prometheus'
6827 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
6828
6829 _stop_and_disable(ctx, 'prometheus')
6830
6831 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
6832 uid=uid, gid=gid)
6833
6834 # config
6835 config_src = '/etc/prometheus/prometheus.yml'
6836 config_src = os.path.abspath(ctx.legacy_dir + config_src)
6837 config_dst = os.path.join(data_dir_dst, 'etc/prometheus')
6838 makedirs(config_dst, uid, gid, 0o755)
6839 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
6840
6841 # data
6842 data_src = '/var/lib/prometheus/metrics/'
6843 data_src = os.path.abspath(ctx.legacy_dir + data_src)
6844 data_dst = os.path.join(data_dir_dst, 'data')
6845 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
6846
6847 make_var_run(ctx, fsid, uid, gid)
6848 c = get_container(ctx, fsid, daemon_type, daemon_id)
6849 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
6850 update_firewalld(ctx, daemon_type)
6851
6852
6853 def command_adopt_grafana(ctx, daemon_id, fsid):
6854 # type: (CephadmContext, str, str) -> None
6855
6856 daemon_type = 'grafana'
6857 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
6858
6859 _stop_and_disable(ctx, 'grafana-server')
6860
6861 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
6862 uid=uid, gid=gid)
6863
6864 # config
6865 config_src = '/etc/grafana/grafana.ini'
6866 config_src = os.path.abspath(ctx.legacy_dir + config_src)
6867 config_dst = os.path.join(data_dir_dst, 'etc/grafana')
6868 makedirs(config_dst, uid, gid, 0o755)
6869 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
6870
6871 prov_src = '/etc/grafana/provisioning/'
6872 prov_src = os.path.abspath(ctx.legacy_dir + prov_src)
6873 prov_dst = os.path.join(data_dir_dst, 'etc/grafana')
6874 copy_tree(ctx, [prov_src], prov_dst, uid=uid, gid=gid)
6875
6876 # cert
6877 cert = '/etc/grafana/grafana.crt'
6878 key = '/etc/grafana/grafana.key'
6879 if os.path.exists(cert) and os.path.exists(key):
6880 cert_src = '/etc/grafana/grafana.crt'
6881 cert_src = os.path.abspath(ctx.legacy_dir + cert_src)
6882 makedirs(os.path.join(data_dir_dst, 'etc/grafana/certs'), uid, gid, 0o755)
6883 cert_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_file')
6884 copy_files(ctx, [cert_src], cert_dst, uid=uid, gid=gid)
6885
6886 key_src = '/etc/grafana/grafana.key'
6887 key_src = os.path.abspath(ctx.legacy_dir + key_src)
6888 key_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_key')
6889 copy_files(ctx, [key_src], key_dst, uid=uid, gid=gid)
6890
6891 _adjust_grafana_ini(os.path.join(config_dst, 'grafana.ini'))
6892 else:
6893 logger.debug('Skipping ssl, missing cert {} or key {}'.format(cert, key))
6894
6895 # data - possible custom dashboards/plugins
6896 data_src = '/var/lib/grafana/'
6897 data_src = os.path.abspath(ctx.legacy_dir + data_src)
6898 data_dst = os.path.join(data_dir_dst, 'data')
6899 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
6900
6901 make_var_run(ctx, fsid, uid, gid)
6902 c = get_container(ctx, fsid, daemon_type, daemon_id)
6903 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
6904 update_firewalld(ctx, daemon_type)
6905
6906
6907 def command_adopt_alertmanager(ctx, daemon_id, fsid):
6908 # type: (CephadmContext, str, str) -> None
6909
6910 daemon_type = 'alertmanager'
6911 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
6912
6913 _stop_and_disable(ctx, 'prometheus-alertmanager')
6914
6915 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
6916 uid=uid, gid=gid)
6917
6918 # config
6919 config_src = '/etc/prometheus/alertmanager.yml'
6920 config_src = os.path.abspath(ctx.legacy_dir + config_src)
6921 config_dst = os.path.join(data_dir_dst, 'etc/alertmanager')
6922 makedirs(config_dst, uid, gid, 0o755)
6923 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
6924
6925 # data
6926 data_src = '/var/lib/prometheus/alertmanager/'
6927 data_src = os.path.abspath(ctx.legacy_dir + data_src)
6928 data_dst = os.path.join(data_dir_dst, 'etc/alertmanager/data')
6929 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
6930
6931 make_var_run(ctx, fsid, uid, gid)
6932 c = get_container(ctx, fsid, daemon_type, daemon_id)
6933 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
6934 update_firewalld(ctx, daemon_type)
6935
6936
6937 def _adjust_grafana_ini(filename):
6938 # type: (str) -> None
6939
6940 # Update cert_file, cert_key pathnames in server section
6941 # ConfigParser does not preserve comments
6942 try:
6943 with open(filename, 'r') as grafana_ini:
6944 lines = grafana_ini.readlines()
6945 with open('{}.new'.format(filename), 'w') as grafana_ini:
6946 server_section = False
6947 for line in lines:
6948 if line.startswith('['):
6949 server_section = False
6950 if line.startswith('[server]'):
6951 server_section = True
6952 if server_section:
6953 line = re.sub(r'^cert_file.*',
6954 'cert_file = /etc/grafana/certs/cert_file', line)
6955 line = re.sub(r'^cert_key.*',
6956 'cert_key = /etc/grafana/certs/cert_key', line)
6957 grafana_ini.write(line)
6958 os.rename('{}.new'.format(filename), filename)
6959 except OSError as err:
6960 raise Error('Cannot update {}: {}'.format(filename, err))
6961
6962
6963 def _stop_and_disable(ctx, unit_name):
6964 # type: (CephadmContext, str) -> None
6965
6966 (enabled, state, _) = check_unit(ctx, unit_name)
6967 if state == 'running':
6968 logger.info('Stopping old systemd unit %s...' % unit_name)
6969 call_throws(ctx, ['systemctl', 'stop', unit_name])
6970 if enabled:
6971 logger.info('Disabling old systemd unit %s...' % unit_name)
6972 call_throws(ctx, ['systemctl', 'disable', unit_name])
6973
6974 ##################################
6975
6976
6977 def command_rm_daemon(ctx):
6978 # type: (CephadmContext) -> None
6979 lock = FileLock(ctx, ctx.fsid)
6980 lock.acquire()
6981
6982 (daemon_type, daemon_id) = ctx.name.split('.', 1)
6983 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
6984
6985 if daemon_type in ['mon', 'osd'] and not ctx.force:
6986 raise Error('must pass --force to proceed: '
6987 'this command may destroy precious data!')
6988
6989 call(ctx, ['systemctl', 'stop', unit_name],
6990 verbosity=CallVerbosity.DEBUG)
6991 call(ctx, ['systemctl', 'reset-failed', unit_name],
6992 verbosity=CallVerbosity.DEBUG)
6993 call(ctx, ['systemctl', 'disable', unit_name],
6994 verbosity=CallVerbosity.DEBUG)
6995 data_dir = get_data_dir(ctx.fsid, ctx.data_dir, daemon_type, daemon_id)
6996 if daemon_type in ['mon', 'osd', 'prometheus'] and \
6997 not ctx.force_delete_data:
6998 # rename it out of the way -- do not delete
6999 backup_dir = os.path.join(ctx.data_dir, ctx.fsid, 'removed')
7000 if not os.path.exists(backup_dir):
7001 makedirs(backup_dir, 0, 0, DATA_DIR_MODE)
7002 dirname = '%s.%s_%s' % (daemon_type, daemon_id,
7003 datetime.datetime.utcnow().strftime(DATEFMT))
7004 os.rename(data_dir,
7005 os.path.join(backup_dir, dirname))
7006 else:
7007 call_throws(ctx, ['rm', '-rf', data_dir])
7008
7009 if 'tcp_ports' in ctx and ctx.tcp_ports is not None:
7010 ports: List[int] = [int(p) for p in ctx.tcp_ports.split()]
7011 try:
7012 fw = Firewalld(ctx)
7013 fw.close_ports(ports)
7014 fw.apply_rules()
7015 except RuntimeError as e:
7016 # in case we cannot close the ports we will remove
7017 # the daemon but keep them open.
7018 logger.warning(f' Error when trying to close ports: {e}')
7019
7020
7021 ##################################
7022
7023
7024 def _zap(ctx: CephadmContext, what: str) -> None:
7025 mounts = get_container_mounts(ctx, ctx.fsid, 'clusterless-ceph-volume', None)
7026 c = get_ceph_volume_container(ctx,
7027 args=['lvm', 'zap', '--destroy', what],
7028 volume_mounts=mounts,
7029 envs=ctx.env)
7030 logger.info(f'Zapping {what}...')
7031 out, err, code = call_throws(ctx, c.run_cmd())
7032
7033
7034 @infer_image
7035 def _zap_osds(ctx: CephadmContext) -> None:
7036 # assume fsid lock already held
7037
7038 # list
7039 mounts = get_container_mounts(ctx, ctx.fsid, 'clusterless-ceph-volume', None)
7040 c = get_ceph_volume_container(ctx,
7041 args=['inventory', '--format', 'json'],
7042 volume_mounts=mounts,
7043 envs=ctx.env)
7044 out, err, code = call_throws(ctx, c.run_cmd())
7045 if code:
7046 raise Error('failed to list osd inventory')
7047 try:
7048 ls = json.loads(out)
7049 except ValueError as e:
7050 raise Error(f'Invalid JSON in ceph-volume inventory: {e}')
7051
7052 for i in ls:
7053 matches = [lv.get('cluster_fsid') == ctx.fsid and i.get('ceph_device') for lv in i.get('lvs', [])]
7054 if any(matches) and all(matches):
7055 _zap(ctx, i.get('path'))
7056 elif any(matches):
7057 lv_names = [lv['name'] for lv in i.get('lvs', [])]
7058 # TODO: we need to map the lv_names back to device paths (the vg
7059 # id isn't part of the output here!)
7060 logger.warning(f'Not zapping LVs (not implemented): {lv_names}')
7061
7062
7063 def command_zap_osds(ctx: CephadmContext) -> None:
7064 if not ctx.force:
7065 raise Error('must pass --force to proceed: '
7066 'this command may destroy precious data!')
7067
7068 lock = FileLock(ctx, ctx.fsid)
7069 lock.acquire()
7070
7071 _zap_osds(ctx)
7072
7073 ##################################
7074
7075
7076 def get_ceph_cluster_count(ctx: CephadmContext) -> int:
7077 return len([c for c in os.listdir(ctx.data_dir) if is_fsid(c)])
7078
7079
7080 def command_rm_cluster(ctx):
7081 # type: (CephadmContext) -> None
7082 if not ctx.force:
7083 raise Error('must pass --force to proceed: '
7084 'this command may destroy precious data!')
7085
7086 lock = FileLock(ctx, ctx.fsid)
7087 lock.acquire()
7088
7089 def disable_systemd_service(unit_name: str) -> None:
7090 call(ctx, ['systemctl', 'stop', unit_name],
7091 verbosity=CallVerbosity.DEBUG)
7092 call(ctx, ['systemctl', 'reset-failed', unit_name],
7093 verbosity=CallVerbosity.DEBUG)
7094 call(ctx, ['systemctl', 'disable', unit_name],
7095 verbosity=CallVerbosity.DEBUG)
7096
7097 # stop + disable individual daemon units
7098 for d in list_daemons(ctx, detail=False):
7099 if d['fsid'] != ctx.fsid:
7100 continue
7101 if d['style'] != 'cephadm:v1':
7102 continue
7103 disable_systemd_service(get_unit_name(ctx.fsid, d['name']))
7104
7105 # cluster units
7106 for unit_name in ['ceph-%s.target' % ctx.fsid]:
7107 disable_systemd_service(unit_name)
7108
7109 slice_name = 'system-ceph\\x2d{}.slice'.format(ctx.fsid.replace('-', '\\x2d'))
7110 call(ctx, ['systemctl', 'stop', slice_name],
7111 verbosity=CallVerbosity.DEBUG)
7112
7113 # osds?
7114 if ctx.zap_osds:
7115 _zap_osds(ctx)
7116
7117 # rm units
7118 call_throws(ctx, ['rm', '-f', ctx.unit_dir
7119 + '/ceph-%s@.service' % ctx.fsid])
7120 call_throws(ctx, ['rm', '-f', ctx.unit_dir
7121 + '/ceph-%s.target' % ctx.fsid])
7122 call_throws(ctx, ['rm', '-rf',
7123 ctx.unit_dir + '/ceph-%s.target.wants' % ctx.fsid])
7124 # rm data
7125 call_throws(ctx, ['rm', '-rf', ctx.data_dir + '/' + ctx.fsid])
7126
7127 if not ctx.keep_logs:
7128 # rm logs
7129 call_throws(ctx, ['rm', '-rf', ctx.log_dir + '/' + ctx.fsid])
7130 call_throws(ctx, ['rm', '-rf', ctx.log_dir
7131 + '/*.wants/ceph-%s@*' % ctx.fsid])
7132
7133 # rm logrotate config
7134 call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/ceph-%s' % ctx.fsid])
7135
7136 # if last cluster on host remove shared files
7137 if get_ceph_cluster_count(ctx) == 0:
7138 disable_systemd_service('ceph.target')
7139
7140 # rm shared ceph target files
7141 call_throws(ctx, ['rm', '-f', ctx.unit_dir + '/multi-user.target.wants/ceph.target'])
7142 call_throws(ctx, ['rm', '-f', ctx.unit_dir + '/ceph.target'])
7143
7144 # rm cephadm logrotate config
7145 call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/cephadm'])
7146
7147 if not ctx.keep_logs:
7148 # remove all cephadm logs
7149 for fname in glob(f'{ctx.log_dir}/cephadm.log*'):
7150 os.remove(fname)
7151
7152 # rm sysctl settings
7153 sysctl_dirs: List[Path] = [Path(ctx.sysctl_dir), Path('/usr/lib/sysctl.d')]
7154
7155 for sysctl_dir in sysctl_dirs:
7156 for p in sysctl_dir.glob(f'90-ceph-{ctx.fsid}-*.conf'):
7157 p.unlink()
7158
7159 # cleanup remaining ceph directories
7160 ceph_dirs = [f'/run/ceph/{ctx.fsid}', f'/tmp/var/lib/ceph/{ctx.fsid}', f'/var/run/ceph/{ctx.fsid}']
7161 for dd in ceph_dirs:
7162 shutil.rmtree(dd, ignore_errors=True)
7163
7164 # clean up config, keyring, and pub key files
7165 files = [CEPH_DEFAULT_CONF, CEPH_DEFAULT_PUBKEY, CEPH_DEFAULT_KEYRING]
7166 if os.path.exists(files[0]):
7167 valid_fsid = False
7168 with open(files[0]) as f:
7169 if ctx.fsid in f.read():
7170 valid_fsid = True
7171 if valid_fsid:
7172 # rm configuration files on /etc/ceph
7173 for n in range(0, len(files)):
7174 if os.path.exists(files[n]):
7175 os.remove(files[n])
7176
7177 ##################################
7178
7179
7180 def check_time_sync(ctx, enabler=None):
7181 # type: (CephadmContext, Optional[Packager]) -> bool
7182 units = [
7183 'chrony.service', # 18.04 (at least)
7184 'chronyd.service', # el / opensuse
7185 'systemd-timesyncd.service',
7186 'ntpd.service', # el7 (at least)
7187 'ntp.service', # 18.04 (at least)
7188 'ntpsec.service', # 20.04 (at least) / buster
7189 'openntpd.service', # ubuntu / debian
7190 ]
7191 if not check_units(ctx, units, enabler):
7192 logger.warning('No time sync service is running; checked for %s' % units)
7193 return False
7194 return True
7195
7196
7197 def command_check_host(ctx: CephadmContext) -> None:
7198 errors = []
7199 commands = ['systemctl', 'lvcreate']
7200
7201 try:
7202 engine = check_container_engine(ctx)
7203 logger.info(f'{engine} is present')
7204 except Error as e:
7205 errors.append(str(e))
7206
7207 for command in commands:
7208 try:
7209 find_program(command)
7210 logger.info('%s is present' % command)
7211 except ValueError:
7212 errors.append('%s binary does not appear to be installed' % command)
7213
7214 # check for configured+running chronyd or ntp
7215 if not check_time_sync(ctx):
7216 errors.append('No time synchronization is active')
7217
7218 if 'expect_hostname' in ctx and ctx.expect_hostname:
7219 if get_hostname().lower() != ctx.expect_hostname.lower():
7220 errors.append('hostname "%s" does not match expected hostname "%s"' % (
7221 get_hostname(), ctx.expect_hostname))
7222 else:
7223 logger.info('Hostname "%s" matches what is expected.',
7224 ctx.expect_hostname)
7225
7226 if errors:
7227 raise Error('\nERROR: '.join(errors))
7228
7229 logger.info('Host looks OK')
7230
7231 ##################################
7232
7233
7234 def get_ssh_vars(ssh_user: str) -> Tuple[int, int, str]:
7235 try:
7236 s_pwd = pwd.getpwnam(ssh_user)
7237 except KeyError:
7238 raise Error('Cannot find uid/gid for ssh-user: %s' % (ssh_user))
7239
7240 ssh_uid = s_pwd.pw_uid
7241 ssh_gid = s_pwd.pw_gid
7242 ssh_dir = os.path.join(s_pwd.pw_dir, '.ssh')
7243 return ssh_uid, ssh_gid, ssh_dir
7244
7245
7246 def authorize_ssh_key(ssh_pub_key: str, ssh_user: str) -> bool:
7247 """Authorize the public key for the provided ssh user"""
7248
7249 def key_in_file(path: str, key: str) -> bool:
7250 if not os.path.exists(path):
7251 return False
7252 with open(path) as f:
7253 lines = f.readlines()
7254 for line in lines:
7255 if line.strip() == key.strip():
7256 return True
7257 return False
7258
7259 logger.info(f'Adding key to {ssh_user}@localhost authorized_keys...')
7260 if ssh_pub_key is None or ssh_pub_key.isspace():
7261 raise Error('Trying to authorize an empty ssh key')
7262
7263 ssh_pub_key = ssh_pub_key.strip()
7264 ssh_uid, ssh_gid, ssh_dir = get_ssh_vars(ssh_user)
7265 if not os.path.exists(ssh_dir):
7266 makedirs(ssh_dir, ssh_uid, ssh_gid, 0o700)
7267
7268 auth_keys_file = '%s/authorized_keys' % ssh_dir
7269 if key_in_file(auth_keys_file, ssh_pub_key):
7270 logger.info(f'key already in {ssh_user}@localhost authorized_keys...')
7271 return False
7272
7273 add_newline = False
7274 if os.path.exists(auth_keys_file):
7275 with open(auth_keys_file, 'r') as f:
7276 f.seek(0, os.SEEK_END)
7277 if f.tell() > 0:
7278 f.seek(f.tell() - 1, os.SEEK_SET) # go to last char
7279 if f.read() != '\n':
7280 add_newline = True
7281
7282 with open(auth_keys_file, 'a') as f:
7283 os.fchown(f.fileno(), ssh_uid, ssh_gid) # just in case we created it
7284 os.fchmod(f.fileno(), 0o600) # just in case we created it
7285 if add_newline:
7286 f.write('\n')
7287 f.write(ssh_pub_key + '\n')
7288
7289 return True
7290
7291
7292 def revoke_ssh_key(key: str, ssh_user: str) -> None:
7293 """Revoke the public key authorization for the ssh user"""
7294 ssh_uid, ssh_gid, ssh_dir = get_ssh_vars(ssh_user)
7295 auth_keys_file = '%s/authorized_keys' % ssh_dir
7296 deleted = False
7297 if os.path.exists(auth_keys_file):
7298 with open(auth_keys_file, 'r') as f:
7299 lines = f.readlines()
7300 _, filename = tempfile.mkstemp()
7301 with open(filename, 'w') as f:
7302 os.fchown(f.fileno(), ssh_uid, ssh_gid)
7303 os.fchmod(f.fileno(), 0o600) # secure access to the keys file
7304 for line in lines:
7305 if line.strip() == key.strip():
7306 deleted = True
7307 else:
7308 f.write(line)
7309
7310 if deleted:
7311 shutil.move(filename, auth_keys_file)
7312 else:
7313 logger.warning('Cannot find the ssh key to be deleted')
7314
7315
7316 def check_ssh_connectivity(ctx: CephadmContext) -> None:
7317
7318 def cmd_is_available(cmd: str) -> bool:
7319 if shutil.which(cmd) is None:
7320 logger.warning(f'Command not found: {cmd}')
7321 return False
7322 return True
7323
7324 if not cmd_is_available('ssh') or not cmd_is_available('ssh-keygen'):
7325 logger.warning('Cannot check ssh connectivity. Skipping...')
7326 return
7327
7328 logger.info('Verifying ssh connectivity ...')
7329 if ctx.ssh_private_key and ctx.ssh_public_key:
7330 # let's use the keys provided by the user
7331 ssh_priv_key_path = pathify(ctx.ssh_private_key.name)
7332 ssh_pub_key_path = pathify(ctx.ssh_public_key.name)
7333 else:
7334 # no custom keys, let's generate some random keys just for this check
7335 ssh_priv_key_path = f'/tmp/ssh_key_{uuid.uuid1()}'
7336 ssh_pub_key_path = f'{ssh_priv_key_path}.pub'
7337 ssh_key_gen_cmd = ['ssh-keygen', '-q', '-t', 'rsa', '-N', '', '-C', '', '-f', ssh_priv_key_path]
7338 _, _, code = call(ctx, ssh_key_gen_cmd)
7339 if code != 0:
7340 logger.warning('Cannot generate keys to check ssh connectivity.')
7341 return
7342
7343 with open(ssh_pub_key_path, 'r') as f:
7344 key = f.read().strip()
7345 new_key = authorize_ssh_key(key, ctx.ssh_user)
7346 ssh_cfg_file_arg = ['-F', pathify(ctx.ssh_config.name)] if ctx.ssh_config else []
7347 _, _, code = call(ctx, ['ssh', '-o StrictHostKeyChecking=no',
7348 *ssh_cfg_file_arg, '-i', ssh_priv_key_path,
7349 '-o PasswordAuthentication=no',
7350 f'{ctx.ssh_user}@{get_hostname()}',
7351 'sudo echo'])
7352
7353 # we only remove the key if it's a new one. In case the user has provided
7354 # some already existing key then we don't alter authorized_keys file
7355 if new_key:
7356 revoke_ssh_key(key, ctx.ssh_user)
7357
7358 pub_key_msg = '- The public key file configured by --ssh-public-key is valid\n' if ctx.ssh_public_key else ''
7359 prv_key_msg = '- The private key file configured by --ssh-private-key is valid\n' if ctx.ssh_private_key else ''
7360 ssh_cfg_msg = '- The ssh configuration file configured by --ssh-config is valid\n' if ctx.ssh_config else ''
7361 err_msg = f"""
7362 ** Please verify your user's ssh configuration and make sure:
7363 - User {ctx.ssh_user} must have passwordless sudo access
7364 {pub_key_msg}{prv_key_msg}{ssh_cfg_msg}
7365 """
7366 if code != 0:
7367 raise Error(err_msg)
7368
7369
7370 def command_prepare_host(ctx: CephadmContext) -> None:
7371 logger.info('Verifying podman|docker is present...')
7372 pkg = None
7373 try:
7374 check_container_engine(ctx)
7375 except Error as e:
7376 logger.warning(str(e))
7377 if not pkg:
7378 pkg = create_packager(ctx)
7379 pkg.install_podman()
7380
7381 logger.info('Verifying lvm2 is present...')
7382 if not find_executable('lvcreate'):
7383 if not pkg:
7384 pkg = create_packager(ctx)
7385 pkg.install(['lvm2'])
7386
7387 logger.info('Verifying time synchronization is in place...')
7388 if not check_time_sync(ctx):
7389 if not pkg:
7390 pkg = create_packager(ctx)
7391 pkg.install(['chrony'])
7392 # check again, and this time try to enable
7393 # the service
7394 check_time_sync(ctx, enabler=pkg)
7395
7396 if 'expect_hostname' in ctx and ctx.expect_hostname and ctx.expect_hostname != get_hostname():
7397 logger.warning('Adjusting hostname from %s -> %s...' % (get_hostname(), ctx.expect_hostname))
7398 call_throws(ctx, ['hostname', ctx.expect_hostname])
7399 with open('/etc/hostname', 'w') as f:
7400 f.write(ctx.expect_hostname + '\n')
7401
7402 logger.info('Repeating the final host check...')
7403 command_check_host(ctx)
7404
7405 ##################################
7406
7407
7408 class CustomValidation(argparse.Action):
7409
7410 def _check_name(self, values: str) -> None:
7411 try:
7412 (daemon_type, daemon_id) = values.split('.', 1)
7413 except ValueError:
7414 raise argparse.ArgumentError(self,
7415 'must be of the format <type>.<id>. For example, osd.1 or prometheus.myhost.com')
7416
7417 daemons = get_supported_daemons()
7418 if daemon_type not in daemons:
7419 raise argparse.ArgumentError(self,
7420 'name must declare the type of daemon e.g. '
7421 '{}'.format(', '.join(daemons)))
7422
7423 def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Union[str, Sequence[Any], None],
7424 option_string: Optional[str] = None) -> None:
7425 assert isinstance(values, str)
7426 if self.dest == 'name':
7427 self._check_name(values)
7428 setattr(namespace, self.dest, values)
7429
7430 ##################################
7431
7432
7433 def get_distro():
7434 # type: () -> Tuple[Optional[str], Optional[str], Optional[str]]
7435 distro = None
7436 distro_version = None
7437 distro_codename = None
7438 with open('/etc/os-release', 'r') as f:
7439 for line in f.readlines():
7440 line = line.strip()
7441 if '=' not in line or line.startswith('#'):
7442 continue
7443 (var, val) = line.split('=', 1)
7444 if val[0] == '"' and val[-1] == '"':
7445 val = val[1:-1]
7446 if var == 'ID':
7447 distro = val.lower()
7448 elif var == 'VERSION_ID':
7449 distro_version = val.lower()
7450 elif var == 'VERSION_CODENAME':
7451 distro_codename = val.lower()
7452 return distro, distro_version, distro_codename
7453
7454
7455 class Packager(object):
7456 def __init__(self, ctx: CephadmContext,
7457 stable: Optional[str] = None, version: Optional[str] = None,
7458 branch: Optional[str] = None, commit: Optional[str] = None):
7459 assert \
7460 (stable and not version and not branch and not commit) or \
7461 (not stable and version and not branch and not commit) or \
7462 (not stable and not version and branch) or \
7463 (not stable and not version and not branch and not commit)
7464 self.ctx = ctx
7465 self.stable = stable
7466 self.version = version
7467 self.branch = branch
7468 self.commit = commit
7469
7470 def validate(self) -> None:
7471 """Validate parameters before writing any state to disk."""
7472 pass
7473
7474 def add_repo(self) -> None:
7475 raise NotImplementedError
7476
7477 def rm_repo(self) -> None:
7478 raise NotImplementedError
7479
7480 def install(self, ls: List[str]) -> None:
7481 raise NotImplementedError
7482
7483 def install_podman(self) -> None:
7484 raise NotImplementedError
7485
7486 def query_shaman(self, distro: str, distro_version: Any, branch: Optional[str], commit: Optional[str]) -> str:
7487 # query shaman
7488 logger.info('Fetching repo metadata from shaman and chacra...')
7489 shaman_url = 'https://shaman.ceph.com/api/repos/ceph/{branch}/{sha1}/{distro}/{distro_version}/repo/?arch={arch}'.format(
7490 distro=distro,
7491 distro_version=distro_version,
7492 branch=branch,
7493 sha1=commit or 'latest',
7494 arch=get_arch()
7495 )
7496 try:
7497 shaman_response = urlopen(shaman_url)
7498 except HTTPError as err:
7499 logger.error('repository not found in shaman (might not be available yet)')
7500 raise Error('%s, failed to fetch %s' % (err, shaman_url))
7501 chacra_url = ''
7502 try:
7503 chacra_url = shaman_response.geturl()
7504 chacra_response = urlopen(chacra_url)
7505 except HTTPError as err:
7506 logger.error('repository not found in chacra (might not be available yet)')
7507 raise Error('%s, failed to fetch %s' % (err, chacra_url))
7508 return chacra_response.read().decode('utf-8')
7509
7510 def repo_gpgkey(self) -> Tuple[str, str]:
7511 if self.ctx.gpg_url:
7512 return self.ctx.gpg_url, 'manual'
7513 if self.stable or self.version:
7514 return 'https://download.ceph.com/keys/release.gpg', 'release'
7515 else:
7516 return 'https://download.ceph.com/keys/autobuild.gpg', 'autobuild'
7517
7518 def enable_service(self, service: str) -> None:
7519 """
7520 Start and enable the service (typically using systemd).
7521 """
7522 call_throws(self.ctx, ['systemctl', 'enable', '--now', service])
7523
7524
7525 class Apt(Packager):
7526 DISTRO_NAMES = {
7527 'ubuntu': 'ubuntu',
7528 'debian': 'debian',
7529 }
7530
7531 def __init__(self, ctx: CephadmContext,
7532 stable: Optional[str], version: Optional[str], branch: Optional[str], commit: Optional[str],
7533 distro: Optional[str], distro_version: Optional[str], distro_codename: Optional[str]) -> None:
7534 super(Apt, self).__init__(ctx, stable=stable, version=version,
7535 branch=branch, commit=commit)
7536 assert distro
7537 self.ctx = ctx
7538 self.distro = self.DISTRO_NAMES[distro]
7539 self.distro_codename = distro_codename
7540 self.distro_version = distro_version
7541
7542 def repo_path(self) -> str:
7543 return '/etc/apt/sources.list.d/ceph.list'
7544
7545 def add_repo(self) -> None:
7546
7547 url, name = self.repo_gpgkey()
7548 logger.info('Installing repo GPG key from %s...' % url)
7549 try:
7550 response = urlopen(url)
7551 except HTTPError as err:
7552 logger.error('failed to fetch GPG repo key from %s: %s' % (
7553 url, err))
7554 raise Error('failed to fetch GPG key')
7555 key = response.read()
7556 with open('/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name, 'wb') as f:
7557 f.write(key)
7558
7559 if self.version:
7560 content = 'deb %s/debian-%s/ %s main\n' % (
7561 self.ctx.repo_url, self.version, self.distro_codename)
7562 elif self.stable:
7563 content = 'deb %s/debian-%s/ %s main\n' % (
7564 self.ctx.repo_url, self.stable, self.distro_codename)
7565 else:
7566 content = self.query_shaman(self.distro, self.distro_codename, self.branch,
7567 self.commit)
7568
7569 logger.info('Installing repo file at %s...' % self.repo_path())
7570 with open(self.repo_path(), 'w') as f:
7571 f.write(content)
7572
7573 self.update()
7574
7575 def rm_repo(self) -> None:
7576 for name in ['autobuild', 'release', 'manual']:
7577 p = '/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name
7578 if os.path.exists(p):
7579 logger.info('Removing repo GPG key %s...' % p)
7580 os.unlink(p)
7581 if os.path.exists(self.repo_path()):
7582 logger.info('Removing repo at %s...' % self.repo_path())
7583 os.unlink(self.repo_path())
7584
7585 if self.distro == 'ubuntu':
7586 self.rm_kubic_repo()
7587
7588 def install(self, ls: List[str]) -> None:
7589 logger.info('Installing packages %s...' % ls)
7590 call_throws(self.ctx, ['apt-get', 'install', '-y'] + ls)
7591
7592 def update(self) -> None:
7593 logger.info('Updating package list...')
7594 call_throws(self.ctx, ['apt-get', 'update'])
7595
7596 def install_podman(self) -> None:
7597 if self.distro == 'ubuntu':
7598 logger.info('Setting up repo for podman...')
7599 self.add_kubic_repo()
7600 self.update()
7601
7602 logger.info('Attempting podman install...')
7603 try:
7604 self.install(['podman'])
7605 except Error:
7606 logger.info('Podman did not work. Falling back to docker...')
7607 self.install(['docker.io'])
7608
7609 def kubic_repo_url(self) -> str:
7610 return 'https://download.opensuse.org/repositories/devel:/kubic:/' \
7611 'libcontainers:/stable/xUbuntu_%s/' % self.distro_version
7612
7613 def kubic_repo_path(self) -> str:
7614 return '/etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list'
7615
7616 def kubric_repo_gpgkey_url(self) -> str:
7617 return '%s/Release.key' % self.kubic_repo_url()
7618
7619 def kubric_repo_gpgkey_path(self) -> str:
7620 return '/etc/apt/trusted.gpg.d/kubic.release.gpg'
7621
7622 def add_kubic_repo(self) -> None:
7623 url = self.kubric_repo_gpgkey_url()
7624 logger.info('Installing repo GPG key from %s...' % url)
7625 try:
7626 response = urlopen(url)
7627 except HTTPError as err:
7628 logger.error('failed to fetch GPG repo key from %s: %s' % (
7629 url, err))
7630 raise Error('failed to fetch GPG key')
7631 key = response.read().decode('utf-8')
7632 tmp_key = write_tmp(key, 0, 0)
7633 keyring = self.kubric_repo_gpgkey_path()
7634 call_throws(self.ctx, ['apt-key', '--keyring', keyring, 'add', tmp_key.name])
7635
7636 logger.info('Installing repo file at %s...' % self.kubic_repo_path())
7637 content = 'deb %s /\n' % self.kubic_repo_url()
7638 with open(self.kubic_repo_path(), 'w') as f:
7639 f.write(content)
7640
7641 def rm_kubic_repo(self) -> None:
7642 keyring = self.kubric_repo_gpgkey_path()
7643 if os.path.exists(keyring):
7644 logger.info('Removing repo GPG key %s...' % keyring)
7645 os.unlink(keyring)
7646
7647 p = self.kubic_repo_path()
7648 if os.path.exists(p):
7649 logger.info('Removing repo at %s...' % p)
7650 os.unlink(p)
7651
7652
7653 class YumDnf(Packager):
7654 DISTRO_NAMES = {
7655 'centos': ('centos', 'el'),
7656 'rhel': ('centos', 'el'),
7657 'scientific': ('centos', 'el'),
7658 'rocky': ('centos', 'el'),
7659 'almalinux': ('centos', 'el'),
7660 'ol': ('centos', 'el'),
7661 'fedora': ('fedora', 'fc'),
7662 'mariner': ('mariner', 'cm'),
7663 }
7664
7665 def __init__(self, ctx: CephadmContext,
7666 stable: Optional[str], version: Optional[str], branch: Optional[str], commit: Optional[str],
7667 distro: Optional[str], distro_version: Optional[str]) -> None:
7668 super(YumDnf, self).__init__(ctx, stable=stable, version=version,
7669 branch=branch, commit=commit)
7670 assert distro
7671 assert distro_version
7672 self.ctx = ctx
7673 self.major = int(distro_version.split('.')[0])
7674 self.distro_normalized = self.DISTRO_NAMES[distro][0]
7675 self.distro_code = self.DISTRO_NAMES[distro][1] + str(self.major)
7676 if (self.distro_code == 'fc' and self.major >= 30) or \
7677 (self.distro_code == 'el' and self.major >= 8):
7678 self.tool = 'dnf'
7679 elif (self.distro_code == 'cm'):
7680 self.tool = 'tdnf'
7681 else:
7682 self.tool = 'yum'
7683
7684 def custom_repo(self, **kw: Any) -> str:
7685 """
7686 Repo files need special care in that a whole line should not be present
7687 if there is no value for it. Because we were using `format()` we could
7688 not conditionally add a line for a repo file. So the end result would
7689 contain a key with a missing value (say if we were passing `None`).
7690
7691 For example, it could look like::
7692
7693 [ceph repo]
7694 name= ceph repo
7695 proxy=
7696 gpgcheck=
7697
7698 Which breaks. This function allows us to conditionally add lines,
7699 preserving an order and be more careful.
7700
7701 Previously, and for historical purposes, this is how the template used
7702 to look::
7703
7704 custom_repo =
7705 [{repo_name}]
7706 name={name}
7707 baseurl={baseurl}
7708 enabled={enabled}
7709 gpgcheck={gpgcheck}
7710 type={_type}
7711 gpgkey={gpgkey}
7712 proxy={proxy}
7713
7714 """
7715 lines = []
7716
7717 # by using tuples (vs a dict) we preserve the order of what we want to
7718 # return, like starting with a [repo name]
7719 tmpl = (
7720 ('reponame', '[%s]'),
7721 ('name', 'name=%s'),
7722 ('baseurl', 'baseurl=%s'),
7723 ('enabled', 'enabled=%s'),
7724 ('gpgcheck', 'gpgcheck=%s'),
7725 ('_type', 'type=%s'),
7726 ('gpgkey', 'gpgkey=%s'),
7727 ('proxy', 'proxy=%s'),
7728 ('priority', 'priority=%s'),
7729 )
7730
7731 for line in tmpl:
7732 tmpl_key, tmpl_value = line # key values from tmpl
7733
7734 # ensure that there is an actual value (not None nor empty string)
7735 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
7736 lines.append(tmpl_value % kw.get(tmpl_key))
7737
7738 return '\n'.join(lines)
7739
7740 def repo_path(self) -> str:
7741 return '/etc/yum.repos.d/ceph.repo'
7742
7743 def repo_baseurl(self) -> str:
7744 assert self.stable or self.version
7745 if self.version:
7746 return '%s/rpm-%s/%s' % (self.ctx.repo_url, self.version,
7747 self.distro_code)
7748 else:
7749 return '%s/rpm-%s/%s' % (self.ctx.repo_url, self.stable,
7750 self.distro_code)
7751
7752 def validate(self) -> None:
7753 if self.distro_code.startswith('fc'):
7754 raise Error('Ceph team does not build Fedora specific packages and therefore cannot add repos for this distro')
7755 if self.distro_code == 'el7':
7756 if self.stable and self.stable >= 'pacific':
7757 raise Error('Ceph does not support pacific or later for this version of this linux distro and therefore cannot add a repo for it')
7758 if self.version and self.version.split('.')[0] >= '16':
7759 raise Error('Ceph does not support 16.y.z or later for this version of this linux distro and therefore cannot add a repo for it')
7760
7761 if self.stable or self.version:
7762 # we know that yum & dnf require there to be a
7763 # $base_url/$arch/repodata/repomd.xml so we can test if this URL
7764 # is gettable in order to validate the inputs
7765 test_url = self.repo_baseurl() + '/noarch/repodata/repomd.xml'
7766 try:
7767 urlopen(test_url)
7768 except HTTPError as err:
7769 logger.error('unable to fetch repo metadata: %r', err)
7770 raise Error('failed to fetch repository metadata. please check'
7771 ' the provided parameters are correct and try again')
7772
7773 def add_repo(self) -> None:
7774 if self.stable or self.version:
7775 content = ''
7776 for n, t in {
7777 'Ceph': '$basearch',
7778 'Ceph-noarch': 'noarch',
7779 'Ceph-source': 'SRPMS'}.items():
7780 content += '[%s]\n' % (n)
7781 content += self.custom_repo(
7782 name='Ceph %s' % t,
7783 baseurl=self.repo_baseurl() + '/' + t,
7784 enabled=1,
7785 gpgcheck=1,
7786 gpgkey=self.repo_gpgkey()[0],
7787 )
7788 content += '\n\n'
7789 else:
7790 content = self.query_shaman(self.distro_normalized, self.major,
7791 self.branch,
7792 self.commit)
7793
7794 logger.info('Writing repo to %s...' % self.repo_path())
7795 with open(self.repo_path(), 'w') as f:
7796 f.write(content)
7797
7798 if self.distro_code.startswith('el'):
7799 logger.info('Enabling EPEL...')
7800 call_throws(self.ctx, [self.tool, 'install', '-y', 'epel-release'])
7801
7802 def rm_repo(self) -> None:
7803 if os.path.exists(self.repo_path()):
7804 os.unlink(self.repo_path())
7805
7806 def install(self, ls: List[str]) -> None:
7807 logger.info('Installing packages %s...' % ls)
7808 call_throws(self.ctx, [self.tool, 'install', '-y'] + ls)
7809
7810 def install_podman(self) -> None:
7811 self.install(['podman'])
7812
7813
7814 class Zypper(Packager):
7815 DISTRO_NAMES = [
7816 'sles',
7817 'opensuse-tumbleweed',
7818 'opensuse-leap'
7819 ]
7820
7821 def __init__(self, ctx: CephadmContext,
7822 stable: Optional[str], version: Optional[str], branch: Optional[str], commit: Optional[str],
7823 distro: Optional[str], distro_version: Optional[str]) -> None:
7824 super(Zypper, self).__init__(ctx, stable=stable, version=version,
7825 branch=branch, commit=commit)
7826 assert distro is not None
7827 self.ctx = ctx
7828 self.tool = 'zypper'
7829 self.distro = 'opensuse'
7830 self.distro_version = '15.1'
7831 if 'tumbleweed' not in distro and distro_version is not None:
7832 self.distro_version = distro_version
7833
7834 def custom_repo(self, **kw: Any) -> str:
7835 """
7836 See YumDnf for format explanation.
7837 """
7838 lines = []
7839
7840 # by using tuples (vs a dict) we preserve the order of what we want to
7841 # return, like starting with a [repo name]
7842 tmpl = (
7843 ('reponame', '[%s]'),
7844 ('name', 'name=%s'),
7845 ('baseurl', 'baseurl=%s'),
7846 ('enabled', 'enabled=%s'),
7847 ('gpgcheck', 'gpgcheck=%s'),
7848 ('_type', 'type=%s'),
7849 ('gpgkey', 'gpgkey=%s'),
7850 ('proxy', 'proxy=%s'),
7851 ('priority', 'priority=%s'),
7852 )
7853
7854 for line in tmpl:
7855 tmpl_key, tmpl_value = line # key values from tmpl
7856
7857 # ensure that there is an actual value (not None nor empty string)
7858 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
7859 lines.append(tmpl_value % kw.get(tmpl_key))
7860
7861 return '\n'.join(lines)
7862
7863 def repo_path(self) -> str:
7864 return '/etc/zypp/repos.d/ceph.repo'
7865
7866 def repo_baseurl(self) -> str:
7867 assert self.stable or self.version
7868 if self.version:
7869 return '%s/rpm-%s/%s' % (self.ctx.repo_url,
7870 self.stable, self.distro)
7871 else:
7872 return '%s/rpm-%s/%s' % (self.ctx.repo_url,
7873 self.stable, self.distro)
7874
7875 def add_repo(self) -> None:
7876 if self.stable or self.version:
7877 content = ''
7878 for n, t in {
7879 'Ceph': '$basearch',
7880 'Ceph-noarch': 'noarch',
7881 'Ceph-source': 'SRPMS'}.items():
7882 content += '[%s]\n' % (n)
7883 content += self.custom_repo(
7884 name='Ceph %s' % t,
7885 baseurl=self.repo_baseurl() + '/' + t,
7886 enabled=1,
7887 gpgcheck=1,
7888 gpgkey=self.repo_gpgkey()[0],
7889 )
7890 content += '\n\n'
7891 else:
7892 content = self.query_shaman(self.distro, self.distro_version,
7893 self.branch,
7894 self.commit)
7895
7896 logger.info('Writing repo to %s...' % self.repo_path())
7897 with open(self.repo_path(), 'w') as f:
7898 f.write(content)
7899
7900 def rm_repo(self) -> None:
7901 if os.path.exists(self.repo_path()):
7902 os.unlink(self.repo_path())
7903
7904 def install(self, ls: List[str]) -> None:
7905 logger.info('Installing packages %s...' % ls)
7906 call_throws(self.ctx, [self.tool, 'in', '-y'] + ls)
7907
7908 def install_podman(self) -> None:
7909 self.install(['podman'])
7910
7911
7912 def create_packager(ctx: CephadmContext,
7913 stable: Optional[str] = None, version: Optional[str] = None,
7914 branch: Optional[str] = None, commit: Optional[str] = None) -> Packager:
7915 distro, distro_version, distro_codename = get_distro()
7916 if distro in YumDnf.DISTRO_NAMES:
7917 return YumDnf(ctx, stable=stable, version=version,
7918 branch=branch, commit=commit,
7919 distro=distro, distro_version=distro_version)
7920 elif distro in Apt.DISTRO_NAMES:
7921 return Apt(ctx, stable=stable, version=version,
7922 branch=branch, commit=commit,
7923 distro=distro, distro_version=distro_version,
7924 distro_codename=distro_codename)
7925 elif distro in Zypper.DISTRO_NAMES:
7926 return Zypper(ctx, stable=stable, version=version,
7927 branch=branch, commit=commit,
7928 distro=distro, distro_version=distro_version)
7929 raise Error('Distro %s version %s not supported' % (distro, distro_version))
7930
7931
7932 def command_add_repo(ctx: CephadmContext) -> None:
7933 if ctx.version and ctx.release:
7934 raise Error('you can specify either --release or --version but not both')
7935 if not ctx.version and not ctx.release and not ctx.dev and not ctx.dev_commit:
7936 raise Error('please supply a --release, --version, --dev or --dev-commit argument')
7937 if ctx.version:
7938 try:
7939 (x, y, z) = ctx.version.split('.')
7940 except Exception:
7941 raise Error('version must be in the form x.y.z (e.g., 15.2.0)')
7942 if ctx.release:
7943 # Pacific =/= pacific in this case, set to undercase to avoid confision
7944 ctx.release = ctx.release.lower()
7945
7946 pkg = create_packager(ctx, stable=ctx.release,
7947 version=ctx.version,
7948 branch=ctx.dev,
7949 commit=ctx.dev_commit)
7950 pkg.validate()
7951 pkg.add_repo()
7952 logger.info('Completed adding repo.')
7953
7954
7955 def command_rm_repo(ctx: CephadmContext) -> None:
7956 pkg = create_packager(ctx)
7957 pkg.rm_repo()
7958
7959
7960 def command_install(ctx: CephadmContext) -> None:
7961 pkg = create_packager(ctx)
7962 pkg.install(ctx.packages)
7963
7964
7965 def command_rescan_disks(ctx: CephadmContext) -> str:
7966
7967 def probe_hba(scan_path: str) -> None:
7968 """Tell the adapter to rescan"""
7969 with open(scan_path, 'w') as f:
7970 f.write('- - -')
7971
7972 cmd = ctx.func.__name__.replace('command_', '')
7973 logger.info(f'{cmd}: starting')
7974 start = time.time()
7975
7976 all_scan_files = glob('/sys/class/scsi_host/*/scan')
7977 scan_files = []
7978 skipped = []
7979 for scan_path in all_scan_files:
7980 adapter_name = os.path.basename(os.path.dirname(scan_path))
7981 proc_name = read_file([os.path.join(os.path.dirname(scan_path), 'proc_name')])
7982 if proc_name in ['unknown', 'usb-storage']:
7983 skipped.append(os.path.basename(scan_path))
7984 logger.info(f'{cmd}: rescan skipping incompatible host adapter {adapter_name} : {proc_name}')
7985 continue
7986
7987 scan_files.append(scan_path)
7988
7989 if not scan_files:
7990 logger.info(f'{cmd}: no compatible HBAs found')
7991 return 'Ok. No compatible HBAs found'
7992
7993 responses = async_run(concurrent_tasks(probe_hba, scan_files))
7994 failures = [r for r in responses if r]
7995
7996 logger.info(f'{cmd}: Complete. {len(scan_files)} adapters rescanned, {len(failures)} failures, {len(skipped)} skipped')
7997
7998 elapsed = time.time() - start
7999 if failures:
8000 plural = 's' if len(failures) > 1 else ''
8001 if len(failures) == len(scan_files):
8002 return f'Failed. All {len(scan_files)} rescan requests failed'
8003 else:
8004 return f'Partial. {len(scan_files) - len(failures)} successful, {len(failures)} failure{plural} against: {", ".join(failures)}'
8005
8006 return f'Ok. {len(all_scan_files)} adapters detected: {len(scan_files)} rescanned, {len(skipped)} skipped, {len(failures)} failed ({elapsed:.2f}s)'
8007
8008 ##################################
8009
8010
8011 def get_ipv4_address(ifname):
8012 # type: (str) -> str
8013 def _extract(sock: socket.socket, offset: int) -> str:
8014 return socket.inet_ntop(
8015 socket.AF_INET,
8016 fcntl.ioctl(
8017 sock.fileno(),
8018 offset,
8019 struct.pack('256s', bytes(ifname[:15], 'utf-8'))
8020 )[20:24])
8021
8022 s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
8023 try:
8024 addr = _extract(s, 35093) # '0x8915' = SIOCGIFADDR
8025 dq_mask = _extract(s, 35099) # 0x891b = SIOCGIFNETMASK
8026 except OSError:
8027 # interface does not have an ipv4 address
8028 return ''
8029
8030 dec_mask = sum([bin(int(i)).count('1')
8031 for i in dq_mask.split('.')])
8032 return '{}/{}'.format(addr, dec_mask)
8033
8034
8035 def get_ipv6_address(ifname):
8036 # type: (str) -> str
8037 if not os.path.exists('/proc/net/if_inet6'):
8038 return ''
8039
8040 raw = read_file(['/proc/net/if_inet6'])
8041 data = raw.splitlines()
8042 # based on docs @ https://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/ch11s04.html
8043 # field 0 is ipv6, field 2 is scope
8044 for iface_setting in data:
8045 field = iface_setting.split()
8046 if field[-1] == ifname:
8047 ipv6_raw = field[0]
8048 ipv6_fmtd = ':'.join([ipv6_raw[_p:_p + 4] for _p in range(0, len(field[0]), 4)])
8049 # apply naming rules using ipaddress module
8050 ipv6 = ipaddress.ip_address(ipv6_fmtd)
8051 return '{}/{}'.format(str(ipv6), int('0x{}'.format(field[2]), 16))
8052 return ''
8053
8054
8055 def bytes_to_human(num, mode='decimal'):
8056 # type: (float, str) -> str
8057 """Convert a bytes value into it's human-readable form.
8058
8059 :param num: number, in bytes, to convert
8060 :param mode: Either decimal (default) or binary to determine divisor
8061 :returns: string representing the bytes value in a more readable format
8062 """
8063 unit_list = ['', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB']
8064 divisor = 1000.0
8065 yotta = 'YB'
8066
8067 if mode == 'binary':
8068 unit_list = ['', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB']
8069 divisor = 1024.0
8070 yotta = 'YiB'
8071
8072 for unit in unit_list:
8073 if abs(num) < divisor:
8074 return '%3.1f%s' % (num, unit)
8075 num /= divisor
8076 return '%.1f%s' % (num, yotta)
8077
8078
8079 def read_file(path_list, file_name=''):
8080 # type: (List[str], str) -> str
8081 """Returns the content of the first file found within the `path_list`
8082
8083 :param path_list: list of file paths to search
8084 :param file_name: optional file_name to be applied to a file path
8085 :returns: content of the file or 'Unknown'
8086 """
8087 for path in path_list:
8088 if file_name:
8089 file_path = os.path.join(path, file_name)
8090 else:
8091 file_path = path
8092 if os.path.exists(file_path):
8093 with open(file_path, 'r') as f:
8094 try:
8095 content = f.read().strip()
8096 except OSError:
8097 # sysfs may populate the file, but for devices like
8098 # virtio reads can fail
8099 return 'Unknown'
8100 else:
8101 return content
8102 return 'Unknown'
8103
8104 ##################################
8105
8106
8107 class HostFacts():
8108 _dmi_path_list = ['/sys/class/dmi/id']
8109 _nic_path_list = ['/sys/class/net']
8110 _apparmor_path_list = ['/etc/apparmor']
8111 _disk_vendor_workarounds = {
8112 '0x1af4': 'Virtio Block Device'
8113 }
8114 _excluded_block_devices = ('sr', 'zram', 'dm-')
8115
8116 def __init__(self, ctx: CephadmContext):
8117 self.ctx: CephadmContext = ctx
8118 self.cpu_model: str = 'Unknown'
8119 self.cpu_count: int = 0
8120 self.cpu_cores: int = 0
8121 self.cpu_threads: int = 0
8122 self.interfaces: Dict[str, Any] = {}
8123
8124 self._meminfo: List[str] = read_file(['/proc/meminfo']).splitlines()
8125 self._get_cpuinfo()
8126 self._process_nics()
8127 self.arch: str = platform.processor()
8128 self.kernel: str = platform.release()
8129
8130 def _get_cpuinfo(self):
8131 # type: () -> None
8132 """Determine cpu information via /proc/cpuinfo"""
8133 raw = read_file(['/proc/cpuinfo'])
8134 output = raw.splitlines()
8135 cpu_set = set()
8136
8137 for line in output:
8138 field = [f.strip() for f in line.split(':')]
8139 if 'model name' in line:
8140 self.cpu_model = field[1]
8141 if 'physical id' in line:
8142 cpu_set.add(field[1])
8143 if 'siblings' in line:
8144 self.cpu_threads = int(field[1].strip())
8145 if 'cpu cores' in line:
8146 self.cpu_cores = int(field[1].strip())
8147 pass
8148 self.cpu_count = len(cpu_set)
8149
8150 def _get_block_devs(self):
8151 # type: () -> List[str]
8152 """Determine the list of block devices by looking at /sys/block"""
8153 return [dev for dev in os.listdir('/sys/block')
8154 if not dev.startswith(HostFacts._excluded_block_devices)]
8155
8156 def _get_devs_by_type(self, rota='0'):
8157 # type: (str) -> List[str]
8158 """Filter block devices by a given rotational attribute (0=flash, 1=spinner)"""
8159 devs = list()
8160 for blk_dev in self._get_block_devs():
8161 rot_path = '/sys/block/{}/queue/rotational'.format(blk_dev)
8162 rot_value = read_file([rot_path])
8163 if rot_value == rota:
8164 devs.append(blk_dev)
8165 return devs
8166
8167 @property
8168 def operating_system(self):
8169 # type: () -> str
8170 """Determine OS version"""
8171 raw_info = read_file(['/etc/os-release'])
8172 os_release = raw_info.splitlines()
8173 rel_str = 'Unknown'
8174 rel_dict = dict()
8175
8176 for line in os_release:
8177 if '=' in line:
8178 var_name, var_value = line.split('=')
8179 rel_dict[var_name] = var_value.strip('"')
8180
8181 # Would normally use PRETTY_NAME, but NAME and VERSION are more
8182 # consistent
8183 if all(_v in rel_dict for _v in ['NAME', 'VERSION']):
8184 rel_str = '{} {}'.format(rel_dict['NAME'], rel_dict['VERSION'])
8185 return rel_str
8186
8187 @property
8188 def hostname(self):
8189 # type: () -> str
8190 """Return the hostname"""
8191 return platform.node()
8192
8193 @property
8194 def subscribed(self):
8195 # type: () -> str
8196 """Highlevel check to see if the host is subscribed to receive updates/support"""
8197 def _red_hat():
8198 # type: () -> str
8199 # RHEL 7 and RHEL 8
8200 entitlements_dir = '/etc/pki/entitlement'
8201 if os.path.exists(entitlements_dir):
8202 pems = glob('{}/*.pem'.format(entitlements_dir))
8203 if len(pems) >= 2:
8204 return 'Yes'
8205
8206 return 'No'
8207
8208 os_name = self.operating_system
8209 if os_name.upper().startswith('RED HAT'):
8210 return _red_hat()
8211
8212 return 'Unknown'
8213
8214 @property
8215 def hdd_count(self):
8216 # type: () -> int
8217 """Return a count of HDDs (spinners)"""
8218 return len(self._get_devs_by_type(rota='1'))
8219
8220 def _get_capacity(self, dev):
8221 # type: (str) -> int
8222 """Determine the size of a given device"""
8223 size_path = os.path.join('/sys/block', dev, 'size')
8224 size_blocks = int(read_file([size_path]))
8225 blk_path = os.path.join('/sys/block', dev, 'queue', 'logical_block_size')
8226 blk_count = int(read_file([blk_path]))
8227 return size_blocks * blk_count
8228
8229 def _get_capacity_by_type(self, rota='0'):
8230 # type: (str) -> int
8231 """Return the total capacity of a category of device (flash or hdd)"""
8232 devs = self._get_devs_by_type(rota=rota)
8233 capacity = 0
8234 for dev in devs:
8235 capacity += self._get_capacity(dev)
8236 return capacity
8237
8238 def _dev_list(self, dev_list):
8239 # type: (List[str]) -> List[Dict[str, object]]
8240 """Return a 'pretty' name list for each device in the `dev_list`"""
8241 disk_list = list()
8242
8243 for dev in dev_list:
8244 disk_model = read_file(['/sys/block/{}/device/model'.format(dev)]).strip()
8245 disk_rev = read_file(['/sys/block/{}/device/rev'.format(dev)]).strip()
8246 disk_wwid = read_file(['/sys/block/{}/device/wwid'.format(dev)]).strip()
8247 vendor = read_file(['/sys/block/{}/device/vendor'.format(dev)]).strip()
8248 disk_vendor = HostFacts._disk_vendor_workarounds.get(vendor, vendor)
8249 disk_size_bytes = self._get_capacity(dev)
8250 disk_list.append({
8251 'description': '{} {} ({})'.format(disk_vendor, disk_model, bytes_to_human(disk_size_bytes)),
8252 'vendor': disk_vendor,
8253 'model': disk_model,
8254 'rev': disk_rev,
8255 'wwid': disk_wwid,
8256 'dev_name': dev,
8257 'disk_size_bytes': disk_size_bytes,
8258 })
8259 return disk_list
8260
8261 @property
8262 def hdd_list(self):
8263 # type: () -> List[Dict[str, object]]
8264 """Return a list of devices that are HDDs (spinners)"""
8265 devs = self._get_devs_by_type(rota='1')
8266 return self._dev_list(devs)
8267
8268 @property
8269 def flash_list(self):
8270 # type: () -> List[Dict[str, object]]
8271 """Return a list of devices that are flash based (SSD, NVMe)"""
8272 devs = self._get_devs_by_type(rota='0')
8273 return self._dev_list(devs)
8274
8275 @property
8276 def hdd_capacity_bytes(self):
8277 # type: () -> int
8278 """Return the total capacity for all HDD devices (bytes)"""
8279 return self._get_capacity_by_type(rota='1')
8280
8281 @property
8282 def hdd_capacity(self):
8283 # type: () -> str
8284 """Return the total capacity for all HDD devices (human readable format)"""
8285 return bytes_to_human(self.hdd_capacity_bytes)
8286
8287 @property
8288 def cpu_load(self):
8289 # type: () -> Dict[str, float]
8290 """Return the cpu load average data for the host"""
8291 raw = read_file(['/proc/loadavg']).strip()
8292 data = raw.split()
8293 return {
8294 '1min': float(data[0]),
8295 '5min': float(data[1]),
8296 '15min': float(data[2]),
8297 }
8298
8299 @property
8300 def flash_count(self):
8301 # type: () -> int
8302 """Return the number of flash devices in the system (SSD, NVMe)"""
8303 return len(self._get_devs_by_type(rota='0'))
8304
8305 @property
8306 def flash_capacity_bytes(self):
8307 # type: () -> int
8308 """Return the total capacity for all flash devices (bytes)"""
8309 return self._get_capacity_by_type(rota='0')
8310
8311 @property
8312 def flash_capacity(self):
8313 # type: () -> str
8314 """Return the total capacity for all Flash devices (human readable format)"""
8315 return bytes_to_human(self.flash_capacity_bytes)
8316
8317 def _process_nics(self):
8318 # type: () -> None
8319 """Look at the NIC devices and extract network related metadata"""
8320 # from https://github.com/torvalds/linux/blob/master/include/uapi/linux/if_arp.h
8321 hw_lookup = {
8322 '1': 'ethernet',
8323 '32': 'infiniband',
8324 '772': 'loopback',
8325 }
8326
8327 for nic_path in HostFacts._nic_path_list:
8328 if not os.path.exists(nic_path):
8329 continue
8330 for iface in os.listdir(nic_path):
8331
8332 if os.path.exists(os.path.join(nic_path, iface, 'bridge')):
8333 nic_type = 'bridge'
8334 elif os.path.exists(os.path.join(nic_path, iface, 'bonding')):
8335 nic_type = 'bonding'
8336 else:
8337 nic_type = hw_lookup.get(read_file([os.path.join(nic_path, iface, 'type')]), 'Unknown')
8338
8339 if nic_type == 'loopback': # skip loopback devices
8340 continue
8341
8342 lower_devs_list = [os.path.basename(link.replace('lower_', '')) for link in glob(os.path.join(nic_path, iface, 'lower_*'))]
8343 upper_devs_list = [os.path.basename(link.replace('upper_', '')) for link in glob(os.path.join(nic_path, iface, 'upper_*'))]
8344
8345 try:
8346 mtu = int(read_file([os.path.join(nic_path, iface, 'mtu')]))
8347 except ValueError:
8348 mtu = 0
8349
8350 operstate = read_file([os.path.join(nic_path, iface, 'operstate')])
8351 try:
8352 speed = int(read_file([os.path.join(nic_path, iface, 'speed')]))
8353 except (OSError, ValueError):
8354 # OSError : device doesn't support the ethtool get_link_ksettings
8355 # ValueError : raised when the read fails, and returns Unknown
8356 #
8357 # Either way, we show a -1 when speed isn't available
8358 speed = -1
8359
8360 dev_link = os.path.join(nic_path, iface, 'device')
8361 if os.path.exists(dev_link):
8362 iftype = 'physical'
8363 driver_path = os.path.join(dev_link, 'driver')
8364 if os.path.exists(driver_path):
8365 driver = os.path.basename(os.path.realpath(driver_path))
8366 else:
8367 driver = 'Unknown'
8368
8369 else:
8370 iftype = 'logical'
8371 driver = ''
8372
8373 self.interfaces[iface] = {
8374 'mtu': mtu,
8375 'upper_devs_list': upper_devs_list,
8376 'lower_devs_list': lower_devs_list,
8377 'operstate': operstate,
8378 'iftype': iftype,
8379 'nic_type': nic_type,
8380 'driver': driver,
8381 'speed': speed,
8382 'ipv4_address': get_ipv4_address(iface),
8383 'ipv6_address': get_ipv6_address(iface),
8384 }
8385
8386 @property
8387 def nic_count(self):
8388 # type: () -> int
8389 """Return a total count of all physical NICs detected in the host"""
8390 phys_devs = []
8391 for iface in self.interfaces:
8392 if self.interfaces[iface]['iftype'] == 'physical':
8393 phys_devs.append(iface)
8394 return len(phys_devs)
8395
8396 def _get_mem_data(self, field_name):
8397 # type: (str) -> int
8398 for line in self._meminfo:
8399 if line.startswith(field_name):
8400 _d = line.split()
8401 return int(_d[1])
8402 return 0
8403
8404 @property
8405 def memory_total_kb(self):
8406 # type: () -> int
8407 """Determine the memory installed (kb)"""
8408 return self._get_mem_data('MemTotal')
8409
8410 @property
8411 def memory_free_kb(self):
8412 # type: () -> int
8413 """Determine the memory free (not cache, immediately usable)"""
8414 return self._get_mem_data('MemFree')
8415
8416 @property
8417 def memory_available_kb(self):
8418 # type: () -> int
8419 """Determine the memory available to new applications without swapping"""
8420 return self._get_mem_data('MemAvailable')
8421
8422 @property
8423 def vendor(self):
8424 # type: () -> str
8425 """Determine server vendor from DMI data in sysfs"""
8426 return read_file(HostFacts._dmi_path_list, 'sys_vendor')
8427
8428 @property
8429 def model(self):
8430 # type: () -> str
8431 """Determine server model information from DMI data in sysfs"""
8432 family = read_file(HostFacts._dmi_path_list, 'product_family')
8433 product = read_file(HostFacts._dmi_path_list, 'product_name')
8434 if family == 'Unknown' and product:
8435 return '{}'.format(product)
8436
8437 return '{} ({})'.format(family, product)
8438
8439 @property
8440 def bios_version(self):
8441 # type: () -> str
8442 """Determine server BIOS version from DMI data in sysfs"""
8443 return read_file(HostFacts._dmi_path_list, 'bios_version')
8444
8445 @property
8446 def bios_date(self):
8447 # type: () -> str
8448 """Determine server BIOS date from DMI data in sysfs"""
8449 return read_file(HostFacts._dmi_path_list, 'bios_date')
8450
8451 @property
8452 def timestamp(self):
8453 # type: () -> float
8454 """Return the current time as Epoch seconds"""
8455 return time.time()
8456
8457 @property
8458 def system_uptime(self):
8459 # type: () -> float
8460 """Return the system uptime (in secs)"""
8461 raw_time = read_file(['/proc/uptime'])
8462 up_secs, _ = raw_time.split()
8463 return float(up_secs)
8464
8465 @property
8466 def kernel_security(self):
8467 # type: () -> Dict[str, str]
8468 """Determine the security features enabled in the kernel - SELinux, AppArmor"""
8469 def _fetch_selinux() -> Dict[str, str]:
8470 """Get the selinux status"""
8471 security = {}
8472 try:
8473 out, err, code = call(self.ctx, ['sestatus'],
8474 verbosity=CallVerbosity.QUIET)
8475 security['type'] = 'SELinux'
8476 status, mode, policy = '', '', ''
8477 for line in out.split('\n'):
8478 if line.startswith('SELinux status:'):
8479 k, v = line.split(':')
8480 status = v.strip()
8481 elif line.startswith('Current mode:'):
8482 k, v = line.split(':')
8483 mode = v.strip()
8484 elif line.startswith('Loaded policy name:'):
8485 k, v = line.split(':')
8486 policy = v.strip()
8487 if status == 'disabled':
8488 security['description'] = 'SELinux: Disabled'
8489 else:
8490 security['description'] = 'SELinux: Enabled({}, {})'.format(mode, policy)
8491 except Exception as e:
8492 logger.info('unable to get selinux status: %s' % e)
8493 return security
8494
8495 def _fetch_apparmor() -> Dict[str, str]:
8496 """Read the apparmor profiles directly, returning an overview of AppArmor status"""
8497 security = {}
8498 for apparmor_path in HostFacts._apparmor_path_list:
8499 if os.path.exists(apparmor_path):
8500 security['type'] = 'AppArmor'
8501 security['description'] = 'AppArmor: Enabled'
8502 try:
8503 profiles = read_file(['/sys/kernel/security/apparmor/profiles'])
8504 if len(profiles) == 0:
8505 return {}
8506 except OSError:
8507 pass
8508 else:
8509 summary = {} # type: Dict[str, int]
8510 for line in profiles.split('\n'):
8511 item, mode = line.split(' ')
8512 mode = mode.strip('()')
8513 if mode in summary:
8514 summary[mode] += 1
8515 else:
8516 summary[mode] = 0
8517 summary_str = ','.join(['{} {}'.format(v, k) for k, v in summary.items()])
8518 security = {**security, **summary} # type: ignore
8519 security['description'] += '({})'.format(summary_str)
8520
8521 return security
8522 return {}
8523
8524 ret = {}
8525 if os.path.exists('/sys/kernel/security/lsm'):
8526 lsm = read_file(['/sys/kernel/security/lsm']).strip()
8527 if 'selinux' in lsm:
8528 ret = _fetch_selinux()
8529 elif 'apparmor' in lsm:
8530 ret = _fetch_apparmor()
8531 else:
8532 return {
8533 'type': 'Unknown',
8534 'description': 'Linux Security Module framework is active, but is not using SELinux or AppArmor'
8535 }
8536
8537 if ret:
8538 return ret
8539
8540 return {
8541 'type': 'None',
8542 'description': 'Linux Security Module framework is not available'
8543 }
8544
8545 @property
8546 def selinux_enabled(self) -> bool:
8547 return (self.kernel_security['type'] == 'SELinux') and \
8548 (self.kernel_security['description'] != 'SELinux: Disabled')
8549
8550 @property
8551 def kernel_parameters(self):
8552 # type: () -> Dict[str, str]
8553 """Get kernel parameters required/used in Ceph clusters"""
8554
8555 k_param = {}
8556 out, _, _ = call_throws(self.ctx, ['sysctl', '-a'], verbosity=CallVerbosity.SILENT)
8557 if out:
8558 param_list = out.split('\n')
8559 param_dict = {param.split(' = ')[0]: param.split(' = ')[-1] for param in param_list}
8560
8561 # return only desired parameters
8562 if 'net.ipv4.ip_nonlocal_bind' in param_dict:
8563 k_param['net.ipv4.ip_nonlocal_bind'] = param_dict['net.ipv4.ip_nonlocal_bind']
8564
8565 return k_param
8566
8567 @staticmethod
8568 def _process_net_data(tcp_file: str, protocol: str = 'tcp') -> List[int]:
8569 listening_ports = []
8570 # Connections state documentation
8571 # tcp - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/net/tcp_states.h
8572 # udp - uses 07 (TCP_CLOSE or UNCONN, since udp is stateless. test with netcat -ul <port>)
8573 listening_state = {
8574 'tcp': '0A',
8575 'udp': '07'
8576 }
8577
8578 if protocol not in listening_state.keys():
8579 return []
8580
8581 if os.path.exists(tcp_file):
8582 with open(tcp_file) as f:
8583 tcp_data = f.readlines()[1:]
8584
8585 for con in tcp_data:
8586 con_info = con.strip().split()
8587 if con_info[3] == listening_state[protocol]:
8588 local_port = int(con_info[1].split(':')[1], 16)
8589 listening_ports.append(local_port)
8590
8591 return listening_ports
8592
8593 @property
8594 def tcp_ports_used(self) -> List[int]:
8595 return HostFacts._process_net_data('/proc/net/tcp')
8596
8597 @property
8598 def tcp6_ports_used(self) -> List[int]:
8599 return HostFacts._process_net_data('/proc/net/tcp6')
8600
8601 @property
8602 def udp_ports_used(self) -> List[int]:
8603 return HostFacts._process_net_data('/proc/net/udp', 'udp')
8604
8605 @property
8606 def udp6_ports_used(self) -> List[int]:
8607 return HostFacts._process_net_data('/proc/net/udp6', 'udp')
8608
8609 def dump(self):
8610 # type: () -> str
8611 """Return the attributes of this HostFacts object as json"""
8612 data = {
8613 k: getattr(self, k) for k in dir(self)
8614 if not k.startswith('_')
8615 and isinstance(getattr(self, k), (float, int, str, list, dict, tuple))
8616 }
8617 return json.dumps(data, indent=2, sort_keys=True)
8618
8619 ##################################
8620
8621
8622 def command_gather_facts(ctx: CephadmContext) -> None:
8623 """gather_facts is intended to provide host releated metadata to the caller"""
8624 host = HostFacts(ctx)
8625 print(host.dump())
8626
8627
8628 ##################################
8629
8630
8631 def systemd_target_state(ctx: CephadmContext, target_name: str, subsystem: str = 'ceph') -> bool:
8632 # TODO: UNITTEST
8633 return os.path.exists(
8634 os.path.join(
8635 ctx.unit_dir,
8636 f'{subsystem}.target.wants',
8637 target_name
8638 )
8639 )
8640
8641
8642 def target_exists(ctx: CephadmContext) -> bool:
8643 return os.path.exists(ctx.unit_dir + '/ceph.target')
8644
8645
8646 @infer_fsid
8647 def command_maintenance(ctx: CephadmContext) -> str:
8648 if not ctx.fsid:
8649 raise Error('failed - must pass --fsid to specify cluster')
8650
8651 target = f'ceph-{ctx.fsid}.target'
8652
8653 if ctx.maintenance_action.lower() == 'enter':
8654 logger.info('Requested to place host into maintenance')
8655 if systemd_target_state(ctx, target):
8656 _out, _err, code = call(ctx,
8657 ['systemctl', 'disable', target],
8658 verbosity=CallVerbosity.DEBUG)
8659 if code:
8660 logger.error(f'Failed to disable the {target} target')
8661 return 'failed - to disable the target'
8662 else:
8663 # stopping a target waits by default
8664 _out, _err, code = call(ctx,
8665 ['systemctl', 'stop', target],
8666 verbosity=CallVerbosity.DEBUG)
8667 if code:
8668 logger.error(f'Failed to stop the {target} target')
8669 return 'failed - to disable the target'
8670 else:
8671 return f'success - systemd target {target} disabled'
8672
8673 else:
8674 return 'skipped - target already disabled'
8675
8676 else:
8677 logger.info('Requested to exit maintenance state')
8678 # if we've never deployed a daemon on this host there will be no systemd
8679 # target to disable so attempting a disable will fail. We still need to
8680 # return success here or host will be permanently stuck in maintenance mode
8681 # as no daemons can be deployed so no systemd target will ever exist to disable.
8682 if not target_exists(ctx):
8683 return 'skipped - systemd target not present on this host. Host removed from maintenance mode.'
8684 # exit maintenance request
8685 if not systemd_target_state(ctx, target):
8686 _out, _err, code = call(ctx,
8687 ['systemctl', 'enable', target],
8688 verbosity=CallVerbosity.DEBUG)
8689 if code:
8690 logger.error(f'Failed to enable the {target} target')
8691 return 'failed - unable to enable the target'
8692 else:
8693 # starting a target waits by default
8694 _out, _err, code = call(ctx,
8695 ['systemctl', 'start', target],
8696 verbosity=CallVerbosity.DEBUG)
8697 if code:
8698 logger.error(f'Failed to start the {target} target')
8699 return 'failed - unable to start the target'
8700 else:
8701 return f'success - systemd target {target} enabled and started'
8702 return f'success - systemd target {target} enabled and started'
8703
8704 ##################################
8705
8706
8707 def _get_parser():
8708 # type: () -> argparse.ArgumentParser
8709 parser = argparse.ArgumentParser(
8710 description='Bootstrap Ceph daemons with systemd and containers.',
8711 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
8712 parser.add_argument(
8713 '--image',
8714 help='container image. Can also be set via the "CEPHADM_IMAGE" '
8715 'env var')
8716 parser.add_argument(
8717 '--docker',
8718 action='store_true',
8719 help='use docker instead of podman')
8720 parser.add_argument(
8721 '--data-dir',
8722 default=DATA_DIR,
8723 help='base directory for daemon data')
8724 parser.add_argument(
8725 '--log-dir',
8726 default=LOG_DIR,
8727 help='base directory for daemon logs')
8728 parser.add_argument(
8729 '--logrotate-dir',
8730 default=LOGROTATE_DIR,
8731 help='location of logrotate configuration files')
8732 parser.add_argument(
8733 '--sysctl-dir',
8734 default=SYSCTL_DIR,
8735 help='location of sysctl configuration files')
8736 parser.add_argument(
8737 '--unit-dir',
8738 default=UNIT_DIR,
8739 help='base directory for systemd units')
8740 parser.add_argument(
8741 '--verbose', '-v',
8742 action='store_true',
8743 help='Show debug-level log messages')
8744 parser.add_argument(
8745 '--timeout',
8746 type=int,
8747 default=DEFAULT_TIMEOUT,
8748 help='timeout in seconds')
8749 parser.add_argument(
8750 '--retry',
8751 type=int,
8752 default=DEFAULT_RETRY,
8753 help='max number of retries')
8754 parser.add_argument(
8755 '--env', '-e',
8756 action='append',
8757 default=[],
8758 help='set environment variable')
8759 parser.add_argument(
8760 '--no-container-init',
8761 action='store_true',
8762 default=not CONTAINER_INIT,
8763 help='Do not run podman/docker with `--init`')
8764
8765 subparsers = parser.add_subparsers(help='sub-command')
8766
8767 parser_version = subparsers.add_parser(
8768 'version', help='get ceph version from container')
8769 parser_version.set_defaults(func=command_version)
8770
8771 parser_pull = subparsers.add_parser(
8772 'pull', help='pull the default container image')
8773 parser_pull.set_defaults(func=command_pull)
8774 parser_pull.add_argument(
8775 '--insecure',
8776 action='store_true',
8777 help=argparse.SUPPRESS,
8778 )
8779
8780 parser_inspect_image = subparsers.add_parser(
8781 'inspect-image', help='inspect local container image')
8782 parser_inspect_image.set_defaults(func=command_inspect_image)
8783
8784 parser_ls = subparsers.add_parser(
8785 'ls', help='list daemon instances on this host')
8786 parser_ls.set_defaults(func=command_ls)
8787 parser_ls.add_argument(
8788 '--no-detail',
8789 action='store_true',
8790 help='Do not include daemon status')
8791 parser_ls.add_argument(
8792 '--legacy-dir',
8793 default='/',
8794 help='base directory for legacy daemon data')
8795
8796 parser_list_networks = subparsers.add_parser(
8797 'list-networks', help='list IP networks')
8798 parser_list_networks.set_defaults(func=command_list_networks)
8799
8800 parser_adopt = subparsers.add_parser(
8801 'adopt', help='adopt daemon deployed with a different tool')
8802 parser_adopt.set_defaults(func=command_adopt)
8803 parser_adopt.add_argument(
8804 '--name', '-n',
8805 required=True,
8806 help='daemon name (type.id)')
8807 parser_adopt.add_argument(
8808 '--style',
8809 required=True,
8810 help='deployment style (legacy, ...)')
8811 parser_adopt.add_argument(
8812 '--cluster',
8813 default='ceph',
8814 help='cluster name')
8815 parser_adopt.add_argument(
8816 '--legacy-dir',
8817 default='/',
8818 help='base directory for legacy daemon data')
8819 parser_adopt.add_argument(
8820 '--config-json',
8821 help='Additional configuration information in JSON format')
8822 parser_adopt.add_argument(
8823 '--skip-firewalld',
8824 action='store_true',
8825 help='Do not configure firewalld')
8826 parser_adopt.add_argument(
8827 '--skip-pull',
8828 action='store_true',
8829 help='do not pull the default image before adopting')
8830 parser_adopt.add_argument(
8831 '--force-start',
8832 action='store_true',
8833 help='start newly adoped daemon, even if it was not running previously')
8834 parser_adopt.add_argument(
8835 '--container-init',
8836 action='store_true',
8837 default=CONTAINER_INIT,
8838 help=argparse.SUPPRESS)
8839
8840 parser_rm_daemon = subparsers.add_parser(
8841 'rm-daemon', help='remove daemon instance')
8842 parser_rm_daemon.set_defaults(func=command_rm_daemon)
8843 parser_rm_daemon.add_argument(
8844 '--name', '-n',
8845 required=True,
8846 action=CustomValidation,
8847 help='daemon name (type.id)')
8848 parser_rm_daemon.add_argument(
8849 '--tcp-ports',
8850 help='List of tcp ports to close in the host firewall')
8851 parser_rm_daemon.add_argument(
8852 '--fsid',
8853 required=True,
8854 help='cluster FSID')
8855 parser_rm_daemon.add_argument(
8856 '--force',
8857 action='store_true',
8858 help='proceed, even though this may destroy valuable data')
8859 parser_rm_daemon.add_argument(
8860 '--force-delete-data',
8861 action='store_true',
8862 help='delete valuable daemon data instead of making a backup')
8863
8864 parser_rm_cluster = subparsers.add_parser(
8865 'rm-cluster', help='remove all daemons for a cluster')
8866 parser_rm_cluster.set_defaults(func=command_rm_cluster)
8867 parser_rm_cluster.add_argument(
8868 '--fsid',
8869 required=True,
8870 help='cluster FSID')
8871 parser_rm_cluster.add_argument(
8872 '--force',
8873 action='store_true',
8874 help='proceed, even though this may destroy valuable data')
8875 parser_rm_cluster.add_argument(
8876 '--keep-logs',
8877 action='store_true',
8878 help='do not remove log files')
8879 parser_rm_cluster.add_argument(
8880 '--zap-osds',
8881 action='store_true',
8882 help='zap OSD devices for this cluster')
8883
8884 parser_run = subparsers.add_parser(
8885 'run', help='run a ceph daemon, in a container, in the foreground')
8886 parser_run.set_defaults(func=command_run)
8887 parser_run.add_argument(
8888 '--name', '-n',
8889 required=True,
8890 help='daemon name (type.id)')
8891 parser_run.add_argument(
8892 '--fsid',
8893 required=True,
8894 help='cluster FSID')
8895
8896 parser_shell = subparsers.add_parser(
8897 'shell', help='run an interactive shell inside a daemon container')
8898 parser_shell.set_defaults(func=command_shell)
8899 parser_shell.add_argument(
8900 '--shared_ceph_folder',
8901 metavar='CEPH_SOURCE_FOLDER',
8902 help='Development mode. Several folders in containers are volumes mapped to different sub-folders in the ceph source folder')
8903 parser_shell.add_argument(
8904 '--fsid',
8905 help='cluster FSID')
8906 parser_shell.add_argument(
8907 '--name', '-n',
8908 help='daemon name (type.id)')
8909 parser_shell.add_argument(
8910 '--config', '-c',
8911 help='ceph.conf to pass through to the container')
8912 parser_shell.add_argument(
8913 '--keyring', '-k',
8914 help='ceph.keyring to pass through to the container')
8915 parser_shell.add_argument(
8916 '--mount', '-m',
8917 help=('mount a file or directory in the container. '
8918 'Support multiple mounts. '
8919 'ie: `--mount /foo /bar:/bar`. '
8920 'When no destination is passed, default is /mnt'),
8921 nargs='+')
8922 parser_shell.add_argument(
8923 '--env', '-e',
8924 action='append',
8925 default=[],
8926 help='set environment variable')
8927 parser_shell.add_argument(
8928 '--volume', '-v',
8929 action='append',
8930 default=[],
8931 help='set environment variable')
8932 parser_shell.add_argument(
8933 'command', nargs=argparse.REMAINDER,
8934 help='command (optional)')
8935 parser_shell.add_argument(
8936 '--no-hosts',
8937 action='store_true',
8938 help='dont pass /etc/hosts through to the container')
8939
8940 parser_enter = subparsers.add_parser(
8941 'enter', help='run an interactive shell inside a running daemon container')
8942 parser_enter.set_defaults(func=command_enter)
8943 parser_enter.add_argument(
8944 '--fsid',
8945 help='cluster FSID')
8946 parser_enter.add_argument(
8947 '--name', '-n',
8948 required=True,
8949 help='daemon name (type.id)')
8950 parser_enter.add_argument(
8951 'command', nargs=argparse.REMAINDER,
8952 help='command')
8953
8954 parser_ceph_volume = subparsers.add_parser(
8955 'ceph-volume', help='run ceph-volume inside a container')
8956 parser_ceph_volume.set_defaults(func=command_ceph_volume)
8957 parser_ceph_volume.add_argument(
8958 '--shared_ceph_folder',
8959 metavar='CEPH_SOURCE_FOLDER',
8960 help='Development mode. Several folders in containers are volumes mapped to different sub-folders in the ceph source folder')
8961 parser_ceph_volume.add_argument(
8962 '--fsid',
8963 help='cluster FSID')
8964 parser_ceph_volume.add_argument(
8965 '--config-json',
8966 help='JSON file with config and (client.bootstrap-osd) key')
8967 parser_ceph_volume.add_argument(
8968 '--config', '-c',
8969 help='ceph conf file')
8970 parser_ceph_volume.add_argument(
8971 '--keyring', '-k',
8972 help='ceph.keyring to pass through to the container')
8973 parser_ceph_volume.add_argument(
8974 'command', nargs=argparse.REMAINDER,
8975 help='command')
8976
8977 parser_zap_osds = subparsers.add_parser(
8978 'zap-osds', help='zap all OSDs associated with a particular fsid')
8979 parser_zap_osds.set_defaults(func=command_zap_osds)
8980 parser_zap_osds.add_argument(
8981 '--fsid',
8982 required=True,
8983 help='cluster FSID')
8984 parser_zap_osds.add_argument(
8985 '--force',
8986 action='store_true',
8987 help='proceed, even though this may destroy valuable data')
8988
8989 parser_unit = subparsers.add_parser(
8990 'unit', help="operate on the daemon's systemd unit")
8991 parser_unit.set_defaults(func=command_unit)
8992 parser_unit.add_argument(
8993 'command',
8994 help='systemd command (start, stop, restart, enable, disable, ...)')
8995 parser_unit.add_argument(
8996 '--fsid',
8997 help='cluster FSID')
8998 parser_unit.add_argument(
8999 '--name', '-n',
9000 required=True,
9001 help='daemon name (type.id)')
9002
9003 parser_logs = subparsers.add_parser(
9004 'logs', help='print journald logs for a daemon container')
9005 parser_logs.set_defaults(func=command_logs)
9006 parser_logs.add_argument(
9007 '--fsid',
9008 help='cluster FSID')
9009 parser_logs.add_argument(
9010 '--name', '-n',
9011 required=True,
9012 help='daemon name (type.id)')
9013 parser_logs.add_argument(
9014 'command', nargs='*',
9015 help='additional journalctl args')
9016
9017 parser_bootstrap = subparsers.add_parser(
9018 'bootstrap', help='bootstrap a cluster (mon + mgr daemons)')
9019 parser_bootstrap.set_defaults(func=command_bootstrap)
9020 parser_bootstrap.add_argument(
9021 '--config', '-c',
9022 help='ceph conf file to incorporate')
9023 parser_bootstrap.add_argument(
9024 '--mon-id',
9025 required=False,
9026 help='mon id (default: local hostname)')
9027 group = parser_bootstrap.add_mutually_exclusive_group()
9028 group.add_argument(
9029 '--mon-addrv',
9030 help='mon IPs (e.g., [v2:localipaddr:3300,v1:localipaddr:6789])')
9031 group.add_argument(
9032 '--mon-ip',
9033 help='mon IP')
9034 parser_bootstrap.add_argument(
9035 '--mgr-id',
9036 required=False,
9037 help='mgr id (default: randomly generated)')
9038 parser_bootstrap.add_argument(
9039 '--fsid',
9040 help='cluster FSID')
9041 parser_bootstrap.add_argument(
9042 '--output-dir',
9043 default='/etc/ceph',
9044 help='directory to write config, keyring, and pub key files')
9045 parser_bootstrap.add_argument(
9046 '--output-keyring',
9047 help='location to write keyring file with new cluster admin and mon keys')
9048 parser_bootstrap.add_argument(
9049 '--output-config',
9050 help='location to write conf file to connect to new cluster')
9051 parser_bootstrap.add_argument(
9052 '--output-pub-ssh-key',
9053 help="location to write the cluster's public SSH key")
9054 parser_bootstrap.add_argument(
9055 '--skip-admin-label',
9056 action='store_true',
9057 help='do not create admin label for ceph.conf and client.admin keyring distribution')
9058 parser_bootstrap.add_argument(
9059 '--skip-ssh',
9060 action='store_true',
9061 help='skip setup of ssh key on local host')
9062 parser_bootstrap.add_argument(
9063 '--initial-dashboard-user',
9064 default='admin',
9065 help='Initial user for the dashboard')
9066 parser_bootstrap.add_argument(
9067 '--initial-dashboard-password',
9068 help='Initial password for the initial dashboard user')
9069 parser_bootstrap.add_argument(
9070 '--ssl-dashboard-port',
9071 type=int,
9072 default=8443,
9073 help='Port number used to connect with dashboard using SSL')
9074 parser_bootstrap.add_argument(
9075 '--dashboard-key',
9076 type=argparse.FileType('r'),
9077 help='Dashboard key')
9078 parser_bootstrap.add_argument(
9079 '--dashboard-crt',
9080 type=argparse.FileType('r'),
9081 help='Dashboard certificate')
9082
9083 parser_bootstrap.add_argument(
9084 '--ssh-config',
9085 type=argparse.FileType('r'),
9086 help='SSH config')
9087 parser_bootstrap.add_argument(
9088 '--ssh-private-key',
9089 type=argparse.FileType('r'),
9090 help='SSH private key')
9091 parser_bootstrap.add_argument(
9092 '--ssh-public-key',
9093 type=argparse.FileType('r'),
9094 help='SSH public key')
9095 parser_bootstrap.add_argument(
9096 '--ssh-user',
9097 default='root',
9098 help='set user for SSHing to cluster hosts, passwordless sudo will be needed for non-root users')
9099 parser_bootstrap.add_argument(
9100 '--skip-mon-network',
9101 action='store_true',
9102 help='set mon public_network based on bootstrap mon ip')
9103 parser_bootstrap.add_argument(
9104 '--skip-dashboard',
9105 action='store_true',
9106 help='do not enable the Ceph Dashboard')
9107 parser_bootstrap.add_argument(
9108 '--dashboard-password-noupdate',
9109 action='store_true',
9110 help='stop forced dashboard password change')
9111 parser_bootstrap.add_argument(
9112 '--no-minimize-config',
9113 action='store_true',
9114 help='do not assimilate and minimize the config file')
9115 parser_bootstrap.add_argument(
9116 '--skip-ping-check',
9117 action='store_true',
9118 help='do not verify that mon IP is pingable')
9119 parser_bootstrap.add_argument(
9120 '--skip-pull',
9121 action='store_true',
9122 help='do not pull the default image before bootstrapping')
9123 parser_bootstrap.add_argument(
9124 '--skip-firewalld',
9125 action='store_true',
9126 help='Do not configure firewalld')
9127 parser_bootstrap.add_argument(
9128 '--allow-overwrite',
9129 action='store_true',
9130 help='allow overwrite of existing --output-* config/keyring/ssh files')
9131 parser_bootstrap.add_argument(
9132 '--allow-fqdn-hostname',
9133 action='store_true',
9134 help='allow hostname that is fully-qualified (contains ".")')
9135 parser_bootstrap.add_argument(
9136 '--allow-mismatched-release',
9137 action='store_true',
9138 help="allow bootstrap of ceph that doesn't match this version of cephadm")
9139 parser_bootstrap.add_argument(
9140 '--skip-prepare-host',
9141 action='store_true',
9142 help='Do not prepare host')
9143 parser_bootstrap.add_argument(
9144 '--orphan-initial-daemons',
9145 action='store_true',
9146 help='Set mon and mgr service to `unmanaged`, Do not create the crash service')
9147 parser_bootstrap.add_argument(
9148 '--skip-monitoring-stack',
9149 action='store_true',
9150 help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter)')
9151 parser_bootstrap.add_argument(
9152 '--with-centralized-logging',
9153 action='store_true',
9154 help='Automatically provision centralized logging (promtail, loki)')
9155 parser_bootstrap.add_argument(
9156 '--apply-spec',
9157 help='Apply cluster spec after bootstrap (copy ssh key, add hosts and apply services)')
9158 parser_bootstrap.add_argument(
9159 '--shared_ceph_folder',
9160 metavar='CEPH_SOURCE_FOLDER',
9161 help='Development mode. Several folders in containers are volumes mapped to different sub-folders in the ceph source folder')
9162
9163 parser_bootstrap.add_argument(
9164 '--registry-url',
9165 help='url for custom registry')
9166 parser_bootstrap.add_argument(
9167 '--registry-username',
9168 help='username for custom registry')
9169 parser_bootstrap.add_argument(
9170 '--registry-password',
9171 help='password for custom registry')
9172 parser_bootstrap.add_argument(
9173 '--registry-json',
9174 help='json file with custom registry login info (URL, Username, Password)')
9175 parser_bootstrap.add_argument(
9176 '--container-init',
9177 action='store_true',
9178 default=CONTAINER_INIT,
9179 help=argparse.SUPPRESS)
9180 parser_bootstrap.add_argument(
9181 '--cluster-network',
9182 help='subnet to use for cluster replication, recovery and heartbeats (in CIDR notation network/mask)')
9183 parser_bootstrap.add_argument(
9184 '--single-host-defaults',
9185 action='store_true',
9186 help='adjust configuration defaults to suit a single-host cluster')
9187 parser_bootstrap.add_argument(
9188 '--log-to-file',
9189 action='store_true',
9190 help='configure cluster to log to traditional log files in /var/log/ceph/$fsid')
9191
9192 parser_deploy = subparsers.add_parser(
9193 'deploy', help='deploy a daemon')
9194 parser_deploy.set_defaults(func=command_deploy)
9195 parser_deploy.add_argument(
9196 '--name',
9197 required=True,
9198 action=CustomValidation,
9199 help='daemon name (type.id)')
9200 parser_deploy.add_argument(
9201 '--fsid',
9202 required=True,
9203 help='cluster FSID')
9204 parser_deploy.add_argument(
9205 '--config', '-c',
9206 help='config file for new daemon')
9207 parser_deploy.add_argument(
9208 '--config-json',
9209 help='Additional configuration information in JSON format')
9210 parser_deploy.add_argument(
9211 '--keyring',
9212 help='keyring for new daemon')
9213 parser_deploy.add_argument(
9214 '--key',
9215 help='key for new daemon')
9216 parser_deploy.add_argument(
9217 '--osd-fsid',
9218 help='OSD uuid, if creating an OSD container')
9219 parser_deploy.add_argument(
9220 '--skip-firewalld',
9221 action='store_true',
9222 help='Do not configure firewalld')
9223 parser_deploy.add_argument(
9224 '--tcp-ports',
9225 help='List of tcp ports to open in the host firewall')
9226 parser_deploy.add_argument(
9227 '--reconfig',
9228 action='store_true',
9229 help='Reconfigure a previously deployed daemon')
9230 parser_deploy.add_argument(
9231 '--allow-ptrace',
9232 action='store_true',
9233 help='Allow SYS_PTRACE on daemon container')
9234 parser_deploy.add_argument(
9235 '--container-init',
9236 action='store_true',
9237 default=CONTAINER_INIT,
9238 help=argparse.SUPPRESS)
9239 parser_deploy.add_argument(
9240 '--memory-request',
9241 help='Container memory request/target'
9242 )
9243 parser_deploy.add_argument(
9244 '--memory-limit',
9245 help='Container memory hard limit'
9246 )
9247 parser_deploy.add_argument(
9248 '--meta-json',
9249 help='JSON dict of additional metadata'
9250 )
9251 parser_deploy.add_argument(
9252 '--extra-container-args',
9253 action='append',
9254 default=[],
9255 help='Additional container arguments to apply to deamon'
9256 )
9257
9258 parser_check_host = subparsers.add_parser(
9259 'check-host', help='check host configuration')
9260 parser_check_host.set_defaults(func=command_check_host)
9261 parser_check_host.add_argument(
9262 '--expect-hostname',
9263 help='Check that hostname matches an expected value')
9264
9265 parser_prepare_host = subparsers.add_parser(
9266 'prepare-host', help='prepare a host for cephadm use')
9267 parser_prepare_host.set_defaults(func=command_prepare_host)
9268 parser_prepare_host.add_argument(
9269 '--expect-hostname',
9270 help='Set hostname')
9271
9272 parser_add_repo = subparsers.add_parser(
9273 'add-repo', help='configure package repository')
9274 parser_add_repo.set_defaults(func=command_add_repo)
9275 parser_add_repo.add_argument(
9276 '--release',
9277 help='use latest version of a named release (e.g., {})'.format(LATEST_STABLE_RELEASE))
9278 parser_add_repo.add_argument(
9279 '--version',
9280 help='use specific upstream version (x.y.z)')
9281 parser_add_repo.add_argument(
9282 '--dev',
9283 help='use specified bleeding edge build from git branch or tag')
9284 parser_add_repo.add_argument(
9285 '--dev-commit',
9286 help='use specified bleeding edge build from git commit')
9287 parser_add_repo.add_argument(
9288 '--gpg-url',
9289 help='specify alternative GPG key location')
9290 parser_add_repo.add_argument(
9291 '--repo-url',
9292 default='https://download.ceph.com',
9293 help='specify alternative repo location')
9294 # TODO: proxy?
9295
9296 parser_rm_repo = subparsers.add_parser(
9297 'rm-repo', help='remove package repository configuration')
9298 parser_rm_repo.set_defaults(func=command_rm_repo)
9299
9300 parser_install = subparsers.add_parser(
9301 'install', help='install ceph package(s)')
9302 parser_install.set_defaults(func=command_install)
9303 parser_install.add_argument(
9304 'packages', nargs='*',
9305 default=['cephadm'],
9306 help='packages')
9307
9308 parser_registry_login = subparsers.add_parser(
9309 'registry-login', help='log host into authenticated registry')
9310 parser_registry_login.set_defaults(func=command_registry_login)
9311 parser_registry_login.add_argument(
9312 '--registry-url',
9313 help='url for custom registry')
9314 parser_registry_login.add_argument(
9315 '--registry-username',
9316 help='username for custom registry')
9317 parser_registry_login.add_argument(
9318 '--registry-password',
9319 help='password for custom registry')
9320 parser_registry_login.add_argument(
9321 '--registry-json',
9322 help='json file with custom registry login info (URL, Username, Password)')
9323 parser_registry_login.add_argument(
9324 '--fsid',
9325 help='cluster FSID')
9326
9327 parser_gather_facts = subparsers.add_parser(
9328 'gather-facts', help='gather and return host related information (JSON format)')
9329 parser_gather_facts.set_defaults(func=command_gather_facts)
9330
9331 parser_maintenance = subparsers.add_parser(
9332 'host-maintenance', help='Manage the maintenance state of a host')
9333 parser_maintenance.add_argument(
9334 '--fsid',
9335 help='cluster FSID')
9336 parser_maintenance.add_argument(
9337 'maintenance_action',
9338 type=str,
9339 choices=['enter', 'exit'],
9340 help='Maintenance action - enter maintenance, or exit maintenance')
9341 parser_maintenance.set_defaults(func=command_maintenance)
9342
9343 parser_agent = subparsers.add_parser(
9344 'agent', help='start cephadm agent')
9345 parser_agent.set_defaults(func=command_agent)
9346 parser_agent.add_argument(
9347 '--fsid',
9348 required=True,
9349 help='cluster FSID')
9350 parser_agent.add_argument(
9351 '--daemon-id',
9352 help='daemon id for agent')
9353
9354 parser_disk_rescan = subparsers.add_parser(
9355 'disk-rescan', help='rescan all HBAs to detect new/removed devices')
9356 parser_disk_rescan.set_defaults(func=command_rescan_disks)
9357
9358 return parser
9359
9360
9361 def _parse_args(av: List[str]) -> argparse.Namespace:
9362 parser = _get_parser()
9363
9364 args = parser.parse_args(av)
9365 if 'command' in args and args.command and args.command[0] == '--':
9366 args.command.pop(0)
9367
9368 # workaround argparse to deprecate the subparser `--container-init` flag
9369 # container_init and no_container_init must always be mutually exclusive
9370 container_init_args = ('--container-init', '--no-container-init')
9371 if set(container_init_args).issubset(av):
9372 parser.error('argument %s: not allowed with argument %s' % (container_init_args))
9373 elif '--container-init' in av:
9374 args.no_container_init = not args.container_init
9375 else:
9376 args.container_init = not args.no_container_init
9377 assert args.container_init is not args.no_container_init
9378
9379 return args
9380
9381
9382 def cephadm_init_ctx(args: List[str]) -> CephadmContext:
9383 ctx = CephadmContext()
9384 ctx.set_args(_parse_args(args))
9385 return ctx
9386
9387
9388 def cephadm_init_logging(ctx: CephadmContext, args: List[str]) -> None:
9389 """Configure the logging for cephadm as well as updating the system
9390 to have the expected log dir and logrotate configuration.
9391 """
9392 logging.addLevelName(QUIET_LOG_LEVEL, 'QUIET')
9393 global logger
9394 if not os.path.exists(LOG_DIR):
9395 os.makedirs(LOG_DIR)
9396 operations = ['bootstrap', 'rm-cluster']
9397 if any(op in args for op in operations):
9398 dictConfig(interactive_logging_config)
9399 else:
9400 dictConfig(logging_config)
9401
9402 logger = logging.getLogger()
9403 logger.setLevel(QUIET_LOG_LEVEL)
9404
9405 if not os.path.exists(ctx.logrotate_dir + '/cephadm'):
9406 with open(ctx.logrotate_dir + '/cephadm', 'w') as f:
9407 f.write("""# created by cephadm
9408 /var/log/ceph/cephadm.log {
9409 rotate 7
9410 daily
9411 compress
9412 missingok
9413 notifempty
9414 su root root
9415 }
9416 """)
9417
9418 if ctx.verbose:
9419 for handler in logger.handlers:
9420 if handler.name in ['console', 'log_file', 'console_stdout']:
9421 handler.setLevel(QUIET_LOG_LEVEL)
9422 logger.debug('%s\ncephadm %s' % ('-' * 80, args))
9423
9424
9425 def cephadm_require_root() -> None:
9426 """Exit if the process is not running as root."""
9427 if os.geteuid() != 0:
9428 sys.stderr.write('ERROR: cephadm should be run as root\n')
9429 sys.exit(1)
9430
9431
9432 def main() -> None:
9433 av: List[str] = []
9434 av = sys.argv[1:]
9435
9436 ctx = cephadm_init_ctx(av)
9437 if not ctx.has_function():
9438 sys.stderr.write('No command specified; pass -h or --help for usage\n')
9439 sys.exit(1)
9440
9441 cephadm_require_root()
9442 cephadm_init_logging(ctx, av)
9443 try:
9444 # podman or docker?
9445 ctx.container_engine = find_container_engine(ctx)
9446 if ctx.func not in \
9447 [
9448 command_check_host,
9449 command_prepare_host,
9450 command_add_repo,
9451 command_rm_repo,
9452 command_install
9453 ]:
9454 check_container_engine(ctx)
9455 # command handler
9456 r = ctx.func(ctx)
9457 except Error as e:
9458 if ctx.verbose:
9459 raise
9460 logger.error('ERROR: %s' % e)
9461 sys.exit(1)
9462 if not r:
9463 r = 0
9464 sys.exit(r)
9465
9466
9467 if __name__ == '__main__':
9468 main()