]> git.proxmox.com Git - ceph.git/blob - ceph/src/cephadm/cephadm
import ceph 16.2.6
[ceph.git] / ceph / src / cephadm / cephadm
1 #!/usr/bin/python3
2
3 import asyncio
4 import asyncio.subprocess
5 import argparse
6 import datetime
7 import fcntl
8 import ipaddress
9 import json
10 import logging
11 from logging.config import dictConfig
12 import os
13 import platform
14 import pwd
15 import random
16 import shlex
17 import shutil
18 import socket
19 import string
20 import subprocess
21 import sys
22 import tempfile
23 import time
24 import errno
25 import struct
26 from socketserver import ThreadingMixIn
27 from http.server import BaseHTTPRequestHandler, HTTPServer
28 import signal
29 import io
30 from contextlib import redirect_stdout
31 import ssl
32 from enum import Enum
33
34 from typing import Dict, List, Tuple, Optional, Union, Any, NoReturn, Callable, IO, Sequence, TypeVar, cast, Set
35
36 import re
37 import uuid
38
39 from configparser import ConfigParser
40 from functools import wraps
41 from glob import glob
42 from io import StringIO
43 from threading import Thread, RLock
44 from urllib.error import HTTPError
45 from urllib.request import urlopen
46 from pathlib import Path
47
48 FuncT = TypeVar('FuncT', bound=Callable)
49
50 # Default container images -----------------------------------------------------
51 DEFAULT_IMAGE = 'quay.io/ceph/ceph:v16'
52 DEFAULT_IMAGE_IS_MASTER = False
53 DEFAULT_IMAGE_RELEASE = 'pacific'
54 DEFAULT_PROMETHEUS_IMAGE = 'quay.io/prometheus/prometheus:v2.18.1'
55 DEFAULT_NODE_EXPORTER_IMAGE = 'quay.io/prometheus/node-exporter:v0.18.1'
56 DEFAULT_ALERT_MANAGER_IMAGE = 'quay.io/prometheus/alertmanager:v0.20.0'
57 DEFAULT_GRAFANA_IMAGE = 'quay.io/ceph/ceph-grafana:6.7.4'
58 DEFAULT_HAPROXY_IMAGE = 'docker.io/library/haproxy:2.3'
59 DEFAULT_KEEPALIVED_IMAGE = 'docker.io/arcts/keepalived'
60 DEFAULT_REGISTRY = 'docker.io' # normalize unqualified digests to this
61 # ------------------------------------------------------------------------------
62
63 LATEST_STABLE_RELEASE = 'pacific'
64 DATA_DIR = '/var/lib/ceph'
65 LOG_DIR = '/var/log/ceph'
66 LOCK_DIR = '/run/cephadm'
67 LOGROTATE_DIR = '/etc/logrotate.d'
68 SYSCTL_DIR = '/usr/lib/sysctl.d'
69 UNIT_DIR = '/etc/systemd/system'
70 LOG_DIR_MODE = 0o770
71 DATA_DIR_MODE = 0o700
72 CONTAINER_INIT = True
73 MIN_PODMAN_VERSION = (2, 0, 2)
74 CGROUPS_SPLIT_PODMAN_VERSION = (2, 1, 0)
75 CUSTOM_PS1 = r'[ceph: \u@\h \W]\$ '
76 DEFAULT_TIMEOUT = None # in seconds
77 DEFAULT_RETRY = 15
78 SHELL_DEFAULT_CONF = '/etc/ceph/ceph.conf'
79 SHELL_DEFAULT_KEYRING = '/etc/ceph/ceph.client.admin.keyring'
80 DATEFMT = '%Y-%m-%dT%H:%M:%S.%fZ'
81
82 logger: logging.Logger = None # type: ignore
83
84 """
85 You can invoke cephadm in two ways:
86
87 1. The normal way, at the command line.
88
89 2. By piping the script to the python3 binary. In this latter case, you should
90 prepend one or more lines to the beginning of the script.
91
92 For arguments,
93
94 injected_argv = [...]
95
96 e.g.,
97
98 injected_argv = ['ls']
99
100 For reading stdin from the '--config-json -' argument,
101
102 injected_stdin = '...'
103 """
104 cached_stdin = None
105
106 ##################################
107
108
109 class BaseConfig:
110
111 def __init__(self) -> None:
112 self.image: str = ''
113 self.docker: bool = False
114 self.data_dir: str = DATA_DIR
115 self.log_dir: str = LOG_DIR
116 self.logrotate_dir: str = LOGROTATE_DIR
117 self.sysctl_dir: str = SYSCTL_DIR
118 self.unit_dir: str = UNIT_DIR
119 self.verbose: bool = False
120 self.timeout: Optional[int] = DEFAULT_TIMEOUT
121 self.retry: int = DEFAULT_RETRY
122 self.env: List[str] = []
123 self.memory_request: Optional[int] = None
124 self.memory_limit: Optional[int] = None
125
126 self.container_init: bool = CONTAINER_INIT
127 self.container_engine: Optional[ContainerEngine] = None
128
129 def set_from_args(self, args: argparse.Namespace) -> None:
130 argdict: Dict[str, Any] = vars(args)
131 for k, v in argdict.items():
132 if hasattr(self, k):
133 setattr(self, k, v)
134
135
136 class CephadmContext:
137
138 def __init__(self) -> None:
139 self.__dict__['_args'] = None
140 self.__dict__['_conf'] = BaseConfig()
141
142 def set_args(self, args: argparse.Namespace) -> None:
143 self._conf.set_from_args(args)
144 self._args = args
145
146 def has_function(self) -> bool:
147 return 'func' in self._args
148
149 def __contains__(self, name: str) -> bool:
150 return hasattr(self, name)
151
152 def __getattr__(self, name: str) -> Any:
153 if '_conf' in self.__dict__ and hasattr(self._conf, name):
154 return getattr(self._conf, name)
155 elif '_args' in self.__dict__ and hasattr(self._args, name):
156 return getattr(self._args, name)
157 else:
158 return super().__getattribute__(name)
159
160 def __setattr__(self, name: str, value: Any) -> None:
161 if hasattr(self._conf, name):
162 setattr(self._conf, name, value)
163 elif hasattr(self._args, name):
164 setattr(self._args, name, value)
165 else:
166 super().__setattr__(name, value)
167
168
169 class ContainerEngine:
170 def __init__(self) -> None:
171 self.path = find_program(self.EXE)
172
173 @classmethod
174 @property
175 def EXE(cls) -> str:
176 raise NotImplementedError()
177
178
179 class Podman(ContainerEngine):
180 EXE = 'podman'
181
182 def __init__(self) -> None:
183 super().__init__()
184 self._version: Optional[Tuple[int, ...]] = None
185
186 @property
187 def version(self) -> Tuple[int, ...]:
188 if self._version is None:
189 raise RuntimeError('Please call `get_version` first')
190 return self._version
191
192 def get_version(self, ctx: CephadmContext) -> None:
193 out, _, _ = call_throws(ctx, [self.path, 'version', '--format', '{{.Client.Version}}'])
194 self._version = _parse_podman_version(out)
195
196
197 class Docker(ContainerEngine):
198 EXE = 'docker'
199
200
201 CONTAINER_PREFERENCE = (Podman, Docker) # prefer podman to docker
202
203
204 # Log and console output config
205 logging_config = {
206 'version': 1,
207 'disable_existing_loggers': True,
208 'formatters': {
209 'cephadm': {
210 'format': '%(asctime)s %(levelname)s %(message)s'
211 },
212 },
213 'handlers': {
214 'console': {
215 'level': 'INFO',
216 'class': 'logging.StreamHandler',
217 },
218 'log_file': {
219 'level': 'DEBUG',
220 'class': 'logging.handlers.WatchedFileHandler',
221 'formatter': 'cephadm',
222 'filename': '%s/cephadm.log' % LOG_DIR,
223 }
224 },
225 'loggers': {
226 '': {
227 'level': 'DEBUG',
228 'handlers': ['console', 'log_file'],
229 }
230 }
231 }
232
233
234 class termcolor:
235 yellow = '\033[93m'
236 red = '\033[31m'
237 end = '\033[0m'
238
239
240 class Error(Exception):
241 pass
242
243
244 class TimeoutExpired(Error):
245 pass
246
247 ##################################
248
249
250 class Ceph(object):
251 daemons = ('mon', 'mgr', 'mds', 'osd', 'rgw', 'rbd-mirror',
252 'crash', 'cephfs-mirror')
253
254 ##################################
255
256
257 class OSD(object):
258 @staticmethod
259 def get_sysctl_settings() -> List[str]:
260 return [
261 '# allow a large number of OSDs',
262 'fs.aio-max-nr = 1048576',
263 'kernel.pid_max = 4194304',
264 ]
265
266 ##################################
267
268
269 class Monitoring(object):
270 """Define the configs for the monitoring containers"""
271
272 port_map = {
273 'prometheus': [9095], # Avoid default 9090, due to conflict with cockpit UI
274 'node-exporter': [9100],
275 'grafana': [3000],
276 'alertmanager': [9093, 9094],
277 }
278
279 components = {
280 'prometheus': {
281 'image': DEFAULT_PROMETHEUS_IMAGE,
282 'cpus': '2',
283 'memory': '4GB',
284 'args': [
285 '--config.file=/etc/prometheus/prometheus.yml',
286 '--storage.tsdb.path=/prometheus',
287 ],
288 'config-json-files': [
289 'prometheus.yml',
290 ],
291 },
292 'node-exporter': {
293 'image': DEFAULT_NODE_EXPORTER_IMAGE,
294 'cpus': '1',
295 'memory': '1GB',
296 'args': [
297 '--no-collector.timex',
298 ],
299 },
300 'grafana': {
301 'image': DEFAULT_GRAFANA_IMAGE,
302 'cpus': '2',
303 'memory': '4GB',
304 'args': [],
305 'config-json-files': [
306 'grafana.ini',
307 'provisioning/datasources/ceph-dashboard.yml',
308 'certs/cert_file',
309 'certs/cert_key',
310 ],
311 },
312 'alertmanager': {
313 'image': DEFAULT_ALERT_MANAGER_IMAGE,
314 'cpus': '2',
315 'memory': '2GB',
316 'args': [
317 '--cluster.listen-address=:{}'.format(port_map['alertmanager'][1]),
318 ],
319 'config-json-files': [
320 'alertmanager.yml',
321 ],
322 'config-json-args': [
323 'peers',
324 ],
325 },
326 } # type: ignore
327
328 @staticmethod
329 def get_version(ctx, container_id, daemon_type):
330 # type: (CephadmContext, str, str) -> str
331 """
332 :param: daemon_type Either "prometheus", "alertmanager" or "node-exporter"
333 """
334 assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter')
335 cmd = daemon_type.replace('-', '_')
336 code = -1
337 err = ''
338 version = ''
339 if daemon_type == 'alertmanager':
340 for cmd in ['alertmanager', 'prometheus-alertmanager']:
341 _, err, code = call(ctx, [
342 ctx.container_engine.path, 'exec', container_id, cmd,
343 '--version'
344 ], verbosity=CallVerbosity.DEBUG)
345 if code == 0:
346 break
347 cmd = 'alertmanager' # reset cmd for version extraction
348 else:
349 _, err, code = call(ctx, [
350 ctx.container_engine.path, 'exec', container_id, cmd, '--version'
351 ], verbosity=CallVerbosity.DEBUG)
352 if code == 0 and \
353 err.startswith('%s, version ' % cmd):
354 version = err.split(' ')[2]
355 return version
356
357 ##################################
358
359
360 def populate_files(config_dir, config_files, uid, gid):
361 # type: (str, Dict, int, int) -> None
362 """create config files for different services"""
363 for fname in config_files:
364 config_file = os.path.join(config_dir, fname)
365 config_content = dict_get_join(config_files, fname)
366 logger.info('Write file: %s' % (config_file))
367 with open(config_file, 'w', encoding='utf-8') as f:
368 os.fchown(f.fileno(), uid, gid)
369 os.fchmod(f.fileno(), 0o600)
370 f.write(config_content)
371
372
373 class NFSGanesha(object):
374 """Defines a NFS-Ganesha container"""
375
376 daemon_type = 'nfs'
377 entrypoint = '/usr/bin/ganesha.nfsd'
378 daemon_args = ['-F', '-L', 'STDERR']
379
380 required_files = ['ganesha.conf']
381
382 port_map = {
383 'nfs': 2049,
384 }
385
386 def __init__(self,
387 ctx,
388 fsid,
389 daemon_id,
390 config_json,
391 image=DEFAULT_IMAGE):
392 # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
393 self.ctx = ctx
394 self.fsid = fsid
395 self.daemon_id = daemon_id
396 self.image = image
397
398 # config-json options
399 self.pool = dict_get(config_json, 'pool', require=True)
400 self.namespace = dict_get(config_json, 'namespace')
401 self.userid = dict_get(config_json, 'userid')
402 self.extra_args = dict_get(config_json, 'extra_args', [])
403 self.files = dict_get(config_json, 'files', {})
404 self.rgw = dict_get(config_json, 'rgw', {})
405
406 # validate the supplied args
407 self.validate()
408
409 @classmethod
410 def init(cls, ctx, fsid, daemon_id):
411 # type: (CephadmContext, str, Union[int, str]) -> NFSGanesha
412 return cls(ctx, fsid, daemon_id, get_parm(ctx.config_json), ctx.image)
413
414 def get_container_mounts(self, data_dir):
415 # type: (str) -> Dict[str, str]
416 mounts = dict()
417 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
418 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
419 mounts[os.path.join(data_dir, 'etc/ganesha')] = '/etc/ganesha:z'
420 if self.rgw:
421 cluster = self.rgw.get('cluster', 'ceph')
422 rgw_user = self.rgw.get('user', 'admin')
423 mounts[os.path.join(data_dir, 'keyring.rgw')] = \
424 '/var/lib/ceph/radosgw/%s-%s/keyring:z' % (cluster, rgw_user)
425 return mounts
426
427 @staticmethod
428 def get_container_envs():
429 # type: () -> List[str]
430 envs = [
431 'CEPH_CONF=%s' % ('/etc/ceph/ceph.conf')
432 ]
433 return envs
434
435 @staticmethod
436 def get_version(ctx, container_id):
437 # type: (CephadmContext, str) -> Optional[str]
438 version = None
439 out, err, code = call(ctx,
440 [ctx.container_engine.path, 'exec', container_id,
441 NFSGanesha.entrypoint, '-v'],
442 verbosity=CallVerbosity.DEBUG)
443 if code == 0:
444 match = re.search(r'NFS-Ganesha Release\s*=\s*[V]*([\d.]+)', out)
445 if match:
446 version = match.group(1)
447 return version
448
449 def validate(self):
450 # type: () -> None
451 if not is_fsid(self.fsid):
452 raise Error('not an fsid: %s' % self.fsid)
453 if not self.daemon_id:
454 raise Error('invalid daemon_id: %s' % self.daemon_id)
455 if not self.image:
456 raise Error('invalid image: %s' % self.image)
457
458 # check for the required files
459 if self.required_files:
460 for fname in self.required_files:
461 if fname not in self.files:
462 raise Error('required file missing from config-json: %s' % fname)
463
464 # check for an RGW config
465 if self.rgw:
466 if not self.rgw.get('keyring'):
467 raise Error('RGW keyring is missing')
468 if not self.rgw.get('user'):
469 raise Error('RGW user is missing')
470
471 def get_daemon_name(self):
472 # type: () -> str
473 return '%s.%s' % (self.daemon_type, self.daemon_id)
474
475 def get_container_name(self, desc=None):
476 # type: (Optional[str]) -> str
477 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
478 if desc:
479 cname = '%s-%s' % (cname, desc)
480 return cname
481
482 def get_daemon_args(self):
483 # type: () -> List[str]
484 return self.daemon_args + self.extra_args
485
486 def create_daemon_dirs(self, data_dir, uid, gid):
487 # type: (str, int, int) -> None
488 """Create files under the container data dir"""
489 if not os.path.isdir(data_dir):
490 raise OSError('data_dir is not a directory: %s' % (data_dir))
491
492 logger.info('Creating ganesha config...')
493
494 # create the ganesha conf dir
495 config_dir = os.path.join(data_dir, 'etc/ganesha')
496 makedirs(config_dir, uid, gid, 0o755)
497
498 # populate files from the config-json
499 populate_files(config_dir, self.files, uid, gid)
500
501 # write the RGW keyring
502 if self.rgw:
503 keyring_path = os.path.join(data_dir, 'keyring.rgw')
504 with open(keyring_path, 'w') as f:
505 os.fchmod(f.fileno(), 0o600)
506 os.fchown(f.fileno(), uid, gid)
507 f.write(self.rgw.get('keyring', ''))
508
509 ##################################
510
511
512 class CephIscsi(object):
513 """Defines a Ceph-Iscsi container"""
514
515 daemon_type = 'iscsi'
516 entrypoint = '/usr/bin/rbd-target-api'
517
518 required_files = ['iscsi-gateway.cfg']
519
520 def __init__(self,
521 ctx,
522 fsid,
523 daemon_id,
524 config_json,
525 image=DEFAULT_IMAGE):
526 # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
527 self.ctx = ctx
528 self.fsid = fsid
529 self.daemon_id = daemon_id
530 self.image = image
531
532 # config-json options
533 self.files = dict_get(config_json, 'files', {})
534
535 # validate the supplied args
536 self.validate()
537
538 @classmethod
539 def init(cls, ctx, fsid, daemon_id):
540 # type: (CephadmContext, str, Union[int, str]) -> CephIscsi
541 return cls(ctx, fsid, daemon_id,
542 get_parm(ctx.config_json), ctx.image)
543
544 @staticmethod
545 def get_container_mounts(data_dir, log_dir):
546 # type: (str, str) -> Dict[str, str]
547 mounts = dict()
548 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
549 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
550 mounts[os.path.join(data_dir, 'iscsi-gateway.cfg')] = '/etc/ceph/iscsi-gateway.cfg:z'
551 mounts[os.path.join(data_dir, 'configfs')] = '/sys/kernel/config'
552 mounts[log_dir] = '/var/log/rbd-target-api:z'
553 mounts['/dev'] = '/dev'
554 return mounts
555
556 @staticmethod
557 def get_container_binds():
558 # type: () -> List[List[str]]
559 binds = []
560 lib_modules = ['type=bind',
561 'source=/lib/modules',
562 'destination=/lib/modules',
563 'ro=true']
564 binds.append(lib_modules)
565 return binds
566
567 @staticmethod
568 def get_version(ctx, container_id):
569 # type: (CephadmContext, str) -> Optional[str]
570 version = None
571 out, err, code = call(ctx,
572 [ctx.container_engine.path, 'exec', container_id,
573 '/usr/bin/python3', '-c', "import pkg_resources; print(pkg_resources.require('ceph_iscsi')[0].version)"],
574 verbosity=CallVerbosity.DEBUG)
575 if code == 0:
576 version = out.strip()
577 return version
578
579 def validate(self):
580 # type: () -> None
581 if not is_fsid(self.fsid):
582 raise Error('not an fsid: %s' % self.fsid)
583 if not self.daemon_id:
584 raise Error('invalid daemon_id: %s' % self.daemon_id)
585 if not self.image:
586 raise Error('invalid image: %s' % self.image)
587
588 # check for the required files
589 if self.required_files:
590 for fname in self.required_files:
591 if fname not in self.files:
592 raise Error('required file missing from config-json: %s' % fname)
593
594 def get_daemon_name(self):
595 # type: () -> str
596 return '%s.%s' % (self.daemon_type, self.daemon_id)
597
598 def get_container_name(self, desc=None):
599 # type: (Optional[str]) -> str
600 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
601 if desc:
602 cname = '%s-%s' % (cname, desc)
603 return cname
604
605 def create_daemon_dirs(self, data_dir, uid, gid):
606 # type: (str, int, int) -> None
607 """Create files under the container data dir"""
608 if not os.path.isdir(data_dir):
609 raise OSError('data_dir is not a directory: %s' % (data_dir))
610
611 logger.info('Creating ceph-iscsi config...')
612 configfs_dir = os.path.join(data_dir, 'configfs')
613 makedirs(configfs_dir, uid, gid, 0o755)
614
615 # populate files from the config-json
616 populate_files(data_dir, self.files, uid, gid)
617
618 @staticmethod
619 def configfs_mount_umount(data_dir, mount=True):
620 # type: (str, bool) -> List[str]
621 mount_path = os.path.join(data_dir, 'configfs')
622 if mount:
623 cmd = 'if ! grep -qs {0} /proc/mounts; then ' \
624 'mount -t configfs none {0}; fi'.format(mount_path)
625 else:
626 cmd = 'if grep -qs {0} /proc/mounts; then ' \
627 'umount {0}; fi'.format(mount_path)
628 return cmd.split()
629
630 def get_tcmu_runner_container(self):
631 # type: () -> CephContainer
632 tcmu_container = get_container(self.ctx, self.fsid, self.daemon_type, self.daemon_id)
633 tcmu_container.entrypoint = '/usr/bin/tcmu-runner'
634 tcmu_container.cname = self.get_container_name(desc='tcmu')
635 # remove extra container args for tcmu container.
636 # extra args could cause issue with forking service type
637 tcmu_container.container_args = []
638 return tcmu_container
639
640 ##################################
641
642
643 class HAproxy(object):
644 """Defines an HAproxy container"""
645 daemon_type = 'haproxy'
646 required_files = ['haproxy.cfg']
647 default_image = DEFAULT_HAPROXY_IMAGE
648
649 def __init__(self,
650 ctx: CephadmContext,
651 fsid: str, daemon_id: Union[int, str],
652 config_json: Dict, image: str) -> None:
653 self.ctx = ctx
654 self.fsid = fsid
655 self.daemon_id = daemon_id
656 self.image = image
657
658 # config-json options
659 self.files = dict_get(config_json, 'files', {})
660
661 self.validate()
662
663 @classmethod
664 def init(cls, ctx: CephadmContext,
665 fsid: str, daemon_id: Union[int, str]) -> 'HAproxy':
666 return cls(ctx, fsid, daemon_id, get_parm(ctx.config_json),
667 ctx.image)
668
669 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
670 """Create files under the container data dir"""
671 if not os.path.isdir(data_dir):
672 raise OSError('data_dir is not a directory: %s' % (data_dir))
673
674 # create additional directories in data dir for HAproxy to use
675 if not os.path.isdir(os.path.join(data_dir, 'haproxy')):
676 makedirs(os.path.join(data_dir, 'haproxy'), uid, gid, DATA_DIR_MODE)
677
678 data_dir = os.path.join(data_dir, 'haproxy')
679 populate_files(data_dir, self.files, uid, gid)
680
681 def get_daemon_args(self) -> List[str]:
682 return ['haproxy', '-f', '/var/lib/haproxy/haproxy.cfg']
683
684 def validate(self):
685 # type: () -> None
686 if not is_fsid(self.fsid):
687 raise Error('not an fsid: %s' % self.fsid)
688 if not self.daemon_id:
689 raise Error('invalid daemon_id: %s' % self.daemon_id)
690 if not self.image:
691 raise Error('invalid image: %s' % self.image)
692
693 # check for the required files
694 if self.required_files:
695 for fname in self.required_files:
696 if fname not in self.files:
697 raise Error('required file missing from config-json: %s' % fname)
698
699 def get_daemon_name(self):
700 # type: () -> str
701 return '%s.%s' % (self.daemon_type, self.daemon_id)
702
703 def get_container_name(self, desc=None):
704 # type: (Optional[str]) -> str
705 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
706 if desc:
707 cname = '%s-%s' % (cname, desc)
708 return cname
709
710 def extract_uid_gid_haproxy(self) -> Tuple[int, int]:
711 # better directory for this?
712 return extract_uid_gid(self.ctx, file_path='/var/lib')
713
714 @staticmethod
715 def get_container_mounts(data_dir: str) -> Dict[str, str]:
716 mounts = dict()
717 mounts[os.path.join(data_dir, 'haproxy')] = '/var/lib/haproxy'
718 return mounts
719
720 @staticmethod
721 def get_sysctl_settings() -> List[str]:
722 return [
723 '# IP forwarding',
724 'net.ipv4.ip_forward = 1',
725 ]
726
727 ##################################
728
729
730 class Keepalived(object):
731 """Defines an Keepalived container"""
732 daemon_type = 'keepalived'
733 required_files = ['keepalived.conf']
734 default_image = DEFAULT_KEEPALIVED_IMAGE
735
736 def __init__(self,
737 ctx: CephadmContext,
738 fsid: str, daemon_id: Union[int, str],
739 config_json: Dict, image: str) -> None:
740 self.ctx = ctx
741 self.fsid = fsid
742 self.daemon_id = daemon_id
743 self.image = image
744
745 # config-json options
746 self.files = dict_get(config_json, 'files', {})
747
748 self.validate()
749
750 @classmethod
751 def init(cls, ctx: CephadmContext, fsid: str,
752 daemon_id: Union[int, str]) -> 'Keepalived':
753 return cls(ctx, fsid, daemon_id,
754 get_parm(ctx.config_json), ctx.image)
755
756 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
757 """Create files under the container data dir"""
758 if not os.path.isdir(data_dir):
759 raise OSError('data_dir is not a directory: %s' % (data_dir))
760
761 # create additional directories in data dir for keepalived to use
762 if not os.path.isdir(os.path.join(data_dir, 'keepalived')):
763 makedirs(os.path.join(data_dir, 'keepalived'), uid, gid, DATA_DIR_MODE)
764
765 # populate files from the config-json
766 populate_files(data_dir, self.files, uid, gid)
767
768 def validate(self):
769 # type: () -> None
770 if not is_fsid(self.fsid):
771 raise Error('not an fsid: %s' % self.fsid)
772 if not self.daemon_id:
773 raise Error('invalid daemon_id: %s' % self.daemon_id)
774 if not self.image:
775 raise Error('invalid image: %s' % self.image)
776
777 # check for the required files
778 if self.required_files:
779 for fname in self.required_files:
780 if fname not in self.files:
781 raise Error('required file missing from config-json: %s' % fname)
782
783 def get_daemon_name(self):
784 # type: () -> str
785 return '%s.%s' % (self.daemon_type, self.daemon_id)
786
787 def get_container_name(self, desc=None):
788 # type: (Optional[str]) -> str
789 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
790 if desc:
791 cname = '%s-%s' % (cname, desc)
792 return cname
793
794 @staticmethod
795 def get_container_envs():
796 # type: () -> List[str]
797 envs = [
798 'KEEPALIVED_AUTOCONF=false',
799 'KEEPALIVED_CONF=/etc/keepalived/keepalived.conf',
800 'KEEPALIVED_CMD=/usr/sbin/keepalived -n -l -f /etc/keepalived/keepalived.conf',
801 'KEEPALIVED_DEBUG=false'
802 ]
803 return envs
804
805 @staticmethod
806 def get_sysctl_settings() -> List[str]:
807 return [
808 '# IP forwarding and non-local bind',
809 'net.ipv4.ip_forward = 1',
810 'net.ipv4.ip_nonlocal_bind = 1',
811 ]
812
813 def extract_uid_gid_keepalived(self) -> Tuple[int, int]:
814 # better directory for this?
815 return extract_uid_gid(self.ctx, file_path='/var/lib')
816
817 @staticmethod
818 def get_container_mounts(data_dir: str) -> Dict[str, str]:
819 mounts = dict()
820 mounts[os.path.join(data_dir, 'keepalived.conf')] = '/etc/keepalived/keepalived.conf'
821 return mounts
822
823 ##################################
824
825
826 class CustomContainer(object):
827 """Defines a custom container"""
828 daemon_type = 'container'
829
830 def __init__(self,
831 fsid: str, daemon_id: Union[int, str],
832 config_json: Dict, image: str) -> None:
833 self.fsid = fsid
834 self.daemon_id = daemon_id
835 self.image = image
836
837 # config-json options
838 self.entrypoint = dict_get(config_json, 'entrypoint')
839 self.uid = dict_get(config_json, 'uid', 65534) # nobody
840 self.gid = dict_get(config_json, 'gid', 65534) # nobody
841 self.volume_mounts = dict_get(config_json, 'volume_mounts', {})
842 self.args = dict_get(config_json, 'args', [])
843 self.envs = dict_get(config_json, 'envs', [])
844 self.privileged = dict_get(config_json, 'privileged', False)
845 self.bind_mounts = dict_get(config_json, 'bind_mounts', [])
846 self.ports = dict_get(config_json, 'ports', [])
847 self.dirs = dict_get(config_json, 'dirs', [])
848 self.files = dict_get(config_json, 'files', {})
849
850 @classmethod
851 def init(cls, ctx: CephadmContext,
852 fsid: str, daemon_id: Union[int, str]) -> 'CustomContainer':
853 return cls(fsid, daemon_id,
854 get_parm(ctx.config_json), ctx.image)
855
856 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
857 """
858 Create dirs/files below the container data directory.
859 """
860 logger.info('Creating custom container configuration '
861 'dirs/files in {} ...'.format(data_dir))
862
863 if not os.path.isdir(data_dir):
864 raise OSError('data_dir is not a directory: %s' % data_dir)
865
866 for dir_path in self.dirs:
867 logger.info('Creating directory: {}'.format(dir_path))
868 dir_path = os.path.join(data_dir, dir_path.strip('/'))
869 makedirs(dir_path, uid, gid, 0o755)
870
871 for file_path in self.files:
872 logger.info('Creating file: {}'.format(file_path))
873 content = dict_get_join(self.files, file_path)
874 file_path = os.path.join(data_dir, file_path.strip('/'))
875 with open(file_path, 'w', encoding='utf-8') as f:
876 os.fchown(f.fileno(), uid, gid)
877 os.fchmod(f.fileno(), 0o600)
878 f.write(content)
879
880 def get_daemon_args(self) -> List[str]:
881 return []
882
883 def get_container_args(self) -> List[str]:
884 return self.args
885
886 def get_container_envs(self) -> List[str]:
887 return self.envs
888
889 def get_container_mounts(self, data_dir: str) -> Dict[str, str]:
890 """
891 Get the volume mounts. Relative source paths will be located below
892 `/var/lib/ceph/<cluster-fsid>/<daemon-name>`.
893
894 Example:
895 {
896 /foo/conf: /conf
897 foo/conf: /conf
898 }
899 becomes
900 {
901 /foo/conf: /conf
902 /var/lib/ceph/<cluster-fsid>/<daemon-name>/foo/conf: /conf
903 }
904 """
905 mounts = {}
906 for source, destination in self.volume_mounts.items():
907 source = os.path.join(data_dir, source)
908 mounts[source] = destination
909 return mounts
910
911 def get_container_binds(self, data_dir: str) -> List[List[str]]:
912 """
913 Get the bind mounts. Relative `source=...` paths will be located below
914 `/var/lib/ceph/<cluster-fsid>/<daemon-name>`.
915
916 Example:
917 [
918 'type=bind',
919 'source=lib/modules',
920 'destination=/lib/modules',
921 'ro=true'
922 ]
923 becomes
924 [
925 ...
926 'source=/var/lib/ceph/<cluster-fsid>/<daemon-name>/lib/modules',
927 ...
928 ]
929 """
930 binds = self.bind_mounts.copy()
931 for bind in binds:
932 for index, value in enumerate(bind):
933 match = re.match(r'^source=(.+)$', value)
934 if match:
935 bind[index] = 'source={}'.format(os.path.join(
936 data_dir, match.group(1)))
937 return binds
938
939 ##################################
940
941
942 def touch(file_path: str, uid: Optional[int] = None, gid: Optional[int] = None) -> None:
943 Path(file_path).touch()
944 if uid and gid:
945 os.chown(file_path, uid, gid)
946
947
948 ##################################
949
950
951 def dict_get(d: Dict, key: str, default: Any = None, require: bool = False) -> Any:
952 """
953 Helper function to get a key from a dictionary.
954 :param d: The dictionary to process.
955 :param key: The name of the key to get.
956 :param default: The default value in case the key does not
957 exist. Default is `None`.
958 :param require: Set to `True` if the key is required. An
959 exception will be raised if the key does not exist in
960 the given dictionary.
961 :return: Returns the value of the given key.
962 :raises: :exc:`self.Error` if the given key does not exist
963 and `require` is set to `True`.
964 """
965 if require and key not in d.keys():
966 raise Error('{} missing from dict'.format(key))
967 return d.get(key, default) # type: ignore
968
969 ##################################
970
971
972 def dict_get_join(d: Dict, key: str) -> Any:
973 """
974 Helper function to get the value of a given key from a dictionary.
975 `List` values will be converted to a string by joining them with a
976 line break.
977 :param d: The dictionary to process.
978 :param key: The name of the key to get.
979 :return: Returns the value of the given key. If it was a `list`, it
980 will be joining with a line break.
981 """
982 value = d.get(key)
983 if isinstance(value, list):
984 value = '\n'.join(map(str, value))
985 return value
986
987 ##################################
988
989
990 def get_supported_daemons():
991 # type: () -> List[str]
992 supported_daemons = list(Ceph.daemons)
993 supported_daemons.extend(Monitoring.components)
994 supported_daemons.append(NFSGanesha.daemon_type)
995 supported_daemons.append(CephIscsi.daemon_type)
996 supported_daemons.append(CustomContainer.daemon_type)
997 supported_daemons.append(CephadmDaemon.daemon_type)
998 supported_daemons.append(HAproxy.daemon_type)
999 supported_daemons.append(Keepalived.daemon_type)
1000 assert len(supported_daemons) == len(set(supported_daemons))
1001 return supported_daemons
1002
1003 ##################################
1004
1005
1006 class PortOccupiedError(Error):
1007 pass
1008
1009
1010 def attempt_bind(ctx, s, address, port):
1011 # type: (CephadmContext, socket.socket, str, int) -> None
1012 try:
1013 s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
1014 s.bind((address, port))
1015 except OSError as e:
1016 if e.errno == errno.EADDRINUSE:
1017 msg = 'Cannot bind to IP %s port %d: %s' % (address, port, e)
1018 logger.warning(msg)
1019 raise PortOccupiedError(msg)
1020 else:
1021 raise Error(e)
1022 except Exception as e:
1023 raise Error(e)
1024 finally:
1025 s.close()
1026
1027
1028 def port_in_use(ctx, port_num):
1029 # type: (CephadmContext, int) -> bool
1030 """Detect whether a port is in use on the local machine - IPv4 and IPv6"""
1031 logger.info('Verifying port %d ...' % port_num)
1032
1033 def _port_in_use(af: socket.AddressFamily, address: str) -> bool:
1034 try:
1035 s = socket.socket(af, socket.SOCK_STREAM)
1036 attempt_bind(ctx, s, address, port_num)
1037 except PortOccupiedError:
1038 return True
1039 except OSError as e:
1040 if e.errno in (errno.EAFNOSUPPORT, errno.EADDRNOTAVAIL):
1041 # Ignore EAFNOSUPPORT and EADDRNOTAVAIL as two interfaces are
1042 # being tested here and one might be intentionally be disabled.
1043 # In that case no error should be raised.
1044 return False
1045 else:
1046 raise e
1047 return False
1048 return any(_port_in_use(af, address) for af, address in (
1049 (socket.AF_INET, '0.0.0.0'),
1050 (socket.AF_INET6, '::')
1051 ))
1052
1053
1054 def check_ip_port(ctx, ip, port):
1055 # type: (CephadmContext, str, int) -> None
1056 if not ctx.skip_ping_check:
1057 logger.info('Verifying IP %s port %d ...' % (ip, port))
1058 if is_ipv6(ip):
1059 s = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
1060 ip = unwrap_ipv6(ip)
1061 else:
1062 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1063 attempt_bind(ctx, s, ip, port)
1064
1065 ##################################
1066
1067
1068 # this is an abbreviated version of
1069 # https://github.com/benediktschmitt/py-filelock/blob/master/filelock.py
1070 # that drops all of the compatibility (this is Unix/Linux only).
1071
1072 class Timeout(TimeoutError):
1073 """
1074 Raised when the lock could not be acquired in *timeout*
1075 seconds.
1076 """
1077
1078 def __init__(self, lock_file: str) -> None:
1079 """
1080 """
1081 #: The path of the file lock.
1082 self.lock_file = lock_file
1083 return None
1084
1085 def __str__(self) -> str:
1086 temp = "The file lock '{}' could not be acquired."\
1087 .format(self.lock_file)
1088 return temp
1089
1090
1091 class _Acquire_ReturnProxy(object):
1092 def __init__(self, lock: 'FileLock') -> None:
1093 self.lock = lock
1094 return None
1095
1096 def __enter__(self) -> 'FileLock':
1097 return self.lock
1098
1099 def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
1100 self.lock.release()
1101 return None
1102
1103
1104 class FileLock(object):
1105 def __init__(self, ctx: CephadmContext, name: str, timeout: int = -1) -> None:
1106 if not os.path.exists(LOCK_DIR):
1107 os.mkdir(LOCK_DIR, 0o700)
1108 self._lock_file = os.path.join(LOCK_DIR, name + '.lock')
1109 self.ctx = ctx
1110
1111 # The file descriptor for the *_lock_file* as it is returned by the
1112 # os.open() function.
1113 # This file lock is only NOT None, if the object currently holds the
1114 # lock.
1115 self._lock_file_fd: Optional[int] = None
1116 self.timeout = timeout
1117 # The lock counter is used for implementing the nested locking
1118 # mechanism. Whenever the lock is acquired, the counter is increased and
1119 # the lock is only released, when this value is 0 again.
1120 self._lock_counter = 0
1121 return None
1122
1123 @property
1124 def is_locked(self) -> bool:
1125 return self._lock_file_fd is not None
1126
1127 def acquire(self, timeout: Optional[int] = None, poll_intervall: float = 0.05) -> _Acquire_ReturnProxy:
1128 """
1129 Acquires the file lock or fails with a :exc:`Timeout` error.
1130 .. code-block:: python
1131 # You can use this method in the context manager (recommended)
1132 with lock.acquire():
1133 pass
1134 # Or use an equivalent try-finally construct:
1135 lock.acquire()
1136 try:
1137 pass
1138 finally:
1139 lock.release()
1140 :arg float timeout:
1141 The maximum time waited for the file lock.
1142 If ``timeout < 0``, there is no timeout and this method will
1143 block until the lock could be acquired.
1144 If ``timeout`` is None, the default :attr:`~timeout` is used.
1145 :arg float poll_intervall:
1146 We check once in *poll_intervall* seconds if we can acquire the
1147 file lock.
1148 :raises Timeout:
1149 if the lock could not be acquired in *timeout* seconds.
1150 .. versionchanged:: 2.0.0
1151 This method returns now a *proxy* object instead of *self*,
1152 so that it can be used in a with statement without side effects.
1153 """
1154
1155 # Use the default timeout, if no timeout is provided.
1156 if timeout is None:
1157 timeout = self.timeout
1158
1159 # Increment the number right at the beginning.
1160 # We can still undo it, if something fails.
1161 self._lock_counter += 1
1162
1163 lock_id = id(self)
1164 lock_filename = self._lock_file
1165 start_time = time.time()
1166 try:
1167 while True:
1168 if not self.is_locked:
1169 logger.debug('Acquiring lock %s on %s', lock_id,
1170 lock_filename)
1171 self._acquire()
1172
1173 if self.is_locked:
1174 logger.debug('Lock %s acquired on %s', lock_id,
1175 lock_filename)
1176 break
1177 elif timeout >= 0 and time.time() - start_time > timeout:
1178 logger.warning('Timeout acquiring lock %s on %s', lock_id,
1179 lock_filename)
1180 raise Timeout(self._lock_file)
1181 else:
1182 logger.debug(
1183 'Lock %s not acquired on %s, waiting %s seconds ...',
1184 lock_id, lock_filename, poll_intervall
1185 )
1186 time.sleep(poll_intervall)
1187 except Exception:
1188 # Something did go wrong, so decrement the counter.
1189 self._lock_counter = max(0, self._lock_counter - 1)
1190
1191 raise
1192 return _Acquire_ReturnProxy(lock=self)
1193
1194 def release(self, force: bool = False) -> None:
1195 """
1196 Releases the file lock.
1197 Please note, that the lock is only completly released, if the lock
1198 counter is 0.
1199 Also note, that the lock file itself is not automatically deleted.
1200 :arg bool force:
1201 If true, the lock counter is ignored and the lock is released in
1202 every case.
1203 """
1204 if self.is_locked:
1205 self._lock_counter -= 1
1206
1207 if self._lock_counter == 0 or force:
1208 # lock_id = id(self)
1209 # lock_filename = self._lock_file
1210
1211 # Can't log in shutdown:
1212 # File "/usr/lib64/python3.9/logging/__init__.py", line 1175, in _open
1213 # NameError: name 'open' is not defined
1214 # logger.debug('Releasing lock %s on %s', lock_id, lock_filename)
1215 self._release()
1216 self._lock_counter = 0
1217 # logger.debug('Lock %s released on %s', lock_id, lock_filename)
1218
1219 return None
1220
1221 def __enter__(self) -> 'FileLock':
1222 self.acquire()
1223 return self
1224
1225 def __exit__(self, exc_type: Any, exc_value: Any, traceback: Any) -> None:
1226 self.release()
1227 return None
1228
1229 def __del__(self) -> None:
1230 self.release(force=True)
1231 return None
1232
1233 def _acquire(self) -> None:
1234 open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
1235 fd = os.open(self._lock_file, open_mode)
1236
1237 try:
1238 fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
1239 except (IOError, OSError):
1240 os.close(fd)
1241 else:
1242 self._lock_file_fd = fd
1243 return None
1244
1245 def _release(self) -> None:
1246 # Do not remove the lockfile:
1247 #
1248 # https://github.com/benediktschmitt/py-filelock/issues/31
1249 # https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
1250 fd = self._lock_file_fd
1251 self._lock_file_fd = None
1252 fcntl.flock(fd, fcntl.LOCK_UN) # type: ignore
1253 os.close(fd) # type: ignore
1254 return None
1255
1256
1257 ##################################
1258 # Popen wrappers, lifted from ceph-volume
1259
1260 class CallVerbosity(Enum):
1261 SILENT = 0
1262 # log stdout/stderr to logger.debug
1263 DEBUG = 1
1264 # On a non-zero exit status, it will forcefully set
1265 # logging ON for the terminal
1266 VERBOSE_ON_FAILURE = 2
1267 # log at info (instead of debug) level.
1268 VERBOSE = 3
1269
1270
1271 if sys.version_info < (3, 8):
1272 import itertools
1273 import threading
1274 import warnings
1275 from asyncio import events
1276
1277 class ThreadedChildWatcher(asyncio.AbstractChildWatcher):
1278 """Threaded child watcher implementation.
1279 The watcher uses a thread per process
1280 for waiting for the process finish.
1281 It doesn't require subscription on POSIX signal
1282 but a thread creation is not free.
1283 The watcher has O(1) complexity, its performance doesn't depend
1284 on amount of spawn processes.
1285 """
1286
1287 def __init__(self) -> None:
1288 self._pid_counter = itertools.count(0)
1289 self._threads = {}
1290
1291 def is_active(self):
1292 return True
1293
1294 def close(self):
1295 self._join_threads()
1296
1297 def _join_threads(self):
1298 """Internal: Join all non-daemon threads"""
1299 threads = [thread for thread in list(self._threads.values())
1300 if thread.is_alive() and not thread.daemon]
1301 for thread in threads:
1302 thread.join()
1303
1304 def __enter__(self):
1305 return self
1306
1307 def __exit__(self, exc_type, exc_val, exc_tb):
1308 pass
1309
1310 def __del__(self, _warn=warnings.warn):
1311 threads = [thread for thread in list(self._threads.values())
1312 if thread.is_alive()]
1313 if threads:
1314 _warn(f'{self.__class__} has registered but not finished child processes',
1315 ResourceWarning,
1316 source=self)
1317
1318 def add_child_handler(self, pid, callback, *args):
1319 loop = events.get_event_loop()
1320 thread = threading.Thread(target=self._do_waitpid,
1321 name=f'waitpid-{next(self._pid_counter)}',
1322 args=(loop, pid, callback, args),
1323 daemon=True)
1324 self._threads[pid] = thread
1325 thread.start()
1326
1327 def remove_child_handler(self, pid):
1328 # asyncio never calls remove_child_handler() !!!
1329 # The method is no-op but is implemented because
1330 # abstract base classe requires it
1331 return True
1332
1333 def attach_loop(self, loop):
1334 pass
1335
1336 def _do_waitpid(self, loop, expected_pid, callback, args):
1337 assert expected_pid > 0
1338
1339 try:
1340 pid, status = os.waitpid(expected_pid, 0)
1341 except ChildProcessError:
1342 # The child process is already reaped
1343 # (may happen if waitpid() is called elsewhere).
1344 pid = expected_pid
1345 returncode = 255
1346 logger.warning(
1347 'Unknown child process pid %d, will report returncode 255',
1348 pid)
1349 else:
1350 if os.WIFEXITED(status):
1351 returncode = os.WEXITSTATUS(status)
1352 elif os.WIFSIGNALED(status):
1353 returncode = -os.WTERMSIG(status)
1354 else:
1355 raise ValueError(f'unknown wait status {status}')
1356 if loop.get_debug():
1357 logger.debug('process %s exited with returncode %s',
1358 expected_pid, returncode)
1359
1360 if loop.is_closed():
1361 logger.warning('Loop %r that handles pid %r is closed', loop, pid)
1362 else:
1363 loop.call_soon_threadsafe(callback, pid, returncode, *args)
1364
1365 self._threads.pop(expected_pid)
1366
1367 # unlike SafeChildWatcher which handles SIGCHLD in the main thread,
1368 # ThreadedChildWatcher runs in a separated thread, hence allows us to
1369 # run create_subprocess_exec() in non-main thread, see
1370 # https://bugs.python.org/issue35621
1371 asyncio.set_child_watcher(ThreadedChildWatcher())
1372
1373
1374 try:
1375 from asyncio import run as async_run # type: ignore[attr-defined]
1376 except ImportError:
1377 def async_run(coro): # type: ignore
1378 loop = asyncio.new_event_loop()
1379 try:
1380 asyncio.set_event_loop(loop)
1381 return loop.run_until_complete(coro)
1382 finally:
1383 try:
1384 loop.run_until_complete(loop.shutdown_asyncgens())
1385 finally:
1386 asyncio.set_event_loop(None)
1387 loop.close()
1388
1389
1390 def call(ctx: CephadmContext,
1391 command: List[str],
1392 desc: Optional[str] = None,
1393 verbosity: CallVerbosity = CallVerbosity.VERBOSE_ON_FAILURE,
1394 timeout: Optional[int] = DEFAULT_TIMEOUT,
1395 **kwargs: Any) -> Tuple[str, str, int]:
1396 """
1397 Wrap subprocess.Popen to
1398
1399 - log stdout/stderr to a logger,
1400 - decode utf-8
1401 - cleanly return out, err, returncode
1402
1403 :param timeout: timeout in seconds
1404 """
1405
1406 prefix = command[0] if desc is None else desc
1407 if prefix:
1408 prefix += ': '
1409 timeout = timeout or ctx.timeout
1410
1411 logger.debug('Running command: %s' % ' '.join(command))
1412
1413 async def tee(reader: asyncio.StreamReader) -> str:
1414 collected = StringIO()
1415 async for line in reader:
1416 message = line.decode('utf-8')
1417 collected.write(message)
1418 if verbosity == CallVerbosity.VERBOSE:
1419 logger.info(prefix + message.rstrip())
1420 elif verbosity != CallVerbosity.SILENT:
1421 logger.debug(prefix + message.rstrip())
1422 return collected.getvalue()
1423
1424 async def run_with_timeout() -> Tuple[str, str, int]:
1425 process = await asyncio.create_subprocess_exec(
1426 *command,
1427 stdout=asyncio.subprocess.PIPE,
1428 stderr=asyncio.subprocess.PIPE,
1429 env=os.environ.copy())
1430 assert process.stdout
1431 assert process.stderr
1432 try:
1433 stdout, stderr = await asyncio.gather(tee(process.stdout),
1434 tee(process.stderr))
1435 returncode = await asyncio.wait_for(process.wait(), timeout)
1436 except asyncio.TimeoutError:
1437 logger.info(prefix + f'timeout after {timeout} seconds')
1438 return '', '', 124
1439 else:
1440 return stdout, stderr, returncode
1441
1442 stdout, stderr, returncode = async_run(run_with_timeout())
1443 if returncode != 0 and verbosity == CallVerbosity.VERBOSE_ON_FAILURE:
1444 logger.info('Non-zero exit code %d from %s',
1445 returncode, ' '.join(command))
1446 for line in stdout.splitlines():
1447 logger.info(prefix + 'stdout ' + line)
1448 for line in stderr.splitlines():
1449 logger.info(prefix + 'stderr ' + line)
1450 return stdout, stderr, returncode
1451
1452
1453 def call_throws(
1454 ctx: CephadmContext,
1455 command: List[str],
1456 desc: Optional[str] = None,
1457 verbosity: CallVerbosity = CallVerbosity.VERBOSE_ON_FAILURE,
1458 timeout: Optional[int] = DEFAULT_TIMEOUT,
1459 **kwargs: Any) -> Tuple[str, str, int]:
1460 out, err, ret = call(ctx, command, desc, verbosity, timeout, **kwargs)
1461 if ret:
1462 raise RuntimeError('Failed command: %s' % ' '.join(command))
1463 return out, err, ret
1464
1465
1466 def call_timeout(ctx, command, timeout):
1467 # type: (CephadmContext, List[str], int) -> int
1468 logger.debug('Running command (timeout=%s): %s'
1469 % (timeout, ' '.join(command)))
1470
1471 def raise_timeout(command, timeout):
1472 # type: (List[str], int) -> NoReturn
1473 msg = 'Command `%s` timed out after %s seconds' % (command, timeout)
1474 logger.debug(msg)
1475 raise TimeoutExpired(msg)
1476
1477 try:
1478 return subprocess.call(command, timeout=timeout, env=os.environ.copy())
1479 except subprocess.TimeoutExpired:
1480 raise_timeout(command, timeout)
1481
1482 ##################################
1483
1484
1485 def json_loads_retry(cli_func: Callable[[], str]) -> Any:
1486 for sleep_secs in [1, 4, 4]:
1487 try:
1488 return json.loads(cli_func())
1489 except json.JSONDecodeError:
1490 logger.debug('Invalid JSON. Retrying in %s seconds...' % sleep_secs)
1491 time.sleep(sleep_secs)
1492 return json.loads(cli_func())
1493
1494
1495 def is_available(ctx, what, func):
1496 # type: (CephadmContext, str, Callable[[], bool]) -> None
1497 """
1498 Wait for a service to become available
1499
1500 :param what: the name of the service
1501 :param func: the callable object that determines availability
1502 """
1503 retry = ctx.retry
1504 logger.info('Waiting for %s...' % what)
1505 num = 1
1506 while True:
1507 if func():
1508 logger.info('%s is available'
1509 % what)
1510 break
1511 elif num > retry:
1512 raise Error('%s not available after %s tries'
1513 % (what, retry))
1514
1515 logger.info('%s not available, waiting (%s/%s)...'
1516 % (what, num, retry))
1517
1518 num += 1
1519 time.sleep(2)
1520
1521
1522 def read_config(fn):
1523 # type: (Optional[str]) -> ConfigParser
1524 cp = ConfigParser()
1525 if fn:
1526 cp.read(fn)
1527 return cp
1528
1529
1530 def pathify(p):
1531 # type: (str) -> str
1532 p = os.path.expanduser(p)
1533 return os.path.abspath(p)
1534
1535
1536 def get_file_timestamp(fn):
1537 # type: (str) -> Optional[str]
1538 try:
1539 mt = os.path.getmtime(fn)
1540 return datetime.datetime.fromtimestamp(
1541 mt, tz=datetime.timezone.utc
1542 ).strftime(DATEFMT)
1543 except Exception:
1544 return None
1545
1546
1547 def try_convert_datetime(s):
1548 # type: (str) -> Optional[str]
1549 # This is super irritating because
1550 # 1) podman and docker use different formats
1551 # 2) python's strptime can't parse either one
1552 #
1553 # I've seen:
1554 # docker 18.09.7: 2020-03-03T09:21:43.636153304Z
1555 # podman 1.7.0: 2020-03-03T15:52:30.136257504-06:00
1556 # 2020-03-03 15:52:30.136257504 -0600 CST
1557 # (In the podman case, there is a different string format for
1558 # 'inspect' and 'inspect --format {{.Created}}'!!)
1559
1560 # In *all* cases, the 9 digit second precision is too much for
1561 # python's strptime. Shorten it to 6 digits.
1562 p = re.compile(r'(\.[\d]{6})[\d]*')
1563 s = p.sub(r'\1', s)
1564
1565 # replace trailing Z with -0000, since (on python 3.6.8) it won't parse
1566 if s and s[-1] == 'Z':
1567 s = s[:-1] + '-0000'
1568
1569 # cut off the redundant 'CST' part that strptime can't parse, if
1570 # present.
1571 v = s.split(' ')
1572 s = ' '.join(v[0:3])
1573
1574 # try parsing with several format strings
1575 fmts = [
1576 '%Y-%m-%dT%H:%M:%S.%f%z',
1577 '%Y-%m-%d %H:%M:%S.%f %z',
1578 ]
1579 for f in fmts:
1580 try:
1581 # return timestamp normalized to UTC, rendered as DATEFMT.
1582 return datetime.datetime.strptime(s, f).astimezone(tz=datetime.timezone.utc).strftime(DATEFMT)
1583 except ValueError:
1584 pass
1585 return None
1586
1587
1588 def _parse_podman_version(version_str):
1589 # type: (str) -> Tuple[int, ...]
1590 def to_int(val: str, org_e: Optional[Exception] = None) -> int:
1591 if not val and org_e:
1592 raise org_e
1593 try:
1594 return int(val)
1595 except ValueError as e:
1596 return to_int(val[0:-1], org_e or e)
1597
1598 return tuple(map(to_int, version_str.split('.')))
1599
1600
1601 def get_hostname():
1602 # type: () -> str
1603 return socket.gethostname()
1604
1605
1606 def get_fqdn():
1607 # type: () -> str
1608 return socket.getfqdn() or socket.gethostname()
1609
1610
1611 def get_arch():
1612 # type: () -> str
1613 return platform.uname().machine
1614
1615
1616 def generate_service_id():
1617 # type: () -> str
1618 return get_hostname() + '.' + ''.join(random.choice(string.ascii_lowercase)
1619 for _ in range(6))
1620
1621
1622 def generate_password():
1623 # type: () -> str
1624 return ''.join(random.choice(string.ascii_lowercase + string.digits)
1625 for i in range(10))
1626
1627
1628 def normalize_container_id(i):
1629 # type: (str) -> str
1630 # docker adds the sha256: prefix, but AFAICS both
1631 # docker (18.09.7 in bionic at least) and podman
1632 # both always use sha256, so leave off the prefix
1633 # for consistency.
1634 prefix = 'sha256:'
1635 if i.startswith(prefix):
1636 i = i[len(prefix):]
1637 return i
1638
1639
1640 def make_fsid():
1641 # type: () -> str
1642 return str(uuid.uuid1())
1643
1644
1645 def is_fsid(s):
1646 # type: (str) -> bool
1647 try:
1648 uuid.UUID(s)
1649 except ValueError:
1650 return False
1651 return True
1652
1653
1654 def validate_fsid(func: FuncT) -> FuncT:
1655 @wraps(func)
1656 def _validate_fsid(ctx: CephadmContext) -> Any:
1657 if 'fsid' in ctx and ctx.fsid:
1658 if not is_fsid(ctx.fsid):
1659 raise Error('not an fsid: %s' % ctx.fsid)
1660 return func(ctx)
1661 return cast(FuncT, _validate_fsid)
1662
1663
1664 def infer_fsid(func: FuncT) -> FuncT:
1665 """
1666 If we only find a single fsid in /var/lib/ceph/*, use that
1667 """
1668 @infer_config
1669 @wraps(func)
1670 def _infer_fsid(ctx: CephadmContext) -> Any:
1671 if 'fsid' in ctx and ctx.fsid:
1672 logger.debug('Using specified fsid: %s' % ctx.fsid)
1673 return func(ctx)
1674
1675 fsids = set()
1676
1677 cp = read_config(ctx.config)
1678 if cp.has_option('global', 'fsid'):
1679 fsids.add(cp.get('global', 'fsid'))
1680
1681 daemon_list = list_daemons(ctx, detail=False)
1682 for daemon in daemon_list:
1683 if not is_fsid(daemon['fsid']):
1684 # 'unknown' fsid
1685 continue
1686 elif 'name' not in ctx or not ctx.name:
1687 # ctx.name not specified
1688 fsids.add(daemon['fsid'])
1689 elif daemon['name'] == ctx.name:
1690 # ctx.name is a match
1691 fsids.add(daemon['fsid'])
1692 fsids = sorted(fsids)
1693
1694 if not fsids:
1695 # some commands do not always require an fsid
1696 pass
1697 elif len(fsids) == 1:
1698 logger.info('Inferring fsid %s' % fsids[0])
1699 ctx.fsid = fsids[0]
1700 else:
1701 raise Error('Cannot infer an fsid, one must be specified: %s' % fsids)
1702 return func(ctx)
1703
1704 return cast(FuncT, _infer_fsid)
1705
1706
1707 def infer_config(func: FuncT) -> FuncT:
1708 """
1709 If we find a MON daemon, use the config from that container
1710 """
1711 @wraps(func)
1712 def _infer_config(ctx: CephadmContext) -> Any:
1713 ctx.config = ctx.config if 'config' in ctx else None
1714 if ctx.config:
1715 logger.debug('Using specified config: %s' % ctx.config)
1716 return func(ctx)
1717 if 'fsid' in ctx and ctx.fsid:
1718 name = ctx.name if 'name' in ctx else None
1719 if not name:
1720 daemon_list = list_daemons(ctx, detail=False)
1721 for daemon in daemon_list:
1722 if daemon.get('name', '').startswith('mon.'):
1723 name = daemon['name']
1724 break
1725 if name:
1726 ctx.config = f'/var/lib/ceph/{ctx.fsid}/{name}/config'
1727 if ctx.config:
1728 logger.info('Inferring config %s' % ctx.config)
1729 elif os.path.exists(SHELL_DEFAULT_CONF):
1730 logger.debug('Using default config: %s' % SHELL_DEFAULT_CONF)
1731 ctx.config = SHELL_DEFAULT_CONF
1732 return func(ctx)
1733
1734 return cast(FuncT, _infer_config)
1735
1736
1737 def _get_default_image(ctx: CephadmContext) -> str:
1738 if DEFAULT_IMAGE_IS_MASTER:
1739 warn = """This is a development version of cephadm.
1740 For information regarding the latest stable release:
1741 https://docs.ceph.com/docs/{}/cephadm/install
1742 """.format(LATEST_STABLE_RELEASE)
1743 for line in warn.splitlines():
1744 logger.warning('{}{}{}'.format(termcolor.yellow, line, termcolor.end))
1745 return DEFAULT_IMAGE
1746
1747
1748 def infer_image(func: FuncT) -> FuncT:
1749 """
1750 Use the most recent ceph image
1751 """
1752 @wraps(func)
1753 def _infer_image(ctx: CephadmContext) -> Any:
1754 if not ctx.image:
1755 ctx.image = os.environ.get('CEPHADM_IMAGE')
1756 if not ctx.image:
1757 ctx.image = get_last_local_ceph_image(ctx, ctx.container_engine.path)
1758 if not ctx.image:
1759 ctx.image = _get_default_image(ctx)
1760 return func(ctx)
1761
1762 return cast(FuncT, _infer_image)
1763
1764
1765 def default_image(func: FuncT) -> FuncT:
1766 @wraps(func)
1767 def _default_image(ctx: CephadmContext) -> Any:
1768 if not ctx.image:
1769 if 'name' in ctx and ctx.name:
1770 type_ = ctx.name.split('.', 1)[0]
1771 if type_ in Monitoring.components:
1772 ctx.image = Monitoring.components[type_]['image']
1773 if type_ == 'haproxy':
1774 ctx.image = HAproxy.default_image
1775 if type_ == 'keepalived':
1776 ctx.image = Keepalived.default_image
1777 if not ctx.image:
1778 ctx.image = os.environ.get('CEPHADM_IMAGE')
1779 if not ctx.image:
1780 ctx.image = _get_default_image(ctx)
1781
1782 return func(ctx)
1783
1784 return cast(FuncT, _default_image)
1785
1786
1787 def get_last_local_ceph_image(ctx: CephadmContext, container_path: str) -> Optional[str]:
1788 """
1789 :return: The most recent local ceph image (already pulled)
1790 """
1791 out, _, _ = call_throws(ctx,
1792 [container_path, 'images',
1793 '--filter', 'label=ceph=True',
1794 '--filter', 'dangling=false',
1795 '--format', '{{.Repository}}@{{.Digest}}'])
1796 return _filter_last_local_ceph_image(out)
1797
1798
1799 def _filter_last_local_ceph_image(out):
1800 # type: (str) -> Optional[str]
1801 for image in out.splitlines():
1802 if image and not image.endswith('@'):
1803 logger.info('Using recent ceph image %s' % image)
1804 return image
1805 return None
1806
1807
1808 def write_tmp(s, uid, gid):
1809 # type: (str, int, int) -> IO[str]
1810 tmp_f = tempfile.NamedTemporaryFile(mode='w',
1811 prefix='ceph-tmp')
1812 os.fchown(tmp_f.fileno(), uid, gid)
1813 tmp_f.write(s)
1814 tmp_f.flush()
1815
1816 return tmp_f
1817
1818
1819 def makedirs(dir, uid, gid, mode):
1820 # type: (str, int, int, int) -> None
1821 if not os.path.exists(dir):
1822 os.makedirs(dir, mode=mode)
1823 else:
1824 os.chmod(dir, mode)
1825 os.chown(dir, uid, gid)
1826 os.chmod(dir, mode) # the above is masked by umask...
1827
1828
1829 def get_data_dir(fsid, data_dir, t, n):
1830 # type: (str, str, str, Union[int, str]) -> str
1831 return os.path.join(data_dir, fsid, '%s.%s' % (t, n))
1832
1833
1834 def get_log_dir(fsid, log_dir):
1835 # type: (str, str) -> str
1836 return os.path.join(log_dir, fsid)
1837
1838
1839 def make_data_dir_base(fsid, data_dir, uid, gid):
1840 # type: (str, str, int, int) -> str
1841 data_dir_base = os.path.join(data_dir, fsid)
1842 makedirs(data_dir_base, uid, gid, DATA_DIR_MODE)
1843 makedirs(os.path.join(data_dir_base, 'crash'), uid, gid, DATA_DIR_MODE)
1844 makedirs(os.path.join(data_dir_base, 'crash', 'posted'), uid, gid,
1845 DATA_DIR_MODE)
1846 return data_dir_base
1847
1848
1849 def make_data_dir(ctx, fsid, daemon_type, daemon_id, uid=None, gid=None):
1850 # type: (CephadmContext, str, str, Union[int, str], Optional[int], Optional[int]) -> str
1851 if uid is None or gid is None:
1852 uid, gid = extract_uid_gid(ctx)
1853 make_data_dir_base(fsid, ctx.data_dir, uid, gid)
1854 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
1855 makedirs(data_dir, uid, gid, DATA_DIR_MODE)
1856 return data_dir
1857
1858
1859 def make_log_dir(ctx, fsid, uid=None, gid=None):
1860 # type: (CephadmContext, str, Optional[int], Optional[int]) -> str
1861 if uid is None or gid is None:
1862 uid, gid = extract_uid_gid(ctx)
1863 log_dir = get_log_dir(fsid, ctx.log_dir)
1864 makedirs(log_dir, uid, gid, LOG_DIR_MODE)
1865 return log_dir
1866
1867
1868 def make_var_run(ctx, fsid, uid, gid):
1869 # type: (CephadmContext, str, int, int) -> None
1870 call_throws(ctx, ['install', '-d', '-m0770', '-o', str(uid), '-g', str(gid),
1871 '/var/run/ceph/%s' % fsid])
1872
1873
1874 def copy_tree(ctx, src, dst, uid=None, gid=None):
1875 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
1876 """
1877 Copy a directory tree from src to dst
1878 """
1879 if uid is None or gid is None:
1880 (uid, gid) = extract_uid_gid(ctx)
1881
1882 for src_dir in src:
1883 dst_dir = dst
1884 if os.path.isdir(dst):
1885 dst_dir = os.path.join(dst, os.path.basename(src_dir))
1886
1887 logger.debug('copy directory `%s` -> `%s`' % (src_dir, dst_dir))
1888 shutil.rmtree(dst_dir, ignore_errors=True)
1889 shutil.copytree(src_dir, dst_dir) # dirs_exist_ok needs python 3.8
1890
1891 for dirpath, dirnames, filenames in os.walk(dst_dir):
1892 logger.debug('chown %s:%s `%s`' % (uid, gid, dirpath))
1893 os.chown(dirpath, uid, gid)
1894 for filename in filenames:
1895 logger.debug('chown %s:%s `%s`' % (uid, gid, filename))
1896 os.chown(os.path.join(dirpath, filename), uid, gid)
1897
1898
1899 def copy_files(ctx, src, dst, uid=None, gid=None):
1900 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
1901 """
1902 Copy a files from src to dst
1903 """
1904 if uid is None or gid is None:
1905 (uid, gid) = extract_uid_gid(ctx)
1906
1907 for src_file in src:
1908 dst_file = dst
1909 if os.path.isdir(dst):
1910 dst_file = os.path.join(dst, os.path.basename(src_file))
1911
1912 logger.debug('copy file `%s` -> `%s`' % (src_file, dst_file))
1913 shutil.copyfile(src_file, dst_file)
1914
1915 logger.debug('chown %s:%s `%s`' % (uid, gid, dst_file))
1916 os.chown(dst_file, uid, gid)
1917
1918
1919 def move_files(ctx, src, dst, uid=None, gid=None):
1920 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
1921 """
1922 Move files from src to dst
1923 """
1924 if uid is None or gid is None:
1925 (uid, gid) = extract_uid_gid(ctx)
1926
1927 for src_file in src:
1928 dst_file = dst
1929 if os.path.isdir(dst):
1930 dst_file = os.path.join(dst, os.path.basename(src_file))
1931
1932 if os.path.islink(src_file):
1933 # shutil.move() in py2 does not handle symlinks correctly
1934 src_rl = os.readlink(src_file)
1935 logger.debug("symlink '%s' -> '%s'" % (dst_file, src_rl))
1936 os.symlink(src_rl, dst_file)
1937 os.unlink(src_file)
1938 else:
1939 logger.debug("move file '%s' -> '%s'" % (src_file, dst_file))
1940 shutil.move(src_file, dst_file)
1941 logger.debug('chown %s:%s `%s`' % (uid, gid, dst_file))
1942 os.chown(dst_file, uid, gid)
1943
1944
1945 # copied from distutils
1946 def find_executable(executable: str, path: Optional[str] = None) -> Optional[str]:
1947 """Tries to find 'executable' in the directories listed in 'path'.
1948 A string listing directories separated by 'os.pathsep'; defaults to
1949 os.environ['PATH']. Returns the complete filename or None if not found.
1950 """
1951 _, ext = os.path.splitext(executable)
1952 if (sys.platform == 'win32') and (ext != '.exe'):
1953 executable = executable + '.exe'
1954
1955 if os.path.isfile(executable):
1956 return executable
1957
1958 if path is None:
1959 path = os.environ.get('PATH', None)
1960 if path is None:
1961 try:
1962 path = os.confstr('CS_PATH')
1963 except (AttributeError, ValueError):
1964 # os.confstr() or CS_PATH is not available
1965 path = os.defpath
1966 # bpo-35755: Don't use os.defpath if the PATH environment variable is
1967 # set to an empty string
1968
1969 # PATH='' doesn't match, whereas PATH=':' looks in the current directory
1970 if not path:
1971 return None
1972
1973 paths = path.split(os.pathsep)
1974 for p in paths:
1975 f = os.path.join(p, executable)
1976 if os.path.isfile(f):
1977 # the file exists, we have a shot at spawn working
1978 return f
1979 return None
1980
1981
1982 def find_program(filename):
1983 # type: (str) -> str
1984 name = find_executable(filename)
1985 if name is None:
1986 raise ValueError('%s not found' % filename)
1987 return name
1988
1989
1990 def find_container_engine(ctx: CephadmContext) -> Optional[ContainerEngine]:
1991 if ctx.docker:
1992 return Docker()
1993 else:
1994 for i in CONTAINER_PREFERENCE:
1995 try:
1996 return i()
1997 except Exception as e:
1998 logger.debug('Could not locate %s: %s' % (i.EXE, e))
1999 return None
2000
2001
2002 def check_container_engine(ctx):
2003 # type: (CephadmContext) -> None
2004 engine = ctx.container_engine
2005 if not isinstance(engine, CONTAINER_PREFERENCE):
2006 # See https://github.com/python/mypy/issues/8993
2007 exes: List[str] = [i.EXE for i in CONTAINER_PREFERENCE] # type: ignore
2008 raise Error('No container engine binary found ({}). Try run `apt/dnf/yum/zypper install <container engine>`'.format(' or '.join(exes)))
2009 elif isinstance(engine, Podman):
2010 engine.get_version(ctx)
2011 if engine.version < MIN_PODMAN_VERSION:
2012 raise Error('podman version %d.%d.%d or later is required' % MIN_PODMAN_VERSION)
2013
2014
2015 def get_unit_name(fsid, daemon_type, daemon_id=None):
2016 # type: (str, str, Optional[Union[int, str]]) -> str
2017 # accept either name or type + id
2018 if daemon_type == CephadmDaemon.daemon_type and daemon_id is not None:
2019 return 'ceph-%s-%s.%s' % (fsid, daemon_type, daemon_id)
2020 elif daemon_id is not None:
2021 return 'ceph-%s@%s.%s' % (fsid, daemon_type, daemon_id)
2022 else:
2023 return 'ceph-%s@%s' % (fsid, daemon_type)
2024
2025
2026 def get_unit_name_by_daemon_name(ctx: CephadmContext, fsid: str, name: str) -> str:
2027 daemon = get_daemon_description(ctx, fsid, name)
2028 try:
2029 return daemon['systemd_unit']
2030 except KeyError:
2031 raise Error('Failed to get unit name for {}'.format(daemon))
2032
2033
2034 def check_unit(ctx, unit_name):
2035 # type: (CephadmContext, str) -> Tuple[bool, str, bool]
2036 # NOTE: we ignore the exit code here because systemctl outputs
2037 # various exit codes based on the state of the service, but the
2038 # string result is more explicit (and sufficient).
2039 enabled = False
2040 installed = False
2041 try:
2042 out, err, code = call(ctx, ['systemctl', 'is-enabled', unit_name],
2043 verbosity=CallVerbosity.DEBUG)
2044 if code == 0:
2045 enabled = True
2046 installed = True
2047 elif 'disabled' in out:
2048 installed = True
2049 except Exception as e:
2050 logger.warning('unable to run systemctl: %s' % e)
2051 enabled = False
2052 installed = False
2053
2054 state = 'unknown'
2055 try:
2056 out, err, code = call(ctx, ['systemctl', 'is-active', unit_name],
2057 verbosity=CallVerbosity.DEBUG)
2058 out = out.strip()
2059 if out in ['active']:
2060 state = 'running'
2061 elif out in ['inactive']:
2062 state = 'stopped'
2063 elif out in ['failed', 'auto-restart']:
2064 state = 'error'
2065 else:
2066 state = 'unknown'
2067 except Exception as e:
2068 logger.warning('unable to run systemctl: %s' % e)
2069 state = 'unknown'
2070 return (enabled, state, installed)
2071
2072
2073 def check_units(ctx, units, enabler=None):
2074 # type: (CephadmContext, List[str], Optional[Packager]) -> bool
2075 for u in units:
2076 (enabled, state, installed) = check_unit(ctx, u)
2077 if enabled and state == 'running':
2078 logger.info('Unit %s is enabled and running' % u)
2079 return True
2080 if enabler is not None:
2081 if installed:
2082 logger.info('Enabling unit %s' % u)
2083 enabler.enable_service(u)
2084 return False
2085
2086
2087 def is_container_running(ctx: CephadmContext, c: 'CephContainer') -> bool:
2088 return bool(get_running_container_name(ctx, c))
2089
2090
2091 def get_running_container_name(ctx: CephadmContext, c: 'CephContainer') -> Optional[str]:
2092 for name in [c.cname, c.old_cname]:
2093 out, err, ret = call(ctx, [
2094 ctx.container_engine.path, 'container', 'inspect',
2095 '--format', '{{.State.Status}}', name
2096 ])
2097 if out.strip() == 'running':
2098 return name
2099 return None
2100
2101
2102 def get_legacy_config_fsid(cluster, legacy_dir=None):
2103 # type: (str, Optional[str]) -> Optional[str]
2104 config_file = '/etc/ceph/%s.conf' % cluster
2105 if legacy_dir is not None:
2106 config_file = os.path.abspath(legacy_dir + config_file)
2107
2108 if os.path.exists(config_file):
2109 config = read_config(config_file)
2110 if config.has_section('global') and config.has_option('global', 'fsid'):
2111 return config.get('global', 'fsid')
2112 return None
2113
2114
2115 def get_legacy_daemon_fsid(ctx, cluster,
2116 daemon_type, daemon_id, legacy_dir=None):
2117 # type: (CephadmContext, str, str, Union[int, str], Optional[str]) -> Optional[str]
2118 fsid = None
2119 if daemon_type == 'osd':
2120 try:
2121 fsid_file = os.path.join(ctx.data_dir,
2122 daemon_type,
2123 'ceph-%s' % daemon_id,
2124 'ceph_fsid')
2125 if legacy_dir is not None:
2126 fsid_file = os.path.abspath(legacy_dir + fsid_file)
2127 with open(fsid_file, 'r') as f:
2128 fsid = f.read().strip()
2129 except IOError:
2130 pass
2131 if not fsid:
2132 fsid = get_legacy_config_fsid(cluster, legacy_dir=legacy_dir)
2133 return fsid
2134
2135
2136 def get_daemon_args(ctx, fsid, daemon_type, daemon_id):
2137 # type: (CephadmContext, str, str, Union[int, str]) -> List[str]
2138 r = list() # type: List[str]
2139
2140 if daemon_type in Ceph.daemons and daemon_type != 'crash':
2141 r += [
2142 '--setuser', 'ceph',
2143 '--setgroup', 'ceph',
2144 '--default-log-to-file=false',
2145 '--default-log-to-stderr=true',
2146 '--default-log-stderr-prefix=debug ',
2147 ]
2148 if daemon_type == 'mon':
2149 r += [
2150 '--default-mon-cluster-log-to-file=false',
2151 '--default-mon-cluster-log-to-stderr=true',
2152 ]
2153 elif daemon_type in Monitoring.components:
2154 metadata = Monitoring.components[daemon_type]
2155 r += metadata.get('args', list())
2156 # set ip and port to bind to for nodeexporter,alertmanager,prometheus
2157 if daemon_type != 'grafana':
2158 ip = ''
2159 port = Monitoring.port_map[daemon_type][0]
2160 if 'meta_json' in ctx and ctx.meta_json:
2161 meta = json.loads(ctx.meta_json) or {}
2162 if 'ip' in meta and meta['ip']:
2163 ip = meta['ip']
2164 if 'ports' in meta and meta['ports']:
2165 port = meta['ports'][0]
2166 r += [f'--web.listen-address={ip}:{port}']
2167 if daemon_type == 'alertmanager':
2168 config = get_parm(ctx.config_json)
2169 peers = config.get('peers', list()) # type: ignore
2170 for peer in peers:
2171 r += ['--cluster.peer={}'.format(peer)]
2172 # some alertmanager, by default, look elsewhere for a config
2173 r += ['--config.file=/etc/alertmanager/alertmanager.yml']
2174 elif daemon_type == NFSGanesha.daemon_type:
2175 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2176 r += nfs_ganesha.get_daemon_args()
2177 elif daemon_type == HAproxy.daemon_type:
2178 haproxy = HAproxy.init(ctx, fsid, daemon_id)
2179 r += haproxy.get_daemon_args()
2180 elif daemon_type == CustomContainer.daemon_type:
2181 cc = CustomContainer.init(ctx, fsid, daemon_id)
2182 r.extend(cc.get_daemon_args())
2183
2184 return r
2185
2186
2187 def create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid,
2188 config=None, keyring=None):
2189 # type: (CephadmContext, str, str, Union[int, str], int, int, Optional[str], Optional[str]) -> None
2190 data_dir = make_data_dir(ctx, fsid, daemon_type, daemon_id, uid=uid, gid=gid)
2191 make_log_dir(ctx, fsid, uid=uid, gid=gid)
2192
2193 if config:
2194 config_path = os.path.join(data_dir, 'config')
2195 with open(config_path, 'w') as f:
2196 os.fchown(f.fileno(), uid, gid)
2197 os.fchmod(f.fileno(), 0o600)
2198 f.write(config)
2199
2200 if keyring:
2201 keyring_path = os.path.join(data_dir, 'keyring')
2202 with open(keyring_path, 'w') as f:
2203 os.fchmod(f.fileno(), 0o600)
2204 os.fchown(f.fileno(), uid, gid)
2205 f.write(keyring)
2206
2207 if daemon_type in Monitoring.components.keys():
2208 config_json: Dict[str, Any] = dict()
2209 if 'config_json' in ctx:
2210 config_json = get_parm(ctx.config_json)
2211
2212 # Set up directories specific to the monitoring component
2213 config_dir = ''
2214 data_dir_root = ''
2215 if daemon_type == 'prometheus':
2216 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2217 daemon_type, daemon_id)
2218 config_dir = 'etc/prometheus'
2219 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2220 makedirs(os.path.join(data_dir_root, config_dir, 'alerting'), uid, gid, 0o755)
2221 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2222 elif daemon_type == 'grafana':
2223 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2224 daemon_type, daemon_id)
2225 config_dir = 'etc/grafana'
2226 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2227 makedirs(os.path.join(data_dir_root, config_dir, 'certs'), uid, gid, 0o755)
2228 makedirs(os.path.join(data_dir_root, config_dir, 'provisioning/datasources'), uid, gid, 0o755)
2229 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2230 touch(os.path.join(data_dir_root, 'data', 'grafana.db'), uid, gid)
2231 elif daemon_type == 'alertmanager':
2232 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2233 daemon_type, daemon_id)
2234 config_dir = 'etc/alertmanager'
2235 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2236 makedirs(os.path.join(data_dir_root, config_dir, 'data'), uid, gid, 0o755)
2237
2238 # populate the config directory for the component from the config-json
2239 if 'files' in config_json:
2240 for fname in config_json['files']:
2241 content = dict_get_join(config_json['files'], fname)
2242 if os.path.isabs(fname):
2243 fpath = os.path.join(data_dir_root, fname.lstrip(os.path.sep))
2244 else:
2245 fpath = os.path.join(data_dir_root, config_dir, fname)
2246 with open(fpath, 'w', encoding='utf-8') as f:
2247 os.fchown(f.fileno(), uid, gid)
2248 os.fchmod(f.fileno(), 0o600)
2249 f.write(content)
2250
2251 elif daemon_type == NFSGanesha.daemon_type:
2252 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2253 nfs_ganesha.create_daemon_dirs(data_dir, uid, gid)
2254
2255 elif daemon_type == CephIscsi.daemon_type:
2256 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2257 ceph_iscsi.create_daemon_dirs(data_dir, uid, gid)
2258
2259 elif daemon_type == HAproxy.daemon_type:
2260 haproxy = HAproxy.init(ctx, fsid, daemon_id)
2261 haproxy.create_daemon_dirs(data_dir, uid, gid)
2262
2263 elif daemon_type == Keepalived.daemon_type:
2264 keepalived = Keepalived.init(ctx, fsid, daemon_id)
2265 keepalived.create_daemon_dirs(data_dir, uid, gid)
2266
2267 elif daemon_type == CustomContainer.daemon_type:
2268 cc = CustomContainer.init(ctx, fsid, daemon_id)
2269 cc.create_daemon_dirs(data_dir, uid, gid)
2270
2271
2272 def get_parm(option):
2273 # type: (str) -> Dict[str, str]
2274
2275 if not option:
2276 return dict()
2277
2278 global cached_stdin
2279 if option == '-':
2280 if cached_stdin is not None:
2281 j = cached_stdin
2282 else:
2283 j = sys.stdin.read()
2284 cached_stdin = j
2285 else:
2286 # inline json string
2287 if option[0] == '{' and option[-1] == '}':
2288 j = option
2289 # json file
2290 elif os.path.exists(option):
2291 with open(option, 'r') as f:
2292 j = f.read()
2293 else:
2294 raise Error('Config file {} not found'.format(option))
2295
2296 try:
2297 js = json.loads(j)
2298 except ValueError as e:
2299 raise Error('Invalid JSON in {}: {}'.format(option, e))
2300 else:
2301 return js
2302
2303
2304 def get_config_and_keyring(ctx):
2305 # type: (CephadmContext) -> Tuple[Optional[str], Optional[str]]
2306 config = None
2307 keyring = None
2308
2309 if 'config_json' in ctx and ctx.config_json:
2310 d = get_parm(ctx.config_json)
2311 config = d.get('config')
2312 keyring = d.get('keyring')
2313
2314 if 'config' in ctx and ctx.config:
2315 try:
2316 with open(ctx.config, 'r') as f:
2317 config = f.read()
2318 except FileNotFoundError as e:
2319 raise Error(e)
2320
2321 if 'key' in ctx and ctx.key:
2322 keyring = '[%s]\n\tkey = %s\n' % (ctx.name, ctx.key)
2323 elif 'keyring' in ctx and ctx.keyring:
2324 try:
2325 with open(ctx.keyring, 'r') as f:
2326 keyring = f.read()
2327 except FileNotFoundError as e:
2328 raise Error(e)
2329
2330 return config, keyring
2331
2332
2333 def get_container_binds(ctx, fsid, daemon_type, daemon_id):
2334 # type: (CephadmContext, str, str, Union[int, str, None]) -> List[List[str]]
2335 binds = list()
2336
2337 if daemon_type == CephIscsi.daemon_type:
2338 binds.extend(CephIscsi.get_container_binds())
2339 elif daemon_type == CustomContainer.daemon_type:
2340 assert daemon_id
2341 cc = CustomContainer.init(ctx, fsid, daemon_id)
2342 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2343 binds.extend(cc.get_container_binds(data_dir))
2344
2345 return binds
2346
2347
2348 def get_container_mounts(ctx, fsid, daemon_type, daemon_id,
2349 no_config=False):
2350 # type: (CephadmContext, str, str, Union[int, str, None], Optional[bool]) -> Dict[str, str]
2351 mounts = dict()
2352
2353 if daemon_type in Ceph.daemons:
2354 if fsid:
2355 run_path = os.path.join('/var/run/ceph', fsid)
2356 if os.path.exists(run_path):
2357 mounts[run_path] = '/var/run/ceph:z'
2358 log_dir = get_log_dir(fsid, ctx.log_dir)
2359 mounts[log_dir] = '/var/log/ceph:z'
2360 crash_dir = '/var/lib/ceph/%s/crash' % fsid
2361 if os.path.exists(crash_dir):
2362 mounts[crash_dir] = '/var/lib/ceph/crash:z'
2363
2364 if daemon_type in Ceph.daemons and daemon_id:
2365 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2366 if daemon_type == 'rgw':
2367 cdata_dir = '/var/lib/ceph/radosgw/ceph-rgw.%s' % (daemon_id)
2368 else:
2369 cdata_dir = '/var/lib/ceph/%s/ceph-%s' % (daemon_type, daemon_id)
2370 if daemon_type != 'crash':
2371 mounts[data_dir] = cdata_dir + ':z'
2372 if not no_config:
2373 mounts[data_dir + '/config'] = '/etc/ceph/ceph.conf:z'
2374 if daemon_type in ['rbd-mirror', 'cephfs-mirror', 'crash']:
2375 # these do not search for their keyrings in a data directory
2376 mounts[data_dir + '/keyring'] = '/etc/ceph/ceph.client.%s.%s.keyring' % (daemon_type, daemon_id)
2377
2378 if daemon_type in ['mon', 'osd', 'clusterless-ceph-volume']:
2379 mounts['/dev'] = '/dev' # FIXME: narrow this down?
2380 mounts['/run/udev'] = '/run/udev'
2381 if daemon_type in ['osd', 'clusterless-ceph-volume']:
2382 mounts['/sys'] = '/sys' # for numa.cc, pick_address, cgroups, ...
2383 mounts['/run/lvm'] = '/run/lvm'
2384 mounts['/run/lock/lvm'] = '/run/lock/lvm'
2385 if daemon_type == 'osd':
2386 # selinux-policy in the container may not match the host.
2387 if HostFacts(ctx).selinux_enabled:
2388 selinux_folder = '/var/lib/ceph/%s/selinux' % fsid
2389 if not os.path.exists(selinux_folder):
2390 os.makedirs(selinux_folder, mode=0o755)
2391 mounts[selinux_folder] = '/sys/fs/selinux:ro'
2392
2393 try:
2394 if ctx.shared_ceph_folder: # make easy manager modules/ceph-volume development
2395 ceph_folder = pathify(ctx.shared_ceph_folder)
2396 if os.path.exists(ceph_folder):
2397 mounts[ceph_folder + '/src/ceph-volume/ceph_volume'] = '/usr/lib/python3.6/site-packages/ceph_volume'
2398 mounts[ceph_folder + '/src/cephadm/cephadm'] = '/usr/sbin/cephadm'
2399 mounts[ceph_folder + '/src/pybind/mgr'] = '/usr/share/ceph/mgr'
2400 mounts[ceph_folder + '/src/python-common/ceph'] = '/usr/lib/python3.6/site-packages/ceph'
2401 mounts[ceph_folder + '/monitoring/grafana/dashboards'] = '/etc/grafana/dashboards/ceph-dashboard'
2402 mounts[ceph_folder + '/monitoring/prometheus/alerts'] = '/etc/prometheus/ceph'
2403 else:
2404 logger.error('{}{}{}'.format(termcolor.red,
2405 'Ceph shared source folder does not exist.',
2406 termcolor.end))
2407 except AttributeError:
2408 pass
2409
2410 if daemon_type in Monitoring.components and daemon_id:
2411 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2412 if daemon_type == 'prometheus':
2413 mounts[os.path.join(data_dir, 'etc/prometheus')] = '/etc/prometheus:Z'
2414 mounts[os.path.join(data_dir, 'data')] = '/prometheus:Z'
2415 elif daemon_type == 'node-exporter':
2416 mounts['/proc'] = '/host/proc:ro'
2417 mounts['/sys'] = '/host/sys:ro'
2418 mounts['/'] = '/rootfs:ro'
2419 elif daemon_type == 'grafana':
2420 mounts[os.path.join(data_dir, 'etc/grafana/grafana.ini')] = '/etc/grafana/grafana.ini:Z'
2421 mounts[os.path.join(data_dir, 'etc/grafana/provisioning/datasources')] = '/etc/grafana/provisioning/datasources:Z'
2422 mounts[os.path.join(data_dir, 'etc/grafana/certs')] = '/etc/grafana/certs:Z'
2423 mounts[os.path.join(data_dir, 'data/grafana.db')] = '/var/lib/grafana/grafana.db:Z'
2424 elif daemon_type == 'alertmanager':
2425 mounts[os.path.join(data_dir, 'etc/alertmanager')] = '/etc/alertmanager:Z'
2426
2427 if daemon_type == NFSGanesha.daemon_type:
2428 assert daemon_id
2429 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2430 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2431 mounts.update(nfs_ganesha.get_container_mounts(data_dir))
2432
2433 if daemon_type == HAproxy.daemon_type:
2434 assert daemon_id
2435 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2436 mounts.update(HAproxy.get_container_mounts(data_dir))
2437
2438 if daemon_type == CephIscsi.daemon_type:
2439 assert daemon_id
2440 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2441 log_dir = get_log_dir(fsid, ctx.log_dir)
2442 mounts.update(CephIscsi.get_container_mounts(data_dir, log_dir))
2443
2444 if daemon_type == Keepalived.daemon_type:
2445 assert daemon_id
2446 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2447 mounts.update(Keepalived.get_container_mounts(data_dir))
2448
2449 if daemon_type == CustomContainer.daemon_type:
2450 assert daemon_id
2451 cc = CustomContainer.init(ctx, fsid, daemon_id)
2452 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2453 mounts.update(cc.get_container_mounts(data_dir))
2454
2455 return mounts
2456
2457
2458 def get_container(ctx: CephadmContext,
2459 fsid: str, daemon_type: str, daemon_id: Union[int, str],
2460 privileged: bool = False,
2461 ptrace: bool = False,
2462 container_args: Optional[List[str]] = None) -> 'CephContainer':
2463 entrypoint: str = ''
2464 name: str = ''
2465 ceph_args: List[str] = []
2466 envs: List[str] = []
2467 host_network: bool = True
2468
2469 if daemon_type in Ceph.daemons:
2470 envs.append('TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728')
2471 if container_args is None:
2472 container_args = []
2473 if daemon_type in ['mon', 'osd']:
2474 # mon and osd need privileged in order for libudev to query devices
2475 privileged = True
2476 if daemon_type == 'rgw':
2477 entrypoint = '/usr/bin/radosgw'
2478 name = 'client.rgw.%s' % daemon_id
2479 elif daemon_type == 'rbd-mirror':
2480 entrypoint = '/usr/bin/rbd-mirror'
2481 name = 'client.rbd-mirror.%s' % daemon_id
2482 elif daemon_type == 'cephfs-mirror':
2483 entrypoint = '/usr/bin/cephfs-mirror'
2484 name = 'client.cephfs-mirror.%s' % daemon_id
2485 elif daemon_type == 'crash':
2486 entrypoint = '/usr/bin/ceph-crash'
2487 name = 'client.crash.%s' % daemon_id
2488 elif daemon_type in ['mon', 'mgr', 'mds', 'osd']:
2489 entrypoint = '/usr/bin/ceph-' + daemon_type
2490 name = '%s.%s' % (daemon_type, daemon_id)
2491 elif daemon_type in Monitoring.components:
2492 entrypoint = ''
2493 elif daemon_type == NFSGanesha.daemon_type:
2494 entrypoint = NFSGanesha.entrypoint
2495 name = '%s.%s' % (daemon_type, daemon_id)
2496 envs.extend(NFSGanesha.get_container_envs())
2497 elif daemon_type == HAproxy.daemon_type:
2498 name = '%s.%s' % (daemon_type, daemon_id)
2499 container_args.extend(['--user=root']) # haproxy 2.4 defaults to a different user
2500 elif daemon_type == Keepalived.daemon_type:
2501 name = '%s.%s' % (daemon_type, daemon_id)
2502 envs.extend(Keepalived.get_container_envs())
2503 container_args.extend(['--cap-add=NET_ADMIN', '--cap-add=NET_RAW'])
2504 elif daemon_type == CephIscsi.daemon_type:
2505 entrypoint = CephIscsi.entrypoint
2506 name = '%s.%s' % (daemon_type, daemon_id)
2507 # So the container can modprobe iscsi_target_mod and have write perms
2508 # to configfs we need to make this a privileged container.
2509 privileged = True
2510 elif daemon_type == CustomContainer.daemon_type:
2511 cc = CustomContainer.init(ctx, fsid, daemon_id)
2512 entrypoint = cc.entrypoint
2513 host_network = False
2514 envs.extend(cc.get_container_envs())
2515 container_args.extend(cc.get_container_args())
2516
2517 if daemon_type in Monitoring.components:
2518 uid, gid = extract_uid_gid_monitoring(ctx, daemon_type)
2519 monitoring_args = [
2520 '--user',
2521 str(uid),
2522 # FIXME: disable cpu/memory limits for the time being (not supported
2523 # by ubuntu 18.04 kernel!)
2524 ]
2525 container_args.extend(monitoring_args)
2526 elif daemon_type == 'crash':
2527 ceph_args = ['-n', name]
2528 elif daemon_type in Ceph.daemons:
2529 ceph_args = ['-n', name, '-f']
2530
2531 # if using podman, set -d, --conmon-pidfile & --cidfile flags
2532 # so service can have Type=Forking
2533 if isinstance(ctx.container_engine, Podman):
2534 runtime_dir = '/run'
2535 container_args.extend([
2536 '-d', '--log-driver', 'journald',
2537 '--conmon-pidfile',
2538 runtime_dir + '/ceph-%s@%s.%s.service-pid' % (fsid, daemon_type, daemon_id),
2539 '--cidfile',
2540 runtime_dir + '/ceph-%s@%s.%s.service-cid' % (fsid, daemon_type, daemon_id),
2541 ])
2542 if ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION:
2543 container_args.append('--cgroups=split')
2544
2545 return CephContainer.for_daemon(
2546 ctx,
2547 fsid=fsid,
2548 daemon_type=daemon_type,
2549 daemon_id=str(daemon_id),
2550 entrypoint=entrypoint,
2551 args=ceph_args + get_daemon_args(ctx, fsid, daemon_type, daemon_id),
2552 container_args=container_args,
2553 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
2554 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
2555 envs=envs,
2556 privileged=privileged,
2557 ptrace=ptrace,
2558 host_network=host_network,
2559 )
2560
2561
2562 def extract_uid_gid(ctx, img='', file_path='/var/lib/ceph'):
2563 # type: (CephadmContext, str, Union[str, List[str]]) -> Tuple[int, int]
2564
2565 if not img:
2566 img = ctx.image
2567
2568 if isinstance(file_path, str):
2569 paths = [file_path]
2570 else:
2571 paths = file_path
2572
2573 for fp in paths:
2574 try:
2575 out = CephContainer(
2576 ctx,
2577 image=img,
2578 entrypoint='stat',
2579 args=['-c', '%u %g', fp]
2580 ).run()
2581 uid, gid = out.split(' ')
2582 return int(uid), int(gid)
2583 except RuntimeError:
2584 pass
2585 raise RuntimeError('uid/gid not found')
2586
2587
2588 def deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid,
2589 config=None, keyring=None,
2590 osd_fsid=None,
2591 reconfig=False,
2592 ports=None):
2593 # type: (CephadmContext, str, str, Union[int, str], Optional[CephContainer], int, int, Optional[str], Optional[str], Optional[str], Optional[bool], Optional[List[int]]) -> None
2594
2595 ports = ports or []
2596 if any([port_in_use(ctx, port) for port in ports]):
2597 if daemon_type == 'mgr':
2598 # non-fatal for mgr when we are in mgr_standby_modules=false, but we can't
2599 # tell whether that is the case here.
2600 logger.warning(
2601 f"ceph-mgr TCP port(s) {','.join(map(str, ports))} already in use"
2602 )
2603 else:
2604 raise Error("TCP Port(s) '{}' required for {} already in use".format(','.join(map(str, ports)), daemon_type))
2605
2606 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2607 if reconfig and not os.path.exists(data_dir):
2608 raise Error('cannot reconfig, data path %s does not exist' % data_dir)
2609 if daemon_type == 'mon' and not os.path.exists(data_dir):
2610 assert config
2611 assert keyring
2612 # tmp keyring file
2613 tmp_keyring = write_tmp(keyring, uid, gid)
2614
2615 # tmp config file
2616 tmp_config = write_tmp(config, uid, gid)
2617
2618 # --mkfs
2619 create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid)
2620 mon_dir = get_data_dir(fsid, ctx.data_dir, 'mon', daemon_id)
2621 log_dir = get_log_dir(fsid, ctx.log_dir)
2622 CephContainer(
2623 ctx,
2624 image=ctx.image,
2625 entrypoint='/usr/bin/ceph-mon',
2626 args=[
2627 '--mkfs',
2628 '-i', str(daemon_id),
2629 '--fsid', fsid,
2630 '-c', '/tmp/config',
2631 '--keyring', '/tmp/keyring',
2632 ] + get_daemon_args(ctx, fsid, 'mon', daemon_id),
2633 volume_mounts={
2634 log_dir: '/var/log/ceph:z',
2635 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (daemon_id),
2636 tmp_keyring.name: '/tmp/keyring:z',
2637 tmp_config.name: '/tmp/config:z',
2638 },
2639 ).run()
2640
2641 # write conf
2642 with open(mon_dir + '/config', 'w') as f:
2643 os.fchown(f.fileno(), uid, gid)
2644 os.fchmod(f.fileno(), 0o600)
2645 f.write(config)
2646 else:
2647 # dirs, conf, keyring
2648 create_daemon_dirs(
2649 ctx,
2650 fsid, daemon_type, daemon_id,
2651 uid, gid,
2652 config, keyring)
2653
2654 if not reconfig:
2655 if daemon_type == CephadmDaemon.daemon_type:
2656 port = next(iter(ports), None) # get first tcp port provided or None
2657
2658 if ctx.config_json == '-':
2659 config_js = get_parm('-')
2660 else:
2661 config_js = get_parm(ctx.config_json)
2662 assert isinstance(config_js, dict)
2663 assert isinstance(daemon_id, str)
2664
2665 cephadm_exporter = CephadmDaemon(ctx, fsid, daemon_id, port)
2666 cephadm_exporter.deploy_daemon_unit(config_js)
2667 else:
2668 if c:
2669 deploy_daemon_units(ctx, fsid, uid, gid, daemon_type, daemon_id,
2670 c, osd_fsid=osd_fsid, ports=ports)
2671 else:
2672 raise RuntimeError('attempting to deploy a daemon without a container image')
2673
2674 if not os.path.exists(data_dir + '/unit.created'):
2675 with open(data_dir + '/unit.created', 'w') as f:
2676 os.fchmod(f.fileno(), 0o600)
2677 os.fchown(f.fileno(), uid, gid)
2678 f.write('mtime is time the daemon deployment was created\n')
2679
2680 with open(data_dir + '/unit.configured', 'w') as f:
2681 f.write('mtime is time we were last configured\n')
2682 os.fchmod(f.fileno(), 0o600)
2683 os.fchown(f.fileno(), uid, gid)
2684
2685 update_firewalld(ctx, daemon_type)
2686
2687 # Open ports explicitly required for the daemon
2688 if ports:
2689 fw = Firewalld(ctx)
2690 fw.open_ports(ports)
2691 fw.apply_rules()
2692
2693 if reconfig and daemon_type not in Ceph.daemons:
2694 # ceph daemons do not need a restart; others (presumably) do to pick
2695 # up the new config
2696 call_throws(ctx, ['systemctl', 'reset-failed',
2697 get_unit_name(fsid, daemon_type, daemon_id)])
2698 call_throws(ctx, ['systemctl', 'restart',
2699 get_unit_name(fsid, daemon_type, daemon_id)])
2700
2701
2702 def _write_container_cmd_to_bash(ctx, file_obj, container, comment=None, background=False):
2703 # type: (CephadmContext, IO[str], CephContainer, Optional[str], Optional[bool]) -> None
2704 if comment:
2705 # Sometimes adding a comment, especially if there are multiple containers in one
2706 # unit file, makes it easier to read and grok.
2707 file_obj.write('# ' + comment + '\n')
2708 # Sometimes, adding `--rm` to a run_cmd doesn't work. Let's remove the container manually
2709 file_obj.write('! ' + ' '.join(container.rm_cmd(old_cname=True)) + ' 2> /dev/null\n')
2710 file_obj.write('! ' + ' '.join(container.rm_cmd()) + ' 2> /dev/null\n')
2711 # Sometimes, `podman rm` doesn't find the container. Then you'll have to add `--storage`
2712 if isinstance(ctx.container_engine, Podman):
2713 file_obj.write(
2714 '! '
2715 + ' '.join([shlex.quote(a) for a in container.rm_cmd(storage=True)])
2716 + ' 2> /dev/null\n')
2717 file_obj.write(
2718 '! '
2719 + ' '.join([shlex.quote(a) for a in container.rm_cmd(old_cname=True, storage=True)])
2720 + ' 2> /dev/null\n')
2721
2722 # container run command
2723 file_obj.write(
2724 ' '.join([shlex.quote(a) for a in container.run_cmd()])
2725 + (' &' if background else '') + '\n')
2726
2727
2728 def clean_cgroup(ctx: CephadmContext, fsid: str, unit_name: str) -> None:
2729 # systemd may fail to cleanup cgroups from previous stopped unit, which will cause next "systemctl start" to fail.
2730 # see https://tracker.ceph.com/issues/50998
2731
2732 CGROUPV2_PATH = Path('/sys/fs/cgroup')
2733 if not (CGROUPV2_PATH / 'system.slice').exists():
2734 # Only unified cgroup is affected, skip if not the case
2735 return
2736
2737 slice_name = 'system-ceph\\x2d{}.slice'.format(fsid.replace('-', '\\x2d'))
2738 cg_path = CGROUPV2_PATH / 'system.slice' / slice_name / f'{unit_name}.service'
2739 if not cg_path.exists():
2740 return
2741
2742 def cg_trim(path: Path) -> None:
2743 for p in path.iterdir():
2744 if p.is_dir():
2745 cg_trim(p)
2746 path.rmdir()
2747 try:
2748 cg_trim(cg_path)
2749 except OSError:
2750 logger.warning(f'Failed to trim old cgroups {cg_path}')
2751
2752
2753 def deploy_daemon_units(
2754 ctx: CephadmContext,
2755 fsid: str,
2756 uid: int,
2757 gid: int,
2758 daemon_type: str,
2759 daemon_id: Union[int, str],
2760 c: 'CephContainer',
2761 enable: bool = True,
2762 start: bool = True,
2763 osd_fsid: Optional[str] = None,
2764 ports: Optional[List[int]] = None,
2765 ) -> None:
2766 # cmd
2767 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2768 with open(data_dir + '/unit.run.new', 'w') as f, \
2769 open(data_dir + '/unit.meta.new', 'w') as metaf:
2770 f.write('set -e\n')
2771
2772 if daemon_type in Ceph.daemons:
2773 install_path = find_program('install')
2774 f.write('{install_path} -d -m0770 -o {uid} -g {gid} /var/run/ceph/{fsid}\n'.format(install_path=install_path, fsid=fsid, uid=uid, gid=gid))
2775
2776 # pre-start cmd(s)
2777 if daemon_type == 'osd':
2778 # osds have a pre-start step
2779 assert osd_fsid
2780 simple_fn = os.path.join('/etc/ceph/osd',
2781 '%s-%s.json.adopted-by-cephadm' % (daemon_id, osd_fsid))
2782 if os.path.exists(simple_fn):
2783 f.write('# Simple OSDs need chown on startup:\n')
2784 for n in ['block', 'block.db', 'block.wal']:
2785 p = os.path.join(data_dir, n)
2786 f.write('[ ! -L {p} ] || chown {uid}:{gid} {p}\n'.format(p=p, uid=uid, gid=gid))
2787 else:
2788 prestart = CephContainer(
2789 ctx,
2790 image=ctx.image,
2791 entrypoint='/usr/sbin/ceph-volume',
2792 args=[
2793 'lvm', 'activate',
2794 str(daemon_id), osd_fsid,
2795 '--no-systemd'
2796 ],
2797 privileged=True,
2798 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
2799 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
2800 cname='ceph-%s-%s.%s-activate' % (fsid, daemon_type, daemon_id),
2801 memory_request=ctx.memory_request,
2802 memory_limit=ctx.memory_limit,
2803 )
2804 _write_container_cmd_to_bash(ctx, f, prestart, 'LVM OSDs use ceph-volume lvm activate')
2805 elif daemon_type == CephIscsi.daemon_type:
2806 f.write(' '.join(CephIscsi.configfs_mount_umount(data_dir, mount=True)) + '\n')
2807 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2808 tcmu_container = ceph_iscsi.get_tcmu_runner_container()
2809 _write_container_cmd_to_bash(ctx, f, tcmu_container, 'iscsi tcmu-runnter container', background=True)
2810
2811 _write_container_cmd_to_bash(ctx, f, c, '%s.%s' % (daemon_type, str(daemon_id)))
2812
2813 # some metadata about the deploy
2814 meta: Dict[str, Any] = {}
2815 if 'meta_json' in ctx and ctx.meta_json:
2816 meta = json.loads(ctx.meta_json) or {}
2817 meta.update({
2818 'memory_request': int(ctx.memory_request) if ctx.memory_request else None,
2819 'memory_limit': int(ctx.memory_limit) if ctx.memory_limit else None,
2820 })
2821 if not meta.get('ports'):
2822 meta['ports'] = ports
2823 metaf.write(json.dumps(meta, indent=4) + '\n')
2824
2825 os.fchmod(f.fileno(), 0o600)
2826 os.fchmod(metaf.fileno(), 0o600)
2827 os.rename(data_dir + '/unit.run.new',
2828 data_dir + '/unit.run')
2829 os.rename(data_dir + '/unit.meta.new',
2830 data_dir + '/unit.meta')
2831
2832 # post-stop command(s)
2833 with open(data_dir + '/unit.poststop.new', 'w') as f:
2834 if daemon_type == 'osd':
2835 assert osd_fsid
2836 poststop = CephContainer(
2837 ctx,
2838 image=ctx.image,
2839 entrypoint='/usr/sbin/ceph-volume',
2840 args=[
2841 'lvm', 'deactivate',
2842 str(daemon_id), osd_fsid,
2843 ],
2844 privileged=True,
2845 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
2846 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
2847 cname='ceph-%s-%s.%s-deactivate' % (fsid, daemon_type,
2848 daemon_id),
2849 )
2850 _write_container_cmd_to_bash(ctx, f, poststop, 'deactivate osd')
2851 elif daemon_type == CephIscsi.daemon_type:
2852 # make sure we also stop the tcmu container
2853 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2854 tcmu_container = ceph_iscsi.get_tcmu_runner_container()
2855 f.write('! ' + ' '.join(tcmu_container.stop_cmd()) + '\n')
2856 f.write(' '.join(CephIscsi.configfs_mount_umount(data_dir, mount=False)) + '\n')
2857 os.fchmod(f.fileno(), 0o600)
2858 os.rename(data_dir + '/unit.poststop.new',
2859 data_dir + '/unit.poststop')
2860
2861 # post-stop command(s)
2862 with open(data_dir + '/unit.stop.new', 'w') as f:
2863 f.write('! ' + ' '.join(c.stop_cmd()) + '\n')
2864 f.write('! ' + ' '.join(c.stop_cmd(old_cname=True)) + '\n')
2865
2866 os.fchmod(f.fileno(), 0o600)
2867 os.rename(data_dir + '/unit.stop.new',
2868 data_dir + '/unit.stop')
2869
2870 if c:
2871 with open(data_dir + '/unit.image.new', 'w') as f:
2872 f.write(c.image + '\n')
2873 os.fchmod(f.fileno(), 0o600)
2874 os.rename(data_dir + '/unit.image.new',
2875 data_dir + '/unit.image')
2876
2877 # sysctl
2878 install_sysctl(ctx, fsid, daemon_type)
2879
2880 # systemd
2881 install_base_units(ctx, fsid)
2882 unit = get_unit_file(ctx, fsid)
2883 unit_file = 'ceph-%s@.service' % (fsid)
2884 with open(ctx.unit_dir + '/' + unit_file + '.new', 'w') as f:
2885 f.write(unit)
2886 os.rename(ctx.unit_dir + '/' + unit_file + '.new',
2887 ctx.unit_dir + '/' + unit_file)
2888 call_throws(ctx, ['systemctl', 'daemon-reload'])
2889
2890 unit_name = get_unit_name(fsid, daemon_type, daemon_id)
2891 call(ctx, ['systemctl', 'stop', unit_name],
2892 verbosity=CallVerbosity.DEBUG)
2893 call(ctx, ['systemctl', 'reset-failed', unit_name],
2894 verbosity=CallVerbosity.DEBUG)
2895 if enable:
2896 call_throws(ctx, ['systemctl', 'enable', unit_name])
2897 if start:
2898 clean_cgroup(ctx, fsid, unit_name)
2899 call_throws(ctx, ['systemctl', 'start', unit_name])
2900
2901
2902 class Firewalld(object):
2903 def __init__(self, ctx):
2904 # type: (CephadmContext) -> None
2905 self.ctx = ctx
2906 self.available = self.check()
2907
2908 def check(self):
2909 # type: () -> bool
2910 self.cmd = find_executable('firewall-cmd')
2911 if not self.cmd:
2912 logger.debug('firewalld does not appear to be present')
2913 return False
2914 (enabled, state, _) = check_unit(self.ctx, 'firewalld.service')
2915 if not enabled:
2916 logger.debug('firewalld.service is not enabled')
2917 return False
2918 if state != 'running':
2919 logger.debug('firewalld.service is not running')
2920 return False
2921
2922 logger.info('firewalld ready')
2923 return True
2924
2925 def enable_service_for(self, daemon_type):
2926 # type: (str) -> None
2927 if not self.available:
2928 logger.debug('Not possible to enable service <%s>. firewalld.service is not available' % daemon_type)
2929 return
2930
2931 if daemon_type == 'mon':
2932 svc = 'ceph-mon'
2933 elif daemon_type in ['mgr', 'mds', 'osd']:
2934 svc = 'ceph'
2935 elif daemon_type == NFSGanesha.daemon_type:
2936 svc = 'nfs'
2937 else:
2938 return
2939
2940 if not self.cmd:
2941 raise RuntimeError('command not defined')
2942
2943 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-service', svc], verbosity=CallVerbosity.DEBUG)
2944 if ret:
2945 logger.info('Enabling firewalld service %s in current zone...' % svc)
2946 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--add-service', svc])
2947 if ret:
2948 raise RuntimeError(
2949 'unable to add service %s to current zone: %s' % (svc, err))
2950 else:
2951 logger.debug('firewalld service %s is enabled in current zone' % svc)
2952
2953 def open_ports(self, fw_ports):
2954 # type: (List[int]) -> None
2955 if not self.available:
2956 logger.debug('Not possible to open ports <%s>. firewalld.service is not available' % fw_ports)
2957 return
2958
2959 if not self.cmd:
2960 raise RuntimeError('command not defined')
2961
2962 for port in fw_ports:
2963 tcp_port = str(port) + '/tcp'
2964 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-port', tcp_port], verbosity=CallVerbosity.DEBUG)
2965 if ret:
2966 logger.info('Enabling firewalld port %s in current zone...' % tcp_port)
2967 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--add-port', tcp_port])
2968 if ret:
2969 raise RuntimeError('unable to add port %s to current zone: %s' %
2970 (tcp_port, err))
2971 else:
2972 logger.debug('firewalld port %s is enabled in current zone' % tcp_port)
2973
2974 def close_ports(self, fw_ports):
2975 # type: (List[int]) -> None
2976 if not self.available:
2977 logger.debug('Not possible to close ports <%s>. firewalld.service is not available' % fw_ports)
2978 return
2979
2980 if not self.cmd:
2981 raise RuntimeError('command not defined')
2982
2983 for port in fw_ports:
2984 tcp_port = str(port) + '/tcp'
2985 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-port', tcp_port], verbosity=CallVerbosity.DEBUG)
2986 if not ret:
2987 logger.info('Disabling port %s in current zone...' % tcp_port)
2988 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--remove-port', tcp_port])
2989 if ret:
2990 raise RuntimeError('unable to remove port %s from current zone: %s' %
2991 (tcp_port, err))
2992 else:
2993 logger.info(f'Port {tcp_port} disabled')
2994 else:
2995 logger.info(f'firewalld port {tcp_port} already closed')
2996
2997 def apply_rules(self):
2998 # type: () -> None
2999 if not self.available:
3000 return
3001
3002 if not self.cmd:
3003 raise RuntimeError('command not defined')
3004
3005 call_throws(self.ctx, [self.cmd, '--reload'])
3006
3007
3008 def update_firewalld(ctx, daemon_type):
3009 # type: (CephadmContext, str) -> None
3010 firewall = Firewalld(ctx)
3011 firewall.enable_service_for(daemon_type)
3012 firewall.apply_rules()
3013
3014
3015 def install_sysctl(ctx: CephadmContext, fsid: str, daemon_type: str) -> None:
3016 """
3017 Set up sysctl settings
3018 """
3019 def _write(conf: Path, lines: List[str]) -> None:
3020 lines = [
3021 '# created by cephadm',
3022 '',
3023 *lines,
3024 '',
3025 ]
3026 with open(conf, 'w') as f:
3027 f.write('\n'.join(lines))
3028
3029 conf = Path(ctx.sysctl_dir).joinpath(f'90-ceph-{fsid}-{daemon_type}.conf')
3030 lines: Optional[List] = None
3031
3032 if daemon_type == 'osd':
3033 lines = OSD.get_sysctl_settings()
3034 elif daemon_type == 'haproxy':
3035 lines = HAproxy.get_sysctl_settings()
3036 elif daemon_type == 'keepalived':
3037 lines = Keepalived.get_sysctl_settings()
3038
3039 # apply the sysctl settings
3040 if lines:
3041 Path(ctx.sysctl_dir).mkdir(mode=0o755, exist_ok=True)
3042 _write(conf, lines)
3043 call_throws(ctx, ['sysctl', '--system'])
3044
3045
3046 def install_base_units(ctx, fsid):
3047 # type: (CephadmContext, str) -> None
3048 """
3049 Set up ceph.target and ceph-$fsid.target units.
3050 """
3051 # global unit
3052 existed = os.path.exists(ctx.unit_dir + '/ceph.target')
3053 with open(ctx.unit_dir + '/ceph.target.new', 'w') as f:
3054 f.write('[Unit]\n'
3055 'Description=All Ceph clusters and services\n'
3056 '\n'
3057 '[Install]\n'
3058 'WantedBy=multi-user.target\n')
3059 os.rename(ctx.unit_dir + '/ceph.target.new',
3060 ctx.unit_dir + '/ceph.target')
3061 if not existed:
3062 # we disable before enable in case a different ceph.target
3063 # (from the traditional package) is present; while newer
3064 # systemd is smart enough to disable the old
3065 # (/lib/systemd/...) and enable the new (/etc/systemd/...),
3066 # some older versions of systemd error out with EEXIST.
3067 call_throws(ctx, ['systemctl', 'disable', 'ceph.target'])
3068 call_throws(ctx, ['systemctl', 'enable', 'ceph.target'])
3069 call_throws(ctx, ['systemctl', 'start', 'ceph.target'])
3070
3071 # cluster unit
3072 existed = os.path.exists(ctx.unit_dir + '/ceph-%s.target' % fsid)
3073 with open(ctx.unit_dir + '/ceph-%s.target.new' % fsid, 'w') as f:
3074 f.write(
3075 '[Unit]\n'
3076 'Description=Ceph cluster {fsid}\n'
3077 'PartOf=ceph.target\n'
3078 'Before=ceph.target\n'
3079 '\n'
3080 '[Install]\n'
3081 'WantedBy=multi-user.target ceph.target\n'.format(
3082 fsid=fsid)
3083 )
3084 os.rename(ctx.unit_dir + '/ceph-%s.target.new' % fsid,
3085 ctx.unit_dir + '/ceph-%s.target' % fsid)
3086 if not existed:
3087 call_throws(ctx, ['systemctl', 'enable', 'ceph-%s.target' % fsid])
3088 call_throws(ctx, ['systemctl', 'start', 'ceph-%s.target' % fsid])
3089
3090 # logrotate for the cluster
3091 with open(ctx.logrotate_dir + '/ceph-%s' % fsid, 'w') as f:
3092 """
3093 This is a bit sloppy in that the killall/pkill will touch all ceph daemons
3094 in all containers, but I don't see an elegant way to send SIGHUP *just* to
3095 the daemons for this cluster. (1) systemd kill -s will get the signal to
3096 podman, but podman will exit. (2) podman kill will get the signal to the
3097 first child (bash), but that isn't the ceph daemon. This is simpler and
3098 should be harmless.
3099 """
3100 f.write("""# created by cephadm
3101 /var/log/ceph/%s/*.log {
3102 rotate 7
3103 daily
3104 compress
3105 sharedscripts
3106 postrotate
3107 killall -q -1 ceph-mon ceph-mgr ceph-mds ceph-osd ceph-fuse radosgw rbd-mirror cephfs-mirror || pkill -1 -x 'ceph-mon|ceph-mgr|ceph-mds|ceph-osd|ceph-fuse|radosgw|rbd-mirror|cephfs-mirror' || true
3108 endscript
3109 missingok
3110 notifempty
3111 su root root
3112 }
3113 """ % fsid)
3114
3115
3116 def get_unit_file(ctx, fsid):
3117 # type: (CephadmContext, str) -> str
3118 extra_args = ''
3119 if isinstance(ctx.container_engine, Podman):
3120 extra_args = ('ExecStartPre=-/bin/rm -f %t/%n-pid %t/%n-cid\n'
3121 'ExecStopPost=-/bin/rm -f %t/%n-pid %t/%n-cid\n'
3122 'Type=forking\n'
3123 'PIDFile=%t/%n-pid\n')
3124 if ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION:
3125 extra_args += 'Delegate=yes\n'
3126
3127 docker = isinstance(ctx.container_engine, Docker)
3128 u = """# generated by cephadm
3129 [Unit]
3130 Description=Ceph %i for {fsid}
3131
3132 # According to:
3133 # http://www.freedesktop.org/wiki/Software/systemd/NetworkTarget
3134 # these can be removed once ceph-mon will dynamically change network
3135 # configuration.
3136 After=network-online.target local-fs.target time-sync.target{docker_after}
3137 Wants=network-online.target local-fs.target time-sync.target
3138 {docker_requires}
3139
3140 PartOf=ceph-{fsid}.target
3141 Before=ceph-{fsid}.target
3142
3143 [Service]
3144 LimitNOFILE=1048576
3145 LimitNPROC=1048576
3146 EnvironmentFile=-/etc/environment
3147 ExecStart=/bin/bash {data_dir}/{fsid}/%i/unit.run
3148 ExecStop=-/bin/bash -c '{container_path} stop ceph-{fsid}-%i ; bash {data_dir}/{fsid}/%i/unit.stop'
3149 ExecStopPost=-/bin/bash {data_dir}/{fsid}/%i/unit.poststop
3150 KillMode=none
3151 Restart=on-failure
3152 RestartSec=10s
3153 TimeoutStartSec=120
3154 TimeoutStopSec=120
3155 StartLimitInterval=30min
3156 StartLimitBurst=5
3157 {extra_args}
3158 [Install]
3159 WantedBy=ceph-{fsid}.target
3160 """.format(container_path=ctx.container_engine.path,
3161 fsid=fsid,
3162 data_dir=ctx.data_dir,
3163 extra_args=extra_args,
3164 # if docker, we depend on docker.service
3165 docker_after=' docker.service' if docker else '',
3166 docker_requires='Requires=docker.service\n' if docker else '')
3167
3168 return u
3169
3170 ##################################
3171
3172
3173 class CephContainer:
3174 def __init__(self,
3175 ctx: CephadmContext,
3176 image: str,
3177 entrypoint: str,
3178 args: List[str] = [],
3179 volume_mounts: Dict[str, str] = {},
3180 cname: str = '',
3181 container_args: List[str] = [],
3182 envs: Optional[List[str]] = None,
3183 privileged: bool = False,
3184 ptrace: bool = False,
3185 bind_mounts: Optional[List[List[str]]] = None,
3186 init: Optional[bool] = None,
3187 host_network: bool = True,
3188 memory_request: Optional[str] = None,
3189 memory_limit: Optional[str] = None,
3190 ) -> None:
3191 self.ctx = ctx
3192 self.image = image
3193 self.entrypoint = entrypoint
3194 self.args = args
3195 self.volume_mounts = volume_mounts
3196 self._cname = cname
3197 self.container_args = container_args
3198 self.envs = envs
3199 self.privileged = privileged
3200 self.ptrace = ptrace
3201 self.bind_mounts = bind_mounts if bind_mounts else []
3202 self.init = init if init else ctx.container_init
3203 self.host_network = host_network
3204 self.memory_request = memory_request
3205 self.memory_limit = memory_limit
3206
3207 @classmethod
3208 def for_daemon(cls,
3209 ctx: CephadmContext,
3210 fsid: str,
3211 daemon_type: str,
3212 daemon_id: str,
3213 entrypoint: str,
3214 args: List[str] = [],
3215 volume_mounts: Dict[str, str] = {},
3216 container_args: List[str] = [],
3217 envs: Optional[List[str]] = None,
3218 privileged: bool = False,
3219 ptrace: bool = False,
3220 bind_mounts: Optional[List[List[str]]] = None,
3221 init: Optional[bool] = None,
3222 host_network: bool = True,
3223 memory_request: Optional[str] = None,
3224 memory_limit: Optional[str] = None,
3225 ) -> 'CephContainer':
3226 return cls(
3227 ctx,
3228 image=ctx.image,
3229 entrypoint=entrypoint,
3230 args=args,
3231 volume_mounts=volume_mounts,
3232 cname='ceph-%s-%s.%s' % (fsid, daemon_type, daemon_id),
3233 container_args=container_args,
3234 envs=envs,
3235 privileged=privileged,
3236 ptrace=ptrace,
3237 bind_mounts=bind_mounts,
3238 init=init,
3239 host_network=host_network,
3240 memory_request=memory_request,
3241 memory_limit=memory_limit,
3242 )
3243
3244 @property
3245 def cname(self) -> str:
3246 """
3247 podman adds the current container name to the /etc/hosts
3248 file. Turns out, python's `socket.getfqdn()` differs from
3249 `hostname -f`, when we have the container names containing
3250 dots in it.:
3251
3252 # podman run --name foo.bar.baz.com ceph/ceph /bin/bash
3253 [root@sebastians-laptop /]# cat /etc/hosts
3254 127.0.0.1 localhost
3255 ::1 localhost
3256 127.0.1.1 sebastians-laptop foo.bar.baz.com
3257 [root@sebastians-laptop /]# hostname -f
3258 sebastians-laptop
3259 [root@sebastians-laptop /]# python3 -c 'import socket; print(socket.getfqdn())'
3260 foo.bar.baz.com
3261
3262 Fascinatingly, this doesn't happen when using dashes.
3263 """
3264 return self._cname.replace('.', '-')
3265
3266 @cname.setter
3267 def cname(self, val: str) -> None:
3268 self._cname = val
3269
3270 @property
3271 def old_cname(self) -> str:
3272 return self._cname
3273
3274 def run_cmd(self) -> List[str]:
3275 cmd_args: List[str] = [
3276 str(self.ctx.container_engine.path),
3277 'run',
3278 '--rm',
3279 '--ipc=host',
3280 # some containers (ahem, haproxy) override this, but we want a fast
3281 # shutdown always (and, more importantly, a successful exit even if we
3282 # fall back to SIGKILL).
3283 '--stop-signal=SIGTERM',
3284 ]
3285
3286 if isinstance(self.ctx.container_engine, Podman):
3287 if os.path.exists('/etc/ceph/podman-auth.json'):
3288 cmd_args.append('--authfile=/etc/ceph/podman-auth.json')
3289
3290 envs: List[str] = [
3291 '-e', 'CONTAINER_IMAGE=%s' % self.image,
3292 '-e', 'NODE_NAME=%s' % get_hostname(),
3293 ]
3294 vols: List[str] = []
3295 binds: List[str] = []
3296
3297 if self.memory_request:
3298 cmd_args.extend(['-e', 'POD_MEMORY_REQUEST', str(self.memory_request)])
3299 if self.memory_limit:
3300 cmd_args.extend(['-e', 'POD_MEMORY_LIMIT', str(self.memory_limit)])
3301 cmd_args.extend(['--memory', str(self.memory_limit)])
3302
3303 if self.host_network:
3304 cmd_args.append('--net=host')
3305 if self.entrypoint:
3306 cmd_args.extend(['--entrypoint', self.entrypoint])
3307 if self.privileged:
3308 cmd_args.extend([
3309 '--privileged',
3310 # let OSD etc read block devs that haven't been chowned
3311 '--group-add=disk'])
3312 if self.ptrace and not self.privileged:
3313 # if privileged, the SYS_PTRACE cap is already added
3314 # in addition, --cap-add and --privileged are mutually
3315 # exclusive since podman >= 2.0
3316 cmd_args.append('--cap-add=SYS_PTRACE')
3317 if self.init:
3318 cmd_args.append('--init')
3319 envs += ['-e', 'CEPH_USE_RANDOM_NONCE=1']
3320 if self.cname:
3321 cmd_args.extend(['--name', self.cname])
3322 if self.envs:
3323 for env in self.envs:
3324 envs.extend(['-e', env])
3325
3326 vols = sum(
3327 [['-v', '%s:%s' % (host_dir, container_dir)]
3328 for host_dir, container_dir in self.volume_mounts.items()], [])
3329 binds = sum([['--mount', '{}'.format(','.join(bind))]
3330 for bind in self.bind_mounts], [])
3331
3332 return \
3333 cmd_args + self.container_args + \
3334 envs + vols + binds + \
3335 [self.image] + self.args # type: ignore
3336
3337 def shell_cmd(self, cmd: List[str]) -> List[str]:
3338 cmd_args: List[str] = [
3339 str(self.ctx.container_engine.path),
3340 'run',
3341 '--rm',
3342 '--ipc=host',
3343 ]
3344 envs: List[str] = [
3345 '-e', 'CONTAINER_IMAGE=%s' % self.image,
3346 '-e', 'NODE_NAME=%s' % get_hostname(),
3347 ]
3348 vols: List[str] = []
3349 binds: List[str] = []
3350
3351 if self.host_network:
3352 cmd_args.append('--net=host')
3353 if self.ctx.no_hosts:
3354 cmd_args.append('--no-hosts')
3355 if self.privileged:
3356 cmd_args.extend([
3357 '--privileged',
3358 # let OSD etc read block devs that haven't been chowned
3359 '--group-add=disk',
3360 ])
3361 if self.init:
3362 cmd_args.append('--init')
3363 envs += ['-e', 'CEPH_USE_RANDOM_NONCE=1']
3364 if self.envs:
3365 for env in self.envs:
3366 envs.extend(['-e', env])
3367
3368 vols = sum(
3369 [['-v', '%s:%s' % (host_dir, container_dir)]
3370 for host_dir, container_dir in self.volume_mounts.items()], [])
3371 binds = sum([['--mount', '{}'.format(','.join(bind))]
3372 for bind in self.bind_mounts], [])
3373
3374 return cmd_args + self.container_args + envs + vols + binds + [
3375 '--entrypoint', cmd[0],
3376 self.image,
3377 ] + cmd[1:]
3378
3379 def exec_cmd(self, cmd):
3380 # type: (List[str]) -> List[str]
3381 cname = get_running_container_name(self.ctx, self)
3382 if not cname:
3383 raise Error('unable to find container "{}"'.format(self.cname))
3384 return [
3385 str(self.ctx.container_engine.path),
3386 'exec',
3387 ] + self.container_args + [
3388 self.cname,
3389 ] + cmd
3390
3391 def rm_cmd(self, old_cname: bool = False, storage: bool = False) -> List[str]:
3392 ret = [
3393 str(self.ctx.container_engine.path),
3394 'rm', '-f',
3395 ]
3396 if storage:
3397 ret.append('--storage')
3398 if old_cname:
3399 ret.append(self.old_cname)
3400 else:
3401 ret.append(self.cname)
3402 return ret
3403
3404 def stop_cmd(self, old_cname: bool = False) -> List[str]:
3405 ret = [
3406 str(self.ctx.container_engine.path),
3407 'stop', self.old_cname if old_cname else self.cname,
3408 ]
3409 return ret
3410
3411 def run(self, timeout=DEFAULT_TIMEOUT):
3412 # type: (Optional[int]) -> str
3413 out, _, _ = call_throws(self.ctx, self.run_cmd(),
3414 desc=self.entrypoint, timeout=timeout)
3415 return out
3416
3417 ##################################
3418
3419
3420 @infer_image
3421 def command_version(ctx):
3422 # type: (CephadmContext) -> int
3423 c = CephContainer(ctx, ctx.image, 'ceph', ['--version'])
3424 out, err, ret = call(ctx, c.run_cmd(), desc=c.entrypoint)
3425 if not ret:
3426 print(out.strip())
3427 return ret
3428
3429 ##################################
3430
3431
3432 @infer_image
3433 def command_pull(ctx):
3434 # type: (CephadmContext) -> int
3435
3436 _pull_image(ctx, ctx.image)
3437 return command_inspect_image(ctx)
3438
3439
3440 def _pull_image(ctx, image):
3441 # type: (CephadmContext, str) -> None
3442 logger.info('Pulling container image %s...' % image)
3443
3444 ignorelist = [
3445 'error creating read-write layer with ID',
3446 'net/http: TLS handshake timeout',
3447 'Digest did not match, expected',
3448 ]
3449
3450 cmd = [ctx.container_engine.path, 'pull', image]
3451 if isinstance(ctx.container_engine, Podman) and os.path.exists('/etc/ceph/podman-auth.json'):
3452 cmd.append('--authfile=/etc/ceph/podman-auth.json')
3453 cmd_str = ' '.join(cmd)
3454
3455 for sleep_secs in [1, 4, 25]:
3456 out, err, ret = call(ctx, cmd)
3457 if not ret:
3458 return
3459
3460 if not any(pattern in err for pattern in ignorelist):
3461 raise RuntimeError('Failed command: %s' % cmd_str)
3462
3463 logger.info('`%s` failed transiently. Retrying. waiting %s seconds...' % (cmd_str, sleep_secs))
3464 time.sleep(sleep_secs)
3465
3466 raise RuntimeError('Failed command: %s: maximum retries reached' % cmd_str)
3467
3468 ##################################
3469
3470
3471 @infer_image
3472 def command_inspect_image(ctx):
3473 # type: (CephadmContext) -> int
3474 out, err, ret = call_throws(ctx, [
3475 ctx.container_engine.path, 'inspect',
3476 '--format', '{{.ID}},{{.RepoDigests}}',
3477 ctx.image])
3478 if ret:
3479 return errno.ENOENT
3480 info_from = get_image_info_from_inspect(out.strip(), ctx.image)
3481
3482 ver = CephContainer(ctx, ctx.image, 'ceph', ['--version']).run().strip()
3483 info_from['ceph_version'] = ver
3484
3485 print(json.dumps(info_from, indent=4, sort_keys=True))
3486 return 0
3487
3488
3489 def normalize_image_digest(digest: str) -> str:
3490 # normal case:
3491 # ceph/ceph -> docker.io/ceph/ceph
3492 # edge cases that shouldn't ever come up:
3493 # ubuntu -> docker.io/ubuntu (ubuntu alias for library/ubuntu)
3494 # no change:
3495 # quay.ceph.io/ceph/ceph -> ceph
3496 # docker.io/ubuntu -> no change
3497 bits = digest.split('/')
3498 if '.' not in bits[0] and len(bits) < 3:
3499 digest = DEFAULT_REGISTRY + '/' + digest
3500 return digest
3501
3502
3503 def get_image_info_from_inspect(out, image):
3504 # type: (str, str) -> Dict[str, Union[str,List[str]]]
3505 image_id, digests = out.split(',', 1)
3506 if not out:
3507 raise Error('inspect {}: empty result'.format(image))
3508 r = {
3509 'image_id': normalize_container_id(image_id)
3510 } # type: Dict[str, Union[str,List[str]]]
3511 if digests:
3512 r['repo_digests'] = list(map(normalize_image_digest, digests[1:-1].split(' ')))
3513 return r
3514
3515 ##################################
3516
3517
3518 def check_subnet(subnets: str) -> Tuple[int, List[int], str]:
3519 """Determine whether the given string is a valid subnet
3520
3521 :param subnets: subnet string, a single definition or comma separated list of CIDR subnets
3522 :returns: return code, IP version list of the subnets and msg describing any errors validation errors
3523 """
3524
3525 rc = 0
3526 versions = set()
3527 errors = []
3528 subnet_list = subnets.split(',')
3529 for subnet in subnet_list:
3530 # ensure the format of the string is as expected address/netmask
3531 if not re.search(r'\/\d+$', subnet):
3532 rc = 1
3533 errors.append(f'{subnet} is not in CIDR format (address/netmask)')
3534 continue
3535 try:
3536 v = ipaddress.ip_network(subnet).version
3537 versions.add(v)
3538 except ValueError as e:
3539 rc = 1
3540 errors.append(f'{subnet} invalid: {str(e)}')
3541
3542 return rc, list(versions), ', '.join(errors)
3543
3544
3545 def unwrap_ipv6(address):
3546 # type: (str) -> str
3547 if address.startswith('[') and address.endswith(']'):
3548 return address[1:-1]
3549 return address
3550
3551
3552 def wrap_ipv6(address):
3553 # type: (str) -> str
3554
3555 # We cannot assume it's already wrapped or even an IPv6 address if
3556 # it's already wrapped it'll not pass (like if it's a hostname) and trigger
3557 # the ValueError
3558 try:
3559 if ipaddress.ip_address(address).version == 6:
3560 return f'[{address}]'
3561 except ValueError:
3562 pass
3563
3564 return address
3565
3566
3567 def is_ipv6(address):
3568 # type: (str) -> bool
3569 address = unwrap_ipv6(address)
3570 try:
3571 return ipaddress.ip_address(address).version == 6
3572 except ValueError:
3573 logger.warning('Address: {} is not a valid IP address'.format(address))
3574 return False
3575
3576
3577 def prepare_mon_addresses(
3578 ctx: CephadmContext
3579 ) -> Tuple[str, bool, Optional[str]]:
3580 r = re.compile(r':(\d+)$')
3581 base_ip = ''
3582 ipv6 = False
3583
3584 if ctx.mon_ip:
3585 ipv6 = is_ipv6(ctx.mon_ip)
3586 if ipv6:
3587 ctx.mon_ip = wrap_ipv6(ctx.mon_ip)
3588 hasport = r.findall(ctx.mon_ip)
3589 if hasport:
3590 port_str = hasport[0]
3591 port = int(port_str)
3592 if port == 6789:
3593 addr_arg = '[v1:%s]' % ctx.mon_ip
3594 elif port == 3300:
3595 addr_arg = '[v2:%s]' % ctx.mon_ip
3596 else:
3597 logger.warning('Using msgr2 protocol for unrecognized port %d' %
3598 port)
3599 addr_arg = '[v2:%s]' % ctx.mon_ip
3600 base_ip = ctx.mon_ip[0:-(len(port_str)) - 1]
3601 check_ip_port(ctx, base_ip, port)
3602 else:
3603 base_ip = ctx.mon_ip
3604 addr_arg = '[v2:%s:3300,v1:%s:6789]' % (ctx.mon_ip, ctx.mon_ip)
3605 check_ip_port(ctx, ctx.mon_ip, 3300)
3606 check_ip_port(ctx, ctx.mon_ip, 6789)
3607 elif ctx.mon_addrv:
3608 addr_arg = ctx.mon_addrv
3609 if addr_arg[0] != '[' or addr_arg[-1] != ']':
3610 raise Error('--mon-addrv value %s must use square backets' %
3611 addr_arg)
3612 ipv6 = addr_arg.count('[') > 1
3613 for addr in addr_arg[1:-1].split(','):
3614 hasport = r.findall(addr)
3615 if not hasport:
3616 raise Error('--mon-addrv value %s must include port number' %
3617 addr_arg)
3618 port_str = hasport[0]
3619 port = int(port_str)
3620 # strip off v1: or v2: prefix
3621 addr = re.sub(r'^v\d+:', '', addr)
3622 base_ip = addr[0:-(len(port_str)) - 1]
3623 check_ip_port(ctx, base_ip, port)
3624 else:
3625 raise Error('must specify --mon-ip or --mon-addrv')
3626 logger.debug('Base mon IP is %s, final addrv is %s' % (base_ip, addr_arg))
3627
3628 mon_network = None
3629 if not ctx.skip_mon_network:
3630 # make sure IP is configured locally, and then figure out the
3631 # CIDR network
3632 errmsg = f'Cannot infer CIDR network for mon IP `{base_ip}`'
3633 for net, ifaces in list_networks(ctx).items():
3634 ips: List[str] = []
3635 for iface, ls in ifaces.items():
3636 ips.extend(ls)
3637 try:
3638 if ipaddress.ip_address(unwrap_ipv6(base_ip)) in \
3639 [ipaddress.ip_address(ip) for ip in ips]:
3640 mon_network = net
3641 logger.info(f'Mon IP `{base_ip}` is in CIDR network `{mon_network}`')
3642 break
3643 except ValueError as e:
3644 logger.warning(f'{errmsg}: {e}')
3645 if not mon_network:
3646 raise Error(f'{errmsg}: pass --skip-mon-network to configure it later')
3647
3648 return (addr_arg, ipv6, mon_network)
3649
3650
3651 def prepare_cluster_network(ctx: CephadmContext) -> Tuple[str, bool]:
3652 cluster_network = ''
3653 ipv6_cluster_network = False
3654 # the cluster network may not exist on this node, so all we can do is
3655 # validate that the address given is valid ipv4 or ipv6 subnet
3656 if ctx.cluster_network:
3657 rc, versions, err_msg = check_subnet(ctx.cluster_network)
3658 if rc:
3659 raise Error(f'Invalid --cluster-network parameter: {err_msg}')
3660 cluster_network = ctx.cluster_network
3661 ipv6_cluster_network = True if 6 in versions else False
3662 else:
3663 logger.info('- internal network (--cluster-network) has not '
3664 'been provided, OSD replication will default to '
3665 'the public_network')
3666
3667 return cluster_network, ipv6_cluster_network
3668
3669
3670 def create_initial_keys(
3671 ctx: CephadmContext,
3672 uid: int, gid: int,
3673 mgr_id: str
3674 ) -> Tuple[str, str, str, Any, Any]: # type: ignore
3675
3676 _image = ctx.image
3677
3678 # create some initial keys
3679 logger.info('Creating initial keys...')
3680 mon_key = CephContainer(
3681 ctx,
3682 image=_image,
3683 entrypoint='/usr/bin/ceph-authtool',
3684 args=['--gen-print-key'],
3685 ).run().strip()
3686 admin_key = CephContainer(
3687 ctx,
3688 image=_image,
3689 entrypoint='/usr/bin/ceph-authtool',
3690 args=['--gen-print-key'],
3691 ).run().strip()
3692 mgr_key = CephContainer(
3693 ctx,
3694 image=_image,
3695 entrypoint='/usr/bin/ceph-authtool',
3696 args=['--gen-print-key'],
3697 ).run().strip()
3698
3699 keyring = ('[mon.]\n'
3700 '\tkey = %s\n'
3701 '\tcaps mon = allow *\n'
3702 '[client.admin]\n'
3703 '\tkey = %s\n'
3704 '\tcaps mon = allow *\n'
3705 '\tcaps mds = allow *\n'
3706 '\tcaps mgr = allow *\n'
3707 '\tcaps osd = allow *\n'
3708 '[mgr.%s]\n'
3709 '\tkey = %s\n'
3710 '\tcaps mon = profile mgr\n'
3711 '\tcaps mds = allow *\n'
3712 '\tcaps osd = allow *\n'
3713 % (mon_key, admin_key, mgr_id, mgr_key))
3714
3715 admin_keyring = write_tmp('[client.admin]\n'
3716 '\tkey = ' + admin_key + '\n',
3717 uid, gid)
3718
3719 # tmp keyring file
3720 bootstrap_keyring = write_tmp(keyring, uid, gid)
3721 return (mon_key, mgr_key, admin_key,
3722 bootstrap_keyring, admin_keyring)
3723
3724
3725 def create_initial_monmap(
3726 ctx: CephadmContext,
3727 uid: int, gid: int,
3728 fsid: str,
3729 mon_id: str, mon_addr: str
3730 ) -> Any:
3731 logger.info('Creating initial monmap...')
3732 monmap = write_tmp('', 0, 0)
3733 out = CephContainer(
3734 ctx,
3735 image=ctx.image,
3736 entrypoint='/usr/bin/monmaptool',
3737 args=[
3738 '--create',
3739 '--clobber',
3740 '--fsid', fsid,
3741 '--addv', mon_id, mon_addr,
3742 '/tmp/monmap'
3743 ],
3744 volume_mounts={
3745 monmap.name: '/tmp/monmap:z',
3746 },
3747 ).run()
3748 logger.debug(f'monmaptool for {mon_id} {mon_addr} on {out}')
3749
3750 # pass monmap file to ceph user for use by ceph-mon --mkfs below
3751 os.fchown(monmap.fileno(), uid, gid)
3752 return monmap
3753
3754
3755 def prepare_create_mon(
3756 ctx: CephadmContext,
3757 uid: int, gid: int,
3758 fsid: str, mon_id: str,
3759 bootstrap_keyring_path: str,
3760 monmap_path: str
3761 ) -> Tuple[str, str]:
3762 logger.info('Creating mon...')
3763 create_daemon_dirs(ctx, fsid, 'mon', mon_id, uid, gid)
3764 mon_dir = get_data_dir(fsid, ctx.data_dir, 'mon', mon_id)
3765 log_dir = get_log_dir(fsid, ctx.log_dir)
3766 out = CephContainer(
3767 ctx,
3768 image=ctx.image,
3769 entrypoint='/usr/bin/ceph-mon',
3770 args=[
3771 '--mkfs',
3772 '-i', mon_id,
3773 '--fsid', fsid,
3774 '-c', '/dev/null',
3775 '--monmap', '/tmp/monmap',
3776 '--keyring', '/tmp/keyring',
3777 ] + get_daemon_args(ctx, fsid, 'mon', mon_id),
3778 volume_mounts={
3779 log_dir: '/var/log/ceph:z',
3780 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
3781 bootstrap_keyring_path: '/tmp/keyring:z',
3782 monmap_path: '/tmp/monmap:z',
3783 },
3784 ).run()
3785 logger.debug(f'create mon.{mon_id} on {out}')
3786 return (mon_dir, log_dir)
3787
3788
3789 def create_mon(
3790 ctx: CephadmContext,
3791 uid: int, gid: int,
3792 fsid: str, mon_id: str
3793 ) -> None:
3794 mon_c = get_container(ctx, fsid, 'mon', mon_id)
3795 ctx.meta_json = json.dumps({'service_name': 'mon'})
3796 deploy_daemon(ctx, fsid, 'mon', mon_id, mon_c, uid, gid,
3797 config=None, keyring=None)
3798
3799
3800 def wait_for_mon(
3801 ctx: CephadmContext,
3802 mon_id: str, mon_dir: str,
3803 admin_keyring_path: str, config_path: str
3804 ) -> None:
3805 logger.info('Waiting for mon to start...')
3806 c = CephContainer(
3807 ctx,
3808 image=ctx.image,
3809 entrypoint='/usr/bin/ceph',
3810 args=[
3811 'status'],
3812 volume_mounts={
3813 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
3814 admin_keyring_path: '/etc/ceph/ceph.client.admin.keyring:z',
3815 config_path: '/etc/ceph/ceph.conf:z',
3816 },
3817 )
3818
3819 # wait for the service to become available
3820 def is_mon_available():
3821 # type: () -> bool
3822 timeout = ctx.timeout if ctx.timeout else 60 # seconds
3823 out, err, ret = call(ctx, c.run_cmd(),
3824 desc=c.entrypoint,
3825 timeout=timeout)
3826 return ret == 0
3827
3828 is_available(ctx, 'mon', is_mon_available)
3829
3830
3831 def create_mgr(
3832 ctx: CephadmContext,
3833 uid: int, gid: int,
3834 fsid: str, mgr_id: str, mgr_key: str,
3835 config: str, clifunc: Callable
3836 ) -> None:
3837 logger.info('Creating mgr...')
3838 mgr_keyring = '[mgr.%s]\n\tkey = %s\n' % (mgr_id, mgr_key)
3839 mgr_c = get_container(ctx, fsid, 'mgr', mgr_id)
3840 # Note:the default port used by the Prometheus node exporter is opened in fw
3841 ctx.meta_json = json.dumps({'service_name': 'mgr'})
3842 deploy_daemon(ctx, fsid, 'mgr', mgr_id, mgr_c, uid, gid,
3843 config=config, keyring=mgr_keyring, ports=[9283])
3844
3845 # wait for the service to become available
3846 logger.info('Waiting for mgr to start...')
3847
3848 def is_mgr_available():
3849 # type: () -> bool
3850 timeout = ctx.timeout if ctx.timeout else 60 # seconds
3851 try:
3852 out = clifunc(['status', '-f', 'json-pretty'], timeout=timeout)
3853 j = json.loads(out)
3854 return j.get('mgrmap', {}).get('available', False)
3855 except Exception as e:
3856 logger.debug('status failed: %s' % e)
3857 return False
3858 is_available(ctx, 'mgr', is_mgr_available)
3859
3860
3861 def prepare_ssh(
3862 ctx: CephadmContext,
3863 cli: Callable, wait_for_mgr_restart: Callable
3864 ) -> None:
3865
3866 cli(['cephadm', 'set-user', ctx.ssh_user])
3867
3868 if ctx.ssh_config:
3869 logger.info('Using provided ssh config...')
3870 mounts = {
3871 pathify(ctx.ssh_config.name): '/tmp/cephadm-ssh-config:z',
3872 }
3873 cli(['cephadm', 'set-ssh-config', '-i', '/tmp/cephadm-ssh-config'], extra_mounts=mounts)
3874
3875 if ctx.ssh_private_key and ctx.ssh_public_key:
3876 logger.info('Using provided ssh keys...')
3877 mounts = {
3878 pathify(ctx.ssh_private_key.name): '/tmp/cephadm-ssh-key:z',
3879 pathify(ctx.ssh_public_key.name): '/tmp/cephadm-ssh-key.pub:z'
3880 }
3881 cli(['cephadm', 'set-priv-key', '-i', '/tmp/cephadm-ssh-key'], extra_mounts=mounts)
3882 cli(['cephadm', 'set-pub-key', '-i', '/tmp/cephadm-ssh-key.pub'], extra_mounts=mounts)
3883 else:
3884 logger.info('Generating ssh key...')
3885 cli(['cephadm', 'generate-key'])
3886 ssh_pub = cli(['cephadm', 'get-pub-key'])
3887
3888 with open(ctx.output_pub_ssh_key, 'w') as f:
3889 f.write(ssh_pub)
3890 logger.info('Wrote public SSH key to %s' % ctx.output_pub_ssh_key)
3891
3892 logger.info('Adding key to %s@localhost authorized_keys...' % ctx.ssh_user)
3893 try:
3894 s_pwd = pwd.getpwnam(ctx.ssh_user)
3895 except KeyError:
3896 raise Error('Cannot find uid/gid for ssh-user: %s' % (ctx.ssh_user))
3897 ssh_uid = s_pwd.pw_uid
3898 ssh_gid = s_pwd.pw_gid
3899 ssh_dir = os.path.join(s_pwd.pw_dir, '.ssh')
3900
3901 if not os.path.exists(ssh_dir):
3902 makedirs(ssh_dir, ssh_uid, ssh_gid, 0o700)
3903
3904 auth_keys_file = '%s/authorized_keys' % ssh_dir
3905 add_newline = False
3906
3907 if os.path.exists(auth_keys_file):
3908 with open(auth_keys_file, 'r') as f:
3909 f.seek(0, os.SEEK_END)
3910 if f.tell() > 0:
3911 f.seek(f.tell() - 1, os.SEEK_SET) # go to last char
3912 if f.read() != '\n':
3913 add_newline = True
3914
3915 with open(auth_keys_file, 'a') as f:
3916 os.fchown(f.fileno(), ssh_uid, ssh_gid) # just in case we created it
3917 os.fchmod(f.fileno(), 0o600) # just in case we created it
3918 if add_newline:
3919 f.write('\n')
3920 f.write(ssh_pub.strip() + '\n')
3921
3922 host = get_hostname()
3923 logger.info('Adding host %s...' % host)
3924 try:
3925 args = ['orch', 'host', 'add', host]
3926 if ctx.mon_ip:
3927 args.append(unwrap_ipv6(ctx.mon_ip))
3928 cli(args)
3929 except RuntimeError as e:
3930 raise Error('Failed to add host <%s>: %s' % (host, e))
3931
3932 for t in ['mon', 'mgr']:
3933 if not ctx.orphan_initial_daemons:
3934 logger.info('Deploying %s service with default placement...' % t)
3935 cli(['orch', 'apply', t])
3936 else:
3937 logger.info('Deploying unmanaged %s service...' % t)
3938 cli(['orch', 'apply', t, '--unmanaged'])
3939
3940 if not ctx.orphan_initial_daemons:
3941 logger.info('Deploying crash service with default placement...')
3942 cli(['orch', 'apply', 'crash'])
3943
3944 if not ctx.skip_monitoring_stack:
3945 for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager']:
3946 logger.info('Deploying %s service with default placement...' % t)
3947 cli(['orch', 'apply', t])
3948
3949
3950 def enable_cephadm_mgr_module(
3951 cli: Callable, wait_for_mgr_restart: Callable
3952 ) -> None:
3953
3954 logger.info('Enabling cephadm module...')
3955 cli(['mgr', 'module', 'enable', 'cephadm'])
3956 wait_for_mgr_restart()
3957 logger.info('Setting orchestrator backend to cephadm...')
3958 cli(['orch', 'set', 'backend', 'cephadm'])
3959
3960
3961 def prepare_dashboard(
3962 ctx: CephadmContext,
3963 uid: int, gid: int,
3964 cli: Callable, wait_for_mgr_restart: Callable
3965 ) -> None:
3966
3967 # Configure SSL port (cephadm only allows to configure dashboard SSL port)
3968 # if the user does not want to use SSL he can change this setting once the cluster is up
3969 cli(['config', 'set', 'mgr', 'mgr/dashboard/ssl_server_port', str(ctx.ssl_dashboard_port)])
3970
3971 # configuring dashboard parameters
3972 logger.info('Enabling the dashboard module...')
3973 cli(['mgr', 'module', 'enable', 'dashboard'])
3974 wait_for_mgr_restart()
3975
3976 # dashboard crt and key
3977 if ctx.dashboard_key and ctx.dashboard_crt:
3978 logger.info('Using provided dashboard certificate...')
3979 mounts = {
3980 pathify(ctx.dashboard_crt.name): '/tmp/dashboard.crt:z',
3981 pathify(ctx.dashboard_key.name): '/tmp/dashboard.key:z'
3982 }
3983 cli(['dashboard', 'set-ssl-certificate', '-i', '/tmp/dashboard.crt'], extra_mounts=mounts)
3984 cli(['dashboard', 'set-ssl-certificate-key', '-i', '/tmp/dashboard.key'], extra_mounts=mounts)
3985 else:
3986 logger.info('Generating a dashboard self-signed certificate...')
3987 cli(['dashboard', 'create-self-signed-cert'])
3988
3989 logger.info('Creating initial admin user...')
3990 password = ctx.initial_dashboard_password or generate_password()
3991 tmp_password_file = write_tmp(password, uid, gid)
3992 cmd = ['dashboard', 'ac-user-create', ctx.initial_dashboard_user, '-i', '/tmp/dashboard.pw', 'administrator', '--force-password']
3993 if not ctx.dashboard_password_noupdate:
3994 cmd.append('--pwd-update-required')
3995 cli(cmd, extra_mounts={pathify(tmp_password_file.name): '/tmp/dashboard.pw:z'})
3996 logger.info('Fetching dashboard port number...')
3997 out = cli(['config', 'get', 'mgr', 'mgr/dashboard/ssl_server_port'])
3998 port = int(out)
3999
4000 # Open dashboard port
4001 fw = Firewalld(ctx)
4002 fw.open_ports([port])
4003 fw.apply_rules()
4004
4005 logger.info('Ceph Dashboard is now available at:\n\n'
4006 '\t URL: https://%s:%s/\n'
4007 '\t User: %s\n'
4008 '\tPassword: %s\n' % (
4009 get_fqdn(), port,
4010 ctx.initial_dashboard_user,
4011 password))
4012
4013
4014 def prepare_bootstrap_config(
4015 ctx: CephadmContext,
4016 fsid: str, mon_addr: str, image: str
4017
4018 ) -> str:
4019
4020 cp = read_config(ctx.config)
4021 if not cp.has_section('global'):
4022 cp.add_section('global')
4023 cp.set('global', 'fsid', fsid)
4024 cp.set('global', 'mon_host', mon_addr)
4025 cp.set('global', 'container_image', image)
4026
4027 if not cp.has_section('mon'):
4028 cp.add_section('mon')
4029 if (
4030 not cp.has_option('mon', 'auth_allow_insecure_global_id_reclaim')
4031 and not cp.has_option('mon', 'auth allow insecure global id reclaim')
4032 ):
4033 cp.set('mon', 'auth_allow_insecure_global_id_reclaim', 'false')
4034
4035 if ctx.single_host_defaults:
4036 logger.info('Adjusting default settings to suit single-host cluster...')
4037 # replicate across osds, not hosts
4038 if (
4039 not cp.has_option('global', 'osd_crush_choose_leaf_type')
4040 and not cp.has_option('global', 'osd crush choose leaf type')
4041 ):
4042 cp.set('global', 'osd_crush_choose_leaf_type', '0')
4043 # replica 2x
4044 if (
4045 not cp.has_option('global', 'osd_pool_default_size')
4046 and not cp.has_option('global', 'osd pool default size')
4047 ):
4048 cp.set('global', 'osd_pool_default_size', '2')
4049 # disable mgr standby modules (so we can colocate multiple mgrs on one host)
4050 if not cp.has_section('mgr'):
4051 cp.add_section('mgr')
4052 if (
4053 not cp.has_option('mgr', 'mgr_standby_modules')
4054 and not cp.has_option('mgr', 'mgr standby modules')
4055 ):
4056 cp.set('mgr', 'mgr_standby_modules', 'false')
4057 if ctx.log_to_file:
4058 cp.set('global', 'log_to_file', 'true')
4059 cp.set('global', 'log_to_stderr', 'false')
4060 cp.set('global', 'log_to_journald', 'false')
4061 cp.set('global', 'mon_cluster_log_to_file', 'true')
4062 cp.set('global', 'mon_cluster_log_to_stderr', 'false')
4063 cp.set('global', 'mon_cluster_log_to_journald', 'false')
4064
4065 cpf = StringIO()
4066 cp.write(cpf)
4067 config = cpf.getvalue()
4068
4069 if ctx.registry_json or ctx.registry_url:
4070 command_registry_login(ctx)
4071
4072 return config
4073
4074
4075 def finish_bootstrap_config(
4076 ctx: CephadmContext,
4077 fsid: str,
4078 config: str,
4079 mon_id: str, mon_dir: str,
4080 mon_network: Optional[str], ipv6: bool,
4081 cli: Callable,
4082 cluster_network: Optional[str], ipv6_cluster_network: bool
4083
4084 ) -> None:
4085 if not ctx.no_minimize_config:
4086 logger.info('Assimilating anything we can from ceph.conf...')
4087 cli([
4088 'config', 'assimilate-conf',
4089 '-i', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
4090 ], {
4091 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
4092 })
4093 logger.info('Generating new minimal ceph.conf...')
4094 cli([
4095 'config', 'generate-minimal-conf',
4096 '-o', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
4097 ], {
4098 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
4099 })
4100 # re-read our minimized config
4101 with open(mon_dir + '/config', 'r') as f:
4102 config = f.read()
4103 logger.info('Restarting the monitor...')
4104 call_throws(ctx, [
4105 'systemctl',
4106 'restart',
4107 get_unit_name(fsid, 'mon', mon_id)
4108 ])
4109
4110 if mon_network:
4111 logger.info(f'Setting mon public_network to {mon_network}')
4112 cli(['config', 'set', 'mon', 'public_network', mon_network])
4113
4114 if cluster_network:
4115 logger.info(f'Setting cluster_network to {cluster_network}')
4116 cli(['config', 'set', 'global', 'cluster_network', cluster_network])
4117
4118 if ipv6 or ipv6_cluster_network:
4119 logger.info('Enabling IPv6 (ms_bind_ipv6) binding')
4120 cli(['config', 'set', 'global', 'ms_bind_ipv6', 'true'])
4121
4122 with open(ctx.output_config, 'w') as f:
4123 f.write(config)
4124 logger.info('Wrote config to %s' % ctx.output_config)
4125 pass
4126
4127
4128 @default_image
4129 def command_bootstrap(ctx):
4130 # type: (CephadmContext) -> int
4131
4132 if not ctx.output_config:
4133 ctx.output_config = os.path.join(ctx.output_dir, 'ceph.conf')
4134 if not ctx.output_keyring:
4135 ctx.output_keyring = os.path.join(ctx.output_dir,
4136 'ceph.client.admin.keyring')
4137 if not ctx.output_pub_ssh_key:
4138 ctx.output_pub_ssh_key = os.path.join(ctx.output_dir, 'ceph.pub')
4139
4140 # verify output files
4141 for f in [ctx.output_config, ctx.output_keyring,
4142 ctx.output_pub_ssh_key]:
4143 if not ctx.allow_overwrite:
4144 if os.path.exists(f):
4145 raise Error('%s already exists; delete or pass '
4146 '--allow-overwrite to overwrite' % f)
4147 dirname = os.path.dirname(f)
4148 if dirname and not os.path.exists(dirname):
4149 fname = os.path.basename(f)
4150 logger.info(f'Creating directory {dirname} for {fname}')
4151 try:
4152 # use makedirs to create intermediate missing dirs
4153 os.makedirs(dirname, 0o755)
4154 except PermissionError:
4155 raise Error(f'Unable to create {dirname} due to permissions failure. Retry with root, or sudo or preallocate the directory.')
4156
4157 (user_conf, _) = get_config_and_keyring(ctx)
4158
4159 if not ctx.skip_prepare_host:
4160 command_prepare_host(ctx)
4161 else:
4162 logger.info('Skip prepare_host')
4163
4164 # initial vars
4165 fsid = ctx.fsid or make_fsid()
4166 if not is_fsid(fsid):
4167 raise Error('not an fsid: %s' % fsid)
4168 logger.info('Cluster fsid: %s' % fsid)
4169
4170 hostname = get_hostname()
4171 if '.' in hostname and not ctx.allow_fqdn_hostname:
4172 raise Error('hostname is a fully qualified domain name (%s); either fix (e.g., "sudo hostname %s" or similar) or pass --allow-fqdn-hostname' % (hostname, hostname.split('.')[0]))
4173 mon_id = ctx.mon_id or hostname
4174 mgr_id = ctx.mgr_id or generate_service_id()
4175
4176 lock = FileLock(ctx, fsid)
4177 lock.acquire()
4178
4179 (addr_arg, ipv6, mon_network) = prepare_mon_addresses(ctx)
4180 cluster_network, ipv6_cluster_network = prepare_cluster_network(ctx)
4181
4182 config = prepare_bootstrap_config(ctx, fsid, addr_arg, ctx.image)
4183
4184 if not ctx.skip_pull:
4185 _pull_image(ctx, ctx.image)
4186
4187 image_ver = CephContainer(ctx, ctx.image, 'ceph', ['--version']).run().strip()
4188 logger.info(f'Ceph version: {image_ver}')
4189
4190 if not ctx.allow_mismatched_release:
4191 image_release = image_ver.split()[4]
4192 if image_release not in \
4193 [DEFAULT_IMAGE_RELEASE, LATEST_STABLE_RELEASE]:
4194 raise Error(
4195 f'Container release {image_release} != cephadm release {DEFAULT_IMAGE_RELEASE};'
4196 ' please use matching version of cephadm (pass --allow-mismatched-release to continue anyway)'
4197 )
4198
4199 logger.info('Extracting ceph user uid/gid from container image...')
4200 (uid, gid) = extract_uid_gid(ctx)
4201
4202 # create some initial keys
4203 (mon_key, mgr_key, admin_key, bootstrap_keyring, admin_keyring) = \
4204 create_initial_keys(ctx, uid, gid, mgr_id)
4205
4206 monmap = create_initial_monmap(ctx, uid, gid, fsid, mon_id, addr_arg)
4207 (mon_dir, log_dir) = \
4208 prepare_create_mon(ctx, uid, gid, fsid, mon_id,
4209 bootstrap_keyring.name, monmap.name)
4210
4211 with open(mon_dir + '/config', 'w') as f:
4212 os.fchown(f.fileno(), uid, gid)
4213 os.fchmod(f.fileno(), 0o600)
4214 f.write(config)
4215
4216 make_var_run(ctx, fsid, uid, gid)
4217 create_mon(ctx, uid, gid, fsid, mon_id)
4218
4219 # config to issue various CLI commands
4220 tmp_config = write_tmp(config, uid, gid)
4221
4222 # a CLI helper to reduce our typing
4223 def cli(cmd, extra_mounts={}, timeout=DEFAULT_TIMEOUT):
4224 # type: (List[str], Dict[str, str], Optional[int]) -> str
4225 mounts = {
4226 log_dir: '/var/log/ceph:z',
4227 admin_keyring.name: '/etc/ceph/ceph.client.admin.keyring:z',
4228 tmp_config.name: '/etc/ceph/ceph.conf:z',
4229 }
4230 for k, v in extra_mounts.items():
4231 mounts[k] = v
4232 timeout = timeout or ctx.timeout
4233 return CephContainer(
4234 ctx,
4235 image=ctx.image,
4236 entrypoint='/usr/bin/ceph',
4237 args=cmd,
4238 volume_mounts=mounts,
4239 ).run(timeout=timeout)
4240
4241 wait_for_mon(ctx, mon_id, mon_dir, admin_keyring.name, tmp_config.name)
4242
4243 finish_bootstrap_config(ctx, fsid, config, mon_id, mon_dir,
4244 mon_network, ipv6, cli,
4245 cluster_network, ipv6_cluster_network)
4246
4247 # output files
4248 with open(ctx.output_keyring, 'w') as f:
4249 os.fchmod(f.fileno(), 0o600)
4250 f.write('[client.admin]\n'
4251 '\tkey = ' + admin_key + '\n')
4252 logger.info('Wrote keyring to %s' % ctx.output_keyring)
4253
4254 # create mgr
4255 create_mgr(ctx, uid, gid, fsid, mgr_id, mgr_key, config, cli)
4256
4257 if user_conf:
4258 # user given config settings were already assimilated earlier
4259 # but if the given settings contained any attributes in
4260 # the mgr (e.g. mgr/cephadm/container_image_prometheus)
4261 # they don't seem to be stored if there isn't a mgr yet.
4262 # Since re-assimilating the same conf settings should be
4263 # idempotent we can just do it again here.
4264 with tempfile.NamedTemporaryFile(buffering=0) as tmp:
4265 tmp.write(user_conf.encode('utf-8'))
4266 cli(['config', 'assimilate-conf',
4267 '-i', '/var/lib/ceph/user.conf'],
4268 {tmp.name: '/var/lib/ceph/user.conf:z'})
4269
4270 # wait for mgr to restart (after enabling a module)
4271 def wait_for_mgr_restart() -> None:
4272 # first get latest mgrmap epoch from the mon. try newer 'mgr
4273 # stat' command first, then fall back to 'mgr dump' if
4274 # necessary
4275 try:
4276 j = json_loads_retry(lambda: cli(['mgr', 'stat']))
4277 except Exception:
4278 j = json_loads_retry(lambda: cli(['mgr', 'dump']))
4279 epoch = j['epoch']
4280
4281 # wait for mgr to have it
4282 logger.info('Waiting for the mgr to restart...')
4283
4284 def mgr_has_latest_epoch():
4285 # type: () -> bool
4286 try:
4287 out = cli(['tell', 'mgr', 'mgr_status'])
4288 j = json.loads(out)
4289 return j['mgrmap_epoch'] >= epoch
4290 except Exception as e:
4291 logger.debug('tell mgr mgr_status failed: %s' % e)
4292 return False
4293 is_available(ctx, 'mgr epoch %d' % epoch, mgr_has_latest_epoch)
4294
4295 enable_cephadm_mgr_module(cli, wait_for_mgr_restart)
4296
4297 # ssh
4298 if not ctx.skip_ssh:
4299 prepare_ssh(ctx, cli, wait_for_mgr_restart)
4300
4301 if ctx.registry_url and ctx.registry_username and ctx.registry_password:
4302 cli(['config', 'set', 'mgr', 'mgr/cephadm/registry_url', ctx.registry_url, '--force'])
4303 cli(['config', 'set', 'mgr', 'mgr/cephadm/registry_username', ctx.registry_username, '--force'])
4304 cli(['config', 'set', 'mgr', 'mgr/cephadm/registry_password', ctx.registry_password, '--force'])
4305
4306 cli(['config', 'set', 'mgr', 'mgr/cephadm/container_init', str(ctx.container_init), '--force'])
4307
4308 if ctx.with_exporter:
4309 cli(['config-key', 'set', 'mgr/cephadm/exporter_enabled', 'true'])
4310 if ctx.exporter_config:
4311 logger.info('Applying custom cephadm exporter settings')
4312 # validated within the parser, so we can just apply to the store
4313 with tempfile.NamedTemporaryFile(buffering=0) as tmp:
4314 tmp.write(json.dumps(ctx.exporter_config).encode('utf-8'))
4315 mounts = {
4316 tmp.name: '/tmp/exporter-config.json:z'
4317 }
4318 cli(['cephadm', 'set-exporter-config', '-i', '/tmp/exporter-config.json'], extra_mounts=mounts)
4319 logger.info('-> Use ceph orch apply cephadm-exporter to deploy')
4320 else:
4321 # generate a default SSL configuration for the exporter(s)
4322 logger.info('Generating a default cephadm exporter configuration (self-signed)')
4323 cli(['cephadm', 'generate-exporter-config'])
4324 #
4325 # deploy the service (commented out until the cephadm changes are in the ceph container build)
4326 logger.info('Deploying cephadm exporter service with default placement...')
4327 cli(['orch', 'apply', 'cephadm-exporter'])
4328
4329 if not ctx.skip_dashboard:
4330 prepare_dashboard(ctx, uid, gid, cli, wait_for_mgr_restart)
4331
4332 if ctx.output_config == '/etc/ceph/ceph.conf' and not ctx.skip_admin_label:
4333 logger.info('Enabling client.admin keyring and conf on hosts with "admin" label')
4334 try:
4335 cli(['orch', 'client-keyring', 'set', 'client.admin', 'label:_admin'])
4336 cli(['orch', 'host', 'label', 'add', get_hostname(), '_admin'])
4337 except Exception:
4338 logger.info('Unable to set up "admin" label; assuming older version of Ceph')
4339
4340 if ctx.apply_spec:
4341 logger.info('Applying %s to cluster' % ctx.apply_spec)
4342
4343 with open(ctx.apply_spec) as f:
4344 for line in f:
4345 if 'hostname:' in line:
4346 line = line.replace('\n', '')
4347 split = line.split(': ')
4348 if split[1] != hostname:
4349 logger.info('Adding ssh key to %s' % split[1])
4350
4351 ssh_key = '/etc/ceph/ceph.pub'
4352 if ctx.ssh_public_key:
4353 ssh_key = ctx.ssh_public_key.name
4354 out, err, code = call_throws(ctx, ['sudo', '-u', ctx.ssh_user, 'ssh-copy-id', '-f', '-i', ssh_key, '-o StrictHostKeyChecking=no', '%s@%s' % (ctx.ssh_user, split[1])])
4355
4356 mounts = {}
4357 mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:z'
4358
4359 out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
4360 logger.info(out)
4361
4362 logger.info('You can access the Ceph CLI with:\n\n'
4363 '\tsudo %s shell --fsid %s -c %s -k %s\n' % (
4364 sys.argv[0],
4365 fsid,
4366 ctx.output_config,
4367 ctx.output_keyring))
4368 logger.info('Please consider enabling telemetry to help improve Ceph:\n\n'
4369 '\tceph telemetry on\n\n'
4370 'For more information see:\n\n'
4371 '\thttps://docs.ceph.com/docs/pacific/mgr/telemetry/\n')
4372 logger.info('Bootstrap complete.')
4373 return 0
4374
4375 ##################################
4376
4377
4378 def command_registry_login(ctx: CephadmContext) -> int:
4379 if ctx.registry_json:
4380 logger.info('Pulling custom registry login info from %s.' % ctx.registry_json)
4381 d = get_parm(ctx.registry_json)
4382 if d.get('url') and d.get('username') and d.get('password'):
4383 ctx.registry_url = d.get('url')
4384 ctx.registry_username = d.get('username')
4385 ctx.registry_password = d.get('password')
4386 registry_login(ctx, ctx.registry_url, ctx.registry_username, ctx.registry_password)
4387 else:
4388 raise Error('json provided for custom registry login did not include all necessary fields. '
4389 'Please setup json file as\n'
4390 '{\n'
4391 ' "url": "REGISTRY_URL",\n'
4392 ' "username": "REGISTRY_USERNAME",\n'
4393 ' "password": "REGISTRY_PASSWORD"\n'
4394 '}\n')
4395 elif ctx.registry_url and ctx.registry_username and ctx.registry_password:
4396 registry_login(ctx, ctx.registry_url, ctx.registry_username, ctx.registry_password)
4397 else:
4398 raise Error('Invalid custom registry arguments received. To login to a custom registry include '
4399 '--registry-url, --registry-username and --registry-password '
4400 'options or --registry-json option')
4401 return 0
4402
4403
4404 def registry_login(ctx: CephadmContext, url: Optional[str], username: Optional[str], password: Optional[str]) -> None:
4405 logger.info('Logging into custom registry.')
4406 try:
4407 engine = ctx.container_engine
4408 cmd = [engine.path, 'login',
4409 '-u', username, '-p', password,
4410 url]
4411 if isinstance(engine, Podman):
4412 cmd.append('--authfile=/etc/ceph/podman-auth.json')
4413 out, _, _ = call_throws(ctx, cmd)
4414 if isinstance(engine, Podman):
4415 os.chmod('/etc/ceph/podman-auth.json', 0o600)
4416 except Exception:
4417 raise Error('Failed to login to custom registry @ %s as %s with given password' % (ctx.registry_url, ctx.registry_username))
4418
4419 ##################################
4420
4421
4422 def extract_uid_gid_monitoring(ctx, daemon_type):
4423 # type: (CephadmContext, str) -> Tuple[int, int]
4424
4425 if daemon_type == 'prometheus':
4426 uid, gid = extract_uid_gid(ctx, file_path='/etc/prometheus')
4427 elif daemon_type == 'node-exporter':
4428 uid, gid = 65534, 65534
4429 elif daemon_type == 'grafana':
4430 uid, gid = extract_uid_gid(ctx, file_path='/var/lib/grafana')
4431 elif daemon_type == 'alertmanager':
4432 uid, gid = extract_uid_gid(ctx, file_path=['/etc/alertmanager', '/etc/prometheus'])
4433 else:
4434 raise Error('{} not implemented yet'.format(daemon_type))
4435 return uid, gid
4436
4437
4438 @default_image
4439 def command_deploy(ctx):
4440 # type: (CephadmContext) -> None
4441 daemon_type, daemon_id = ctx.name.split('.', 1)
4442
4443 lock = FileLock(ctx, ctx.fsid)
4444 lock.acquire()
4445
4446 if daemon_type not in get_supported_daemons():
4447 raise Error('daemon type %s not recognized' % daemon_type)
4448
4449 redeploy = False
4450 unit_name = get_unit_name(ctx.fsid, daemon_type, daemon_id)
4451 (_, state, _) = check_unit(ctx, unit_name)
4452 if state == 'running' or is_container_running(ctx, CephContainer.for_daemon(ctx, ctx.fsid, daemon_type, daemon_id, 'bash')):
4453 redeploy = True
4454
4455 if ctx.reconfig:
4456 logger.info('%s daemon %s ...' % ('Reconfig', ctx.name))
4457 elif redeploy:
4458 logger.info('%s daemon %s ...' % ('Redeploy', ctx.name))
4459 else:
4460 logger.info('%s daemon %s ...' % ('Deploy', ctx.name))
4461
4462 # Get and check ports explicitly required to be opened
4463 daemon_ports = [] # type: List[int]
4464
4465 # only check port in use if not reconfig or redeploy since service
4466 # we are redeploying/reconfiguring will already be using the port
4467 if not ctx.reconfig and not redeploy:
4468 if ctx.tcp_ports:
4469 daemon_ports = list(map(int, ctx.tcp_ports.split()))
4470
4471 if daemon_type in Ceph.daemons:
4472 config, keyring = get_config_and_keyring(ctx)
4473 uid, gid = extract_uid_gid(ctx)
4474 make_var_run(ctx, ctx.fsid, uid, gid)
4475
4476 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id,
4477 ptrace=ctx.allow_ptrace)
4478 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4479 config=config, keyring=keyring,
4480 osd_fsid=ctx.osd_fsid,
4481 reconfig=ctx.reconfig,
4482 ports=daemon_ports)
4483
4484 elif daemon_type in Monitoring.components:
4485 # monitoring daemon - prometheus, grafana, alertmanager, node-exporter
4486 # Default Checks
4487 # make sure provided config-json is sufficient
4488 config = get_parm(ctx.config_json) # type: ignore
4489 required_files = Monitoring.components[daemon_type].get('config-json-files', list())
4490 required_args = Monitoring.components[daemon_type].get('config-json-args', list())
4491 if required_files:
4492 if not config or not all(c in config.get('files', {}).keys() for c in required_files): # type: ignore
4493 raise Error('{} deployment requires config-json which must '
4494 'contain file content for {}'.format(daemon_type.capitalize(), ', '.join(required_files)))
4495 if required_args:
4496 if not config or not all(c in config.keys() for c in required_args): # type: ignore
4497 raise Error('{} deployment requires config-json which must '
4498 'contain arg for {}'.format(daemon_type.capitalize(), ', '.join(required_args)))
4499
4500 uid, gid = extract_uid_gid_monitoring(ctx, daemon_type)
4501 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4502 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4503 reconfig=ctx.reconfig,
4504 ports=daemon_ports)
4505
4506 elif daemon_type == NFSGanesha.daemon_type:
4507 if not ctx.reconfig and not redeploy and not daemon_ports:
4508 daemon_ports = list(NFSGanesha.port_map.values())
4509
4510 config, keyring = get_config_and_keyring(ctx)
4511 # TODO: extract ganesha uid/gid (997, 994) ?
4512 uid, gid = extract_uid_gid(ctx)
4513 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4514 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4515 config=config, keyring=keyring,
4516 reconfig=ctx.reconfig,
4517 ports=daemon_ports)
4518
4519 elif daemon_type == CephIscsi.daemon_type:
4520 config, keyring = get_config_and_keyring(ctx)
4521 uid, gid = extract_uid_gid(ctx)
4522 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4523 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4524 config=config, keyring=keyring,
4525 reconfig=ctx.reconfig,
4526 ports=daemon_ports)
4527
4528 elif daemon_type == HAproxy.daemon_type:
4529 haproxy = HAproxy.init(ctx, ctx.fsid, daemon_id)
4530 uid, gid = haproxy.extract_uid_gid_haproxy()
4531 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4532 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4533 reconfig=ctx.reconfig,
4534 ports=daemon_ports)
4535
4536 elif daemon_type == Keepalived.daemon_type:
4537 keepalived = Keepalived.init(ctx, ctx.fsid, daemon_id)
4538 uid, gid = keepalived.extract_uid_gid_keepalived()
4539 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4540 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4541 reconfig=ctx.reconfig,
4542 ports=daemon_ports)
4543
4544 elif daemon_type == CustomContainer.daemon_type:
4545 cc = CustomContainer.init(ctx, ctx.fsid, daemon_id)
4546 if not ctx.reconfig and not redeploy:
4547 daemon_ports.extend(cc.ports)
4548 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id,
4549 privileged=cc.privileged,
4550 ptrace=ctx.allow_ptrace)
4551 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c,
4552 uid=cc.uid, gid=cc.gid, config=None,
4553 keyring=None, reconfig=ctx.reconfig,
4554 ports=daemon_ports)
4555
4556 elif daemon_type == CephadmDaemon.daemon_type:
4557 # get current user gid and uid
4558 uid = os.getuid()
4559 gid = os.getgid()
4560 config_js = get_parm(ctx.config_json) # type: Dict[str, str]
4561 if not daemon_ports:
4562 logger.info('cephadm-exporter will use default port ({})'.format(CephadmDaemon.default_port))
4563 daemon_ports = [CephadmDaemon.default_port]
4564
4565 CephadmDaemon.validate_config(config_js)
4566
4567 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, None,
4568 uid, gid, ports=daemon_ports)
4569
4570 else:
4571 raise Error('daemon type {} not implemented in command_deploy function'
4572 .format(daemon_type))
4573
4574 ##################################
4575
4576
4577 @infer_image
4578 def command_run(ctx):
4579 # type: (CephadmContext) -> int
4580 (daemon_type, daemon_id) = ctx.name.split('.', 1)
4581 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4582 command = c.run_cmd()
4583 return call_timeout(ctx, command, ctx.timeout)
4584
4585 ##################################
4586
4587
4588 @infer_fsid
4589 @infer_config
4590 @infer_image
4591 @validate_fsid
4592 def command_shell(ctx):
4593 # type: (CephadmContext) -> int
4594 cp = read_config(ctx.config)
4595 if cp.has_option('global', 'fsid') and \
4596 cp.get('global', 'fsid') != ctx.fsid:
4597 raise Error('fsid does not match ceph.conf')
4598
4599 if ctx.fsid:
4600 make_log_dir(ctx, ctx.fsid)
4601 if ctx.name:
4602 if '.' in ctx.name:
4603 (daemon_type, daemon_id) = ctx.name.split('.', 1)
4604 else:
4605 daemon_type = ctx.name
4606 daemon_id = None
4607 else:
4608 daemon_type = 'osd' # get the most mounts
4609 daemon_id = None
4610
4611 if daemon_id and not ctx.fsid:
4612 raise Error('must pass --fsid to specify cluster')
4613
4614 # use /etc/ceph files by default, if present. we do this instead of
4615 # making these defaults in the arg parser because we don't want an error
4616 # if they don't exist.
4617 if not ctx.keyring and os.path.exists(SHELL_DEFAULT_KEYRING):
4618 ctx.keyring = SHELL_DEFAULT_KEYRING
4619
4620 container_args: List[str] = ['-i']
4621 mounts = get_container_mounts(ctx, ctx.fsid, daemon_type, daemon_id,
4622 no_config=True if ctx.config else False)
4623 binds = get_container_binds(ctx, ctx.fsid, daemon_type, daemon_id)
4624 if ctx.config:
4625 mounts[pathify(ctx.config)] = '/etc/ceph/ceph.conf:z'
4626 if ctx.keyring:
4627 mounts[pathify(ctx.keyring)] = '/etc/ceph/ceph.keyring:z'
4628 if ctx.mount:
4629 for _mount in ctx.mount:
4630 split_src_dst = _mount.split(':')
4631 mount = pathify(split_src_dst[0])
4632 filename = os.path.basename(split_src_dst[0])
4633 if len(split_src_dst) > 1:
4634 dst = split_src_dst[1] + ':z' if len(split_src_dst) == 3 else split_src_dst[1]
4635 mounts[mount] = dst
4636 else:
4637 mounts[mount] = '/mnt/{}:z'.format(filename)
4638 if ctx.command:
4639 command = ctx.command
4640 else:
4641 command = ['bash']
4642 container_args += [
4643 '-t',
4644 '-e', 'LANG=C',
4645 '-e', 'PS1=%s' % CUSTOM_PS1,
4646 ]
4647 if ctx.fsid:
4648 home = os.path.join(ctx.data_dir, ctx.fsid, 'home')
4649 if not os.path.exists(home):
4650 logger.debug('Creating root home at %s' % home)
4651 makedirs(home, 0, 0, 0o660)
4652 if os.path.exists('/etc/skel'):
4653 for f in os.listdir('/etc/skel'):
4654 if f.startswith('.bash'):
4655 shutil.copyfile(os.path.join('/etc/skel', f),
4656 os.path.join(home, f))
4657 mounts[home] = '/root'
4658
4659 for i in ctx.volume:
4660 a, b = i.split(':', 1)
4661 mounts[a] = b
4662
4663 c = CephContainer(
4664 ctx,
4665 image=ctx.image,
4666 entrypoint='doesnotmatter',
4667 args=[],
4668 container_args=container_args,
4669 volume_mounts=mounts,
4670 bind_mounts=binds,
4671 envs=ctx.env,
4672 privileged=True)
4673 command = c.shell_cmd(command)
4674
4675 return call_timeout(ctx, command, ctx.timeout)
4676
4677 ##################################
4678
4679
4680 @infer_fsid
4681 def command_enter(ctx):
4682 # type: (CephadmContext) -> int
4683 if not ctx.fsid:
4684 raise Error('must pass --fsid to specify cluster')
4685 (daemon_type, daemon_id) = ctx.name.split('.', 1)
4686 container_args = ['-i'] # type: List[str]
4687 if ctx.command:
4688 command = ctx.command
4689 else:
4690 command = ['sh']
4691 container_args += [
4692 '-t',
4693 '-e', 'LANG=C',
4694 '-e', 'PS1=%s' % CUSTOM_PS1,
4695 ]
4696 c = CephContainer(
4697 ctx,
4698 image=ctx.image,
4699 entrypoint='doesnotmatter',
4700 container_args=container_args,
4701 cname='ceph-%s-%s.%s' % (ctx.fsid, daemon_type, daemon_id),
4702 )
4703 command = c.exec_cmd(command)
4704 return call_timeout(ctx, command, ctx.timeout)
4705
4706 ##################################
4707
4708
4709 @infer_fsid
4710 @infer_image
4711 @validate_fsid
4712 def command_ceph_volume(ctx):
4713 # type: (CephadmContext) -> None
4714 cp = read_config(ctx.config)
4715 if cp.has_option('global', 'fsid') and \
4716 cp.get('global', 'fsid') != ctx.fsid:
4717 raise Error('fsid does not match ceph.conf')
4718
4719 if ctx.fsid:
4720 make_log_dir(ctx, ctx.fsid)
4721
4722 lock = FileLock(ctx, ctx.fsid)
4723 lock.acquire()
4724
4725 (uid, gid) = (0, 0) # ceph-volume runs as root
4726 mounts = get_container_mounts(ctx, ctx.fsid, 'osd', None)
4727
4728 tmp_config = None
4729 tmp_keyring = None
4730
4731 (config, keyring) = get_config_and_keyring(ctx)
4732
4733 if config:
4734 # tmp config file
4735 tmp_config = write_tmp(config, uid, gid)
4736 mounts[tmp_config.name] = '/etc/ceph/ceph.conf:z'
4737
4738 if keyring:
4739 # tmp keyring file
4740 tmp_keyring = write_tmp(keyring, uid, gid)
4741 mounts[tmp_keyring.name] = '/var/lib/ceph/bootstrap-osd/ceph.keyring:z'
4742
4743 c = CephContainer(
4744 ctx,
4745 image=ctx.image,
4746 entrypoint='/usr/sbin/ceph-volume',
4747 envs=ctx.env,
4748 args=ctx.command,
4749 privileged=True,
4750 volume_mounts=mounts,
4751 )
4752
4753 out, err, code = call_throws(ctx, c.run_cmd())
4754 if not code:
4755 print(out)
4756
4757 ##################################
4758
4759
4760 @infer_fsid
4761 def command_unit(ctx):
4762 # type: (CephadmContext) -> None
4763 if not ctx.fsid:
4764 raise Error('must pass --fsid to specify cluster')
4765
4766 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
4767
4768 call_throws(ctx, [
4769 'systemctl',
4770 ctx.command,
4771 unit_name],
4772 verbosity=CallVerbosity.VERBOSE,
4773 desc=''
4774 )
4775
4776 ##################################
4777
4778
4779 @infer_fsid
4780 def command_logs(ctx):
4781 # type: (CephadmContext) -> None
4782 if not ctx.fsid:
4783 raise Error('must pass --fsid to specify cluster')
4784
4785 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
4786
4787 cmd = [find_program('journalctl')]
4788 cmd.extend(['-u', unit_name])
4789 if ctx.command:
4790 cmd.extend(ctx.command)
4791
4792 # call this directly, without our wrapper, so that we get an unmolested
4793 # stdout with logger prefixing.
4794 logger.debug('Running command: %s' % ' '.join(cmd))
4795 subprocess.call(cmd, env=os.environ.copy()) # type: ignore
4796
4797 ##################################
4798
4799
4800 def list_networks(ctx):
4801 # type: (CephadmContext) -> Dict[str,Dict[str, Set[str]]]
4802
4803 # sadly, 18.04's iproute2 4.15.0-2ubun doesn't support the -j flag,
4804 # so we'll need to use a regex to parse 'ip' command output.
4805 #
4806 # out, _, _ = call_throws(['ip', '-j', 'route', 'ls'])
4807 # j = json.loads(out)
4808 # for x in j:
4809 res = _list_ipv4_networks(ctx)
4810 res.update(_list_ipv6_networks(ctx))
4811 return res
4812
4813
4814 def _list_ipv4_networks(ctx: CephadmContext) -> Dict[str, Dict[str, Set[str]]]:
4815 execstr: Optional[str] = find_executable('ip')
4816 if not execstr:
4817 raise FileNotFoundError("unable to find 'ip' command")
4818 out, _, _ = call_throws(ctx, [execstr, 'route', 'ls'])
4819 return _parse_ipv4_route(out)
4820
4821
4822 def _parse_ipv4_route(out: str) -> Dict[str, Dict[str, Set[str]]]:
4823 r = {} # type: Dict[str, Dict[str, Set[str]]]
4824 p = re.compile(r'^(\S+) dev (\S+) (.*)scope link (.*)src (\S+)')
4825 for line in out.splitlines():
4826 m = p.findall(line)
4827 if not m:
4828 continue
4829 net = m[0][0]
4830 iface = m[0][1]
4831 ip = m[0][4]
4832 if net not in r:
4833 r[net] = {}
4834 if iface not in r[net]:
4835 r[net][iface] = set()
4836 r[net][iface].add(ip)
4837 return r
4838
4839
4840 def _list_ipv6_networks(ctx: CephadmContext) -> Dict[str, Dict[str, Set[str]]]:
4841 execstr: Optional[str] = find_executable('ip')
4842 if not execstr:
4843 raise FileNotFoundError("unable to find 'ip' command")
4844 routes, _, _ = call_throws(ctx, [execstr, '-6', 'route', 'ls'])
4845 ips, _, _ = call_throws(ctx, [execstr, '-6', 'addr', 'ls'])
4846 return _parse_ipv6_route(routes, ips)
4847
4848
4849 def _parse_ipv6_route(routes: str, ips: str) -> Dict[str, Dict[str, Set[str]]]:
4850 r = {} # type: Dict[str, Dict[str, Set[str]]]
4851 route_p = re.compile(r'^(\S+) dev (\S+) proto (\S+) metric (\S+) .*pref (\S+)$')
4852 ip_p = re.compile(r'^\s+inet6 (\S+)/(.*)scope (.*)$')
4853 iface_p = re.compile(r'^(\d+): (\S+): (.*)$')
4854 for line in routes.splitlines():
4855 m = route_p.findall(line)
4856 if not m or m[0][0].lower() == 'default':
4857 continue
4858 net = m[0][0]
4859 if '/' not in net: # only consider networks with a mask
4860 continue
4861 iface = m[0][1]
4862 if net not in r:
4863 r[net] = {}
4864 if iface not in r[net]:
4865 r[net][iface] = set()
4866
4867 iface = None
4868 for line in ips.splitlines():
4869 m = ip_p.findall(line)
4870 if not m:
4871 m = iface_p.findall(line)
4872 if m:
4873 # drop @... suffix, if present
4874 iface = m[0][1].split('@')[0]
4875 continue
4876 ip = m[0][0]
4877 # find the network it belongs to
4878 net = [n for n in r.keys()
4879 if ipaddress.ip_address(ip) in ipaddress.ip_network(n)]
4880 if net:
4881 assert(iface)
4882 r[net[0]][iface].add(ip)
4883
4884 return r
4885
4886
4887 def command_list_networks(ctx):
4888 # type: (CephadmContext) -> None
4889 r = list_networks(ctx)
4890
4891 def serialize_sets(obj: Any) -> Any:
4892 return list(obj) if isinstance(obj, set) else obj
4893
4894 print(json.dumps(r, indent=4, default=serialize_sets))
4895
4896 ##################################
4897
4898
4899 def command_ls(ctx):
4900 # type: (CephadmContext) -> None
4901 ls = list_daemons(ctx, detail=not ctx.no_detail,
4902 legacy_dir=ctx.legacy_dir)
4903 print(json.dumps(ls, indent=4))
4904
4905
4906 def with_units_to_int(v: str) -> int:
4907 if v.endswith('iB'):
4908 v = v[:-2]
4909 elif v.endswith('B'):
4910 v = v[:-1]
4911 mult = 1
4912 if v[-1].upper() == 'K':
4913 mult = 1024
4914 v = v[:-1]
4915 elif v[-1].upper() == 'M':
4916 mult = 1024 * 1024
4917 v = v[:-1]
4918 elif v[-1].upper() == 'G':
4919 mult = 1024 * 1024 * 1024
4920 v = v[:-1]
4921 elif v[-1].upper() == 'T':
4922 mult = 1024 * 1024 * 1024 * 1024
4923 v = v[:-1]
4924 return int(float(v) * mult)
4925
4926
4927 def list_daemons(ctx, detail=True, legacy_dir=None):
4928 # type: (CephadmContext, bool, Optional[str]) -> List[Dict[str, str]]
4929 host_version: Optional[str] = None
4930 ls = []
4931 container_path = ctx.container_engine.path
4932
4933 data_dir = ctx.data_dir
4934 if legacy_dir is not None:
4935 data_dir = os.path.abspath(legacy_dir + data_dir)
4936
4937 # keep track of ceph versions we see
4938 seen_versions = {} # type: Dict[str, Optional[str]]
4939
4940 # keep track of image digests
4941 seen_digests = {} # type: Dict[str, List[str]]
4942
4943 # keep track of memory usage we've seen
4944 seen_memusage = {} # type: Dict[str, int]
4945 out, err, code = call(
4946 ctx,
4947 [container_path, 'stats', '--format', '{{.ID}},{{.MemUsage}}', '--no-stream'],
4948 verbosity=CallVerbosity.DEBUG
4949 )
4950 seen_memusage_cid_len, seen_memusage = _parse_mem_usage(code, out)
4951
4952 # /var/lib/ceph
4953 if os.path.exists(data_dir):
4954 for i in os.listdir(data_dir):
4955 if i in ['mon', 'osd', 'mds', 'mgr']:
4956 daemon_type = i
4957 for j in os.listdir(os.path.join(data_dir, i)):
4958 if '-' not in j:
4959 continue
4960 (cluster, daemon_id) = j.split('-', 1)
4961 fsid = get_legacy_daemon_fsid(ctx,
4962 cluster, daemon_type, daemon_id,
4963 legacy_dir=legacy_dir)
4964 legacy_unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
4965 val: Dict[str, Any] = {
4966 'style': 'legacy',
4967 'name': '%s.%s' % (daemon_type, daemon_id),
4968 'fsid': fsid if fsid is not None else 'unknown',
4969 'systemd_unit': legacy_unit_name,
4970 }
4971 if detail:
4972 (val['enabled'], val['state'], _) = \
4973 check_unit(ctx, legacy_unit_name)
4974 if not host_version:
4975 try:
4976 out, err, code = call(ctx,
4977 ['ceph', '-v'],
4978 verbosity=CallVerbosity.DEBUG)
4979 if not code and out.startswith('ceph version '):
4980 host_version = out.split(' ')[2]
4981 except Exception:
4982 pass
4983 val['host_version'] = host_version
4984 ls.append(val)
4985 elif is_fsid(i):
4986 fsid = str(i) # convince mypy that fsid is a str here
4987 for j in os.listdir(os.path.join(data_dir, i)):
4988 if '.' in j and os.path.isdir(os.path.join(data_dir, fsid, j)):
4989 name = j
4990 (daemon_type, daemon_id) = j.split('.', 1)
4991 unit_name = get_unit_name(fsid,
4992 daemon_type,
4993 daemon_id)
4994 else:
4995 continue
4996 val = {
4997 'style': 'cephadm:v1',
4998 'name': name,
4999 'fsid': fsid,
5000 'systemd_unit': unit_name,
5001 }
5002 if detail:
5003 # get container id
5004 (val['enabled'], val['state'], _) = \
5005 check_unit(ctx, unit_name)
5006 container_id = None
5007 image_name = None
5008 image_id = None
5009 image_digests = None
5010 version = None
5011 start_stamp = None
5012
5013 out, err, code = get_container_stats(ctx, container_path, fsid, daemon_type, daemon_id)
5014 if not code:
5015 (container_id, image_name, image_id, start,
5016 version) = out.strip().split(',')
5017 image_id = normalize_container_id(image_id)
5018 daemon_type = name.split('.', 1)[0]
5019 start_stamp = try_convert_datetime(start)
5020
5021 # collect digests for this image id
5022 image_digests = seen_digests.get(image_id)
5023 if not image_digests:
5024 out, err, code = call(
5025 ctx,
5026 [
5027 container_path, 'image', 'inspect', image_id,
5028 '--format', '{{.RepoDigests}}',
5029 ],
5030 verbosity=CallVerbosity.DEBUG)
5031 if not code:
5032 image_digests = list(set(map(
5033 normalize_image_digest,
5034 out.strip()[1:-1].split(' '))))
5035 seen_digests[image_id] = image_digests
5036
5037 # identify software version inside the container (if we can)
5038 if not version or '.' not in version:
5039 version = seen_versions.get(image_id, None)
5040 if daemon_type == NFSGanesha.daemon_type:
5041 version = NFSGanesha.get_version(ctx, container_id)
5042 if daemon_type == CephIscsi.daemon_type:
5043 version = CephIscsi.get_version(ctx, container_id)
5044 elif not version:
5045 if daemon_type in Ceph.daemons:
5046 out, err, code = call(ctx,
5047 [container_path, 'exec', container_id,
5048 'ceph', '-v'],
5049 verbosity=CallVerbosity.DEBUG)
5050 if not code and \
5051 out.startswith('ceph version '):
5052 version = out.split(' ')[2]
5053 seen_versions[image_id] = version
5054 elif daemon_type == 'grafana':
5055 out, err, code = call(ctx,
5056 [container_path, 'exec', container_id,
5057 'grafana-server', '-v'],
5058 verbosity=CallVerbosity.DEBUG)
5059 if not code and \
5060 out.startswith('Version '):
5061 version = out.split(' ')[1]
5062 seen_versions[image_id] = version
5063 elif daemon_type in ['prometheus',
5064 'alertmanager',
5065 'node-exporter']:
5066 version = Monitoring.get_version(ctx, container_id, daemon_type)
5067 seen_versions[image_id] = version
5068 elif daemon_type == 'haproxy':
5069 out, err, code = call(ctx,
5070 [container_path, 'exec', container_id,
5071 'haproxy', '-v'],
5072 verbosity=CallVerbosity.DEBUG)
5073 if not code and \
5074 out.startswith('HA-Proxy version '):
5075 version = out.split(' ')[2]
5076 seen_versions[image_id] = version
5077 elif daemon_type == 'keepalived':
5078 out, err, code = call(ctx,
5079 [container_path, 'exec', container_id,
5080 'keepalived', '--version'],
5081 verbosity=CallVerbosity.DEBUG)
5082 if not code and \
5083 err.startswith('Keepalived '):
5084 version = err.split(' ')[1]
5085 if version[0] == 'v':
5086 version = version[1:]
5087 seen_versions[image_id] = version
5088 elif daemon_type == CustomContainer.daemon_type:
5089 # Because a custom container can contain
5090 # everything, we do not know which command
5091 # to execute to get the version.
5092 pass
5093 else:
5094 logger.warning('version for unknown daemon type %s' % daemon_type)
5095 else:
5096 vfile = os.path.join(data_dir, fsid, j, 'unit.image') # type: ignore
5097 try:
5098 with open(vfile, 'r') as f:
5099 image_name = f.read().strip() or None
5100 except IOError:
5101 pass
5102
5103 # unit.meta?
5104 mfile = os.path.join(data_dir, fsid, j, 'unit.meta') # type: ignore
5105 try:
5106 with open(mfile, 'r') as f:
5107 meta = json.loads(f.read())
5108 val.update(meta)
5109 except IOError:
5110 pass
5111
5112 val['container_id'] = container_id
5113 val['container_image_name'] = image_name
5114 val['container_image_id'] = image_id
5115 val['container_image_digests'] = image_digests
5116 if container_id:
5117 val['memory_usage'] = seen_memusage.get(container_id[0:seen_memusage_cid_len])
5118 val['version'] = version
5119 val['started'] = start_stamp
5120 val['created'] = get_file_timestamp(
5121 os.path.join(data_dir, fsid, j, 'unit.created')
5122 )
5123 val['deployed'] = get_file_timestamp(
5124 os.path.join(data_dir, fsid, j, 'unit.image'))
5125 val['configured'] = get_file_timestamp(
5126 os.path.join(data_dir, fsid, j, 'unit.configured'))
5127
5128 ls.append(val)
5129
5130 return ls
5131
5132
5133 def _parse_mem_usage(code: int, out: str) -> Tuple[int, Dict[str, int]]:
5134 # keep track of memory usage we've seen
5135 seen_memusage = {} # type: Dict[str, int]
5136 seen_memusage_cid_len = 0
5137 if not code:
5138 for line in out.splitlines():
5139 (cid, usage) = line.split(',')
5140 (used, limit) = usage.split(' / ')
5141 try:
5142 seen_memusage[cid] = with_units_to_int(used)
5143 if not seen_memusage_cid_len:
5144 seen_memusage_cid_len = len(cid)
5145 except ValueError:
5146 logger.info('unable to parse memory usage line\n>{}'.format(line))
5147 pass
5148 return seen_memusage_cid_len, seen_memusage
5149
5150
5151 def get_daemon_description(ctx, fsid, name, detail=False, legacy_dir=None):
5152 # type: (CephadmContext, str, str, bool, Optional[str]) -> Dict[str, str]
5153
5154 for d in list_daemons(ctx, detail=detail, legacy_dir=legacy_dir):
5155 if d['fsid'] != fsid:
5156 continue
5157 if d['name'] != name:
5158 continue
5159 return d
5160 raise Error('Daemon not found: {}. See `cephadm ls`'.format(name))
5161
5162
5163 def get_container_stats(ctx: CephadmContext, container_path: str, fsid: str, daemon_type: str, daemon_id: str) -> Tuple[str, str, int]:
5164 c = CephContainer.for_daemon(ctx, fsid, daemon_type, daemon_id, 'bash')
5165 out, err, code = '', '', -1
5166 for name in (c.cname, c.old_cname):
5167 cmd = [
5168 container_path, 'inspect',
5169 '--format', '{{.Id}},{{.Config.Image}},{{.Image}},{{.Created}},{{index .Config.Labels "io.ceph.version"}}',
5170 name
5171 ]
5172 out, err, code = call(ctx, cmd, verbosity=CallVerbosity.DEBUG)
5173 if not code:
5174 break
5175 return out, err, code
5176
5177 ##################################
5178
5179
5180 @default_image
5181 def command_adopt(ctx):
5182 # type: (CephadmContext) -> None
5183
5184 if not ctx.skip_pull:
5185 _pull_image(ctx, ctx.image)
5186
5187 (daemon_type, daemon_id) = ctx.name.split('.', 1)
5188
5189 # legacy check
5190 if ctx.style != 'legacy':
5191 raise Error('adoption of style %s not implemented' % ctx.style)
5192
5193 # lock
5194 fsid = get_legacy_daemon_fsid(ctx,
5195 ctx.cluster,
5196 daemon_type,
5197 daemon_id,
5198 legacy_dir=ctx.legacy_dir)
5199 if not fsid:
5200 raise Error('could not detect legacy fsid; set fsid in ceph.conf')
5201 lock = FileLock(ctx, fsid)
5202 lock.acquire()
5203
5204 # call correct adoption
5205 if daemon_type in Ceph.daemons:
5206 command_adopt_ceph(ctx, daemon_type, daemon_id, fsid)
5207 elif daemon_type == 'prometheus':
5208 command_adopt_prometheus(ctx, daemon_id, fsid)
5209 elif daemon_type == 'grafana':
5210 command_adopt_grafana(ctx, daemon_id, fsid)
5211 elif daemon_type == 'node-exporter':
5212 raise Error('adoption of node-exporter not implemented')
5213 elif daemon_type == 'alertmanager':
5214 command_adopt_alertmanager(ctx, daemon_id, fsid)
5215 else:
5216 raise Error('daemon type %s not recognized' % daemon_type)
5217
5218
5219 class AdoptOsd(object):
5220 def __init__(self, ctx, osd_data_dir, osd_id):
5221 # type: (CephadmContext, str, str) -> None
5222 self.ctx = ctx
5223 self.osd_data_dir = osd_data_dir
5224 self.osd_id = osd_id
5225
5226 def check_online_osd(self):
5227 # type: () -> Tuple[Optional[str], Optional[str]]
5228
5229 osd_fsid, osd_type = None, None
5230
5231 path = os.path.join(self.osd_data_dir, 'fsid')
5232 try:
5233 with open(path, 'r') as f:
5234 osd_fsid = f.read().strip()
5235 logger.info('Found online OSD at %s' % path)
5236 except IOError:
5237 logger.info('Unable to read OSD fsid from %s' % path)
5238 if os.path.exists(os.path.join(self.osd_data_dir, 'type')):
5239 with open(os.path.join(self.osd_data_dir, 'type')) as f:
5240 osd_type = f.read().strip()
5241 else:
5242 logger.info('"type" file missing for OSD data dir')
5243
5244 return osd_fsid, osd_type
5245
5246 def check_offline_lvm_osd(self):
5247 # type: () -> Tuple[Optional[str], Optional[str]]
5248 osd_fsid, osd_type = None, None
5249
5250 c = CephContainer(
5251 self.ctx,
5252 image=self.ctx.image,
5253 entrypoint='/usr/sbin/ceph-volume',
5254 args=['lvm', 'list', '--format=json'],
5255 privileged=True
5256 )
5257 out, err, code = call_throws(self.ctx, c.run_cmd())
5258 if not code:
5259 try:
5260 js = json.loads(out)
5261 if self.osd_id in js:
5262 logger.info('Found offline LVM OSD {}'.format(self.osd_id))
5263 osd_fsid = js[self.osd_id][0]['tags']['ceph.osd_fsid']
5264 for device in js[self.osd_id]:
5265 if device['tags']['ceph.type'] == 'block':
5266 osd_type = 'bluestore'
5267 break
5268 if device['tags']['ceph.type'] == 'data':
5269 osd_type = 'filestore'
5270 break
5271 except ValueError as e:
5272 logger.info('Invalid JSON in ceph-volume lvm list: {}'.format(e))
5273
5274 return osd_fsid, osd_type
5275
5276 def check_offline_simple_osd(self):
5277 # type: () -> Tuple[Optional[str], Optional[str]]
5278 osd_fsid, osd_type = None, None
5279
5280 osd_file = glob('/etc/ceph/osd/{}-[a-f0-9-]*.json'.format(self.osd_id))
5281 if len(osd_file) == 1:
5282 with open(osd_file[0], 'r') as f:
5283 try:
5284 js = json.loads(f.read())
5285 logger.info('Found offline simple OSD {}'.format(self.osd_id))
5286 osd_fsid = js['fsid']
5287 osd_type = js['type']
5288 if osd_type != 'filestore':
5289 # need this to be mounted for the adopt to work, as it
5290 # needs to move files from this directory
5291 call_throws(self.ctx, ['mount', js['data']['path'], self.osd_data_dir])
5292 except ValueError as e:
5293 logger.info('Invalid JSON in {}: {}'.format(osd_file, e))
5294
5295 return osd_fsid, osd_type
5296
5297
5298 def command_adopt_ceph(ctx, daemon_type, daemon_id, fsid):
5299 # type: (CephadmContext, str, str, str) -> None
5300
5301 (uid, gid) = extract_uid_gid(ctx)
5302
5303 data_dir_src = ('/var/lib/ceph/%s/%s-%s' %
5304 (daemon_type, ctx.cluster, daemon_id))
5305 data_dir_src = os.path.abspath(ctx.legacy_dir + data_dir_src)
5306
5307 if not os.path.exists(data_dir_src):
5308 raise Error("{}.{} data directory '{}' does not exist. "
5309 'Incorrect ID specified, or daemon already adopted?'.format(
5310 daemon_type, daemon_id, data_dir_src))
5311
5312 osd_fsid = None
5313 if daemon_type == 'osd':
5314 adopt_osd = AdoptOsd(ctx, data_dir_src, daemon_id)
5315 osd_fsid, osd_type = adopt_osd.check_online_osd()
5316 if not osd_fsid:
5317 osd_fsid, osd_type = adopt_osd.check_offline_lvm_osd()
5318 if not osd_fsid:
5319 osd_fsid, osd_type = adopt_osd.check_offline_simple_osd()
5320 if not osd_fsid:
5321 raise Error('Unable to find OSD {}'.format(daemon_id))
5322 logger.info('objectstore_type is %s' % osd_type)
5323 assert osd_type
5324 if osd_type == 'filestore':
5325 raise Error('FileStore is not supported by cephadm')
5326
5327 # NOTE: implicit assumption here that the units correspond to the
5328 # cluster we are adopting based on the /etc/{defaults,sysconfig}/ceph
5329 # CLUSTER field.
5330 unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
5331 (enabled, state, _) = check_unit(ctx, unit_name)
5332 if state == 'running':
5333 logger.info('Stopping old systemd unit %s...' % unit_name)
5334 call_throws(ctx, ['systemctl', 'stop', unit_name])
5335 if enabled:
5336 logger.info('Disabling old systemd unit %s...' % unit_name)
5337 call_throws(ctx, ['systemctl', 'disable', unit_name])
5338
5339 # data
5340 logger.info('Moving data...')
5341 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5342 uid=uid, gid=gid)
5343 move_files(ctx, glob(os.path.join(data_dir_src, '*')),
5344 data_dir_dst,
5345 uid=uid, gid=gid)
5346 logger.debug('Remove dir `%s`' % (data_dir_src))
5347 if os.path.ismount(data_dir_src):
5348 call_throws(ctx, ['umount', data_dir_src])
5349 os.rmdir(data_dir_src)
5350
5351 logger.info('Chowning content...')
5352 call_throws(ctx, ['chown', '-c', '-R', '%d.%d' % (uid, gid), data_dir_dst])
5353
5354 if daemon_type == 'mon':
5355 # rename *.ldb -> *.sst, in case they are coming from ubuntu
5356 store = os.path.join(data_dir_dst, 'store.db')
5357 num_renamed = 0
5358 if os.path.exists(store):
5359 for oldf in os.listdir(store):
5360 if oldf.endswith('.ldb'):
5361 newf = oldf.replace('.ldb', '.sst')
5362 oldp = os.path.join(store, oldf)
5363 newp = os.path.join(store, newf)
5364 logger.debug('Renaming %s -> %s' % (oldp, newp))
5365 os.rename(oldp, newp)
5366 if num_renamed:
5367 logger.info('Renamed %d leveldb *.ldb files to *.sst',
5368 num_renamed)
5369 if daemon_type == 'osd':
5370 for n in ['block', 'block.db', 'block.wal']:
5371 p = os.path.join(data_dir_dst, n)
5372 if os.path.exists(p):
5373 logger.info('Chowning %s...' % p)
5374 os.chown(p, uid, gid)
5375 # disable the ceph-volume 'simple' mode files on the host
5376 simple_fn = os.path.join('/etc/ceph/osd',
5377 '%s-%s.json' % (daemon_id, osd_fsid))
5378 if os.path.exists(simple_fn):
5379 new_fn = simple_fn + '.adopted-by-cephadm'
5380 logger.info('Renaming %s -> %s', simple_fn, new_fn)
5381 os.rename(simple_fn, new_fn)
5382 logger.info('Disabling host unit ceph-volume@ simple unit...')
5383 call(ctx, ['systemctl', 'disable',
5384 'ceph-volume@simple-%s-%s.service' % (daemon_id, osd_fsid)])
5385 else:
5386 # assume this is an 'lvm' c-v for now, but don't error
5387 # out if it's not.
5388 logger.info('Disabling host unit ceph-volume@ lvm unit...')
5389 call(ctx, ['systemctl', 'disable',
5390 'ceph-volume@lvm-%s-%s.service' % (daemon_id, osd_fsid)])
5391
5392 # config
5393 config_src = '/etc/ceph/%s.conf' % (ctx.cluster)
5394 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5395 config_dst = os.path.join(data_dir_dst, 'config')
5396 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5397
5398 # logs
5399 logger.info('Moving logs...')
5400 log_dir_src = ('/var/log/ceph/%s-%s.%s.log*' %
5401 (ctx.cluster, daemon_type, daemon_id))
5402 log_dir_src = os.path.abspath(ctx.legacy_dir + log_dir_src)
5403 log_dir_dst = make_log_dir(ctx, fsid, uid=uid, gid=gid)
5404 move_files(ctx, glob(log_dir_src),
5405 log_dir_dst,
5406 uid=uid, gid=gid)
5407
5408 logger.info('Creating new units...')
5409 make_var_run(ctx, fsid, uid, gid)
5410 c = get_container(ctx, fsid, daemon_type, daemon_id)
5411 deploy_daemon_units(ctx, fsid, uid, gid, daemon_type, daemon_id, c,
5412 enable=True, # unconditionally enable the new unit
5413 start=(state == 'running' or ctx.force_start),
5414 osd_fsid=osd_fsid)
5415 update_firewalld(ctx, daemon_type)
5416
5417
5418 def command_adopt_prometheus(ctx, daemon_id, fsid):
5419 # type: (CephadmContext, str, str) -> None
5420 daemon_type = 'prometheus'
5421 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
5422
5423 _stop_and_disable(ctx, 'prometheus')
5424
5425 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5426 uid=uid, gid=gid)
5427
5428 # config
5429 config_src = '/etc/prometheus/prometheus.yml'
5430 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5431 config_dst = os.path.join(data_dir_dst, 'etc/prometheus')
5432 makedirs(config_dst, uid, gid, 0o755)
5433 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5434
5435 # data
5436 data_src = '/var/lib/prometheus/metrics/'
5437 data_src = os.path.abspath(ctx.legacy_dir + data_src)
5438 data_dst = os.path.join(data_dir_dst, 'data')
5439 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
5440
5441 make_var_run(ctx, fsid, uid, gid)
5442 c = get_container(ctx, fsid, daemon_type, daemon_id)
5443 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
5444 update_firewalld(ctx, daemon_type)
5445
5446
5447 def command_adopt_grafana(ctx, daemon_id, fsid):
5448 # type: (CephadmContext, str, str) -> None
5449
5450 daemon_type = 'grafana'
5451 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
5452
5453 _stop_and_disable(ctx, 'grafana-server')
5454
5455 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5456 uid=uid, gid=gid)
5457
5458 # config
5459 config_src = '/etc/grafana/grafana.ini'
5460 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5461 config_dst = os.path.join(data_dir_dst, 'etc/grafana')
5462 makedirs(config_dst, uid, gid, 0o755)
5463 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5464
5465 prov_src = '/etc/grafana/provisioning/'
5466 prov_src = os.path.abspath(ctx.legacy_dir + prov_src)
5467 prov_dst = os.path.join(data_dir_dst, 'etc/grafana')
5468 copy_tree(ctx, [prov_src], prov_dst, uid=uid, gid=gid)
5469
5470 # cert
5471 cert = '/etc/grafana/grafana.crt'
5472 key = '/etc/grafana/grafana.key'
5473 if os.path.exists(cert) and os.path.exists(key):
5474 cert_src = '/etc/grafana/grafana.crt'
5475 cert_src = os.path.abspath(ctx.legacy_dir + cert_src)
5476 makedirs(os.path.join(data_dir_dst, 'etc/grafana/certs'), uid, gid, 0o755)
5477 cert_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_file')
5478 copy_files(ctx, [cert_src], cert_dst, uid=uid, gid=gid)
5479
5480 key_src = '/etc/grafana/grafana.key'
5481 key_src = os.path.abspath(ctx.legacy_dir + key_src)
5482 key_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_key')
5483 copy_files(ctx, [key_src], key_dst, uid=uid, gid=gid)
5484
5485 _adjust_grafana_ini(os.path.join(config_dst, 'grafana.ini'))
5486 else:
5487 logger.debug('Skipping ssl, missing cert {} or key {}'.format(cert, key))
5488
5489 # data - possible custom dashboards/plugins
5490 data_src = '/var/lib/grafana/'
5491 data_src = os.path.abspath(ctx.legacy_dir + data_src)
5492 data_dst = os.path.join(data_dir_dst, 'data')
5493 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
5494
5495 make_var_run(ctx, fsid, uid, gid)
5496 c = get_container(ctx, fsid, daemon_type, daemon_id)
5497 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
5498 update_firewalld(ctx, daemon_type)
5499
5500
5501 def command_adopt_alertmanager(ctx, daemon_id, fsid):
5502 # type: (CephadmContext, str, str) -> None
5503
5504 daemon_type = 'alertmanager'
5505 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
5506
5507 _stop_and_disable(ctx, 'prometheus-alertmanager')
5508
5509 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5510 uid=uid, gid=gid)
5511
5512 # config
5513 config_src = '/etc/prometheus/alertmanager.yml'
5514 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5515 config_dst = os.path.join(data_dir_dst, 'etc/alertmanager')
5516 makedirs(config_dst, uid, gid, 0o755)
5517 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5518
5519 # data
5520 data_src = '/var/lib/prometheus/alertmanager/'
5521 data_src = os.path.abspath(ctx.legacy_dir + data_src)
5522 data_dst = os.path.join(data_dir_dst, 'etc/alertmanager/data')
5523 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
5524
5525 make_var_run(ctx, fsid, uid, gid)
5526 c = get_container(ctx, fsid, daemon_type, daemon_id)
5527 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
5528 update_firewalld(ctx, daemon_type)
5529
5530
5531 def _adjust_grafana_ini(filename):
5532 # type: (str) -> None
5533
5534 # Update cert_file, cert_key pathnames in server section
5535 # ConfigParser does not preserve comments
5536 try:
5537 with open(filename, 'r') as grafana_ini:
5538 lines = grafana_ini.readlines()
5539 with open('{}.new'.format(filename), 'w') as grafana_ini:
5540 server_section = False
5541 for line in lines:
5542 if line.startswith('['):
5543 server_section = False
5544 if line.startswith('[server]'):
5545 server_section = True
5546 if server_section:
5547 line = re.sub(r'^cert_file.*',
5548 'cert_file = /etc/grafana/certs/cert_file', line)
5549 line = re.sub(r'^cert_key.*',
5550 'cert_key = /etc/grafana/certs/cert_key', line)
5551 grafana_ini.write(line)
5552 os.rename('{}.new'.format(filename), filename)
5553 except OSError as err:
5554 raise Error('Cannot update {}: {}'.format(filename, err))
5555
5556
5557 def _stop_and_disable(ctx, unit_name):
5558 # type: (CephadmContext, str) -> None
5559
5560 (enabled, state, _) = check_unit(ctx, unit_name)
5561 if state == 'running':
5562 logger.info('Stopping old systemd unit %s...' % unit_name)
5563 call_throws(ctx, ['systemctl', 'stop', unit_name])
5564 if enabled:
5565 logger.info('Disabling old systemd unit %s...' % unit_name)
5566 call_throws(ctx, ['systemctl', 'disable', unit_name])
5567
5568 ##################################
5569
5570
5571 def command_rm_daemon(ctx):
5572 # type: (CephadmContext) -> None
5573 lock = FileLock(ctx, ctx.fsid)
5574 lock.acquire()
5575
5576 (daemon_type, daemon_id) = ctx.name.split('.', 1)
5577 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
5578
5579 if daemon_type in ['mon', 'osd'] and not ctx.force:
5580 raise Error('must pass --force to proceed: '
5581 'this command may destroy precious data!')
5582
5583 call(ctx, ['systemctl', 'stop', unit_name],
5584 verbosity=CallVerbosity.DEBUG)
5585 call(ctx, ['systemctl', 'reset-failed', unit_name],
5586 verbosity=CallVerbosity.DEBUG)
5587 call(ctx, ['systemctl', 'disable', unit_name],
5588 verbosity=CallVerbosity.DEBUG)
5589 data_dir = get_data_dir(ctx.fsid, ctx.data_dir, daemon_type, daemon_id)
5590 if daemon_type in ['mon', 'osd', 'prometheus'] and \
5591 not ctx.force_delete_data:
5592 # rename it out of the way -- do not delete
5593 backup_dir = os.path.join(ctx.data_dir, ctx.fsid, 'removed')
5594 if not os.path.exists(backup_dir):
5595 makedirs(backup_dir, 0, 0, DATA_DIR_MODE)
5596 dirname = '%s.%s_%s' % (daemon_type, daemon_id,
5597 datetime.datetime.utcnow().strftime(DATEFMT))
5598 os.rename(data_dir,
5599 os.path.join(backup_dir, dirname))
5600 else:
5601 if daemon_type == CephadmDaemon.daemon_type:
5602 CephadmDaemon.uninstall(ctx, ctx.fsid, daemon_type, daemon_id)
5603 call_throws(ctx, ['rm', '-rf', data_dir])
5604
5605 ##################################
5606
5607
5608 def _zap(ctx: CephadmContext, what: str) -> None:
5609 mounts = get_container_mounts(ctx, ctx.fsid, 'clusterless-ceph-volume', None)
5610 c = CephContainer(
5611 ctx,
5612 image=ctx.image,
5613 entrypoint='/usr/sbin/ceph-volume',
5614 envs=ctx.env,
5615 args=['lvm', 'zap', '--destroy', what],
5616 privileged=True,
5617 volume_mounts=mounts,
5618 )
5619 logger.info(f'Zapping {what}...')
5620 out, err, code = call_throws(ctx, c.run_cmd())
5621
5622
5623 @infer_image
5624 def _zap_osds(ctx: CephadmContext) -> None:
5625 # assume fsid lock already held
5626
5627 # list
5628 mounts = get_container_mounts(ctx, ctx.fsid, 'clusterless-ceph-volume', None)
5629 c = CephContainer(
5630 ctx,
5631 image=ctx.image,
5632 entrypoint='/usr/sbin/ceph-volume',
5633 envs=ctx.env,
5634 args=['inventory', '--format', 'json'],
5635 privileged=True,
5636 volume_mounts=mounts,
5637 )
5638 out, err, code = call_throws(ctx, c.run_cmd())
5639 if code:
5640 raise Error('failed to list osd inventory')
5641 try:
5642 ls = json.loads(out)
5643 except ValueError as e:
5644 raise Error(f'Invalid JSON in ceph-volume inventory: {e}')
5645
5646 for i in ls:
5647 matches = [lv.get('cluster_fsid') == ctx.fsid for lv in i.get('lvs', [])]
5648 if any(matches) and all(matches):
5649 _zap(ctx, i.get('path'))
5650 elif any(matches):
5651 lv_names = [lv['name'] for lv in i.get('lvs', [])]
5652 # TODO: we need to map the lv_names back to device paths (the vg
5653 # id isn't part of the output here!)
5654 logger.warning(f'Not zapping LVs (not implemented): {lv_names}')
5655
5656
5657 def command_zap_osds(ctx: CephadmContext) -> None:
5658 if not ctx.force:
5659 raise Error('must pass --force to proceed: '
5660 'this command may destroy precious data!')
5661
5662 lock = FileLock(ctx, ctx.fsid)
5663 lock.acquire()
5664
5665 _zap_osds(ctx)
5666
5667 ##################################
5668
5669
5670 def command_rm_cluster(ctx):
5671 # type: (CephadmContext) -> None
5672 if not ctx.force:
5673 raise Error('must pass --force to proceed: '
5674 'this command may destroy precious data!')
5675
5676 lock = FileLock(ctx, ctx.fsid)
5677 lock.acquire()
5678
5679 # stop + disable individual daemon units
5680 for d in list_daemons(ctx, detail=False):
5681 if d['fsid'] != ctx.fsid:
5682 continue
5683 if d['style'] != 'cephadm:v1':
5684 continue
5685 unit_name = get_unit_name(ctx.fsid, d['name'])
5686 call(ctx, ['systemctl', 'stop', unit_name],
5687 verbosity=CallVerbosity.DEBUG)
5688 call(ctx, ['systemctl', 'reset-failed', unit_name],
5689 verbosity=CallVerbosity.DEBUG)
5690 call(ctx, ['systemctl', 'disable', unit_name],
5691 verbosity=CallVerbosity.DEBUG)
5692
5693 # cluster units
5694 for unit_name in ['ceph-%s.target' % ctx.fsid]:
5695 call(ctx, ['systemctl', 'stop', unit_name],
5696 verbosity=CallVerbosity.DEBUG)
5697 call(ctx, ['systemctl', 'reset-failed', unit_name],
5698 verbosity=CallVerbosity.DEBUG)
5699 call(ctx, ['systemctl', 'disable', unit_name],
5700 verbosity=CallVerbosity.DEBUG)
5701
5702 slice_name = 'system-ceph\\x2d{}.slice'.format(ctx.fsid.replace('-', '\\x2d'))
5703 call(ctx, ['systemctl', 'stop', slice_name],
5704 verbosity=CallVerbosity.DEBUG)
5705
5706 # osds?
5707 if ctx.zap_osds:
5708 _zap_osds(ctx)
5709
5710 # rm units
5711 call_throws(ctx, ['rm', '-f', ctx.unit_dir
5712 + '/ceph-%s@.service' % ctx.fsid])
5713 call_throws(ctx, ['rm', '-f', ctx.unit_dir
5714 + '/ceph-%s.target' % ctx.fsid])
5715 call_throws(ctx, ['rm', '-rf',
5716 ctx.unit_dir + '/ceph-%s.target.wants' % ctx.fsid])
5717 # rm data
5718 call_throws(ctx, ['rm', '-rf', ctx.data_dir + '/' + ctx.fsid])
5719
5720 if not ctx.keep_logs:
5721 # rm logs
5722 call_throws(ctx, ['rm', '-rf', ctx.log_dir + '/' + ctx.fsid])
5723 call_throws(ctx, ['rm', '-rf', ctx.log_dir
5724 + '/*.wants/ceph-%s@*' % ctx.fsid])
5725
5726 # rm logrotate config
5727 call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/ceph-%s' % ctx.fsid])
5728
5729 # rm cephadm logrotate config if last cluster on host
5730 if not os.listdir(ctx.data_dir):
5731 call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/cephadm'])
5732
5733 # rm sysctl settings
5734 sysctl_dir = Path(ctx.sysctl_dir)
5735 for p in sysctl_dir.glob(f'90-ceph-{ctx.fsid}-*.conf'):
5736 p.unlink()
5737
5738 # clean up config, keyring, and pub key files
5739 files = ['/etc/ceph/ceph.conf', '/etc/ceph/ceph.pub', '/etc/ceph/ceph.client.admin.keyring']
5740
5741 if os.path.exists(files[0]):
5742 valid_fsid = False
5743 with open(files[0]) as f:
5744 if ctx.fsid in f.read():
5745 valid_fsid = True
5746 if valid_fsid:
5747 for n in range(0, len(files)):
5748 if os.path.exists(files[n]):
5749 os.remove(files[n])
5750
5751
5752 ##################################
5753
5754
5755 def check_time_sync(ctx, enabler=None):
5756 # type: (CephadmContext, Optional[Packager]) -> bool
5757 units = [
5758 'chrony.service', # 18.04 (at least)
5759 'chronyd.service', # el / opensuse
5760 'systemd-timesyncd.service',
5761 'ntpd.service', # el7 (at least)
5762 'ntp.service', # 18.04 (at least)
5763 'ntpsec.service', # 20.04 (at least) / buster
5764 'openntpd.service', # ubuntu / debian
5765 ]
5766 if not check_units(ctx, units, enabler):
5767 logger.warning('No time sync service is running; checked for %s' % units)
5768 return False
5769 return True
5770
5771
5772 def command_check_host(ctx: CephadmContext) -> None:
5773 container_path = ctx.container_engine.path
5774
5775 errors = []
5776 commands = ['systemctl', 'lvcreate']
5777
5778 try:
5779 check_container_engine(ctx)
5780 logger.info('podman|docker (%s) is present' % container_path)
5781 except Error as e:
5782 errors.append(str(e))
5783
5784 for command in commands:
5785 try:
5786 find_program(command)
5787 logger.info('%s is present' % command)
5788 except ValueError:
5789 errors.append('%s binary does not appear to be installed' % command)
5790
5791 # check for configured+running chronyd or ntp
5792 if not check_time_sync(ctx):
5793 errors.append('No time synchronization is active')
5794
5795 if 'expect_hostname' in ctx and ctx.expect_hostname:
5796 if get_hostname().lower() != ctx.expect_hostname.lower():
5797 errors.append('hostname "%s" does not match expected hostname "%s"' % (
5798 get_hostname(), ctx.expect_hostname))
5799 logger.info('Hostname "%s" matches what is expected.',
5800 ctx.expect_hostname)
5801
5802 if errors:
5803 raise Error('\nERROR: '.join(errors))
5804
5805 logger.info('Host looks OK')
5806
5807 ##################################
5808
5809
5810 def command_prepare_host(ctx: CephadmContext) -> None:
5811 logger.info('Verifying podman|docker is present...')
5812 pkg = None
5813 try:
5814 check_container_engine(ctx)
5815 except Error as e:
5816 logger.warning(str(e))
5817 if not pkg:
5818 pkg = create_packager(ctx)
5819 pkg.install_podman()
5820
5821 logger.info('Verifying lvm2 is present...')
5822 if not find_executable('lvcreate'):
5823 if not pkg:
5824 pkg = create_packager(ctx)
5825 pkg.install(['lvm2'])
5826
5827 logger.info('Verifying time synchronization is in place...')
5828 if not check_time_sync(ctx):
5829 if not pkg:
5830 pkg = create_packager(ctx)
5831 pkg.install(['chrony'])
5832 # check again, and this time try to enable
5833 # the service
5834 check_time_sync(ctx, enabler=pkg)
5835
5836 if 'expect_hostname' in ctx and ctx.expect_hostname and ctx.expect_hostname != get_hostname():
5837 logger.warning('Adjusting hostname from %s -> %s...' % (get_hostname(), ctx.expect_hostname))
5838 call_throws(ctx, ['hostname', ctx.expect_hostname])
5839 with open('/etc/hostname', 'w') as f:
5840 f.write(ctx.expect_hostname + '\n')
5841
5842 logger.info('Repeating the final host check...')
5843 command_check_host(ctx)
5844
5845 ##################################
5846
5847
5848 class CustomValidation(argparse.Action):
5849
5850 def _check_name(self, values: str) -> None:
5851 try:
5852 (daemon_type, daemon_id) = values.split('.', 1)
5853 except ValueError:
5854 raise argparse.ArgumentError(self,
5855 'must be of the format <type>.<id>. For example, osd.1 or prometheus.myhost.com')
5856
5857 daemons = get_supported_daemons()
5858 if daemon_type not in daemons:
5859 raise argparse.ArgumentError(self,
5860 'name must declare the type of daemon e.g. '
5861 '{}'.format(', '.join(daemons)))
5862
5863 def __call__(self, parser: argparse.ArgumentParser, namespace: argparse.Namespace, values: Union[str, Sequence[Any], None],
5864 option_string: Optional[str] = None) -> None:
5865 assert isinstance(values, str)
5866 if self.dest == 'name':
5867 self._check_name(values)
5868 setattr(namespace, self.dest, values)
5869 elif self.dest == 'exporter_config':
5870 cfg = get_parm(values)
5871 # run the class' validate method, and convert to an argparse error
5872 # if problems are found
5873 try:
5874 CephadmDaemon.validate_config(cfg)
5875 except Error as e:
5876 raise argparse.ArgumentError(self,
5877 str(e))
5878 setattr(namespace, self.dest, cfg)
5879
5880 ##################################
5881
5882
5883 def get_distro():
5884 # type: () -> Tuple[Optional[str], Optional[str], Optional[str]]
5885 distro = None
5886 distro_version = None
5887 distro_codename = None
5888 with open('/etc/os-release', 'r') as f:
5889 for line in f.readlines():
5890 line = line.strip()
5891 if '=' not in line or line.startswith('#'):
5892 continue
5893 (var, val) = line.split('=', 1)
5894 if val[0] == '"' and val[-1] == '"':
5895 val = val[1:-1]
5896 if var == 'ID':
5897 distro = val.lower()
5898 elif var == 'VERSION_ID':
5899 distro_version = val.lower()
5900 elif var == 'VERSION_CODENAME':
5901 distro_codename = val.lower()
5902 return distro, distro_version, distro_codename
5903
5904
5905 class Packager(object):
5906 def __init__(self, ctx: CephadmContext,
5907 stable: Optional[str] = None, version: Optional[str] = None,
5908 branch: Optional[str] = None, commit: Optional[str] = None):
5909 assert \
5910 (stable and not version and not branch and not commit) or \
5911 (not stable and version and not branch and not commit) or \
5912 (not stable and not version and branch) or \
5913 (not stable and not version and not branch and not commit)
5914 self.ctx = ctx
5915 self.stable = stable
5916 self.version = version
5917 self.branch = branch
5918 self.commit = commit
5919
5920 def add_repo(self) -> None:
5921 raise NotImplementedError
5922
5923 def rm_repo(self) -> None:
5924 raise NotImplementedError
5925
5926 def install(self, ls: List[str]) -> None:
5927 raise NotImplementedError
5928
5929 def install_podman(self) -> None:
5930 raise NotImplementedError
5931
5932 def query_shaman(self, distro: str, distro_version: Any, branch: Optional[str], commit: Optional[str]) -> str:
5933 # query shaman
5934 logger.info('Fetching repo metadata from shaman and chacra...')
5935 shaman_url = 'https://shaman.ceph.com/api/repos/ceph/{branch}/{sha1}/{distro}/{distro_version}/repo/?arch={arch}'.format(
5936 distro=distro,
5937 distro_version=distro_version,
5938 branch=branch,
5939 sha1=commit or 'latest',
5940 arch=get_arch()
5941 )
5942 try:
5943 shaman_response = urlopen(shaman_url)
5944 except HTTPError as err:
5945 logger.error('repository not found in shaman (might not be available yet)')
5946 raise Error('%s, failed to fetch %s' % (err, shaman_url))
5947 chacra_url = ''
5948 try:
5949 chacra_url = shaman_response.geturl()
5950 chacra_response = urlopen(chacra_url)
5951 except HTTPError as err:
5952 logger.error('repository not found in chacra (might not be available yet)')
5953 raise Error('%s, failed to fetch %s' % (err, chacra_url))
5954 return chacra_response.read().decode('utf-8')
5955
5956 def repo_gpgkey(self) -> Tuple[str, str]:
5957 if self.ctx.gpg_url:
5958 return self.ctx.gpg_url
5959 if self.stable or self.version:
5960 return 'https://download.ceph.com/keys/release.gpg', 'release'
5961 else:
5962 return 'https://download.ceph.com/keys/autobuild.gpg', 'autobuild'
5963
5964 def enable_service(self, service: str) -> None:
5965 """
5966 Start and enable the service (typically using systemd).
5967 """
5968 call_throws(self.ctx, ['systemctl', 'enable', '--now', service])
5969
5970
5971 class Apt(Packager):
5972 DISTRO_NAMES = {
5973 'ubuntu': 'ubuntu',
5974 'debian': 'debian',
5975 }
5976
5977 def __init__(self, ctx: CephadmContext,
5978 stable: Optional[str], version: Optional[str], branch: Optional[str], commit: Optional[str],
5979 distro: Optional[str], distro_version: Optional[str], distro_codename: Optional[str]) -> None:
5980 super(Apt, self).__init__(ctx, stable=stable, version=version,
5981 branch=branch, commit=commit)
5982 assert distro
5983 self.ctx = ctx
5984 self.distro = self.DISTRO_NAMES[distro]
5985 self.distro_codename = distro_codename
5986 self.distro_version = distro_version
5987
5988 def repo_path(self) -> str:
5989 return '/etc/apt/sources.list.d/ceph.list'
5990
5991 def add_repo(self) -> None:
5992
5993 url, name = self.repo_gpgkey()
5994 logger.info('Installing repo GPG key from %s...' % url)
5995 try:
5996 response = urlopen(url)
5997 except HTTPError as err:
5998 logger.error('failed to fetch GPG repo key from %s: %s' % (
5999 url, err))
6000 raise Error('failed to fetch GPG key')
6001 key = response.read()
6002 with open('/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name, 'wb') as f:
6003 f.write(key)
6004
6005 if self.version:
6006 content = 'deb %s/debian-%s/ %s main\n' % (
6007 self.ctx.repo_url, self.version, self.distro_codename)
6008 elif self.stable:
6009 content = 'deb %s/debian-%s/ %s main\n' % (
6010 self.ctx.repo_url, self.stable, self.distro_codename)
6011 else:
6012 content = self.query_shaman(self.distro, self.distro_codename, self.branch,
6013 self.commit)
6014
6015 logger.info('Installing repo file at %s...' % self.repo_path())
6016 with open(self.repo_path(), 'w') as f:
6017 f.write(content)
6018
6019 self.update()
6020
6021 def rm_repo(self) -> None:
6022 for name in ['autobuild', 'release']:
6023 p = '/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name
6024 if os.path.exists(p):
6025 logger.info('Removing repo GPG key %s...' % p)
6026 os.unlink(p)
6027 if os.path.exists(self.repo_path()):
6028 logger.info('Removing repo at %s...' % self.repo_path())
6029 os.unlink(self.repo_path())
6030
6031 if self.distro == 'ubuntu':
6032 self.rm_kubic_repo()
6033
6034 def install(self, ls: List[str]) -> None:
6035 logger.info('Installing packages %s...' % ls)
6036 call_throws(self.ctx, ['apt-get', 'install', '-y'] + ls)
6037
6038 def update(self) -> None:
6039 logger.info('Updating package list...')
6040 call_throws(self.ctx, ['apt-get', 'update'])
6041
6042 def install_podman(self) -> None:
6043 if self.distro == 'ubuntu':
6044 logger.info('Setting up repo for podman...')
6045 self.add_kubic_repo()
6046 self.update()
6047
6048 logger.info('Attempting podman install...')
6049 try:
6050 self.install(['podman'])
6051 except Error:
6052 logger.info('Podman did not work. Falling back to docker...')
6053 self.install(['docker.io'])
6054
6055 def kubic_repo_url(self) -> str:
6056 return 'https://download.opensuse.org/repositories/devel:/kubic:/' \
6057 'libcontainers:/stable/xUbuntu_%s/' % self.distro_version
6058
6059 def kubic_repo_path(self) -> str:
6060 return '/etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list'
6061
6062 def kubric_repo_gpgkey_url(self) -> str:
6063 return '%s/Release.key' % self.kubic_repo_url()
6064
6065 def kubric_repo_gpgkey_path(self) -> str:
6066 return '/etc/apt/trusted.gpg.d/kubic.release.gpg'
6067
6068 def add_kubic_repo(self) -> None:
6069 url = self.kubric_repo_gpgkey_url()
6070 logger.info('Installing repo GPG key from %s...' % url)
6071 try:
6072 response = urlopen(url)
6073 except HTTPError as err:
6074 logger.error('failed to fetch GPG repo key from %s: %s' % (
6075 url, err))
6076 raise Error('failed to fetch GPG key')
6077 key = response.read().decode('utf-8')
6078 tmp_key = write_tmp(key, 0, 0)
6079 keyring = self.kubric_repo_gpgkey_path()
6080 call_throws(self.ctx, ['apt-key', '--keyring', keyring, 'add', tmp_key.name])
6081
6082 logger.info('Installing repo file at %s...' % self.kubic_repo_path())
6083 content = 'deb %s /\n' % self.kubic_repo_url()
6084 with open(self.kubic_repo_path(), 'w') as f:
6085 f.write(content)
6086
6087 def rm_kubic_repo(self) -> None:
6088 keyring = self.kubric_repo_gpgkey_path()
6089 if os.path.exists(keyring):
6090 logger.info('Removing repo GPG key %s...' % keyring)
6091 os.unlink(keyring)
6092
6093 p = self.kubic_repo_path()
6094 if os.path.exists(p):
6095 logger.info('Removing repo at %s...' % p)
6096 os.unlink(p)
6097
6098
6099 class YumDnf(Packager):
6100 DISTRO_NAMES = {
6101 'centos': ('centos', 'el'),
6102 'rhel': ('centos', 'el'),
6103 'scientific': ('centos', 'el'),
6104 'rocky': ('centos', 'el'),
6105 'almalinux': ('centos', 'el'),
6106 'fedora': ('fedora', 'fc'),
6107 }
6108
6109 def __init__(self, ctx: CephadmContext,
6110 stable: Optional[str], version: Optional[str], branch: Optional[str], commit: Optional[str],
6111 distro: Optional[str], distro_version: Optional[str]) -> None:
6112 super(YumDnf, self).__init__(ctx, stable=stable, version=version,
6113 branch=branch, commit=commit)
6114 assert distro
6115 assert distro_version
6116 self.ctx = ctx
6117 self.major = int(distro_version.split('.')[0])
6118 self.distro_normalized = self.DISTRO_NAMES[distro][0]
6119 self.distro_code = self.DISTRO_NAMES[distro][1] + str(self.major)
6120 if (self.distro_code == 'fc' and self.major >= 30) or \
6121 (self.distro_code == 'el' and self.major >= 8):
6122 self.tool = 'dnf'
6123 else:
6124 self.tool = 'yum'
6125
6126 def custom_repo(self, **kw: Any) -> str:
6127 """
6128 Repo files need special care in that a whole line should not be present
6129 if there is no value for it. Because we were using `format()` we could
6130 not conditionally add a line for a repo file. So the end result would
6131 contain a key with a missing value (say if we were passing `None`).
6132
6133 For example, it could look like::
6134
6135 [ceph repo]
6136 name= ceph repo
6137 proxy=
6138 gpgcheck=
6139
6140 Which breaks. This function allows us to conditionally add lines,
6141 preserving an order and be more careful.
6142
6143 Previously, and for historical purposes, this is how the template used
6144 to look::
6145
6146 custom_repo =
6147 [{repo_name}]
6148 name={name}
6149 baseurl={baseurl}
6150 enabled={enabled}
6151 gpgcheck={gpgcheck}
6152 type={_type}
6153 gpgkey={gpgkey}
6154 proxy={proxy}
6155
6156 """
6157 lines = []
6158
6159 # by using tuples (vs a dict) we preserve the order of what we want to
6160 # return, like starting with a [repo name]
6161 tmpl = (
6162 ('reponame', '[%s]'),
6163 ('name', 'name=%s'),
6164 ('baseurl', 'baseurl=%s'),
6165 ('enabled', 'enabled=%s'),
6166 ('gpgcheck', 'gpgcheck=%s'),
6167 ('_type', 'type=%s'),
6168 ('gpgkey', 'gpgkey=%s'),
6169 ('proxy', 'proxy=%s'),
6170 ('priority', 'priority=%s'),
6171 )
6172
6173 for line in tmpl:
6174 tmpl_key, tmpl_value = line # key values from tmpl
6175
6176 # ensure that there is an actual value (not None nor empty string)
6177 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
6178 lines.append(tmpl_value % kw.get(tmpl_key))
6179
6180 return '\n'.join(lines)
6181
6182 def repo_path(self) -> str:
6183 return '/etc/yum.repos.d/ceph.repo'
6184
6185 def repo_baseurl(self) -> str:
6186 assert self.stable or self.version
6187 if self.version:
6188 return '%s/rpm-%s/%s' % (self.ctx.repo_url, self.version,
6189 self.distro_code)
6190 else:
6191 return '%s/rpm-%s/%s' % (self.ctx.repo_url, self.stable,
6192 self.distro_code)
6193
6194 def add_repo(self) -> None:
6195 if self.distro_code.startswith('fc'):
6196 raise Error('Ceph team does not build Fedora specific packages and therefore cannot add repos for this distro')
6197 if self.distro_code == 'el7':
6198 if self.stable and self.stable >= 'pacific':
6199 raise Error('Ceph does not support pacific or later for this version of this linux distro and therefore cannot add a repo for it')
6200 if self.version and self.version.split('.')[0] >= '16':
6201 raise Error('Ceph does not support 16.y.z or later for this version of this linux distro and therefore cannot add a repo for it')
6202 if self.stable or self.version:
6203 content = ''
6204 for n, t in {
6205 'Ceph': '$basearch',
6206 'Ceph-noarch': 'noarch',
6207 'Ceph-source': 'SRPMS'}.items():
6208 content += '[%s]\n' % (n)
6209 content += self.custom_repo(
6210 name='Ceph %s' % t,
6211 baseurl=self.repo_baseurl() + '/' + t,
6212 enabled=1,
6213 gpgcheck=1,
6214 gpgkey=self.repo_gpgkey()[0],
6215 )
6216 content += '\n\n'
6217 else:
6218 content = self.query_shaman(self.distro_normalized, self.major,
6219 self.branch,
6220 self.commit)
6221
6222 logger.info('Writing repo to %s...' % self.repo_path())
6223 with open(self.repo_path(), 'w') as f:
6224 f.write(content)
6225
6226 if self.distro_code.startswith('el'):
6227 logger.info('Enabling EPEL...')
6228 call_throws(self.ctx, [self.tool, 'install', '-y', 'epel-release'])
6229
6230 def rm_repo(self) -> None:
6231 if os.path.exists(self.repo_path()):
6232 os.unlink(self.repo_path())
6233
6234 def install(self, ls: List[str]) -> None:
6235 logger.info('Installing packages %s...' % ls)
6236 call_throws(self.ctx, [self.tool, 'install', '-y'] + ls)
6237
6238 def install_podman(self) -> None:
6239 self.install(['podman'])
6240
6241
6242 class Zypper(Packager):
6243 DISTRO_NAMES = [
6244 'sles',
6245 'opensuse-tumbleweed',
6246 'opensuse-leap'
6247 ]
6248
6249 def __init__(self, ctx: CephadmContext,
6250 stable: Optional[str], version: Optional[str], branch: Optional[str], commit: Optional[str],
6251 distro: Optional[str], distro_version: Optional[str]) -> None:
6252 super(Zypper, self).__init__(ctx, stable=stable, version=version,
6253 branch=branch, commit=commit)
6254 assert distro is not None
6255 self.ctx = ctx
6256 self.tool = 'zypper'
6257 self.distro = 'opensuse'
6258 self.distro_version = '15.1'
6259 if 'tumbleweed' not in distro and distro_version is not None:
6260 self.distro_version = distro_version
6261
6262 def custom_repo(self, **kw: Any) -> str:
6263 """
6264 See YumDnf for format explanation.
6265 """
6266 lines = []
6267
6268 # by using tuples (vs a dict) we preserve the order of what we want to
6269 # return, like starting with a [repo name]
6270 tmpl = (
6271 ('reponame', '[%s]'),
6272 ('name', 'name=%s'),
6273 ('baseurl', 'baseurl=%s'),
6274 ('enabled', 'enabled=%s'),
6275 ('gpgcheck', 'gpgcheck=%s'),
6276 ('_type', 'type=%s'),
6277 ('gpgkey', 'gpgkey=%s'),
6278 ('proxy', 'proxy=%s'),
6279 ('priority', 'priority=%s'),
6280 )
6281
6282 for line in tmpl:
6283 tmpl_key, tmpl_value = line # key values from tmpl
6284
6285 # ensure that there is an actual value (not None nor empty string)
6286 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
6287 lines.append(tmpl_value % kw.get(tmpl_key))
6288
6289 return '\n'.join(lines)
6290
6291 def repo_path(self) -> str:
6292 return '/etc/zypp/repos.d/ceph.repo'
6293
6294 def repo_baseurl(self) -> str:
6295 assert self.stable or self.version
6296 if self.version:
6297 return '%s/rpm-%s/%s' % (self.ctx.repo_url,
6298 self.stable, self.distro)
6299 else:
6300 return '%s/rpm-%s/%s' % (self.ctx.repo_url,
6301 self.stable, self.distro)
6302
6303 def add_repo(self) -> None:
6304 if self.stable or self.version:
6305 content = ''
6306 for n, t in {
6307 'Ceph': '$basearch',
6308 'Ceph-noarch': 'noarch',
6309 'Ceph-source': 'SRPMS'}.items():
6310 content += '[%s]\n' % (n)
6311 content += self.custom_repo(
6312 name='Ceph %s' % t,
6313 baseurl=self.repo_baseurl() + '/' + t,
6314 enabled=1,
6315 gpgcheck=1,
6316 gpgkey=self.repo_gpgkey()[0],
6317 )
6318 content += '\n\n'
6319 else:
6320 content = self.query_shaman(self.distro, self.distro_version,
6321 self.branch,
6322 self.commit)
6323
6324 logger.info('Writing repo to %s...' % self.repo_path())
6325 with open(self.repo_path(), 'w') as f:
6326 f.write(content)
6327
6328 def rm_repo(self) -> None:
6329 if os.path.exists(self.repo_path()):
6330 os.unlink(self.repo_path())
6331
6332 def install(self, ls: List[str]) -> None:
6333 logger.info('Installing packages %s...' % ls)
6334 call_throws(self.ctx, [self.tool, 'in', '-y'] + ls)
6335
6336 def install_podman(self) -> None:
6337 self.install(['podman'])
6338
6339
6340 def create_packager(ctx: CephadmContext,
6341 stable: Optional[str] = None, version: Optional[str] = None,
6342 branch: Optional[str] = None, commit: Optional[str] = None) -> Packager:
6343 distro, distro_version, distro_codename = get_distro()
6344 if distro in YumDnf.DISTRO_NAMES:
6345 return YumDnf(ctx, stable=stable, version=version,
6346 branch=branch, commit=commit,
6347 distro=distro, distro_version=distro_version)
6348 elif distro in Apt.DISTRO_NAMES:
6349 return Apt(ctx, stable=stable, version=version,
6350 branch=branch, commit=commit,
6351 distro=distro, distro_version=distro_version,
6352 distro_codename=distro_codename)
6353 elif distro in Zypper.DISTRO_NAMES:
6354 return Zypper(ctx, stable=stable, version=version,
6355 branch=branch, commit=commit,
6356 distro=distro, distro_version=distro_version)
6357 raise Error('Distro %s version %s not supported' % (distro, distro_version))
6358
6359
6360 def command_add_repo(ctx: CephadmContext) -> None:
6361 if ctx.version and ctx.release:
6362 raise Error('you can specify either --release or --version but not both')
6363 if not ctx.version and not ctx.release and not ctx.dev and not ctx.dev_commit:
6364 raise Error('please supply a --release, --version, --dev or --dev-commit argument')
6365 if ctx.version:
6366 try:
6367 (x, y, z) = ctx.version.split('.')
6368 except Exception:
6369 raise Error('version must be in the form x.y.z (e.g., 15.2.0)')
6370 if ctx.release:
6371 # Pacific =/= pacific in this case, set to undercase to avoid confision
6372 ctx.release = ctx.release.lower()
6373
6374 pkg = create_packager(ctx, stable=ctx.release,
6375 version=ctx.version,
6376 branch=ctx.dev,
6377 commit=ctx.dev_commit)
6378 pkg.add_repo()
6379 logger.info('Completed adding repo.')
6380
6381
6382 def command_rm_repo(ctx: CephadmContext) -> None:
6383 pkg = create_packager(ctx)
6384 pkg.rm_repo()
6385
6386
6387 def command_install(ctx: CephadmContext) -> None:
6388 pkg = create_packager(ctx)
6389 pkg.install(ctx.packages)
6390
6391 ##################################
6392
6393
6394 def get_ipv4_address(ifname):
6395 # type: (str) -> str
6396 def _extract(sock: socket.socket, offset: int) -> str:
6397 return socket.inet_ntop(
6398 socket.AF_INET,
6399 fcntl.ioctl(
6400 sock.fileno(),
6401 offset,
6402 struct.pack('256s', bytes(ifname[:15], 'utf-8'))
6403 )[20:24])
6404
6405 s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
6406 try:
6407 addr = _extract(s, 35093) # '0x8915' = SIOCGIFADDR
6408 dq_mask = _extract(s, 35099) # 0x891b = SIOCGIFNETMASK
6409 except OSError:
6410 # interface does not have an ipv4 address
6411 return ''
6412
6413 dec_mask = sum([bin(int(i)).count('1')
6414 for i in dq_mask.split('.')])
6415 return '{}/{}'.format(addr, dec_mask)
6416
6417
6418 def get_ipv6_address(ifname):
6419 # type: (str) -> str
6420 if not os.path.exists('/proc/net/if_inet6'):
6421 return ''
6422
6423 raw = read_file(['/proc/net/if_inet6'])
6424 data = raw.splitlines()
6425 # based on docs @ https://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/ch11s04.html
6426 # field 0 is ipv6, field 2 is scope
6427 for iface_setting in data:
6428 field = iface_setting.split()
6429 if field[-1] == ifname:
6430 ipv6_raw = field[0]
6431 ipv6_fmtd = ':'.join([ipv6_raw[_p:_p + 4] for _p in range(0, len(field[0]), 4)])
6432 # apply naming rules using ipaddress module
6433 ipv6 = ipaddress.ip_address(ipv6_fmtd)
6434 return '{}/{}'.format(str(ipv6), int('0x{}'.format(field[2]), 16))
6435 return ''
6436
6437
6438 def bytes_to_human(num, mode='decimal'):
6439 # type: (float, str) -> str
6440 """Convert a bytes value into it's human-readable form.
6441
6442 :param num: number, in bytes, to convert
6443 :param mode: Either decimal (default) or binary to determine divisor
6444 :returns: string representing the bytes value in a more readable format
6445 """
6446 unit_list = ['', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB']
6447 divisor = 1000.0
6448 yotta = 'YB'
6449
6450 if mode == 'binary':
6451 unit_list = ['', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB']
6452 divisor = 1024.0
6453 yotta = 'YiB'
6454
6455 for unit in unit_list:
6456 if abs(num) < divisor:
6457 return '%3.1f%s' % (num, unit)
6458 num /= divisor
6459 return '%.1f%s' % (num, yotta)
6460
6461
6462 def read_file(path_list, file_name=''):
6463 # type: (List[str], str) -> str
6464 """Returns the content of the first file found within the `path_list`
6465
6466 :param path_list: list of file paths to search
6467 :param file_name: optional file_name to be applied to a file path
6468 :returns: content of the file or 'Unknown'
6469 """
6470 for path in path_list:
6471 if file_name:
6472 file_path = os.path.join(path, file_name)
6473 else:
6474 file_path = path
6475 if os.path.exists(file_path):
6476 with open(file_path, 'r') as f:
6477 try:
6478 content = f.read().strip()
6479 except OSError:
6480 # sysfs may populate the file, but for devices like
6481 # virtio reads can fail
6482 return 'Unknown'
6483 else:
6484 return content
6485 return 'Unknown'
6486
6487 ##################################
6488
6489
6490 class HostFacts():
6491 _dmi_path_list = ['/sys/class/dmi/id']
6492 _nic_path_list = ['/sys/class/net']
6493 _apparmor_path_list = ['/etc/apparmor']
6494 _disk_vendor_workarounds = {
6495 '0x1af4': 'Virtio Block Device'
6496 }
6497
6498 def __init__(self, ctx: CephadmContext):
6499 self.ctx: CephadmContext = ctx
6500 self.cpu_model: str = 'Unknown'
6501 self.cpu_count: int = 0
6502 self.cpu_cores: int = 0
6503 self.cpu_threads: int = 0
6504 self.interfaces: Dict[str, Any] = {}
6505
6506 self._meminfo: List[str] = read_file(['/proc/meminfo']).splitlines()
6507 self._get_cpuinfo()
6508 self._process_nics()
6509 self.arch: str = platform.processor()
6510 self.kernel: str = platform.release()
6511
6512 def _get_cpuinfo(self):
6513 # type: () -> None
6514 """Determine cpu information via /proc/cpuinfo"""
6515 raw = read_file(['/proc/cpuinfo'])
6516 output = raw.splitlines()
6517 cpu_set = set()
6518
6519 for line in output:
6520 field = [f.strip() for f in line.split(':')]
6521 if 'model name' in line:
6522 self.cpu_model = field[1]
6523 if 'physical id' in line:
6524 cpu_set.add(field[1])
6525 if 'siblings' in line:
6526 self.cpu_threads = int(field[1].strip())
6527 if 'cpu cores' in line:
6528 self.cpu_cores = int(field[1].strip())
6529 pass
6530 self.cpu_count = len(cpu_set)
6531
6532 def _get_block_devs(self):
6533 # type: () -> List[str]
6534 """Determine the list of block devices by looking at /sys/block"""
6535 return [dev for dev in os.listdir('/sys/block')
6536 if not dev.startswith('dm')]
6537
6538 def _get_devs_by_type(self, rota='0'):
6539 # type: (str) -> List[str]
6540 """Filter block devices by a given rotational attribute (0=flash, 1=spinner)"""
6541 devs = list()
6542 for blk_dev in self._get_block_devs():
6543 rot_path = '/sys/block/{}/queue/rotational'.format(blk_dev)
6544 rot_value = read_file([rot_path])
6545 if rot_value == rota:
6546 devs.append(blk_dev)
6547 return devs
6548
6549 @property
6550 def operating_system(self):
6551 # type: () -> str
6552 """Determine OS version"""
6553 raw_info = read_file(['/etc/os-release'])
6554 os_release = raw_info.splitlines()
6555 rel_str = 'Unknown'
6556 rel_dict = dict()
6557
6558 for line in os_release:
6559 if '=' in line:
6560 var_name, var_value = line.split('=')
6561 rel_dict[var_name] = var_value.strip('"')
6562
6563 # Would normally use PRETTY_NAME, but NAME and VERSION are more
6564 # consistent
6565 if all(_v in rel_dict for _v in ['NAME', 'VERSION']):
6566 rel_str = '{} {}'.format(rel_dict['NAME'], rel_dict['VERSION'])
6567 return rel_str
6568
6569 @property
6570 def hostname(self):
6571 # type: () -> str
6572 """Return the hostname"""
6573 return platform.node()
6574
6575 @property
6576 def subscribed(self):
6577 # type: () -> str
6578 """Highlevel check to see if the host is subscribed to receive updates/support"""
6579 def _red_hat():
6580 # type: () -> str
6581 # RHEL 7 and RHEL 8
6582 entitlements_dir = '/etc/pki/entitlement'
6583 if os.path.exists(entitlements_dir):
6584 pems = glob('{}/*.pem'.format(entitlements_dir))
6585 if len(pems) >= 2:
6586 return 'Yes'
6587
6588 return 'No'
6589
6590 os_name = self.operating_system
6591 if os_name.upper().startswith('RED HAT'):
6592 return _red_hat()
6593
6594 return 'Unknown'
6595
6596 @property
6597 def hdd_count(self):
6598 # type: () -> int
6599 """Return a count of HDDs (spinners)"""
6600 return len(self._get_devs_by_type(rota='1'))
6601
6602 def _get_capacity(self, dev):
6603 # type: (str) -> int
6604 """Determine the size of a given device"""
6605 size_path = os.path.join('/sys/block', dev, 'size')
6606 size_blocks = int(read_file([size_path]))
6607 blk_path = os.path.join('/sys/block', dev, 'queue', 'logical_block_size')
6608 blk_count = int(read_file([blk_path]))
6609 return size_blocks * blk_count
6610
6611 def _get_capacity_by_type(self, rota='0'):
6612 # type: (str) -> int
6613 """Return the total capacity of a category of device (flash or hdd)"""
6614 devs = self._get_devs_by_type(rota=rota)
6615 capacity = 0
6616 for dev in devs:
6617 capacity += self._get_capacity(dev)
6618 return capacity
6619
6620 def _dev_list(self, dev_list):
6621 # type: (List[str]) -> List[Dict[str, object]]
6622 """Return a 'pretty' name list for each device in the `dev_list`"""
6623 disk_list = list()
6624
6625 for dev in dev_list:
6626 disk_model = read_file(['/sys/block/{}/device/model'.format(dev)]).strip()
6627 disk_rev = read_file(['/sys/block/{}/device/rev'.format(dev)]).strip()
6628 disk_wwid = read_file(['/sys/block/{}/device/wwid'.format(dev)]).strip()
6629 vendor = read_file(['/sys/block/{}/device/vendor'.format(dev)]).strip()
6630 disk_vendor = HostFacts._disk_vendor_workarounds.get(vendor, vendor)
6631 disk_size_bytes = self._get_capacity(dev)
6632 disk_list.append({
6633 'description': '{} {} ({})'.format(disk_vendor, disk_model, bytes_to_human(disk_size_bytes)),
6634 'vendor': disk_vendor,
6635 'model': disk_model,
6636 'rev': disk_rev,
6637 'wwid': disk_wwid,
6638 'dev_name': dev,
6639 'disk_size_bytes': disk_size_bytes,
6640 })
6641 return disk_list
6642
6643 @property
6644 def hdd_list(self):
6645 # type: () -> List[Dict[str, object]]
6646 """Return a list of devices that are HDDs (spinners)"""
6647 devs = self._get_devs_by_type(rota='1')
6648 return self._dev_list(devs)
6649
6650 @property
6651 def flash_list(self):
6652 # type: () -> List[Dict[str, object]]
6653 """Return a list of devices that are flash based (SSD, NVMe)"""
6654 devs = self._get_devs_by_type(rota='0')
6655 return self._dev_list(devs)
6656
6657 @property
6658 def hdd_capacity_bytes(self):
6659 # type: () -> int
6660 """Return the total capacity for all HDD devices (bytes)"""
6661 return self._get_capacity_by_type(rota='1')
6662
6663 @property
6664 def hdd_capacity(self):
6665 # type: () -> str
6666 """Return the total capacity for all HDD devices (human readable format)"""
6667 return bytes_to_human(self.hdd_capacity_bytes)
6668
6669 @property
6670 def cpu_load(self):
6671 # type: () -> Dict[str, float]
6672 """Return the cpu load average data for the host"""
6673 raw = read_file(['/proc/loadavg']).strip()
6674 data = raw.split()
6675 return {
6676 '1min': float(data[0]),
6677 '5min': float(data[1]),
6678 '15min': float(data[2]),
6679 }
6680
6681 @property
6682 def flash_count(self):
6683 # type: () -> int
6684 """Return the number of flash devices in the system (SSD, NVMe)"""
6685 return len(self._get_devs_by_type(rota='0'))
6686
6687 @property
6688 def flash_capacity_bytes(self):
6689 # type: () -> int
6690 """Return the total capacity for all flash devices (bytes)"""
6691 return self._get_capacity_by_type(rota='0')
6692
6693 @property
6694 def flash_capacity(self):
6695 # type: () -> str
6696 """Return the total capacity for all Flash devices (human readable format)"""
6697 return bytes_to_human(self.flash_capacity_bytes)
6698
6699 def _process_nics(self):
6700 # type: () -> None
6701 """Look at the NIC devices and extract network related metadata"""
6702 # from https://github.com/torvalds/linux/blob/master/include/uapi/linux/if_arp.h
6703 hw_lookup = {
6704 '1': 'ethernet',
6705 '32': 'infiniband',
6706 '772': 'loopback',
6707 }
6708
6709 for nic_path in HostFacts._nic_path_list:
6710 if not os.path.exists(nic_path):
6711 continue
6712 for iface in os.listdir(nic_path):
6713
6714 lower_devs_list = [os.path.basename(link.replace('lower_', '')) for link in glob(os.path.join(nic_path, iface, 'lower_*'))]
6715 upper_devs_list = [os.path.basename(link.replace('upper_', '')) for link in glob(os.path.join(nic_path, iface, 'upper_*'))]
6716
6717 try:
6718 mtu = int(read_file([os.path.join(nic_path, iface, 'mtu')]))
6719 except ValueError:
6720 mtu = 0
6721
6722 operstate = read_file([os.path.join(nic_path, iface, 'operstate')])
6723 try:
6724 speed = int(read_file([os.path.join(nic_path, iface, 'speed')]))
6725 except (OSError, ValueError):
6726 # OSError : device doesn't support the ethtool get_link_ksettings
6727 # ValueError : raised when the read fails, and returns Unknown
6728 #
6729 # Either way, we show a -1 when speed isn't available
6730 speed = -1
6731
6732 if os.path.exists(os.path.join(nic_path, iface, 'bridge')):
6733 nic_type = 'bridge'
6734 elif os.path.exists(os.path.join(nic_path, iface, 'bonding')):
6735 nic_type = 'bonding'
6736 else:
6737 nic_type = hw_lookup.get(read_file([os.path.join(nic_path, iface, 'type')]), 'Unknown')
6738
6739 dev_link = os.path.join(nic_path, iface, 'device')
6740 if os.path.exists(dev_link):
6741 iftype = 'physical'
6742 driver_path = os.path.join(dev_link, 'driver')
6743 if os.path.exists(driver_path):
6744 driver = os.path.basename(os.path.realpath(driver_path))
6745 else:
6746 driver = 'Unknown'
6747
6748 else:
6749 iftype = 'logical'
6750 driver = ''
6751
6752 self.interfaces[iface] = {
6753 'mtu': mtu,
6754 'upper_devs_list': upper_devs_list,
6755 'lower_devs_list': lower_devs_list,
6756 'operstate': operstate,
6757 'iftype': iftype,
6758 'nic_type': nic_type,
6759 'driver': driver,
6760 'speed': speed,
6761 'ipv4_address': get_ipv4_address(iface),
6762 'ipv6_address': get_ipv6_address(iface),
6763 }
6764
6765 @property
6766 def nic_count(self):
6767 # type: () -> int
6768 """Return a total count of all physical NICs detected in the host"""
6769 phys_devs = []
6770 for iface in self.interfaces:
6771 if self.interfaces[iface]['iftype'] == 'physical':
6772 phys_devs.append(iface)
6773 return len(phys_devs)
6774
6775 def _get_mem_data(self, field_name):
6776 # type: (str) -> int
6777 for line in self._meminfo:
6778 if line.startswith(field_name):
6779 _d = line.split()
6780 return int(_d[1])
6781 return 0
6782
6783 @property
6784 def memory_total_kb(self):
6785 # type: () -> int
6786 """Determine the memory installed (kb)"""
6787 return self._get_mem_data('MemTotal')
6788
6789 @property
6790 def memory_free_kb(self):
6791 # type: () -> int
6792 """Determine the memory free (not cache, immediately usable)"""
6793 return self._get_mem_data('MemFree')
6794
6795 @property
6796 def memory_available_kb(self):
6797 # type: () -> int
6798 """Determine the memory available to new applications without swapping"""
6799 return self._get_mem_data('MemAvailable')
6800
6801 @property
6802 def vendor(self):
6803 # type: () -> str
6804 """Determine server vendor from DMI data in sysfs"""
6805 return read_file(HostFacts._dmi_path_list, 'sys_vendor')
6806
6807 @property
6808 def model(self):
6809 # type: () -> str
6810 """Determine server model information from DMI data in sysfs"""
6811 family = read_file(HostFacts._dmi_path_list, 'product_family')
6812 product = read_file(HostFacts._dmi_path_list, 'product_name')
6813 if family == 'Unknown' and product:
6814 return '{}'.format(product)
6815
6816 return '{} ({})'.format(family, product)
6817
6818 @property
6819 def bios_version(self):
6820 # type: () -> str
6821 """Determine server BIOS version from DMI data in sysfs"""
6822 return read_file(HostFacts._dmi_path_list, 'bios_version')
6823
6824 @property
6825 def bios_date(self):
6826 # type: () -> str
6827 """Determine server BIOS date from DMI data in sysfs"""
6828 return read_file(HostFacts._dmi_path_list, 'bios_date')
6829
6830 @property
6831 def timestamp(self):
6832 # type: () -> float
6833 """Return the current time as Epoch seconds"""
6834 return time.time()
6835
6836 @property
6837 def system_uptime(self):
6838 # type: () -> float
6839 """Return the system uptime (in secs)"""
6840 raw_time = read_file(['/proc/uptime'])
6841 up_secs, _ = raw_time.split()
6842 return float(up_secs)
6843
6844 @property
6845 def kernel_security(self):
6846 # type: () -> Dict[str, str]
6847 """Determine the security features enabled in the kernel - SELinux, AppArmor"""
6848 def _fetch_selinux() -> Dict[str, str]:
6849 """Get the selinux status"""
6850 security = {}
6851 try:
6852 out, err, code = call(self.ctx, ['sestatus'],
6853 verbosity=CallVerbosity.DEBUG)
6854 security['type'] = 'SELinux'
6855 status, mode, policy = '', '', ''
6856 for line in out.split('\n'):
6857 if line.startswith('SELinux status:'):
6858 k, v = line.split(':')
6859 status = v.strip()
6860 elif line.startswith('Current mode:'):
6861 k, v = line.split(':')
6862 mode = v.strip()
6863 elif line.startswith('Loaded policy name:'):
6864 k, v = line.split(':')
6865 policy = v.strip()
6866 if status == 'disabled':
6867 security['description'] = 'SELinux: Disabled'
6868 else:
6869 security['description'] = 'SELinux: Enabled({}, {})'.format(mode, policy)
6870 except Exception as e:
6871 logger.info('unable to get selinux status: %s' % e)
6872 return security
6873
6874 def _fetch_apparmor() -> Dict[str, str]:
6875 """Read the apparmor profiles directly, returning an overview of AppArmor status"""
6876 security = {}
6877 for apparmor_path in HostFacts._apparmor_path_list:
6878 if os.path.exists(apparmor_path):
6879 security['type'] = 'AppArmor'
6880 security['description'] = 'AppArmor: Enabled'
6881 try:
6882 profiles = read_file(['/sys/kernel/security/apparmor/profiles'])
6883 if len(profiles) == 0:
6884 return {}
6885 except OSError:
6886 pass
6887 else:
6888 summary = {} # type: Dict[str, int]
6889 for line in profiles.split('\n'):
6890 item, mode = line.split(' ')
6891 mode = mode.strip('()')
6892 if mode in summary:
6893 summary[mode] += 1
6894 else:
6895 summary[mode] = 0
6896 summary_str = ','.join(['{} {}'.format(v, k) for k, v in summary.items()])
6897 security = {**security, **summary} # type: ignore
6898 security['description'] += '({})'.format(summary_str)
6899
6900 return security
6901 return {}
6902
6903 ret = {}
6904 if os.path.exists('/sys/kernel/security/lsm'):
6905 lsm = read_file(['/sys/kernel/security/lsm']).strip()
6906 if 'selinux' in lsm:
6907 ret = _fetch_selinux()
6908 elif 'apparmor' in lsm:
6909 ret = _fetch_apparmor()
6910 else:
6911 return {
6912 'type': 'Unknown',
6913 'description': 'Linux Security Module framework is active, but is not using SELinux or AppArmor'
6914 }
6915
6916 if ret:
6917 return ret
6918
6919 return {
6920 'type': 'None',
6921 'description': 'Linux Security Module framework is not available'
6922 }
6923
6924 @property
6925 def selinux_enabled(self) -> bool:
6926 return (self.kernel_security['type'] == 'SELinux') and \
6927 (self.kernel_security['description'] != 'SELinux: Disabled')
6928
6929 @property
6930 def kernel_parameters(self):
6931 # type: () -> Dict[str, str]
6932 """Get kernel parameters required/used in Ceph clusters"""
6933
6934 k_param = {}
6935 out, _, _ = call_throws(self.ctx, ['sysctl', '-a'], verbosity=CallVerbosity.SILENT)
6936 if out:
6937 param_list = out.split('\n')
6938 param_dict = {param.split(' = ')[0]: param.split(' = ')[-1] for param in param_list}
6939
6940 # return only desired parameters
6941 if 'net.ipv4.ip_nonlocal_bind' in param_dict:
6942 k_param['net.ipv4.ip_nonlocal_bind'] = param_dict['net.ipv4.ip_nonlocal_bind']
6943
6944 return k_param
6945
6946 @staticmethod
6947 def _process_net_data(tcp_file: str, protocol: str = 'tcp') -> List[int]:
6948 listening_ports = []
6949 # Connections state documentation
6950 # tcp - https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/include/net/tcp_states.h
6951 # udp - uses 07 (TCP_CLOSE or UNCONN, since udp is stateless. test with netcat -ul <port>)
6952 listening_state = {
6953 'tcp': '0A',
6954 'udp': '07'
6955 }
6956
6957 if protocol not in listening_state.keys():
6958 return []
6959
6960 if os.path.exists(tcp_file):
6961 with open(tcp_file) as f:
6962 tcp_data = f.readlines()[1:]
6963
6964 for con in tcp_data:
6965 con_info = con.strip().split()
6966 if con_info[3] == listening_state[protocol]:
6967 local_port = int(con_info[1].split(':')[1], 16)
6968 listening_ports.append(local_port)
6969
6970 return listening_ports
6971
6972 @property
6973 def tcp_ports_used(self) -> List[int]:
6974 return HostFacts._process_net_data('/proc/net/tcp')
6975
6976 @property
6977 def tcp6_ports_used(self) -> List[int]:
6978 return HostFacts._process_net_data('/proc/net/tcp6')
6979
6980 @property
6981 def udp_ports_used(self) -> List[int]:
6982 return HostFacts._process_net_data('/proc/net/udp', 'udp')
6983
6984 @property
6985 def udp6_ports_used(self) -> List[int]:
6986 return HostFacts._process_net_data('/proc/net/udp6', 'udp')
6987
6988 def dump(self):
6989 # type: () -> str
6990 """Return the attributes of this HostFacts object as json"""
6991 data = {
6992 k: getattr(self, k) for k in dir(self)
6993 if not k.startswith('_')
6994 and isinstance(getattr(self, k), (float, int, str, list, dict, tuple))
6995 }
6996 return json.dumps(data, indent=2, sort_keys=True)
6997
6998 ##################################
6999
7000
7001 def command_gather_facts(ctx: CephadmContext) -> None:
7002 """gather_facts is intended to provide host releated metadata to the caller"""
7003 host = HostFacts(ctx)
7004 print(host.dump())
7005
7006
7007 ##################################
7008
7009
7010 class CephadmCache:
7011 task_types = ['disks', 'daemons', 'host', 'http_server']
7012
7013 def __init__(self) -> None:
7014 self.started_epoch_secs = time.time()
7015 self.tasks = {
7016 'daemons': 'inactive',
7017 'disks': 'inactive',
7018 'host': 'inactive',
7019 'http_server': 'inactive',
7020 }
7021 self.errors: list = []
7022 self.disks: dict = {}
7023 self.daemons: dict = {}
7024 self.host: dict = {}
7025 self.lock = RLock()
7026
7027 @property
7028 def health(self) -> dict:
7029 return {
7030 'started_epoch_secs': self.started_epoch_secs,
7031 'tasks': self.tasks,
7032 'errors': self.errors,
7033 }
7034
7035 def to_json(self) -> dict:
7036 return {
7037 'health': self.health,
7038 'host': self.host,
7039 'daemons': self.daemons,
7040 'disks': self.disks,
7041 }
7042
7043 def update_health(self, task_type: str, task_status: str, error_msg: Optional[str] = None) -> None:
7044 assert task_type in CephadmCache.task_types
7045 with self.lock:
7046 self.tasks[task_type] = task_status
7047 if error_msg:
7048 self.errors.append(error_msg)
7049
7050 def update_task(self, task_type: str, content: dict) -> None:
7051 assert task_type in CephadmCache.task_types
7052 assert isinstance(content, dict)
7053 with self.lock:
7054 current = getattr(self, task_type)
7055 for k in content:
7056 current[k] = content[k]
7057
7058 setattr(self, task_type, current)
7059
7060
7061 class CephadmHTTPServer(ThreadingMixIn, HTTPServer):
7062 allow_reuse_address = True
7063 daemon_threads = True
7064 cephadm_cache: CephadmCache
7065 token: str
7066
7067
7068 class CephadmDaemonHandler(BaseHTTPRequestHandler):
7069 server: CephadmHTTPServer
7070 api_version = 'v1'
7071 valid_routes = [
7072 f'/{api_version}/metadata',
7073 f'/{api_version}/metadata/health',
7074 f'/{api_version}/metadata/disks',
7075 f'/{api_version}/metadata/daemons',
7076 f'/{api_version}/metadata/host',
7077 ]
7078
7079 class Decorators:
7080 @classmethod
7081 def authorize(cls, f: Any) -> Any:
7082 """Implement a basic token check.
7083
7084 The token is installed at deployment time and must be provided to
7085 ensure we only respond to callers who know our token i.e. mgr
7086 """
7087
7088 def wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:
7089 auth = self.headers.get('Authorization', None)
7090 if auth != 'Bearer ' + self.server.token:
7091 self.send_error(401)
7092 return
7093 f(self, *args, **kwargs)
7094
7095 return wrapper
7096
7097 def _help_page(self) -> str:
7098 return """<!DOCTYPE html>
7099 <html>
7100 <head><title>cephadm metadata exporter</title></head>
7101 <style>
7102 body {{
7103 font-family: sans-serif;
7104 font-size: 0.8em;
7105 }}
7106 table {{
7107 border-width: 0px;
7108 border-spacing: 0px;
7109 margin-left:20px;
7110 }}
7111 tr:hover {{
7112 background: PowderBlue;
7113 }}
7114 td,th {{
7115 padding: 5px;
7116 }}
7117 </style>
7118 <body>
7119 <h1>cephadm metadata exporter {api_version}</h1>
7120 <table>
7121 <thead>
7122 <tr><th>Endpoint</th><th>Methods</th><th>Response</th><th>Description</th></tr>
7123 </thead>
7124 <tr><td><a href='{api_version}/metadata'>{api_version}/metadata</a></td><td>GET</td><td>JSON</td><td>Return <b>all</b> metadata for the host</td></tr>
7125 <tr><td><a href='{api_version}/metadata/daemons'>{api_version}/metadata/daemons</a></td><td>GET</td><td>JSON</td><td>Return daemon and systemd states for ceph daemons (ls)</td></tr>
7126 <tr><td><a href='{api_version}/metadata/disks'>{api_version}/metadata/disks</a></td><td>GET</td><td>JSON</td><td>show disk inventory (ceph-volume)</td></tr>
7127 <tr><td><a href='{api_version}/metadata/health'>{api_version}/metadata/health</a></td><td>GET</td><td>JSON</td><td>Show current health of the exporter sub-tasks</td></tr>
7128 <tr><td><a href='{api_version}/metadata/host'>{api_version}/metadata/host</a></td><td>GET</td><td>JSON</td><td>Show host metadata (gather-facts)</td></tr>
7129 </table>
7130 </body>
7131 </html>""".format(api_version=CephadmDaemonHandler.api_version)
7132
7133 def _fetch_root(self) -> None:
7134 self.send_response(200)
7135 self.send_header('Content-type', 'text/html; charset=utf-8')
7136 self.end_headers()
7137 self.wfile.write(self._help_page().encode('utf-8'))
7138
7139 @Decorators.authorize
7140 def do_GET(self) -> None:
7141 """Handle *all* GET requests"""
7142
7143 if self.path == '/':
7144 # provide a html response if someone hits the root url, to document the
7145 # available api endpoints
7146 return self._fetch_root()
7147 elif self.path in CephadmDaemonHandler.valid_routes:
7148 u = self.path.split('/')[-1]
7149 data = json.dumps({})
7150 status_code = 200
7151
7152 tasks = self.server.cephadm_cache.health.get('tasks', {})
7153 assert tasks
7154
7155 # We're using the http status code to help indicate thread health
7156 # - 200 (OK): request successful
7157 # - 204 (No Content): access to a cache relating to a dead thread
7158 # - 206 (Partial content): one or more theads are inactive
7159 # - 500 (Server Error): all threads inactive
7160 if u == 'metadata':
7161 data = json.dumps(self.server.cephadm_cache.to_json())
7162 if all([tasks[task_name] == 'inactive' for task_name in tasks if task_name != 'http_server']):
7163 # All the subtasks are dead!
7164 status_code = 500
7165 elif any([tasks[task_name] == 'inactive' for task_name in tasks if task_name != 'http_server']):
7166 status_code = 206
7167
7168 # Individual GETs against the a tasks endpoint will also return a 503 if the corresponding thread is inactive
7169 elif u == 'daemons':
7170 data = json.dumps(self.server.cephadm_cache.daemons)
7171 if tasks['daemons'] == 'inactive':
7172 status_code = 204
7173 elif u == 'disks':
7174 data = json.dumps(self.server.cephadm_cache.disks)
7175 if tasks['disks'] == 'inactive':
7176 status_code = 204
7177 elif u == 'host':
7178 data = json.dumps(self.server.cephadm_cache.host)
7179 if tasks['host'] == 'inactive':
7180 status_code = 204
7181
7182 # a GET against health will always return a 200, since the op is always successful
7183 elif u == 'health':
7184 data = json.dumps(self.server.cephadm_cache.health)
7185
7186 self.send_response(status_code)
7187 self.send_header('Content-type', 'application/json')
7188 self.end_headers()
7189 self.wfile.write(data.encode('utf-8'))
7190 else:
7191 # Invalid GET URL
7192 bad_request_msg = 'Valid URLs are: {}'.format(', '.join(CephadmDaemonHandler.valid_routes))
7193 self.send_response(404, message=bad_request_msg) # reason
7194 self.send_header('Content-type', 'application/json')
7195 self.end_headers()
7196 self.wfile.write(json.dumps({'message': bad_request_msg}).encode('utf-8'))
7197
7198 def log_message(self, format: str, *args: Any) -> None:
7199 rqst = ' '.join(str(a) for a in args)
7200 logger.info(f'client:{self.address_string()} [{self.log_date_time_string()}] {rqst}')
7201
7202
7203 class CephadmDaemon():
7204
7205 daemon_type = 'cephadm-exporter'
7206 default_port = 9443
7207 key_name = 'key'
7208 crt_name = 'crt'
7209 token_name = 'token'
7210 config_requirements = [
7211 key_name,
7212 crt_name,
7213 token_name,
7214 ]
7215 loop_delay = 1
7216 thread_check_interval = 5
7217
7218 def __init__(self, ctx: CephadmContext, fsid: str, daemon_id: Optional[str] = None, port: Optional[int] = None) -> None:
7219 self.ctx = ctx
7220 self.fsid = fsid
7221 self.daemon_id = daemon_id
7222 if not port:
7223 self.port = CephadmDaemon.default_port
7224 else:
7225 self.port = port
7226 self.workers: List[Thread] = []
7227 self.http_server: CephadmHTTPServer
7228 self.stop = False
7229 self.cephadm_cache = CephadmCache()
7230 self.errors: List[str] = []
7231 self.token = read_file([os.path.join(self.daemon_path, CephadmDaemon.token_name)])
7232
7233 @classmethod
7234 def validate_config(cls, config: dict) -> None:
7235 reqs = ', '.join(CephadmDaemon.config_requirements)
7236 errors = []
7237
7238 if not config or not all([k_name in config for k_name in CephadmDaemon.config_requirements]):
7239 raise Error(f'config must contain the following fields : {reqs}')
7240
7241 if not all([isinstance(config[k_name], str) for k_name in CephadmDaemon.config_requirements]):
7242 errors.append(f'the following fields must be strings: {reqs}')
7243
7244 crt = config[CephadmDaemon.crt_name]
7245 key = config[CephadmDaemon.key_name]
7246 token = config[CephadmDaemon.token_name]
7247
7248 if not crt.startswith('-----BEGIN CERTIFICATE-----') or not crt.endswith('-----END CERTIFICATE-----\n'):
7249 errors.append('crt field is not a valid SSL certificate')
7250 if not key.startswith('-----BEGIN PRIVATE KEY-----') or not key.endswith('-----END PRIVATE KEY-----\n'):
7251 errors.append('key is not a valid SSL private key')
7252 if len(token) < 8:
7253 errors.append("'token' must be more than 8 characters long")
7254
7255 if 'port' in config:
7256 try:
7257 p = int(config['port'])
7258 if p <= 1024:
7259 raise ValueError
7260 except (TypeError, ValueError):
7261 errors.append('port must be an integer > 1024')
7262
7263 if errors:
7264 raise Error('Parameter errors : {}'.format(', '.join(errors)))
7265
7266 @property
7267 def port_active(self) -> bool:
7268 return port_in_use(self.ctx, self.port)
7269
7270 @property
7271 def can_run(self) -> bool:
7272 # if port is in use
7273 if self.port_active:
7274 self.errors.append(f'TCP port {self.port} already in use, unable to bind')
7275 if not os.path.exists(os.path.join(self.daemon_path, CephadmDaemon.key_name)):
7276 self.errors.append(f"Key file '{CephadmDaemon.key_name}' is missing from {self.daemon_path}")
7277 if not os.path.exists(os.path.join(self.daemon_path, CephadmDaemon.crt_name)):
7278 self.errors.append(f"Certificate file '{CephadmDaemon.crt_name}' is missing from {self.daemon_path}")
7279 if self.token == 'Unknown':
7280 self.errors.append(f"Authentication token '{CephadmDaemon.token_name}' is missing from {self.daemon_path}")
7281 return len(self.errors) == 0
7282
7283 @staticmethod
7284 def _unit_name(fsid: str, daemon_id: str) -> str:
7285 return '{}.service'.format(get_unit_name(fsid, CephadmDaemon.daemon_type, daemon_id))
7286
7287 @property
7288 def unit_name(self) -> str:
7289 assert self.daemon_id is not None
7290 return CephadmDaemon._unit_name(self.fsid, self.daemon_id)
7291
7292 @property
7293 def daemon_path(self) -> str:
7294 return os.path.join(
7295 self.ctx.data_dir,
7296 self.fsid,
7297 f'{self.daemon_type}.{self.daemon_id}'
7298 )
7299
7300 @property
7301 def binary_path(self) -> str:
7302 path = os.path.realpath(__file__)
7303 assert os.path.isfile(path)
7304 return path
7305
7306 def _handle_thread_exception(self, exc: Exception, thread_type: str) -> None:
7307 e_msg = f'{exc.__class__.__name__} exception: {str(exc)}'
7308 thread_info = getattr(self.cephadm_cache, thread_type)
7309 errors = thread_info.get('scrape_errors', [])
7310 errors.append(e_msg)
7311 logger.error(e_msg)
7312 logger.exception(exc)
7313 self.cephadm_cache.update_task(
7314 thread_type,
7315 {
7316 'scrape_errors': errors,
7317 'data': None,
7318 }
7319 )
7320
7321 def _scrape_host_facts(self, refresh_interval: int = 10) -> None:
7322 ctr = 0
7323 exception_encountered = False
7324
7325 while True:
7326
7327 if self.stop or exception_encountered:
7328 break
7329
7330 if ctr >= refresh_interval:
7331 ctr = 0
7332 logger.debug('executing host-facts scrape')
7333 errors = []
7334 s_time = time.time()
7335
7336 try:
7337 facts = HostFacts(self.ctx)
7338 except Exception as e:
7339 self._handle_thread_exception(e, 'host')
7340 exception_encountered = True
7341 else:
7342 elapsed = time.time() - s_time
7343 try:
7344 data = json.loads(facts.dump())
7345 except json.decoder.JSONDecodeError:
7346 errors.append('host-facts provided invalid JSON')
7347 logger.warning(errors[-1])
7348 data = {}
7349 self.cephadm_cache.update_task(
7350 'host',
7351 {
7352 'scrape_timestamp': s_time,
7353 'scrape_duration_secs': elapsed,
7354 'scrape_errors': errors,
7355 'data': data,
7356 }
7357 )
7358 logger.debug(f'completed host-facts scrape - {elapsed}s')
7359
7360 time.sleep(CephadmDaemon.loop_delay)
7361 ctr += CephadmDaemon.loop_delay
7362 logger.info('host-facts thread stopped')
7363
7364 def _scrape_ceph_volume(self, refresh_interval: int = 15) -> None:
7365 # we're invoking the ceph_volume command, so we need to set the args that it
7366 # expects to use
7367 self.ctx.command = 'inventory --format=json'.split()
7368 self.ctx.fsid = self.fsid
7369
7370 ctr = 0
7371 exception_encountered = False
7372
7373 while True:
7374 if self.stop or exception_encountered:
7375 break
7376
7377 if ctr >= refresh_interval:
7378 ctr = 0
7379 logger.debug('executing ceph-volume scrape')
7380 errors = []
7381 s_time = time.time()
7382 stream = io.StringIO()
7383 try:
7384 with redirect_stdout(stream):
7385 command_ceph_volume(self.ctx)
7386 except Exception as e:
7387 self._handle_thread_exception(e, 'disks')
7388 exception_encountered = True
7389 else:
7390 elapsed = time.time() - s_time
7391
7392 # if the call to ceph-volume returns junk with the
7393 # json, it won't parse
7394 stdout = stream.getvalue()
7395
7396 data = []
7397 if stdout:
7398 try:
7399 data = json.loads(stdout)
7400 except json.decoder.JSONDecodeError:
7401 errors.append('ceph-volume thread provided bad json data')
7402 logger.warning(errors[-1])
7403 else:
7404 errors.append('ceph-volume did not return any data')
7405 logger.warning(errors[-1])
7406
7407 self.cephadm_cache.update_task(
7408 'disks',
7409 {
7410 'scrape_timestamp': s_time,
7411 'scrape_duration_secs': elapsed,
7412 'scrape_errors': errors,
7413 'data': data,
7414 }
7415 )
7416
7417 logger.debug(f'completed ceph-volume scrape - {elapsed}s')
7418 time.sleep(CephadmDaemon.loop_delay)
7419 ctr += CephadmDaemon.loop_delay
7420
7421 logger.info('ceph-volume thread stopped')
7422
7423 def _scrape_list_daemons(self, refresh_interval: int = 20) -> None:
7424 ctr = 0
7425 exception_encountered = False
7426 while True:
7427 if self.stop or exception_encountered:
7428 break
7429
7430 if ctr >= refresh_interval:
7431 ctr = 0
7432 logger.debug('executing list-daemons scrape')
7433 errors = []
7434 s_time = time.time()
7435
7436 try:
7437 # list daemons should ideally be invoked with a fsid
7438 data = list_daemons(self.ctx)
7439 except Exception as e:
7440 self._handle_thread_exception(e, 'daemons')
7441 exception_encountered = True
7442 else:
7443 if not isinstance(data, list):
7444 errors.append('list-daemons did not supply a list?')
7445 logger.warning(errors[-1])
7446 data = []
7447 elapsed = time.time() - s_time
7448 self.cephadm_cache.update_task(
7449 'daemons',
7450 {
7451 'scrape_timestamp': s_time,
7452 'scrape_duration_secs': elapsed,
7453 'scrape_errors': errors,
7454 'data': data,
7455 }
7456 )
7457 logger.debug(f'completed list-daemons scrape - {elapsed}s')
7458
7459 time.sleep(CephadmDaemon.loop_delay)
7460 ctr += CephadmDaemon.loop_delay
7461 logger.info('list-daemons thread stopped')
7462
7463 def _create_thread(self, target: Any, name: str, refresh_interval: Optional[int] = None) -> Thread:
7464 if refresh_interval:
7465 t = Thread(target=target, args=(refresh_interval,))
7466 else:
7467 t = Thread(target=target)
7468 t.daemon = True
7469 t.name = name
7470 self.cephadm_cache.update_health(name, 'active')
7471 t.start()
7472
7473 start_msg = f'Started {name} thread'
7474 if refresh_interval:
7475 logger.info(f'{start_msg}, with a refresh interval of {refresh_interval}s')
7476 else:
7477 logger.info(f'{start_msg}')
7478 return t
7479
7480 def reload(self, *args: Any) -> None:
7481 """reload -HUP received
7482
7483 This is a placeholder function only, and serves to provide the hook that could
7484 be exploited later if the exporter evolves to incorporate a config file
7485 """
7486 logger.info('Reload request received - ignoring, no action needed')
7487
7488 def shutdown(self, *args: Any) -> None:
7489 logger.info('Shutdown request received')
7490 self.stop = True
7491 self.http_server.shutdown()
7492
7493 def run(self) -> None:
7494 logger.info(f"cephadm exporter starting for FSID '{self.fsid}'")
7495 if not self.can_run:
7496 logger.error('Unable to start the exporter daemon')
7497 for e in self.errors:
7498 logger.error(e)
7499 return
7500
7501 # register signal handlers for running under systemd control
7502 signal.signal(signal.SIGTERM, self.shutdown)
7503 signal.signal(signal.SIGINT, self.shutdown)
7504 signal.signal(signal.SIGHUP, self.reload)
7505 logger.debug('Signal handlers attached')
7506
7507 host_facts = self._create_thread(self._scrape_host_facts, 'host', 5)
7508 self.workers.append(host_facts)
7509
7510 daemons = self._create_thread(self._scrape_list_daemons, 'daemons', 20)
7511 self.workers.append(daemons)
7512
7513 disks = self._create_thread(self._scrape_ceph_volume, 'disks', 20)
7514 self.workers.append(disks)
7515
7516 self.http_server = CephadmHTTPServer(('0.0.0.0', self.port), CephadmDaemonHandler) # IPv4 only
7517 self.http_server.socket = ssl.wrap_socket(self.http_server.socket,
7518 keyfile=os.path.join(self.daemon_path, CephadmDaemon.key_name),
7519 certfile=os.path.join(self.daemon_path, CephadmDaemon.crt_name),
7520 server_side=True)
7521
7522 self.http_server.cephadm_cache = self.cephadm_cache
7523 self.http_server.token = self.token
7524 server_thread = self._create_thread(self.http_server.serve_forever, 'http_server')
7525 logger.info(f'https server listening on {self.http_server.server_address[0]}:{self.http_server.server_port}')
7526
7527 ctr = 0
7528 while server_thread.is_alive():
7529 if self.stop:
7530 break
7531
7532 if ctr >= CephadmDaemon.thread_check_interval:
7533 ctr = 0
7534 for worker in self.workers:
7535 if self.cephadm_cache.tasks[worker.name] == 'inactive':
7536 continue
7537 if not worker.is_alive():
7538 logger.warning(f'{worker.name} thread not running')
7539 stop_time = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')
7540 self.cephadm_cache.update_health(worker.name, 'inactive', f'{worker.name} stopped at {stop_time}')
7541
7542 time.sleep(CephadmDaemon.loop_delay)
7543 ctr += CephadmDaemon.loop_delay
7544
7545 logger.info('Main http server thread stopped')
7546
7547 @property
7548 def unit_run(self) -> str:
7549
7550 return """set -e
7551 {py3} {bin_path} exporter --fsid {fsid} --id {daemon_id} --port {port} &""".format(
7552 py3=shutil.which('python3'),
7553 bin_path=self.binary_path,
7554 fsid=self.fsid,
7555 daemon_id=self.daemon_id,
7556 port=self.port
7557 )
7558
7559 @property
7560 def unit_file(self) -> str:
7561 docker = isinstance(self.ctx.container_engine, Docker)
7562 return """#generated by cephadm
7563 [Unit]
7564 Description=cephadm exporter service for cluster {fsid}
7565 After=network-online.target{docker_after}
7566 Wants=network-online.target
7567 {docker_requires}
7568
7569 PartOf=ceph-{fsid}.target
7570 Before=ceph-{fsid}.target
7571
7572 [Service]
7573 Type=forking
7574 ExecStart=/bin/bash {daemon_path}/unit.run
7575 ExecReload=/bin/kill -HUP $MAINPID
7576 Restart=on-failure
7577 RestartSec=10s
7578
7579 [Install]
7580 WantedBy=ceph-{fsid}.target
7581 """.format(fsid=self.fsid,
7582 daemon_path=self.daemon_path,
7583 # if docker, we depend on docker.service
7584 docker_after=' docker.service' if docker else '',
7585 docker_requires='Requires=docker.service\n' if docker else '')
7586
7587 def deploy_daemon_unit(self, config: Optional[dict] = None) -> None:
7588 """deploy a specific unit file for cephadm
7589
7590 The normal deploy_daemon_units doesn't apply for this
7591 daemon since it's not a container, so we just create a
7592 simple service definition and add it to the fsid's target
7593 """
7594 if not config:
7595 raise Error('Attempting to deploy cephadm daemon without a config')
7596 assert isinstance(config, dict)
7597
7598 # Create the required config files in the daemons dir, with restricted permissions
7599 for filename in config:
7600 with open(os.open(os.path.join(self.daemon_path, filename), os.O_CREAT | os.O_WRONLY, mode=0o600), 'w') as f:
7601 f.write(config[filename])
7602
7603 # When __file__ is <stdin> we're being invoked over remoto via the orchestrator, so
7604 # we pick up the file from where the orchestrator placed it - otherwise we'll
7605 # copy it to the binary location for this cluster
7606 if not __file__ == '<stdin>':
7607 try:
7608 shutil.copy(__file__,
7609 self.binary_path)
7610 except shutil.SameFileError:
7611 pass
7612
7613 with open(os.path.join(self.daemon_path, 'unit.run'), 'w') as f:
7614 f.write(self.unit_run)
7615
7616 with open(
7617 os.path.join(self.ctx.unit_dir,
7618 f'{self.unit_name}.new'),
7619 'w'
7620 ) as f:
7621 f.write(self.unit_file)
7622 os.rename(
7623 os.path.join(self.ctx.unit_dir, f'{self.unit_name}.new'),
7624 os.path.join(self.ctx.unit_dir, self.unit_name))
7625
7626 call_throws(self.ctx, ['systemctl', 'daemon-reload'])
7627 call(self.ctx, ['systemctl', 'stop', self.unit_name],
7628 verbosity=CallVerbosity.DEBUG)
7629 call(self.ctx, ['systemctl', 'reset-failed', self.unit_name],
7630 verbosity=CallVerbosity.DEBUG)
7631 call_throws(self.ctx, ['systemctl', 'enable', '--now', self.unit_name])
7632
7633 @classmethod
7634 def uninstall(cls, ctx: CephadmContext, fsid: str, daemon_type: str, daemon_id: str) -> None:
7635 unit_name = CephadmDaemon._unit_name(fsid, daemon_id)
7636 unit_path = os.path.join(ctx.unit_dir, unit_name)
7637 unit_run = os.path.join(ctx.data_dir, fsid, f'{daemon_type}.{daemon_id}', 'unit.run')
7638 port = None
7639 try:
7640 with open(unit_run, 'r') as u:
7641 contents = u.read().strip(' &')
7642 except OSError:
7643 logger.warning(f'Unable to access the unit.run file @ {unit_run}')
7644 return
7645
7646 port = None
7647 for line in contents.split('\n'):
7648 if '--port ' in line:
7649 try:
7650 port = int(line.split('--port ')[-1])
7651 except ValueError:
7652 logger.warning('Unexpected format in unit.run file: port is not numeric')
7653 logger.warning('Unable to remove the systemd file and close the port')
7654 return
7655 break
7656
7657 if port:
7658 fw = Firewalld(ctx)
7659 try:
7660 fw.close_ports([port])
7661 except RuntimeError:
7662 logger.error(f'Unable to close port {port}')
7663
7664 stdout, stderr, rc = call(ctx, ['rm', '-f', unit_path])
7665 if rc:
7666 logger.error(f'Unable to remove the systemd file @ {unit_path}')
7667 else:
7668 logger.info(f'removed systemd unit file @ {unit_path}')
7669 stdout, stderr, rc = call(ctx, ['systemctl', 'daemon-reload'])
7670
7671
7672 def command_exporter(ctx: CephadmContext) -> None:
7673 exporter = CephadmDaemon(ctx, ctx.fsid, daemon_id=ctx.id, port=ctx.port)
7674
7675 if ctx.fsid not in os.listdir(ctx.data_dir):
7676 raise Error(f"cluster fsid '{ctx.fsid}' not found in '{ctx.data_dir}'")
7677
7678 exporter.run()
7679
7680 ##################################
7681
7682
7683 def systemd_target_state(target_name: str, subsystem: str = 'ceph') -> bool:
7684 # TODO: UNITTEST
7685 return os.path.exists(
7686 os.path.join(
7687 UNIT_DIR,
7688 f'{subsystem}.target.wants',
7689 target_name
7690 )
7691 )
7692
7693
7694 @infer_fsid
7695 def command_maintenance(ctx: CephadmContext) -> str:
7696 if not ctx.fsid:
7697 raise Error('must pass --fsid to specify cluster')
7698
7699 target = f'ceph-{ctx.fsid}.target'
7700
7701 if ctx.maintenance_action.lower() == 'enter':
7702 logger.info('Requested to place host into maintenance')
7703 if systemd_target_state(target):
7704 _out, _err, code = call(ctx,
7705 ['systemctl', 'disable', target],
7706 verbosity=CallVerbosity.DEBUG)
7707 if code:
7708 logger.error(f'Failed to disable the {target} target')
7709 return 'failed - to disable the target'
7710 else:
7711 # stopping a target waits by default
7712 _out, _err, code = call(ctx,
7713 ['systemctl', 'stop', target],
7714 verbosity=CallVerbosity.DEBUG)
7715 if code:
7716 logger.error(f'Failed to stop the {target} target')
7717 return 'failed - to disable the target'
7718 else:
7719 return f'success - systemd target {target} disabled'
7720
7721 else:
7722 return 'skipped - target already disabled'
7723
7724 else:
7725 logger.info('Requested to exit maintenance state')
7726 # exit maintenance request
7727 if not systemd_target_state(target):
7728 _out, _err, code = call(ctx,
7729 ['systemctl', 'enable', target],
7730 verbosity=CallVerbosity.DEBUG)
7731 if code:
7732 logger.error(f'Failed to enable the {target} target')
7733 return 'failed - unable to enable the target'
7734 else:
7735 # starting a target waits by default
7736 _out, _err, code = call(ctx,
7737 ['systemctl', 'start', target],
7738 verbosity=CallVerbosity.DEBUG)
7739 if code:
7740 logger.error(f'Failed to start the {target} target')
7741 return 'failed - unable to start the target'
7742 else:
7743 return f'success - systemd target {target} enabled and started'
7744 return f'success - systemd target {target} enabled and started'
7745
7746 ##################################
7747
7748
7749 def _get_parser():
7750 # type: () -> argparse.ArgumentParser
7751 parser = argparse.ArgumentParser(
7752 description='Bootstrap Ceph daemons with systemd and containers.',
7753 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
7754 parser.add_argument(
7755 '--image',
7756 help='container image. Can also be set via the "CEPHADM_IMAGE" '
7757 'env var')
7758 parser.add_argument(
7759 '--docker',
7760 action='store_true',
7761 help='use docker instead of podman')
7762 parser.add_argument(
7763 '--data-dir',
7764 default=DATA_DIR,
7765 help='base directory for daemon data')
7766 parser.add_argument(
7767 '--log-dir',
7768 default=LOG_DIR,
7769 help='base directory for daemon logs')
7770 parser.add_argument(
7771 '--logrotate-dir',
7772 default=LOGROTATE_DIR,
7773 help='location of logrotate configuration files')
7774 parser.add_argument(
7775 '--sysctl-dir',
7776 default=SYSCTL_DIR,
7777 help='location of sysctl configuration files')
7778 parser.add_argument(
7779 '--unit-dir',
7780 default=UNIT_DIR,
7781 help='base directory for systemd units')
7782 parser.add_argument(
7783 '--verbose', '-v',
7784 action='store_true',
7785 help='Show debug-level log messages')
7786 parser.add_argument(
7787 '--timeout',
7788 type=int,
7789 default=DEFAULT_TIMEOUT,
7790 help='timeout in seconds')
7791 parser.add_argument(
7792 '--retry',
7793 type=int,
7794 default=DEFAULT_RETRY,
7795 help='max number of retries')
7796 parser.add_argument(
7797 '--env', '-e',
7798 action='append',
7799 default=[],
7800 help='set environment variable')
7801 parser.add_argument(
7802 '--no-container-init',
7803 action='store_true',
7804 default=not CONTAINER_INIT,
7805 help='Do not run podman/docker with `--init`')
7806
7807 subparsers = parser.add_subparsers(help='sub-command')
7808
7809 parser_version = subparsers.add_parser(
7810 'version', help='get ceph version from container')
7811 parser_version.set_defaults(func=command_version)
7812
7813 parser_pull = subparsers.add_parser(
7814 'pull', help='pull latest image version')
7815 parser_pull.set_defaults(func=command_pull)
7816
7817 parser_inspect_image = subparsers.add_parser(
7818 'inspect-image', help='inspect local container image')
7819 parser_inspect_image.set_defaults(func=command_inspect_image)
7820
7821 parser_ls = subparsers.add_parser(
7822 'ls', help='list daemon instances on this host')
7823 parser_ls.set_defaults(func=command_ls)
7824 parser_ls.add_argument(
7825 '--no-detail',
7826 action='store_true',
7827 help='Do not include daemon status')
7828 parser_ls.add_argument(
7829 '--legacy-dir',
7830 default='/',
7831 help='base directory for legacy daemon data')
7832
7833 parser_list_networks = subparsers.add_parser(
7834 'list-networks', help='list IP networks')
7835 parser_list_networks.set_defaults(func=command_list_networks)
7836
7837 parser_adopt = subparsers.add_parser(
7838 'adopt', help='adopt daemon deployed with a different tool')
7839 parser_adopt.set_defaults(func=command_adopt)
7840 parser_adopt.add_argument(
7841 '--name', '-n',
7842 required=True,
7843 help='daemon name (type.id)')
7844 parser_adopt.add_argument(
7845 '--style',
7846 required=True,
7847 help='deployment style (legacy, ...)')
7848 parser_adopt.add_argument(
7849 '--cluster',
7850 default='ceph',
7851 help='cluster name')
7852 parser_adopt.add_argument(
7853 '--legacy-dir',
7854 default='/',
7855 help='base directory for legacy daemon data')
7856 parser_adopt.add_argument(
7857 '--config-json',
7858 help='Additional configuration information in JSON format')
7859 parser_adopt.add_argument(
7860 '--skip-firewalld',
7861 action='store_true',
7862 help='Do not configure firewalld')
7863 parser_adopt.add_argument(
7864 '--skip-pull',
7865 action='store_true',
7866 help='do not pull the latest image before adopting')
7867 parser_adopt.add_argument(
7868 '--force-start',
7869 action='store_true',
7870 help='start newly adoped daemon, even if it was not running previously')
7871 parser_adopt.add_argument(
7872 '--container-init',
7873 action='store_true',
7874 default=CONTAINER_INIT,
7875 help=argparse.SUPPRESS)
7876
7877 parser_rm_daemon = subparsers.add_parser(
7878 'rm-daemon', help='remove daemon instance')
7879 parser_rm_daemon.set_defaults(func=command_rm_daemon)
7880 parser_rm_daemon.add_argument(
7881 '--name', '-n',
7882 required=True,
7883 action=CustomValidation,
7884 help='daemon name (type.id)')
7885 parser_rm_daemon.add_argument(
7886 '--fsid',
7887 required=True,
7888 help='cluster FSID')
7889 parser_rm_daemon.add_argument(
7890 '--force',
7891 action='store_true',
7892 help='proceed, even though this may destroy valuable data')
7893 parser_rm_daemon.add_argument(
7894 '--force-delete-data',
7895 action='store_true',
7896 help='delete valuable daemon data instead of making a backup')
7897
7898 parser_rm_cluster = subparsers.add_parser(
7899 'rm-cluster', help='remove all daemons for a cluster')
7900 parser_rm_cluster.set_defaults(func=command_rm_cluster)
7901 parser_rm_cluster.add_argument(
7902 '--fsid',
7903 required=True,
7904 help='cluster FSID')
7905 parser_rm_cluster.add_argument(
7906 '--force',
7907 action='store_true',
7908 help='proceed, even though this may destroy valuable data')
7909 parser_rm_cluster.add_argument(
7910 '--keep-logs',
7911 action='store_true',
7912 help='do not remove log files')
7913 parser_rm_cluster.add_argument(
7914 '--zap-osds',
7915 action='store_true',
7916 help='zap OSD devices for this cluster')
7917
7918 parser_run = subparsers.add_parser(
7919 'run', help='run a ceph daemon, in a container, in the foreground')
7920 parser_run.set_defaults(func=command_run)
7921 parser_run.add_argument(
7922 '--name', '-n',
7923 required=True,
7924 help='daemon name (type.id)')
7925 parser_run.add_argument(
7926 '--fsid',
7927 required=True,
7928 help='cluster FSID')
7929
7930 parser_shell = subparsers.add_parser(
7931 'shell', help='run an interactive shell inside a daemon container')
7932 parser_shell.set_defaults(func=command_shell)
7933 parser_shell.add_argument(
7934 '--fsid',
7935 help='cluster FSID')
7936 parser_shell.add_argument(
7937 '--name', '-n',
7938 help='daemon name (type.id)')
7939 parser_shell.add_argument(
7940 '--config', '-c',
7941 help='ceph.conf to pass through to the container')
7942 parser_shell.add_argument(
7943 '--keyring', '-k',
7944 help='ceph.keyring to pass through to the container')
7945 parser_shell.add_argument(
7946 '--mount', '-m',
7947 help=('mount a file or directory in the container. '
7948 'Support multiple mounts. '
7949 'ie: `--mount /foo /bar:/bar`. '
7950 'When no destination is passed, default is /mnt'),
7951 nargs='+')
7952 parser_shell.add_argument(
7953 '--env', '-e',
7954 action='append',
7955 default=[],
7956 help='set environment variable')
7957 parser_shell.add_argument(
7958 '--volume', '-v',
7959 action='append',
7960 default=[],
7961 help='set environment variable')
7962 parser_shell.add_argument(
7963 'command', nargs=argparse.REMAINDER,
7964 help='command (optional)')
7965 parser_shell.add_argument(
7966 '--no-hosts',
7967 action='store_true',
7968 help='dont pass /etc/hosts through to the container')
7969
7970 parser_enter = subparsers.add_parser(
7971 'enter', help='run an interactive shell inside a running daemon container')
7972 parser_enter.set_defaults(func=command_enter)
7973 parser_enter.add_argument(
7974 '--fsid',
7975 help='cluster FSID')
7976 parser_enter.add_argument(
7977 '--name', '-n',
7978 required=True,
7979 help='daemon name (type.id)')
7980 parser_enter.add_argument(
7981 'command', nargs=argparse.REMAINDER,
7982 help='command')
7983
7984 parser_ceph_volume = subparsers.add_parser(
7985 'ceph-volume', help='run ceph-volume inside a container')
7986 parser_ceph_volume.set_defaults(func=command_ceph_volume)
7987 parser_ceph_volume.add_argument(
7988 '--fsid',
7989 help='cluster FSID')
7990 parser_ceph_volume.add_argument(
7991 '--config-json',
7992 help='JSON file with config and (client.bootrap-osd) key')
7993 parser_ceph_volume.add_argument(
7994 '--config', '-c',
7995 help='ceph conf file')
7996 parser_ceph_volume.add_argument(
7997 '--keyring', '-k',
7998 help='ceph.keyring to pass through to the container')
7999 parser_ceph_volume.add_argument(
8000 'command', nargs=argparse.REMAINDER,
8001 help='command')
8002
8003 parser_zap_osds = subparsers.add_parser(
8004 'zap-osds', help='zap all OSDs associated with a particular fsid')
8005 parser_zap_osds.set_defaults(func=command_zap_osds)
8006 parser_zap_osds.add_argument(
8007 '--fsid',
8008 required=True,
8009 help='cluster FSID')
8010 parser_zap_osds.add_argument(
8011 '--force',
8012 action='store_true',
8013 help='proceed, even though this may destroy valuable data')
8014
8015 parser_unit = subparsers.add_parser(
8016 'unit', help="operate on the daemon's systemd unit")
8017 parser_unit.set_defaults(func=command_unit)
8018 parser_unit.add_argument(
8019 'command',
8020 help='systemd command (start, stop, restart, enable, disable, ...)')
8021 parser_unit.add_argument(
8022 '--fsid',
8023 help='cluster FSID')
8024 parser_unit.add_argument(
8025 '--name', '-n',
8026 required=True,
8027 help='daemon name (type.id)')
8028
8029 parser_logs = subparsers.add_parser(
8030 'logs', help='print journald logs for a daemon container')
8031 parser_logs.set_defaults(func=command_logs)
8032 parser_logs.add_argument(
8033 '--fsid',
8034 help='cluster FSID')
8035 parser_logs.add_argument(
8036 '--name', '-n',
8037 required=True,
8038 help='daemon name (type.id)')
8039 parser_logs.add_argument(
8040 'command', nargs='*',
8041 help='additional journalctl args')
8042
8043 parser_bootstrap = subparsers.add_parser(
8044 'bootstrap', help='bootstrap a cluster (mon + mgr daemons)')
8045 parser_bootstrap.set_defaults(func=command_bootstrap)
8046 parser_bootstrap.add_argument(
8047 '--config', '-c',
8048 help='ceph conf file to incorporate')
8049 parser_bootstrap.add_argument(
8050 '--mon-id',
8051 required=False,
8052 help='mon id (default: local hostname)')
8053 parser_bootstrap.add_argument(
8054 '--mon-addrv',
8055 help='mon IPs (e.g., [v2:localipaddr:3300,v1:localipaddr:6789])')
8056 parser_bootstrap.add_argument(
8057 '--mon-ip',
8058 help='mon IP')
8059 parser_bootstrap.add_argument(
8060 '--mgr-id',
8061 required=False,
8062 help='mgr id (default: randomly generated)')
8063 parser_bootstrap.add_argument(
8064 '--fsid',
8065 help='cluster FSID')
8066 parser_bootstrap.add_argument(
8067 '--output-dir',
8068 default='/etc/ceph',
8069 help='directory to write config, keyring, and pub key files')
8070 parser_bootstrap.add_argument(
8071 '--output-keyring',
8072 help='location to write keyring file with new cluster admin and mon keys')
8073 parser_bootstrap.add_argument(
8074 '--output-config',
8075 help='location to write conf file to connect to new cluster')
8076 parser_bootstrap.add_argument(
8077 '--output-pub-ssh-key',
8078 help="location to write the cluster's public SSH key")
8079 parser_bootstrap.add_argument(
8080 '--skip-admin-label',
8081 action='store_true',
8082 help='do not create admin label for ceph.conf and client.admin keyring distribution')
8083 parser_bootstrap.add_argument(
8084 '--skip-ssh',
8085 action='store_true',
8086 help='skip setup of ssh key on local host')
8087 parser_bootstrap.add_argument(
8088 '--initial-dashboard-user',
8089 default='admin',
8090 help='Initial user for the dashboard')
8091 parser_bootstrap.add_argument(
8092 '--initial-dashboard-password',
8093 help='Initial password for the initial dashboard user')
8094 parser_bootstrap.add_argument(
8095 '--ssl-dashboard-port',
8096 type=int,
8097 default=8443,
8098 help='Port number used to connect with dashboard using SSL')
8099 parser_bootstrap.add_argument(
8100 '--dashboard-key',
8101 type=argparse.FileType('r'),
8102 help='Dashboard key')
8103 parser_bootstrap.add_argument(
8104 '--dashboard-crt',
8105 type=argparse.FileType('r'),
8106 help='Dashboard certificate')
8107
8108 parser_bootstrap.add_argument(
8109 '--ssh-config',
8110 type=argparse.FileType('r'),
8111 help='SSH config')
8112 parser_bootstrap.add_argument(
8113 '--ssh-private-key',
8114 type=argparse.FileType('r'),
8115 help='SSH private key')
8116 parser_bootstrap.add_argument(
8117 '--ssh-public-key',
8118 type=argparse.FileType('r'),
8119 help='SSH public key')
8120 parser_bootstrap.add_argument(
8121 '--ssh-user',
8122 default='root',
8123 help='set user for SSHing to cluster hosts, passwordless sudo will be needed for non-root users')
8124 parser_bootstrap.add_argument(
8125 '--skip-mon-network',
8126 action='store_true',
8127 help='set mon public_network based on bootstrap mon ip')
8128 parser_bootstrap.add_argument(
8129 '--skip-dashboard',
8130 action='store_true',
8131 help='do not enable the Ceph Dashboard')
8132 parser_bootstrap.add_argument(
8133 '--dashboard-password-noupdate',
8134 action='store_true',
8135 help='stop forced dashboard password change')
8136 parser_bootstrap.add_argument(
8137 '--no-minimize-config',
8138 action='store_true',
8139 help='do not assimilate and minimize the config file')
8140 parser_bootstrap.add_argument(
8141 '--skip-ping-check',
8142 action='store_true',
8143 help='do not verify that mon IP is pingable')
8144 parser_bootstrap.add_argument(
8145 '--skip-pull',
8146 action='store_true',
8147 help='do not pull the latest image before bootstrapping')
8148 parser_bootstrap.add_argument(
8149 '--skip-firewalld',
8150 action='store_true',
8151 help='Do not configure firewalld')
8152 parser_bootstrap.add_argument(
8153 '--allow-overwrite',
8154 action='store_true',
8155 help='allow overwrite of existing --output-* config/keyring/ssh files')
8156 parser_bootstrap.add_argument(
8157 '--allow-fqdn-hostname',
8158 action='store_true',
8159 help='allow hostname that is fully-qualified (contains ".")')
8160 parser_bootstrap.add_argument(
8161 '--allow-mismatched-release',
8162 action='store_true',
8163 help="allow bootstrap of ceph that doesn't match this version of cephadm")
8164 parser_bootstrap.add_argument(
8165 '--skip-prepare-host',
8166 action='store_true',
8167 help='Do not prepare host')
8168 parser_bootstrap.add_argument(
8169 '--orphan-initial-daemons',
8170 action='store_true',
8171 help='Set mon and mgr service to `unmanaged`, Do not create the crash service')
8172 parser_bootstrap.add_argument(
8173 '--skip-monitoring-stack',
8174 action='store_true',
8175 help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter)')
8176 parser_bootstrap.add_argument(
8177 '--apply-spec',
8178 help='Apply cluster spec after bootstrap (copy ssh key, add hosts and apply services)')
8179 parser_bootstrap.add_argument(
8180 '--shared_ceph_folder',
8181 metavar='CEPH_SOURCE_FOLDER',
8182 help='Development mode. Several folders in containers are volumes mapped to different sub-folders in the ceph source folder')
8183
8184 parser_bootstrap.add_argument(
8185 '--registry-url',
8186 help='url for custom registry')
8187 parser_bootstrap.add_argument(
8188 '--registry-username',
8189 help='username for custom registry')
8190 parser_bootstrap.add_argument(
8191 '--registry-password',
8192 help='password for custom registry')
8193 parser_bootstrap.add_argument(
8194 '--registry-json',
8195 help='json file with custom registry login info (URL, Username, Password)')
8196 parser_bootstrap.add_argument(
8197 '--container-init',
8198 action='store_true',
8199 default=CONTAINER_INIT,
8200 help=argparse.SUPPRESS)
8201 parser_bootstrap.add_argument(
8202 '--with-exporter',
8203 action='store_true',
8204 help='Automatically deploy cephadm metadata exporter to each node')
8205 parser_bootstrap.add_argument(
8206 '--exporter-config',
8207 action=CustomValidation,
8208 help=f'Exporter configuration information in JSON format (providing: {", ".join(CephadmDaemon.config_requirements)}, port information)')
8209 parser_bootstrap.add_argument(
8210 '--cluster-network',
8211 help='subnet to use for cluster replication, recovery and heartbeats (in CIDR notation network/mask)')
8212 parser_bootstrap.add_argument(
8213 '--single-host-defaults',
8214 action='store_true',
8215 help='adjust configuration defaults to suit a single-host cluster')
8216 parser_bootstrap.add_argument(
8217 '--log-to-file',
8218 action='store_true',
8219 help='configure cluster to log to traditional log files in /var/log/ceph/$fsid')
8220
8221 parser_deploy = subparsers.add_parser(
8222 'deploy', help='deploy a daemon')
8223 parser_deploy.set_defaults(func=command_deploy)
8224 parser_deploy.add_argument(
8225 '--name',
8226 required=True,
8227 action=CustomValidation,
8228 help='daemon name (type.id)')
8229 parser_deploy.add_argument(
8230 '--fsid',
8231 required=True,
8232 help='cluster FSID')
8233 parser_deploy.add_argument(
8234 '--config', '-c',
8235 help='config file for new daemon')
8236 parser_deploy.add_argument(
8237 '--config-json',
8238 help='Additional configuration information in JSON format')
8239 parser_deploy.add_argument(
8240 '--keyring',
8241 help='keyring for new daemon')
8242 parser_deploy.add_argument(
8243 '--key',
8244 help='key for new daemon')
8245 parser_deploy.add_argument(
8246 '--osd-fsid',
8247 help='OSD uuid, if creating an OSD container')
8248 parser_deploy.add_argument(
8249 '--skip-firewalld',
8250 action='store_true',
8251 help='Do not configure firewalld')
8252 parser_deploy.add_argument(
8253 '--tcp-ports',
8254 help='List of tcp ports to open in the host firewall')
8255 parser_deploy.add_argument(
8256 '--reconfig',
8257 action='store_true',
8258 help='Reconfigure a previously deployed daemon')
8259 parser_deploy.add_argument(
8260 '--allow-ptrace',
8261 action='store_true',
8262 help='Allow SYS_PTRACE on daemon container')
8263 parser_deploy.add_argument(
8264 '--container-init',
8265 action='store_true',
8266 default=CONTAINER_INIT,
8267 help=argparse.SUPPRESS)
8268 parser_deploy.add_argument(
8269 '--memory-request',
8270 help='Container memory request/target'
8271 )
8272 parser_deploy.add_argument(
8273 '--memory-limit',
8274 help='Container memory hard limit'
8275 )
8276 parser_deploy.add_argument(
8277 '--meta-json',
8278 help='JSON dict of additional metadata'
8279 )
8280
8281 parser_check_host = subparsers.add_parser(
8282 'check-host', help='check host configuration')
8283 parser_check_host.set_defaults(func=command_check_host)
8284 parser_check_host.add_argument(
8285 '--expect-hostname',
8286 help='Check that hostname matches an expected value')
8287
8288 parser_prepare_host = subparsers.add_parser(
8289 'prepare-host', help='prepare a host for cephadm use')
8290 parser_prepare_host.set_defaults(func=command_prepare_host)
8291 parser_prepare_host.add_argument(
8292 '--expect-hostname',
8293 help='Set hostname')
8294
8295 parser_add_repo = subparsers.add_parser(
8296 'add-repo', help='configure package repository')
8297 parser_add_repo.set_defaults(func=command_add_repo)
8298 parser_add_repo.add_argument(
8299 '--release',
8300 help='use latest version of a named release (e.g., {})'.format(LATEST_STABLE_RELEASE))
8301 parser_add_repo.add_argument(
8302 '--version',
8303 help='use specific upstream version (x.y.z)')
8304 parser_add_repo.add_argument(
8305 '--dev',
8306 help='use specified bleeding edge build from git branch or tag')
8307 parser_add_repo.add_argument(
8308 '--dev-commit',
8309 help='use specified bleeding edge build from git commit')
8310 parser_add_repo.add_argument(
8311 '--gpg-url',
8312 help='specify alternative GPG key location')
8313 parser_add_repo.add_argument(
8314 '--repo-url',
8315 default='https://download.ceph.com',
8316 help='specify alternative repo location')
8317 # TODO: proxy?
8318
8319 parser_rm_repo = subparsers.add_parser(
8320 'rm-repo', help='remove package repository configuration')
8321 parser_rm_repo.set_defaults(func=command_rm_repo)
8322
8323 parser_install = subparsers.add_parser(
8324 'install', help='install ceph package(s)')
8325 parser_install.set_defaults(func=command_install)
8326 parser_install.add_argument(
8327 'packages', nargs='*',
8328 default=['cephadm'],
8329 help='packages')
8330
8331 parser_registry_login = subparsers.add_parser(
8332 'registry-login', help='log host into authenticated registry')
8333 parser_registry_login.set_defaults(func=command_registry_login)
8334 parser_registry_login.add_argument(
8335 '--registry-url',
8336 help='url for custom registry')
8337 parser_registry_login.add_argument(
8338 '--registry-username',
8339 help='username for custom registry')
8340 parser_registry_login.add_argument(
8341 '--registry-password',
8342 help='password for custom registry')
8343 parser_registry_login.add_argument(
8344 '--registry-json',
8345 help='json file with custom registry login info (URL, Username, Password)')
8346 parser_registry_login.add_argument(
8347 '--fsid',
8348 help='cluster FSID')
8349
8350 parser_gather_facts = subparsers.add_parser(
8351 'gather-facts', help='gather and return host related information (JSON format)')
8352 parser_gather_facts.set_defaults(func=command_gather_facts)
8353
8354 parser_exporter = subparsers.add_parser(
8355 'exporter', help='Start cephadm in exporter mode (web service), providing host/daemon/disk metadata')
8356 parser_exporter.add_argument(
8357 '--fsid',
8358 required=True,
8359 type=str,
8360 help='fsid of the cephadm exporter to run against')
8361 parser_exporter.add_argument(
8362 '--port',
8363 type=int,
8364 default=int(CephadmDaemon.default_port),
8365 help='port number for the cephadm exporter service')
8366 parser_exporter.add_argument(
8367 '--id',
8368 type=str,
8369 default=get_hostname().split('.')[0],
8370 help='daemon identifer for the exporter')
8371 parser_exporter.set_defaults(func=command_exporter)
8372
8373 parser_maintenance = subparsers.add_parser(
8374 'host-maintenance', help='Manage the maintenance state of a host')
8375 parser_maintenance.add_argument(
8376 '--fsid',
8377 help='cluster FSID')
8378 parser_maintenance.add_argument(
8379 'maintenance_action',
8380 type=str,
8381 choices=['enter', 'exit'],
8382 help='Maintenance action - enter maintenance, or exit maintenance')
8383 parser_maintenance.set_defaults(func=command_maintenance)
8384
8385 return parser
8386
8387
8388 def _parse_args(av: List[str]) -> argparse.Namespace:
8389 parser = _get_parser()
8390
8391 args = parser.parse_args(av)
8392 if 'command' in args and args.command and args.command[0] == '--':
8393 args.command.pop(0)
8394
8395 # workaround argparse to deprecate the subparser `--container-init` flag
8396 # container_init and no_container_init must always be mutually exclusive
8397 container_init_args = ('--container-init', '--no-container-init')
8398 if set(container_init_args).issubset(av):
8399 parser.error('argument %s: not allowed with argument %s' % (container_init_args))
8400 elif '--container-init' in av:
8401 args.no_container_init = not args.container_init
8402 else:
8403 args.container_init = not args.no_container_init
8404 assert args.container_init is not args.no_container_init
8405
8406 return args
8407
8408
8409 def cephadm_init_ctx(args: List[str]) -> CephadmContext:
8410 ctx = CephadmContext()
8411 ctx.set_args(_parse_args(args))
8412 return ctx
8413
8414
8415 def cephadm_init(args: List[str]) -> CephadmContext:
8416 global logger
8417 ctx = cephadm_init_ctx(args)
8418
8419 # Logger configuration
8420 if not os.path.exists(LOG_DIR):
8421 os.makedirs(LOG_DIR)
8422 dictConfig(logging_config)
8423 logger = logging.getLogger()
8424
8425 if not os.path.exists(ctx.logrotate_dir + '/cephadm'):
8426 with open(ctx.logrotate_dir + '/cephadm', 'w') as f:
8427 f.write("""# created by cephadm
8428 /var/log/ceph/cephadm.log {
8429 rotate 7
8430 daily
8431 compress
8432 missingok
8433 notifempty
8434 }
8435 """)
8436
8437 if ctx.verbose:
8438 for handler in logger.handlers:
8439 if handler.name == 'console':
8440 handler.setLevel(logging.DEBUG)
8441
8442 return ctx
8443
8444
8445 def main() -> None:
8446
8447 # root?
8448 if os.geteuid() != 0:
8449 sys.stderr.write('ERROR: cephadm should be run as root\n')
8450 sys.exit(1)
8451
8452 av: List[str] = []
8453 av = sys.argv[1:]
8454
8455 ctx = cephadm_init(av)
8456 if not ctx.has_function():
8457 sys.stderr.write('No command specified; pass -h or --help for usage\n')
8458 sys.exit(1)
8459
8460 try:
8461 # podman or docker?
8462 ctx.container_engine = find_container_engine(ctx)
8463 if ctx.func not in \
8464 [command_check_host, command_prepare_host, command_add_repo, command_install]:
8465 check_container_engine(ctx)
8466 # command handler
8467 r = ctx.func(ctx)
8468 except Error as e:
8469 if ctx.verbose:
8470 raise
8471 logger.error('ERROR: %s' % e)
8472 sys.exit(1)
8473 if not r:
8474 r = 0
8475 sys.exit(r)
8476
8477
8478 if __name__ == '__main__':
8479 main()