]> git.proxmox.com Git - ceph.git/blob - ceph/src/cephadm/cephadm
import ceph pacific 16.2.5
[ceph.git] / ceph / src / cephadm / cephadm
1 #!/usr/bin/python3
2
3 import asyncio
4 import asyncio.subprocess
5 import argparse
6 import datetime
7 import fcntl
8 import ipaddress
9 import json
10 import logging
11 from logging.config import dictConfig
12 import os
13 import platform
14 import pwd
15 import random
16 import shlex
17 import shutil
18 import socket
19 import string
20 import subprocess
21 import sys
22 import tempfile
23 import time
24 import errno
25 import struct
26 from socketserver import ThreadingMixIn
27 from http.server import BaseHTTPRequestHandler, HTTPServer
28 import signal
29 import io
30 from contextlib import redirect_stdout
31 import ssl
32 from enum import Enum
33
34 from typing import Dict, List, Tuple, Optional, Union, Any, NoReturn, Callable, IO
35
36 import re
37 import uuid
38
39 from configparser import ConfigParser
40 from functools import wraps
41 from glob import glob
42 from io import StringIO
43 from threading import Thread, RLock
44 from urllib.error import HTTPError
45 from urllib.request import urlopen
46 from pathlib import Path
47
48 # Default container images -----------------------------------------------------
49 DEFAULT_IMAGE = 'docker.io/ceph/ceph:v16'
50 DEFAULT_IMAGE_IS_MASTER = False
51 DEFAULT_IMAGE_RELEASE = 'pacific'
52 DEFAULT_PROMETHEUS_IMAGE = 'docker.io/prom/prometheus:v2.18.1'
53 DEFAULT_NODE_EXPORTER_IMAGE = 'docker.io/prom/node-exporter:v0.18.1'
54 DEFAULT_GRAFANA_IMAGE = 'docker.io/ceph/ceph-grafana:6.7.4'
55 DEFAULT_ALERT_MANAGER_IMAGE = 'docker.io/prom/alertmanager:v0.20.0'
56 DEFAULT_REGISTRY = 'docker.io' # normalize unqualified digests to this
57 # ------------------------------------------------------------------------------
58
59 LATEST_STABLE_RELEASE = 'pacific'
60 DATA_DIR = '/var/lib/ceph'
61 LOG_DIR = '/var/log/ceph'
62 LOCK_DIR = '/run/cephadm'
63 LOGROTATE_DIR = '/etc/logrotate.d'
64 SYSCTL_DIR = '/usr/lib/sysctl.d'
65 UNIT_DIR = '/etc/systemd/system'
66 LOG_DIR_MODE = 0o770
67 DATA_DIR_MODE = 0o700
68 CONTAINER_INIT = True
69 MIN_PODMAN_VERSION = (2, 0, 2)
70 CGROUPS_SPLIT_PODMAN_VERSION = (2, 1, 0)
71 CUSTOM_PS1 = r'[ceph: \u@\h \W]\$ '
72 DEFAULT_TIMEOUT = None # in seconds
73 DEFAULT_RETRY = 15
74 SHELL_DEFAULT_CONF = '/etc/ceph/ceph.conf'
75 SHELL_DEFAULT_KEYRING = '/etc/ceph/ceph.client.admin.keyring'
76 DATEFMT = '%Y-%m-%dT%H:%M:%S.%fZ'
77
78 logger: logging.Logger = None # type: ignore
79
80 """
81 You can invoke cephadm in two ways:
82
83 1. The normal way, at the command line.
84
85 2. By piping the script to the python3 binary. In this latter case, you should
86 prepend one or more lines to the beginning of the script.
87
88 For arguments,
89
90 injected_argv = [...]
91
92 e.g.,
93
94 injected_argv = ['ls']
95
96 For reading stdin from the '--config-json -' argument,
97
98 injected_stdin = '...'
99 """
100 cached_stdin = None
101
102 ##################################
103
104
105 class BaseConfig:
106
107 def __init__(self):
108 self.image: str = ''
109 self.docker: bool = False
110 self.data_dir: str = DATA_DIR
111 self.log_dir: str = LOG_DIR
112 self.logrotate_dir: str = LOGROTATE_DIR
113 self.sysctl_dir: str = SYSCTL_DIR
114 self.unit_dir: str = UNIT_DIR
115 self.verbose: bool = False
116 self.timeout: Optional[int] = DEFAULT_TIMEOUT
117 self.retry: int = DEFAULT_RETRY
118 self.env: List[str] = []
119 self.memory_request: Optional[int] = None
120 self.memory_limit: Optional[int] = None
121
122 self.container_init: bool = CONTAINER_INIT
123 self.container_engine: Optional[ContainerEngine] = None
124
125 def set_from_args(self, args: argparse.Namespace):
126 argdict: Dict[str, Any] = vars(args)
127 for k, v in argdict.items():
128 if hasattr(self, k):
129 setattr(self, k, v)
130
131
132 class CephadmContext:
133
134 def __init__(self):
135 self.__dict__['_args'] = None
136 self.__dict__['_conf'] = BaseConfig()
137
138 def set_args(self, args: argparse.Namespace) -> None:
139 self._conf.set_from_args(args)
140 self._args = args
141
142 def has_function(self) -> bool:
143 return 'func' in self._args
144
145 def __contains__(self, name: str) -> bool:
146 return hasattr(self, name)
147
148 def __getattr__(self, name: str) -> Any:
149 if '_conf' in self.__dict__ and hasattr(self._conf, name):
150 return getattr(self._conf, name)
151 elif '_args' in self.__dict__ and hasattr(self._args, name):
152 return getattr(self._args, name)
153 else:
154 return super().__getattribute__(name)
155
156 def __setattr__(self, name: str, value: Any) -> None:
157 if hasattr(self._conf, name):
158 setattr(self._conf, name, value)
159 elif hasattr(self._args, name):
160 setattr(self._args, name, value)
161 else:
162 super().__setattr__(name, value)
163
164
165 class ContainerEngine:
166 def __init__(self):
167 self.path = find_program(self.EXE)
168
169 @property
170 def EXE(self) -> str:
171 raise NotImplementedError()
172
173
174 class Podman(ContainerEngine):
175 EXE = 'podman'
176
177 def __init__(self):
178 super().__init__()
179 self._version = None
180
181 @property
182 def version(self):
183 if self._version is None:
184 raise RuntimeError('Please call `get_version` first')
185 return self._version
186
187 def get_version(self, ctx: CephadmContext):
188 out, _, _ = call_throws(ctx, [self.path, 'version', '--format', '{{.Client.Version}}'])
189 self._version = _parse_podman_version(out)
190
191
192 class Docker(ContainerEngine):
193 EXE = 'docker'
194
195
196 CONTAINER_PREFERENCE = (Podman, Docker) # prefer podman to docker
197
198
199 # Log and console output config
200 logging_config = {
201 'version': 1,
202 'disable_existing_loggers': True,
203 'formatters': {
204 'cephadm': {
205 'format': '%(asctime)s %(levelname)s %(message)s'
206 },
207 },
208 'handlers': {
209 'console': {
210 'level': 'INFO',
211 'class': 'logging.StreamHandler',
212 },
213 'log_file': {
214 'level': 'DEBUG',
215 'class': 'logging.handlers.WatchedFileHandler',
216 'formatter': 'cephadm',
217 'filename': '%s/cephadm.log' % LOG_DIR,
218 }
219 },
220 'loggers': {
221 '': {
222 'level': 'DEBUG',
223 'handlers': ['console', 'log_file'],
224 }
225 }
226 }
227
228
229 class termcolor:
230 yellow = '\033[93m'
231 red = '\033[31m'
232 end = '\033[0m'
233
234
235 class Error(Exception):
236 pass
237
238
239 class TimeoutExpired(Error):
240 pass
241
242 ##################################
243
244
245 class Ceph(object):
246 daemons = ('mon', 'mgr', 'mds', 'osd', 'rgw', 'rbd-mirror',
247 'crash', 'cephfs-mirror')
248
249 ##################################
250
251
252 class OSD(object):
253 @staticmethod
254 def get_sysctl_settings() -> List[str]:
255 return [
256 '# allow a large number of OSDs',
257 'fs.aio-max-nr = 1048576',
258 'kernel.pid_max = 4194304',
259 ]
260
261 ##################################
262
263
264 class Monitoring(object):
265 """Define the configs for the monitoring containers"""
266
267 port_map = {
268 'prometheus': [9095], # Avoid default 9090, due to conflict with cockpit UI
269 'node-exporter': [9100],
270 'grafana': [3000],
271 'alertmanager': [9093, 9094],
272 }
273
274 components = {
275 'prometheus': {
276 'image': DEFAULT_PROMETHEUS_IMAGE,
277 'cpus': '2',
278 'memory': '4GB',
279 'args': [
280 '--config.file=/etc/prometheus/prometheus.yml',
281 '--storage.tsdb.path=/prometheus',
282 ],
283 'config-json-files': [
284 'prometheus.yml',
285 ],
286 },
287 'node-exporter': {
288 'image': DEFAULT_NODE_EXPORTER_IMAGE,
289 'cpus': '1',
290 'memory': '1GB',
291 'args': [
292 '--no-collector.timex',
293 ],
294 },
295 'grafana': {
296 'image': DEFAULT_GRAFANA_IMAGE,
297 'cpus': '2',
298 'memory': '4GB',
299 'args': [],
300 'config-json-files': [
301 'grafana.ini',
302 'provisioning/datasources/ceph-dashboard.yml',
303 'certs/cert_file',
304 'certs/cert_key',
305 ],
306 },
307 'alertmanager': {
308 'image': DEFAULT_ALERT_MANAGER_IMAGE,
309 'cpus': '2',
310 'memory': '2GB',
311 'args': [
312 '--cluster.listen-address=:{}'.format(port_map['alertmanager'][1]),
313 ],
314 'config-json-files': [
315 'alertmanager.yml',
316 ],
317 'config-json-args': [
318 'peers',
319 ],
320 },
321 } # type: ignore
322
323 @staticmethod
324 def get_version(ctx, container_id, daemon_type):
325 # type: (CephadmContext, str, str) -> str
326 """
327 :param: daemon_type Either "prometheus", "alertmanager" or "node-exporter"
328 """
329 assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter')
330 cmd = daemon_type.replace('-', '_')
331 code = -1
332 err = ''
333 version = ''
334 if daemon_type == 'alertmanager':
335 for cmd in ['alertmanager', 'prometheus-alertmanager']:
336 _, err, code = call(ctx, [
337 ctx.container_engine.path, 'exec', container_id, cmd,
338 '--version'
339 ], verbosity=CallVerbosity.DEBUG)
340 if code == 0:
341 break
342 cmd = 'alertmanager' # reset cmd for version extraction
343 else:
344 _, err, code = call(ctx, [
345 ctx.container_engine.path, 'exec', container_id, cmd, '--version'
346 ], verbosity=CallVerbosity.DEBUG)
347 if code == 0 and \
348 err.startswith('%s, version ' % cmd):
349 version = err.split(' ')[2]
350 return version
351
352 ##################################
353
354
355 def populate_files(config_dir, config_files, uid, gid):
356 # type: (str, Dict, int, int) -> None
357 """create config files for different services"""
358 for fname in config_files:
359 config_file = os.path.join(config_dir, fname)
360 config_content = dict_get_join(config_files, fname)
361 logger.info('Write file: %s' % (config_file))
362 with open(config_file, 'w', encoding='utf-8') as f:
363 os.fchown(f.fileno(), uid, gid)
364 os.fchmod(f.fileno(), 0o600)
365 f.write(config_content)
366
367
368 class NFSGanesha(object):
369 """Defines a NFS-Ganesha container"""
370
371 daemon_type = 'nfs'
372 entrypoint = '/usr/bin/ganesha.nfsd'
373 daemon_args = ['-F', '-L', 'STDERR']
374
375 required_files = ['ganesha.conf']
376
377 port_map = {
378 'nfs': 2049,
379 }
380
381 def __init__(self,
382 ctx,
383 fsid,
384 daemon_id,
385 config_json,
386 image=DEFAULT_IMAGE):
387 # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
388 self.ctx = ctx
389 self.fsid = fsid
390 self.daemon_id = daemon_id
391 self.image = image
392
393 # config-json options
394 self.pool = dict_get(config_json, 'pool', require=True)
395 self.namespace = dict_get(config_json, 'namespace')
396 self.userid = dict_get(config_json, 'userid')
397 self.extra_args = dict_get(config_json, 'extra_args', [])
398 self.files = dict_get(config_json, 'files', {})
399 self.rgw = dict_get(config_json, 'rgw', {})
400
401 # validate the supplied args
402 self.validate()
403
404 @classmethod
405 def init(cls, ctx, fsid, daemon_id):
406 # type: (CephadmContext, str, Union[int, str]) -> NFSGanesha
407 return cls(ctx, fsid, daemon_id, get_parm(ctx.config_json), ctx.image)
408
409 def get_container_mounts(self, data_dir):
410 # type: (str) -> Dict[str, str]
411 mounts = dict()
412 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
413 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
414 mounts[os.path.join(data_dir, 'etc/ganesha')] = '/etc/ganesha:z'
415 if self.rgw:
416 cluster = self.rgw.get('cluster', 'ceph')
417 rgw_user = self.rgw.get('user', 'admin')
418 mounts[os.path.join(data_dir, 'keyring.rgw')] = \
419 '/var/lib/ceph/radosgw/%s-%s/keyring:z' % (cluster, rgw_user)
420 return mounts
421
422 @staticmethod
423 def get_container_envs():
424 # type: () -> List[str]
425 envs = [
426 'CEPH_CONF=%s' % ('/etc/ceph/ceph.conf')
427 ]
428 return envs
429
430 @staticmethod
431 def get_version(ctx, container_id):
432 # type: (CephadmContext, str) -> Optional[str]
433 version = None
434 out, err, code = call(ctx,
435 [ctx.container_engine.path, 'exec', container_id,
436 NFSGanesha.entrypoint, '-v'],
437 verbosity=CallVerbosity.DEBUG)
438 if code == 0:
439 match = re.search(r'NFS-Ganesha Release\s*=\s*[V]*([\d.]+)', out)
440 if match:
441 version = match.group(1)
442 return version
443
444 def validate(self):
445 # type: () -> None
446 if not is_fsid(self.fsid):
447 raise Error('not an fsid: %s' % self.fsid)
448 if not self.daemon_id:
449 raise Error('invalid daemon_id: %s' % self.daemon_id)
450 if not self.image:
451 raise Error('invalid image: %s' % self.image)
452
453 # check for the required files
454 if self.required_files:
455 for fname in self.required_files:
456 if fname not in self.files:
457 raise Error('required file missing from config-json: %s' % fname)
458
459 # check for an RGW config
460 if self.rgw:
461 if not self.rgw.get('keyring'):
462 raise Error('RGW keyring is missing')
463 if not self.rgw.get('user'):
464 raise Error('RGW user is missing')
465
466 def get_daemon_name(self):
467 # type: () -> str
468 return '%s.%s' % (self.daemon_type, self.daemon_id)
469
470 def get_container_name(self, desc=None):
471 # type: (Optional[str]) -> str
472 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
473 if desc:
474 cname = '%s-%s' % (cname, desc)
475 return cname
476
477 def get_daemon_args(self):
478 # type: () -> List[str]
479 return self.daemon_args + self.extra_args
480
481 def create_daemon_dirs(self, data_dir, uid, gid):
482 # type: (str, int, int) -> None
483 """Create files under the container data dir"""
484 if not os.path.isdir(data_dir):
485 raise OSError('data_dir is not a directory: %s' % (data_dir))
486
487 logger.info('Creating ganesha config...')
488
489 # create the ganesha conf dir
490 config_dir = os.path.join(data_dir, 'etc/ganesha')
491 makedirs(config_dir, uid, gid, 0o755)
492
493 # populate files from the config-json
494 populate_files(config_dir, self.files, uid, gid)
495
496 # write the RGW keyring
497 if self.rgw:
498 keyring_path = os.path.join(data_dir, 'keyring.rgw')
499 with open(keyring_path, 'w') as f:
500 os.fchmod(f.fileno(), 0o600)
501 os.fchown(f.fileno(), uid, gid)
502 f.write(self.rgw.get('keyring', ''))
503
504 ##################################
505
506
507 class CephIscsi(object):
508 """Defines a Ceph-Iscsi container"""
509
510 daemon_type = 'iscsi'
511 entrypoint = '/usr/bin/rbd-target-api'
512
513 required_files = ['iscsi-gateway.cfg']
514
515 def __init__(self,
516 ctx,
517 fsid,
518 daemon_id,
519 config_json,
520 image=DEFAULT_IMAGE):
521 # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
522 self.ctx = ctx
523 self.fsid = fsid
524 self.daemon_id = daemon_id
525 self.image = image
526
527 # config-json options
528 self.files = dict_get(config_json, 'files', {})
529
530 # validate the supplied args
531 self.validate()
532
533 @classmethod
534 def init(cls, ctx, fsid, daemon_id):
535 # type: (CephadmContext, str, Union[int, str]) -> CephIscsi
536 return cls(ctx, fsid, daemon_id,
537 get_parm(ctx.config_json), ctx.image)
538
539 @staticmethod
540 def get_container_mounts(data_dir, log_dir):
541 # type: (str, str) -> Dict[str, str]
542 mounts = dict()
543 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
544 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
545 mounts[os.path.join(data_dir, 'iscsi-gateway.cfg')] = '/etc/ceph/iscsi-gateway.cfg:z'
546 mounts[os.path.join(data_dir, 'configfs')] = '/sys/kernel/config'
547 mounts[log_dir] = '/var/log/rbd-target-api:z'
548 mounts['/dev'] = '/dev'
549 return mounts
550
551 @staticmethod
552 def get_container_binds():
553 # type: () -> List[List[str]]
554 binds = []
555 lib_modules = ['type=bind',
556 'source=/lib/modules',
557 'destination=/lib/modules',
558 'ro=true']
559 binds.append(lib_modules)
560 return binds
561
562 @staticmethod
563 def get_version(ctx, container_id):
564 # type: (CephadmContext, str) -> Optional[str]
565 version = None
566 out, err, code = call(ctx,
567 [ctx.container_engine.path, 'exec', container_id,
568 '/usr/bin/python3', '-c', "import pkg_resources; print(pkg_resources.require('ceph_iscsi')[0].version)"],
569 verbosity=CallVerbosity.DEBUG)
570 if code == 0:
571 version = out.strip()
572 return version
573
574 def validate(self):
575 # type: () -> None
576 if not is_fsid(self.fsid):
577 raise Error('not an fsid: %s' % self.fsid)
578 if not self.daemon_id:
579 raise Error('invalid daemon_id: %s' % self.daemon_id)
580 if not self.image:
581 raise Error('invalid image: %s' % self.image)
582
583 # check for the required files
584 if self.required_files:
585 for fname in self.required_files:
586 if fname not in self.files:
587 raise Error('required file missing from config-json: %s' % fname)
588
589 def get_daemon_name(self):
590 # type: () -> str
591 return '%s.%s' % (self.daemon_type, self.daemon_id)
592
593 def get_container_name(self, desc=None):
594 # type: (Optional[str]) -> str
595 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
596 if desc:
597 cname = '%s-%s' % (cname, desc)
598 return cname
599
600 def create_daemon_dirs(self, data_dir, uid, gid):
601 # type: (str, int, int) -> None
602 """Create files under the container data dir"""
603 if not os.path.isdir(data_dir):
604 raise OSError('data_dir is not a directory: %s' % (data_dir))
605
606 logger.info('Creating ceph-iscsi config...')
607 configfs_dir = os.path.join(data_dir, 'configfs')
608 makedirs(configfs_dir, uid, gid, 0o755)
609
610 # populate files from the config-json
611 populate_files(data_dir, self.files, uid, gid)
612
613 @staticmethod
614 def configfs_mount_umount(data_dir, mount=True):
615 # type: (str, bool) -> List[str]
616 mount_path = os.path.join(data_dir, 'configfs')
617 if mount:
618 cmd = 'if ! grep -qs {0} /proc/mounts; then ' \
619 'mount -t configfs none {0}; fi'.format(mount_path)
620 else:
621 cmd = 'if grep -qs {0} /proc/mounts; then ' \
622 'umount {0}; fi'.format(mount_path)
623 return cmd.split()
624
625 def get_tcmu_runner_container(self):
626 # type: () -> CephContainer
627 tcmu_container = get_container(self.ctx, self.fsid, self.daemon_type, self.daemon_id)
628 tcmu_container.entrypoint = '/usr/bin/tcmu-runner'
629 tcmu_container.cname = self.get_container_name(desc='tcmu')
630 # remove extra container args for tcmu container.
631 # extra args could cause issue with forking service type
632 tcmu_container.container_args = []
633 return tcmu_container
634
635 ##################################
636
637
638 class HAproxy(object):
639 """Defines an HAproxy container"""
640 daemon_type = 'haproxy'
641 required_files = ['haproxy.cfg']
642 default_image = 'haproxy'
643
644 def __init__(self,
645 ctx: CephadmContext,
646 fsid: str, daemon_id: Union[int, str],
647 config_json: Dict, image: str) -> None:
648 self.ctx = ctx
649 self.fsid = fsid
650 self.daemon_id = daemon_id
651 self.image = image
652
653 # config-json options
654 self.files = dict_get(config_json, 'files', {})
655
656 self.validate()
657
658 @classmethod
659 def init(cls, ctx: CephadmContext,
660 fsid: str, daemon_id: Union[int, str]) -> 'HAproxy':
661 return cls(ctx, fsid, daemon_id, get_parm(ctx.config_json),
662 ctx.image)
663
664 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
665 """Create files under the container data dir"""
666 if not os.path.isdir(data_dir):
667 raise OSError('data_dir is not a directory: %s' % (data_dir))
668
669 # create additional directories in data dir for HAproxy to use
670 if not os.path.isdir(os.path.join(data_dir, 'haproxy')):
671 makedirs(os.path.join(data_dir, 'haproxy'), uid, gid, DATA_DIR_MODE)
672
673 data_dir = os.path.join(data_dir, 'haproxy')
674 populate_files(data_dir, self.files, uid, gid)
675
676 def get_daemon_args(self) -> List[str]:
677 return ['haproxy', '-f', '/var/lib/haproxy/haproxy.cfg']
678
679 def validate(self):
680 # type: () -> None
681 if not is_fsid(self.fsid):
682 raise Error('not an fsid: %s' % self.fsid)
683 if not self.daemon_id:
684 raise Error('invalid daemon_id: %s' % self.daemon_id)
685 if not self.image:
686 raise Error('invalid image: %s' % self.image)
687
688 # check for the required files
689 if self.required_files:
690 for fname in self.required_files:
691 if fname not in self.files:
692 raise Error('required file missing from config-json: %s' % fname)
693
694 def get_daemon_name(self):
695 # type: () -> str
696 return '%s.%s' % (self.daemon_type, self.daemon_id)
697
698 def get_container_name(self, desc=None):
699 # type: (Optional[str]) -> str
700 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
701 if desc:
702 cname = '%s-%s' % (cname, desc)
703 return cname
704
705 def extract_uid_gid_haproxy(self):
706 # better directory for this?
707 return extract_uid_gid(self.ctx, file_path='/var/lib')
708
709 @staticmethod
710 def get_container_mounts(data_dir: str) -> Dict[str, str]:
711 mounts = dict()
712 mounts[os.path.join(data_dir, 'haproxy')] = '/var/lib/haproxy'
713 return mounts
714
715 @staticmethod
716 def get_sysctl_settings() -> List[str]:
717 return [
718 '# IP forwarding',
719 'net.ipv4.ip_forward = 1',
720 ]
721
722 ##################################
723
724
725 class Keepalived(object):
726 """Defines an Keepalived container"""
727 daemon_type = 'keepalived'
728 required_files = ['keepalived.conf']
729 default_image = 'arcts/keepalived'
730
731 def __init__(self,
732 ctx: CephadmContext,
733 fsid: str, daemon_id: Union[int, str],
734 config_json: Dict, image: str) -> None:
735 self.ctx = ctx
736 self.fsid = fsid
737 self.daemon_id = daemon_id
738 self.image = image
739
740 # config-json options
741 self.files = dict_get(config_json, 'files', {})
742
743 self.validate()
744
745 @classmethod
746 def init(cls, ctx: CephadmContext, fsid: str,
747 daemon_id: Union[int, str]) -> 'Keepalived':
748 return cls(ctx, fsid, daemon_id,
749 get_parm(ctx.config_json), ctx.image)
750
751 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
752 """Create files under the container data dir"""
753 if not os.path.isdir(data_dir):
754 raise OSError('data_dir is not a directory: %s' % (data_dir))
755
756 # create additional directories in data dir for keepalived to use
757 if not os.path.isdir(os.path.join(data_dir, 'keepalived')):
758 makedirs(os.path.join(data_dir, 'keepalived'), uid, gid, DATA_DIR_MODE)
759
760 # populate files from the config-json
761 populate_files(data_dir, self.files, uid, gid)
762
763 def validate(self):
764 # type: () -> None
765 if not is_fsid(self.fsid):
766 raise Error('not an fsid: %s' % self.fsid)
767 if not self.daemon_id:
768 raise Error('invalid daemon_id: %s' % self.daemon_id)
769 if not self.image:
770 raise Error('invalid image: %s' % self.image)
771
772 # check for the required files
773 if self.required_files:
774 for fname in self.required_files:
775 if fname not in self.files:
776 raise Error('required file missing from config-json: %s' % fname)
777
778 def get_daemon_name(self):
779 # type: () -> str
780 return '%s.%s' % (self.daemon_type, self.daemon_id)
781
782 def get_container_name(self, desc=None):
783 # type: (Optional[str]) -> str
784 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
785 if desc:
786 cname = '%s-%s' % (cname, desc)
787 return cname
788
789 @staticmethod
790 def get_container_envs():
791 # type: () -> List[str]
792 envs = [
793 'KEEPALIVED_AUTOCONF=false',
794 'KEEPALIVED_CONF=/etc/keepalived/keepalived.conf',
795 'KEEPALIVED_CMD=/usr/sbin/keepalived -n -l -f /etc/keepalived/keepalived.conf',
796 'KEEPALIVED_DEBUG=false'
797 ]
798 return envs
799
800 @staticmethod
801 def get_sysctl_settings() -> List[str]:
802 return [
803 '# IP forwarding and non-local bind',
804 'net.ipv4.ip_forward = 1',
805 'net.ipv4.ip_nonlocal_bind = 1',
806 ]
807
808 def extract_uid_gid_keepalived(self):
809 # better directory for this?
810 return extract_uid_gid(self.ctx, file_path='/var/lib')
811
812 @staticmethod
813 def get_container_mounts(data_dir: str) -> Dict[str, str]:
814 mounts = dict()
815 mounts[os.path.join(data_dir, 'keepalived.conf')] = '/etc/keepalived/keepalived.conf'
816 return mounts
817
818 ##################################
819
820
821 class CustomContainer(object):
822 """Defines a custom container"""
823 daemon_type = 'container'
824
825 def __init__(self,
826 fsid: str, daemon_id: Union[int, str],
827 config_json: Dict, image: str) -> None:
828 self.fsid = fsid
829 self.daemon_id = daemon_id
830 self.image = image
831
832 # config-json options
833 self.entrypoint = dict_get(config_json, 'entrypoint')
834 self.uid = dict_get(config_json, 'uid', 65534) # nobody
835 self.gid = dict_get(config_json, 'gid', 65534) # nobody
836 self.volume_mounts = dict_get(config_json, 'volume_mounts', {})
837 self.args = dict_get(config_json, 'args', [])
838 self.envs = dict_get(config_json, 'envs', [])
839 self.privileged = dict_get(config_json, 'privileged', False)
840 self.bind_mounts = dict_get(config_json, 'bind_mounts', [])
841 self.ports = dict_get(config_json, 'ports', [])
842 self.dirs = dict_get(config_json, 'dirs', [])
843 self.files = dict_get(config_json, 'files', {})
844
845 @classmethod
846 def init(cls, ctx: CephadmContext,
847 fsid: str, daemon_id: Union[int, str]) -> 'CustomContainer':
848 return cls(fsid, daemon_id,
849 get_parm(ctx.config_json), ctx.image)
850
851 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
852 """
853 Create dirs/files below the container data directory.
854 """
855 logger.info('Creating custom container configuration '
856 'dirs/files in {} ...'.format(data_dir))
857
858 if not os.path.isdir(data_dir):
859 raise OSError('data_dir is not a directory: %s' % data_dir)
860
861 for dir_path in self.dirs:
862 logger.info('Creating directory: {}'.format(dir_path))
863 dir_path = os.path.join(data_dir, dir_path.strip('/'))
864 makedirs(dir_path, uid, gid, 0o755)
865
866 for file_path in self.files:
867 logger.info('Creating file: {}'.format(file_path))
868 content = dict_get_join(self.files, file_path)
869 file_path = os.path.join(data_dir, file_path.strip('/'))
870 with open(file_path, 'w', encoding='utf-8') as f:
871 os.fchown(f.fileno(), uid, gid)
872 os.fchmod(f.fileno(), 0o600)
873 f.write(content)
874
875 def get_daemon_args(self) -> List[str]:
876 return []
877
878 def get_container_args(self) -> List[str]:
879 return self.args
880
881 def get_container_envs(self) -> List[str]:
882 return self.envs
883
884 def get_container_mounts(self, data_dir: str) -> Dict[str, str]:
885 """
886 Get the volume mounts. Relative source paths will be located below
887 `/var/lib/ceph/<cluster-fsid>/<daemon-name>`.
888
889 Example:
890 {
891 /foo/conf: /conf
892 foo/conf: /conf
893 }
894 becomes
895 {
896 /foo/conf: /conf
897 /var/lib/ceph/<cluster-fsid>/<daemon-name>/foo/conf: /conf
898 }
899 """
900 mounts = {}
901 for source, destination in self.volume_mounts.items():
902 source = os.path.join(data_dir, source)
903 mounts[source] = destination
904 return mounts
905
906 def get_container_binds(self, data_dir: str) -> List[List[str]]:
907 """
908 Get the bind mounts. Relative `source=...` paths will be located below
909 `/var/lib/ceph/<cluster-fsid>/<daemon-name>`.
910
911 Example:
912 [
913 'type=bind',
914 'source=lib/modules',
915 'destination=/lib/modules',
916 'ro=true'
917 ]
918 becomes
919 [
920 ...
921 'source=/var/lib/ceph/<cluster-fsid>/<daemon-name>/lib/modules',
922 ...
923 ]
924 """
925 binds = self.bind_mounts.copy()
926 for bind in binds:
927 for index, value in enumerate(bind):
928 match = re.match(r'^source=(.+)$', value)
929 if match:
930 bind[index] = 'source={}'.format(os.path.join(
931 data_dir, match.group(1)))
932 return binds
933
934 ##################################
935
936
937 def touch(file_path: str, uid: Optional[int] = None, gid: Optional[int] = None) -> None:
938 Path(file_path).touch()
939 if uid and gid:
940 os.chown(file_path, uid, gid)
941
942
943 ##################################
944
945
946 def dict_get(d: Dict, key: str, default: Any = None, require: bool = False) -> Any:
947 """
948 Helper function to get a key from a dictionary.
949 :param d: The dictionary to process.
950 :param key: The name of the key to get.
951 :param default: The default value in case the key does not
952 exist. Default is `None`.
953 :param require: Set to `True` if the key is required. An
954 exception will be raised if the key does not exist in
955 the given dictionary.
956 :return: Returns the value of the given key.
957 :raises: :exc:`self.Error` if the given key does not exist
958 and `require` is set to `True`.
959 """
960 if require and key not in d.keys():
961 raise Error('{} missing from dict'.format(key))
962 return d.get(key, default) # type: ignore
963
964 ##################################
965
966
967 def dict_get_join(d: Dict, key: str) -> Any:
968 """
969 Helper function to get the value of a given key from a dictionary.
970 `List` values will be converted to a string by joining them with a
971 line break.
972 :param d: The dictionary to process.
973 :param key: The name of the key to get.
974 :return: Returns the value of the given key. If it was a `list`, it
975 will be joining with a line break.
976 """
977 value = d.get(key)
978 if isinstance(value, list):
979 value = '\n'.join(map(str, value))
980 return value
981
982 ##################################
983
984
985 def get_supported_daemons():
986 # type: () -> List[str]
987 supported_daemons = list(Ceph.daemons)
988 supported_daemons.extend(Monitoring.components)
989 supported_daemons.append(NFSGanesha.daemon_type)
990 supported_daemons.append(CephIscsi.daemon_type)
991 supported_daemons.append(CustomContainer.daemon_type)
992 supported_daemons.append(CephadmDaemon.daemon_type)
993 supported_daemons.append(HAproxy.daemon_type)
994 supported_daemons.append(Keepalived.daemon_type)
995 assert len(supported_daemons) == len(set(supported_daemons))
996 return supported_daemons
997
998 ##################################
999
1000
1001 class PortOccupiedError(Error):
1002 pass
1003
1004
1005 def attempt_bind(ctx, s, address, port):
1006 # type: (CephadmContext, socket.socket, str, int) -> None
1007 try:
1008 s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
1009 s.bind((address, port))
1010 except OSError as e:
1011 if e.errno == errno.EADDRINUSE:
1012 msg = 'Cannot bind to IP %s port %d: %s' % (address, port, e)
1013 logger.warning(msg)
1014 raise PortOccupiedError(msg)
1015 else:
1016 raise Error(e)
1017 except Exception as e:
1018 raise Error(e)
1019 finally:
1020 s.close()
1021
1022
1023 def port_in_use(ctx, port_num):
1024 # type: (CephadmContext, int) -> bool
1025 """Detect whether a port is in use on the local machine - IPv4 and IPv6"""
1026 logger.info('Verifying port %d ...' % port_num)
1027
1028 def _port_in_use(af: socket.AddressFamily, address: str) -> bool:
1029 try:
1030 s = socket.socket(af, socket.SOCK_STREAM)
1031 attempt_bind(ctx, s, address, port_num)
1032 except PortOccupiedError:
1033 return True
1034 except OSError as e:
1035 if e.errno in (errno.EAFNOSUPPORT, errno.EADDRNOTAVAIL):
1036 # Ignore EAFNOSUPPORT and EADDRNOTAVAIL as two interfaces are
1037 # being tested here and one might be intentionally be disabled.
1038 # In that case no error should be raised.
1039 return False
1040 else:
1041 raise e
1042 return False
1043 return any(_port_in_use(af, address) for af, address in (
1044 (socket.AF_INET, '0.0.0.0'),
1045 (socket.AF_INET6, '::')
1046 ))
1047
1048
1049 def check_ip_port(ctx, ip, port):
1050 # type: (CephadmContext, str, int) -> None
1051 if not ctx.skip_ping_check:
1052 logger.info('Verifying IP %s port %d ...' % (ip, port))
1053 if is_ipv6(ip):
1054 s = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
1055 ip = unwrap_ipv6(ip)
1056 else:
1057 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1058 attempt_bind(ctx, s, ip, port)
1059
1060 ##################################
1061
1062
1063 # this is an abbreviated version of
1064 # https://github.com/benediktschmitt/py-filelock/blob/master/filelock.py
1065 # that drops all of the compatibility (this is Unix/Linux only).
1066
1067 class Timeout(TimeoutError):
1068 """
1069 Raised when the lock could not be acquired in *timeout*
1070 seconds.
1071 """
1072
1073 def __init__(self, lock_file):
1074 """
1075 """
1076 #: The path of the file lock.
1077 self.lock_file = lock_file
1078 return None
1079
1080 def __str__(self):
1081 temp = "The file lock '{}' could not be acquired."\
1082 .format(self.lock_file)
1083 return temp
1084
1085
1086 class _Acquire_ReturnProxy(object):
1087 def __init__(self, lock):
1088 self.lock = lock
1089 return None
1090
1091 def __enter__(self):
1092 return self.lock
1093
1094 def __exit__(self, exc_type, exc_value, traceback):
1095 self.lock.release()
1096 return None
1097
1098
1099 class FileLock(object):
1100 def __init__(self, ctx: CephadmContext, name, timeout=-1):
1101 if not os.path.exists(LOCK_DIR):
1102 os.mkdir(LOCK_DIR, 0o700)
1103 self._lock_file = os.path.join(LOCK_DIR, name + '.lock')
1104 self.ctx = ctx
1105
1106 # The file descriptor for the *_lock_file* as it is returned by the
1107 # os.open() function.
1108 # This file lock is only NOT None, if the object currently holds the
1109 # lock.
1110 self._lock_file_fd: Optional[int] = None
1111 self.timeout = timeout
1112 # The lock counter is used for implementing the nested locking
1113 # mechanism. Whenever the lock is acquired, the counter is increased and
1114 # the lock is only released, when this value is 0 again.
1115 self._lock_counter = 0
1116 return None
1117
1118 @property
1119 def is_locked(self):
1120 return self._lock_file_fd is not None
1121
1122 def acquire(self, timeout=None, poll_intervall=0.05):
1123 """
1124 Acquires the file lock or fails with a :exc:`Timeout` error.
1125 .. code-block:: python
1126 # You can use this method in the context manager (recommended)
1127 with lock.acquire():
1128 pass
1129 # Or use an equivalent try-finally construct:
1130 lock.acquire()
1131 try:
1132 pass
1133 finally:
1134 lock.release()
1135 :arg float timeout:
1136 The maximum time waited for the file lock.
1137 If ``timeout < 0``, there is no timeout and this method will
1138 block until the lock could be acquired.
1139 If ``timeout`` is None, the default :attr:`~timeout` is used.
1140 :arg float poll_intervall:
1141 We check once in *poll_intervall* seconds if we can acquire the
1142 file lock.
1143 :raises Timeout:
1144 if the lock could not be acquired in *timeout* seconds.
1145 .. versionchanged:: 2.0.0
1146 This method returns now a *proxy* object instead of *self*,
1147 so that it can be used in a with statement without side effects.
1148 """
1149
1150 # Use the default timeout, if no timeout is provided.
1151 if timeout is None:
1152 timeout = self.timeout
1153
1154 # Increment the number right at the beginning.
1155 # We can still undo it, if something fails.
1156 self._lock_counter += 1
1157
1158 lock_id = id(self)
1159 lock_filename = self._lock_file
1160 start_time = time.time()
1161 try:
1162 while True:
1163 if not self.is_locked:
1164 logger.debug('Acquiring lock %s on %s', lock_id,
1165 lock_filename)
1166 self._acquire()
1167
1168 if self.is_locked:
1169 logger.debug('Lock %s acquired on %s', lock_id,
1170 lock_filename)
1171 break
1172 elif timeout >= 0 and time.time() - start_time > timeout:
1173 logger.warning('Timeout acquiring lock %s on %s', lock_id,
1174 lock_filename)
1175 raise Timeout(self._lock_file)
1176 else:
1177 logger.debug(
1178 'Lock %s not acquired on %s, waiting %s seconds ...',
1179 lock_id, lock_filename, poll_intervall
1180 )
1181 time.sleep(poll_intervall)
1182 except Exception:
1183 # Something did go wrong, so decrement the counter.
1184 self._lock_counter = max(0, self._lock_counter - 1)
1185
1186 raise
1187 return _Acquire_ReturnProxy(lock=self)
1188
1189 def release(self, force=False):
1190 """
1191 Releases the file lock.
1192 Please note, that the lock is only completly released, if the lock
1193 counter is 0.
1194 Also note, that the lock file itself is not automatically deleted.
1195 :arg bool force:
1196 If true, the lock counter is ignored and the lock is released in
1197 every case.
1198 """
1199 if self.is_locked:
1200 self._lock_counter -= 1
1201
1202 if self._lock_counter == 0 or force:
1203 lock_id = id(self)
1204 lock_filename = self._lock_file
1205
1206 logger.debug('Releasing lock %s on %s', lock_id, lock_filename)
1207 self._release()
1208 self._lock_counter = 0
1209 logger.debug('Lock %s released on %s', lock_id, lock_filename)
1210
1211 return None
1212
1213 def __enter__(self):
1214 self.acquire()
1215 return self
1216
1217 def __exit__(self, exc_type, exc_value, traceback):
1218 self.release()
1219 return None
1220
1221 def __del__(self):
1222 self.release(force=True)
1223 return None
1224
1225 def _acquire(self):
1226 open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
1227 fd = os.open(self._lock_file, open_mode)
1228
1229 try:
1230 fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
1231 except (IOError, OSError):
1232 os.close(fd)
1233 else:
1234 self._lock_file_fd = fd
1235 return None
1236
1237 def _release(self):
1238 # Do not remove the lockfile:
1239 #
1240 # https://github.com/benediktschmitt/py-filelock/issues/31
1241 # https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
1242 fd = self._lock_file_fd
1243 self._lock_file_fd = None
1244 fcntl.flock(fd, fcntl.LOCK_UN) # type: ignore
1245 os.close(fd) # type: ignore
1246 return None
1247
1248
1249 ##################################
1250 # Popen wrappers, lifted from ceph-volume
1251
1252 class CallVerbosity(Enum):
1253 SILENT = 0
1254 # log stdout/stderr to logger.debug
1255 DEBUG = 1
1256 # On a non-zero exit status, it will forcefully set
1257 # logging ON for the terminal
1258 VERBOSE_ON_FAILURE = 2
1259 # log at info (instead of debug) level.
1260 VERBOSE = 3
1261
1262
1263 if sys.version_info < (3, 8):
1264 import itertools
1265 import threading
1266 import warnings
1267 from asyncio import events
1268
1269 class ThreadedChildWatcher(asyncio.AbstractChildWatcher):
1270 """Threaded child watcher implementation.
1271 The watcher uses a thread per process
1272 for waiting for the process finish.
1273 It doesn't require subscription on POSIX signal
1274 but a thread creation is not free.
1275 The watcher has O(1) complexity, its performance doesn't depend
1276 on amount of spawn processes.
1277 """
1278
1279 def __init__(self):
1280 self._pid_counter = itertools.count(0)
1281 self._threads = {}
1282
1283 def is_active(self):
1284 return True
1285
1286 def close(self):
1287 self._join_threads()
1288
1289 def _join_threads(self):
1290 """Internal: Join all non-daemon threads"""
1291 threads = [thread for thread in list(self._threads.values())
1292 if thread.is_alive() and not thread.daemon]
1293 for thread in threads:
1294 thread.join()
1295
1296 def __enter__(self):
1297 return self
1298
1299 def __exit__(self, exc_type, exc_val, exc_tb):
1300 pass
1301
1302 def __del__(self, _warn=warnings.warn):
1303 threads = [thread for thread in list(self._threads.values())
1304 if thread.is_alive()]
1305 if threads:
1306 _warn(f'{self.__class__} has registered but not finished child processes',
1307 ResourceWarning,
1308 source=self)
1309
1310 def add_child_handler(self, pid, callback, *args):
1311 loop = events.get_event_loop()
1312 thread = threading.Thread(target=self._do_waitpid,
1313 name=f'waitpid-{next(self._pid_counter)}',
1314 args=(loop, pid, callback, args),
1315 daemon=True)
1316 self._threads[pid] = thread
1317 thread.start()
1318
1319 def remove_child_handler(self, pid):
1320 # asyncio never calls remove_child_handler() !!!
1321 # The method is no-op but is implemented because
1322 # abstract base classe requires it
1323 return True
1324
1325 def attach_loop(self, loop):
1326 pass
1327
1328 def _do_waitpid(self, loop, expected_pid, callback, args):
1329 assert expected_pid > 0
1330
1331 try:
1332 pid, status = os.waitpid(expected_pid, 0)
1333 except ChildProcessError:
1334 # The child process is already reaped
1335 # (may happen if waitpid() is called elsewhere).
1336 pid = expected_pid
1337 returncode = 255
1338 logger.warning(
1339 'Unknown child process pid %d, will report returncode 255',
1340 pid)
1341 else:
1342 if os.WIFEXITED(status):
1343 returncode = os.WEXITSTATUS(status)
1344 elif os.WIFSIGNALED(status):
1345 returncode = -os.WTERMSIG(status)
1346 else:
1347 raise ValueError(f'unknown wait status {status}')
1348 if loop.get_debug():
1349 logger.debug('process %s exited with returncode %s',
1350 expected_pid, returncode)
1351
1352 if loop.is_closed():
1353 logger.warning('Loop %r that handles pid %r is closed', loop, pid)
1354 else:
1355 loop.call_soon_threadsafe(callback, pid, returncode, *args)
1356
1357 self._threads.pop(expected_pid)
1358
1359 # unlike SafeChildWatcher which handles SIGCHLD in the main thread,
1360 # ThreadedChildWatcher runs in a separated thread, hence allows us to
1361 # run create_subprocess_exec() in non-main thread, see
1362 # https://bugs.python.org/issue35621
1363 asyncio.set_child_watcher(ThreadedChildWatcher())
1364
1365
1366 try:
1367 from asyncio import run as async_run # type: ignore[attr-defined]
1368 except ImportError:
1369 def async_run(coro): # type: ignore
1370 loop = asyncio.new_event_loop()
1371 try:
1372 asyncio.set_event_loop(loop)
1373 return loop.run_until_complete(coro)
1374 finally:
1375 try:
1376 loop.run_until_complete(loop.shutdown_asyncgens())
1377 finally:
1378 asyncio.set_event_loop(None)
1379 loop.close()
1380
1381
1382 def call(ctx: CephadmContext,
1383 command: List[str],
1384 desc: Optional[str] = None,
1385 verbosity: CallVerbosity = CallVerbosity.VERBOSE_ON_FAILURE,
1386 timeout: Optional[int] = DEFAULT_TIMEOUT,
1387 **kwargs) -> Tuple[str, str, int]:
1388 """
1389 Wrap subprocess.Popen to
1390
1391 - log stdout/stderr to a logger,
1392 - decode utf-8
1393 - cleanly return out, err, returncode
1394
1395 :param timeout: timeout in seconds
1396 """
1397
1398 prefix = command[0] if desc is None else desc
1399 if prefix:
1400 prefix += ': '
1401 timeout = timeout or ctx.timeout
1402
1403 logger.debug('Running command: %s' % ' '.join(command))
1404
1405 async def tee(reader: asyncio.StreamReader) -> str:
1406 collected = StringIO()
1407 async for line in reader:
1408 message = line.decode('utf-8')
1409 collected.write(message)
1410 if verbosity == CallVerbosity.VERBOSE:
1411 logger.info(prefix + message.rstrip())
1412 elif verbosity != CallVerbosity.SILENT:
1413 logger.debug(prefix + message.rstrip())
1414 return collected.getvalue()
1415
1416 async def run_with_timeout() -> Tuple[str, str, int]:
1417 process = await asyncio.create_subprocess_exec(
1418 *command,
1419 stdout=asyncio.subprocess.PIPE,
1420 stderr=asyncio.subprocess.PIPE)
1421 assert process.stdout
1422 assert process.stderr
1423 try:
1424 stdout, stderr = await asyncio.gather(tee(process.stdout),
1425 tee(process.stderr))
1426 returncode = await asyncio.wait_for(process.wait(), timeout)
1427 except asyncio.TimeoutError:
1428 logger.info(prefix + f'timeout after {timeout} seconds')
1429 return '', '', 124
1430 else:
1431 return stdout, stderr, returncode
1432
1433 stdout, stderr, returncode = async_run(run_with_timeout())
1434 if returncode != 0 and verbosity == CallVerbosity.VERBOSE_ON_FAILURE:
1435 logger.info('Non-zero exit code %d from %s',
1436 returncode, ' '.join(command))
1437 for line in stdout.splitlines():
1438 logger.info(prefix + 'stdout ' + line)
1439 for line in stderr.splitlines():
1440 logger.info(prefix + 'stderr ' + line)
1441 return stdout, stderr, returncode
1442
1443
1444 def call_throws(
1445 ctx: CephadmContext,
1446 command: List[str],
1447 desc: Optional[str] = None,
1448 verbosity: CallVerbosity = CallVerbosity.VERBOSE_ON_FAILURE,
1449 timeout: Optional[int] = DEFAULT_TIMEOUT,
1450 **kwargs) -> Tuple[str, str, int]:
1451 out, err, ret = call(ctx, command, desc, verbosity, timeout, **kwargs)
1452 if ret:
1453 raise RuntimeError('Failed command: %s' % ' '.join(command))
1454 return out, err, ret
1455
1456
1457 def call_timeout(ctx, command, timeout):
1458 # type: (CephadmContext, List[str], int) -> int
1459 logger.debug('Running command (timeout=%s): %s'
1460 % (timeout, ' '.join(command)))
1461
1462 def raise_timeout(command, timeout):
1463 # type: (List[str], int) -> NoReturn
1464 msg = 'Command `%s` timed out after %s seconds' % (command, timeout)
1465 logger.debug(msg)
1466 raise TimeoutExpired(msg)
1467
1468 try:
1469 return subprocess.call(command, timeout=timeout)
1470 except subprocess.TimeoutExpired:
1471 raise_timeout(command, timeout)
1472
1473 ##################################
1474
1475
1476 def json_loads_retry(cli_func):
1477 for sleep_secs in [1, 4, 4]:
1478 try:
1479 return json.loads(cli_func())
1480 except json.JSONDecodeError:
1481 logger.debug('Invalid JSON. Retrying in %s seconds...' % sleep_secs)
1482 time.sleep(sleep_secs)
1483 return json.loads(cli_func())
1484
1485
1486 def is_available(ctx, what, func):
1487 # type: (CephadmContext, str, Callable[[], bool]) -> None
1488 """
1489 Wait for a service to become available
1490
1491 :param what: the name of the service
1492 :param func: the callable object that determines availability
1493 """
1494 retry = ctx.retry
1495 logger.info('Waiting for %s...' % what)
1496 num = 1
1497 while True:
1498 if func():
1499 logger.info('%s is available'
1500 % what)
1501 break
1502 elif num > retry:
1503 raise Error('%s not available after %s tries'
1504 % (what, retry))
1505
1506 logger.info('%s not available, waiting (%s/%s)...'
1507 % (what, num, retry))
1508
1509 num += 1
1510 time.sleep(2)
1511
1512
1513 def read_config(fn):
1514 # type: (Optional[str]) -> ConfigParser
1515 cp = ConfigParser()
1516 if fn:
1517 cp.read(fn)
1518 return cp
1519
1520
1521 def pathify(p):
1522 # type: (str) -> str
1523 p = os.path.expanduser(p)
1524 return os.path.abspath(p)
1525
1526
1527 def get_file_timestamp(fn):
1528 # type: (str) -> Optional[str]
1529 try:
1530 mt = os.path.getmtime(fn)
1531 return datetime.datetime.fromtimestamp(
1532 mt, tz=datetime.timezone.utc
1533 ).strftime(DATEFMT)
1534 except Exception:
1535 return None
1536
1537
1538 def try_convert_datetime(s):
1539 # type: (str) -> Optional[str]
1540 # This is super irritating because
1541 # 1) podman and docker use different formats
1542 # 2) python's strptime can't parse either one
1543 #
1544 # I've seen:
1545 # docker 18.09.7: 2020-03-03T09:21:43.636153304Z
1546 # podman 1.7.0: 2020-03-03T15:52:30.136257504-06:00
1547 # 2020-03-03 15:52:30.136257504 -0600 CST
1548 # (In the podman case, there is a different string format for
1549 # 'inspect' and 'inspect --format {{.Created}}'!!)
1550
1551 # In *all* cases, the 9 digit second precision is too much for
1552 # python's strptime. Shorten it to 6 digits.
1553 p = re.compile(r'(\.[\d]{6})[\d]*')
1554 s = p.sub(r'\1', s)
1555
1556 # replace trailing Z with -0000, since (on python 3.6.8) it won't parse
1557 if s and s[-1] == 'Z':
1558 s = s[:-1] + '-0000'
1559
1560 # cut off the redundant 'CST' part that strptime can't parse, if
1561 # present.
1562 v = s.split(' ')
1563 s = ' '.join(v[0:3])
1564
1565 # try parsing with several format strings
1566 fmts = [
1567 '%Y-%m-%dT%H:%M:%S.%f%z',
1568 '%Y-%m-%d %H:%M:%S.%f %z',
1569 ]
1570 for f in fmts:
1571 try:
1572 # return timestamp normalized to UTC, rendered as DATEFMT.
1573 return datetime.datetime.strptime(s, f).astimezone(tz=datetime.timezone.utc).strftime(DATEFMT)
1574 except ValueError:
1575 pass
1576 return None
1577
1578
1579 def _parse_podman_version(version_str):
1580 # type: (str) -> Tuple[int, ...]
1581 def to_int(val, org_e=None):
1582 if not val and org_e:
1583 raise org_e
1584 try:
1585 return int(val)
1586 except ValueError as e:
1587 return to_int(val[0:-1], org_e or e)
1588
1589 return tuple(map(to_int, version_str.split('.')))
1590
1591
1592 def get_hostname():
1593 # type: () -> str
1594 return socket.gethostname()
1595
1596
1597 def get_fqdn():
1598 # type: () -> str
1599 return socket.getfqdn() or socket.gethostname()
1600
1601
1602 def get_arch():
1603 # type: () -> str
1604 return platform.uname().machine
1605
1606
1607 def generate_service_id():
1608 # type: () -> str
1609 return get_hostname() + '.' + ''.join(random.choice(string.ascii_lowercase)
1610 for _ in range(6))
1611
1612
1613 def generate_password():
1614 # type: () -> str
1615 return ''.join(random.choice(string.ascii_lowercase + string.digits)
1616 for i in range(10))
1617
1618
1619 def normalize_container_id(i):
1620 # type: (str) -> str
1621 # docker adds the sha256: prefix, but AFAICS both
1622 # docker (18.09.7 in bionic at least) and podman
1623 # both always use sha256, so leave off the prefix
1624 # for consistency.
1625 prefix = 'sha256:'
1626 if i.startswith(prefix):
1627 i = i[len(prefix):]
1628 return i
1629
1630
1631 def make_fsid():
1632 # type: () -> str
1633 return str(uuid.uuid1())
1634
1635
1636 def is_fsid(s):
1637 # type: (str) -> bool
1638 try:
1639 uuid.UUID(s)
1640 except ValueError:
1641 return False
1642 return True
1643
1644
1645 def infer_fsid(func):
1646 """
1647 If we only find a single fsid in /var/lib/ceph/*, use that
1648 """
1649 @wraps(func)
1650 def _infer_fsid(ctx: CephadmContext):
1651 if ctx.fsid:
1652 logger.debug('Using specified fsid: %s' % ctx.fsid)
1653 return func(ctx)
1654
1655 fsids_set = set()
1656 daemon_list = list_daemons(ctx, detail=False)
1657 for daemon in daemon_list:
1658 if not is_fsid(daemon['fsid']):
1659 # 'unknown' fsid
1660 continue
1661 elif 'name' not in ctx or not ctx.name:
1662 # ctx.name not specified
1663 fsids_set.add(daemon['fsid'])
1664 elif daemon['name'] == ctx.name:
1665 # ctx.name is a match
1666 fsids_set.add(daemon['fsid'])
1667 fsids = sorted(fsids_set)
1668
1669 if not fsids:
1670 # some commands do not always require an fsid
1671 pass
1672 elif len(fsids) == 1:
1673 logger.info('Inferring fsid %s' % fsids[0])
1674 ctx.fsid = fsids[0]
1675 else:
1676 raise Error('Cannot infer an fsid, one must be specified: %s' % fsids)
1677 return func(ctx)
1678
1679 return _infer_fsid
1680
1681
1682 def infer_config(func):
1683 """
1684 If we find a MON daemon, use the config from that container
1685 """
1686 @wraps(func)
1687 def _infer_config(ctx: CephadmContext):
1688 if ctx.config:
1689 logger.debug('Using specified config: %s' % ctx.config)
1690 return func(ctx)
1691 config = None
1692 if ctx.fsid:
1693 name = ctx.name
1694 if not name:
1695 daemon_list = list_daemons(ctx, detail=False)
1696 for daemon in daemon_list:
1697 if daemon['name'].startswith('mon.'):
1698 name = daemon['name']
1699 break
1700 if name:
1701 config = '/var/lib/ceph/{}/{}/config'.format(ctx.fsid,
1702 name)
1703 if config:
1704 logger.info('Inferring config %s' % config)
1705 ctx.config = config
1706 elif os.path.exists(SHELL_DEFAULT_CONF):
1707 logger.debug('Using default config: %s' % SHELL_DEFAULT_CONF)
1708 ctx.config = SHELL_DEFAULT_CONF
1709 return func(ctx)
1710
1711 return _infer_config
1712
1713
1714 def _get_default_image(ctx: CephadmContext):
1715 if DEFAULT_IMAGE_IS_MASTER:
1716 warn = """This is a development version of cephadm.
1717 For information regarding the latest stable release:
1718 https://docs.ceph.com/docs/{}/cephadm/install
1719 """.format(LATEST_STABLE_RELEASE)
1720 for line in warn.splitlines():
1721 logger.warning('{}{}{}'.format(termcolor.yellow, line, termcolor.end))
1722 return DEFAULT_IMAGE
1723
1724
1725 def infer_image(func):
1726 """
1727 Use the most recent ceph image
1728 """
1729 @wraps(func)
1730 def _infer_image(ctx: CephadmContext):
1731 if not ctx.image:
1732 ctx.image = os.environ.get('CEPHADM_IMAGE')
1733 if not ctx.image:
1734 ctx.image = get_last_local_ceph_image(ctx, ctx.container_engine.path)
1735 if not ctx.image:
1736 ctx.image = _get_default_image(ctx)
1737 return func(ctx)
1738
1739 return _infer_image
1740
1741
1742 def default_image(func):
1743 @wraps(func)
1744 def _default_image(ctx: CephadmContext):
1745 if not ctx.image:
1746 if 'name' in ctx and ctx.name:
1747 type_ = ctx.name.split('.', 1)[0]
1748 if type_ in Monitoring.components:
1749 ctx.image = Monitoring.components[type_]['image']
1750 if type_ == 'haproxy':
1751 ctx.image = HAproxy.default_image
1752 if type_ == 'keepalived':
1753 ctx.image = Keepalived.default_image
1754 if not ctx.image:
1755 ctx.image = os.environ.get('CEPHADM_IMAGE')
1756 if not ctx.image:
1757 ctx.image = _get_default_image(ctx)
1758
1759 return func(ctx)
1760
1761 return _default_image
1762
1763
1764 def get_last_local_ceph_image(ctx: CephadmContext, container_path: str):
1765 """
1766 :return: The most recent local ceph image (already pulled)
1767 """
1768 out, _, _ = call_throws(ctx,
1769 [container_path, 'images',
1770 '--filter', 'label=ceph=True',
1771 '--filter', 'dangling=false',
1772 '--format', '{{.Repository}}@{{.Digest}}'])
1773 return _filter_last_local_ceph_image(out)
1774
1775
1776 def _filter_last_local_ceph_image(out):
1777 # type: (str) -> Optional[str]
1778 for image in out.splitlines():
1779 if image and not image.endswith('@'):
1780 logger.info('Using recent ceph image %s' % image)
1781 return image
1782 return None
1783
1784
1785 def write_tmp(s, uid, gid):
1786 # type: (str, int, int) -> IO[str]
1787 tmp_f = tempfile.NamedTemporaryFile(mode='w',
1788 prefix='ceph-tmp')
1789 os.fchown(tmp_f.fileno(), uid, gid)
1790 tmp_f.write(s)
1791 tmp_f.flush()
1792
1793 return tmp_f
1794
1795
1796 def makedirs(dir, uid, gid, mode):
1797 # type: (str, int, int, int) -> None
1798 if not os.path.exists(dir):
1799 os.makedirs(dir, mode=mode)
1800 else:
1801 os.chmod(dir, mode)
1802 os.chown(dir, uid, gid)
1803 os.chmod(dir, mode) # the above is masked by umask...
1804
1805
1806 def get_data_dir(fsid, data_dir, t, n):
1807 # type: (str, str, str, Union[int, str]) -> str
1808 return os.path.join(data_dir, fsid, '%s.%s' % (t, n))
1809
1810
1811 def get_log_dir(fsid, log_dir):
1812 # type: (str, str) -> str
1813 return os.path.join(log_dir, fsid)
1814
1815
1816 def make_data_dir_base(fsid, data_dir, uid, gid):
1817 # type: (str, str, int, int) -> str
1818 data_dir_base = os.path.join(data_dir, fsid)
1819 makedirs(data_dir_base, uid, gid, DATA_DIR_MODE)
1820 makedirs(os.path.join(data_dir_base, 'crash'), uid, gid, DATA_DIR_MODE)
1821 makedirs(os.path.join(data_dir_base, 'crash', 'posted'), uid, gid,
1822 DATA_DIR_MODE)
1823 return data_dir_base
1824
1825
1826 def make_data_dir(ctx, fsid, daemon_type, daemon_id, uid=None, gid=None):
1827 # type: (CephadmContext, str, str, Union[int, str], Optional[int], Optional[int]) -> str
1828 if uid is None or gid is None:
1829 uid, gid = extract_uid_gid(ctx)
1830 make_data_dir_base(fsid, ctx.data_dir, uid, gid)
1831 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
1832 makedirs(data_dir, uid, gid, DATA_DIR_MODE)
1833 return data_dir
1834
1835
1836 def make_log_dir(ctx, fsid, uid=None, gid=None):
1837 # type: (CephadmContext, str, Optional[int], Optional[int]) -> str
1838 if uid is None or gid is None:
1839 uid, gid = extract_uid_gid(ctx)
1840 log_dir = get_log_dir(fsid, ctx.log_dir)
1841 makedirs(log_dir, uid, gid, LOG_DIR_MODE)
1842 return log_dir
1843
1844
1845 def make_var_run(ctx, fsid, uid, gid):
1846 # type: (CephadmContext, str, int, int) -> None
1847 call_throws(ctx, ['install', '-d', '-m0770', '-o', str(uid), '-g', str(gid),
1848 '/var/run/ceph/%s' % fsid])
1849
1850
1851 def copy_tree(ctx, src, dst, uid=None, gid=None):
1852 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
1853 """
1854 Copy a directory tree from src to dst
1855 """
1856 if uid is None or gid is None:
1857 (uid, gid) = extract_uid_gid(ctx)
1858
1859 for src_dir in src:
1860 dst_dir = dst
1861 if os.path.isdir(dst):
1862 dst_dir = os.path.join(dst, os.path.basename(src_dir))
1863
1864 logger.debug('copy directory `%s` -> `%s`' % (src_dir, dst_dir))
1865 shutil.rmtree(dst_dir, ignore_errors=True)
1866 shutil.copytree(src_dir, dst_dir) # dirs_exist_ok needs python 3.8
1867
1868 for dirpath, dirnames, filenames in os.walk(dst_dir):
1869 logger.debug('chown %s:%s `%s`' % (uid, gid, dirpath))
1870 os.chown(dirpath, uid, gid)
1871 for filename in filenames:
1872 logger.debug('chown %s:%s `%s`' % (uid, gid, filename))
1873 os.chown(os.path.join(dirpath, filename), uid, gid)
1874
1875
1876 def copy_files(ctx, src, dst, uid=None, gid=None):
1877 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
1878 """
1879 Copy a files from src to dst
1880 """
1881 if uid is None or gid is None:
1882 (uid, gid) = extract_uid_gid(ctx)
1883
1884 for src_file in src:
1885 dst_file = dst
1886 if os.path.isdir(dst):
1887 dst_file = os.path.join(dst, os.path.basename(src_file))
1888
1889 logger.debug('copy file `%s` -> `%s`' % (src_file, dst_file))
1890 shutil.copyfile(src_file, dst_file)
1891
1892 logger.debug('chown %s:%s `%s`' % (uid, gid, dst_file))
1893 os.chown(dst_file, uid, gid)
1894
1895
1896 def move_files(ctx, src, dst, uid=None, gid=None):
1897 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
1898 """
1899 Move files from src to dst
1900 """
1901 if uid is None or gid is None:
1902 (uid, gid) = extract_uid_gid(ctx)
1903
1904 for src_file in src:
1905 dst_file = dst
1906 if os.path.isdir(dst):
1907 dst_file = os.path.join(dst, os.path.basename(src_file))
1908
1909 if os.path.islink(src_file):
1910 # shutil.move() in py2 does not handle symlinks correctly
1911 src_rl = os.readlink(src_file)
1912 logger.debug("symlink '%s' -> '%s'" % (dst_file, src_rl))
1913 os.symlink(src_rl, dst_file)
1914 os.unlink(src_file)
1915 else:
1916 logger.debug("move file '%s' -> '%s'" % (src_file, dst_file))
1917 shutil.move(src_file, dst_file)
1918 logger.debug('chown %s:%s `%s`' % (uid, gid, dst_file))
1919 os.chown(dst_file, uid, gid)
1920
1921
1922 # copied from distutils
1923 def find_executable(executable, path=None):
1924 """Tries to find 'executable' in the directories listed in 'path'.
1925 A string listing directories separated by 'os.pathsep'; defaults to
1926 os.environ['PATH']. Returns the complete filename or None if not found.
1927 """
1928 _, ext = os.path.splitext(executable)
1929 if (sys.platform == 'win32') and (ext != '.exe'):
1930 executable = executable + '.exe'
1931
1932 if os.path.isfile(executable):
1933 return executable
1934
1935 if path is None:
1936 path = os.environ.get('PATH', None)
1937 if path is None:
1938 try:
1939 path = os.confstr('CS_PATH')
1940 except (AttributeError, ValueError):
1941 # os.confstr() or CS_PATH is not available
1942 path = os.defpath
1943 # bpo-35755: Don't use os.defpath if the PATH environment variable is
1944 # set to an empty string
1945
1946 # PATH='' doesn't match, whereas PATH=':' looks in the current directory
1947 if not path:
1948 return None
1949
1950 paths = path.split(os.pathsep)
1951 for p in paths:
1952 f = os.path.join(p, executable)
1953 if os.path.isfile(f):
1954 # the file exists, we have a shot at spawn working
1955 return f
1956 return None
1957
1958
1959 def find_program(filename):
1960 # type: (str) -> str
1961 name = find_executable(filename)
1962 if name is None:
1963 raise ValueError('%s not found' % filename)
1964 return name
1965
1966
1967 def find_container_engine(ctx: CephadmContext):
1968 if ctx.docker:
1969 return Docker()
1970 else:
1971 for i in CONTAINER_PREFERENCE:
1972 try:
1973 return i()
1974 except Exception as e:
1975 logger.debug('Could not locate %s: %s' % (i.EXE, e))
1976 return None
1977
1978
1979 def check_container_engine(ctx):
1980 # type: (CephadmContext) -> None
1981 engine = ctx.container_engine
1982 if not isinstance(engine, CONTAINER_PREFERENCE):
1983 raise Error('Unable to locate any of %s' % [i.EXE for i in CONTAINER_PREFERENCE])
1984 elif isinstance(engine, Podman):
1985 engine.get_version(ctx)
1986 if engine.version < MIN_PODMAN_VERSION:
1987 raise Error('podman version %d.%d.%d or later is required' % MIN_PODMAN_VERSION)
1988
1989
1990 def get_unit_name(fsid, daemon_type, daemon_id=None):
1991 # type: (str, str, Optional[Union[int, str]]) -> str
1992 # accept either name or type + id
1993 if daemon_type == CephadmDaemon.daemon_type and daemon_id is not None:
1994 return 'ceph-%s-%s.%s' % (fsid, daemon_type, daemon_id)
1995 elif daemon_id is not None:
1996 return 'ceph-%s@%s.%s' % (fsid, daemon_type, daemon_id)
1997 else:
1998 return 'ceph-%s@%s' % (fsid, daemon_type)
1999
2000
2001 def get_unit_name_by_daemon_name(ctx: CephadmContext, fsid, name):
2002 daemon = get_daemon_description(ctx, fsid, name)
2003 try:
2004 return daemon['systemd_unit']
2005 except KeyError:
2006 raise Error('Failed to get unit name for {}'.format(daemon))
2007
2008
2009 def check_unit(ctx, unit_name):
2010 # type: (CephadmContext, str) -> Tuple[bool, str, bool]
2011 # NOTE: we ignore the exit code here because systemctl outputs
2012 # various exit codes based on the state of the service, but the
2013 # string result is more explicit (and sufficient).
2014 enabled = False
2015 installed = False
2016 try:
2017 out, err, code = call(ctx, ['systemctl', 'is-enabled', unit_name],
2018 verbosity=CallVerbosity.DEBUG)
2019 if code == 0:
2020 enabled = True
2021 installed = True
2022 elif 'disabled' in out:
2023 installed = True
2024 except Exception as e:
2025 logger.warning('unable to run systemctl: %s' % e)
2026 enabled = False
2027 installed = False
2028
2029 state = 'unknown'
2030 try:
2031 out, err, code = call(ctx, ['systemctl', 'is-active', unit_name],
2032 verbosity=CallVerbosity.DEBUG)
2033 out = out.strip()
2034 if out in ['active']:
2035 state = 'running'
2036 elif out in ['inactive']:
2037 state = 'stopped'
2038 elif out in ['failed', 'auto-restart']:
2039 state = 'error'
2040 else:
2041 state = 'unknown'
2042 except Exception as e:
2043 logger.warning('unable to run systemctl: %s' % e)
2044 state = 'unknown'
2045 return (enabled, state, installed)
2046
2047
2048 def check_units(ctx, units, enabler=None):
2049 # type: (CephadmContext, List[str], Optional[Packager]) -> bool
2050 for u in units:
2051 (enabled, state, installed) = check_unit(ctx, u)
2052 if enabled and state == 'running':
2053 logger.info('Unit %s is enabled and running' % u)
2054 return True
2055 if enabler is not None:
2056 if installed:
2057 logger.info('Enabling unit %s' % u)
2058 enabler.enable_service(u)
2059 return False
2060
2061
2062 def is_container_running(ctx: CephadmContext, name: str) -> bool:
2063 out, err, ret = call(ctx, [
2064 ctx.container_engine.path, 'container', 'inspect',
2065 '--format', '{{.State.Status}}', name
2066 ])
2067 return out == 'running'
2068
2069
2070 def get_legacy_config_fsid(cluster, legacy_dir=None):
2071 # type: (str, Optional[str]) -> Optional[str]
2072 config_file = '/etc/ceph/%s.conf' % cluster
2073 if legacy_dir is not None:
2074 config_file = os.path.abspath(legacy_dir + config_file)
2075
2076 if os.path.exists(config_file):
2077 config = read_config(config_file)
2078 if config.has_section('global') and config.has_option('global', 'fsid'):
2079 return config.get('global', 'fsid')
2080 return None
2081
2082
2083 def get_legacy_daemon_fsid(ctx, cluster,
2084 daemon_type, daemon_id, legacy_dir=None):
2085 # type: (CephadmContext, str, str, Union[int, str], Optional[str]) -> Optional[str]
2086 fsid = None
2087 if daemon_type == 'osd':
2088 try:
2089 fsid_file = os.path.join(ctx.data_dir,
2090 daemon_type,
2091 'ceph-%s' % daemon_id,
2092 'ceph_fsid')
2093 if legacy_dir is not None:
2094 fsid_file = os.path.abspath(legacy_dir + fsid_file)
2095 with open(fsid_file, 'r') as f:
2096 fsid = f.read().strip()
2097 except IOError:
2098 pass
2099 if not fsid:
2100 fsid = get_legacy_config_fsid(cluster, legacy_dir=legacy_dir)
2101 return fsid
2102
2103
2104 def get_daemon_args(ctx, fsid, daemon_type, daemon_id):
2105 # type: (CephadmContext, str, str, Union[int, str]) -> List[str]
2106 r = list() # type: List[str]
2107
2108 if daemon_type in Ceph.daemons and daemon_type != 'crash':
2109 r += [
2110 '--setuser', 'ceph',
2111 '--setgroup', 'ceph',
2112 '--default-log-to-file=false',
2113 '--default-log-to-stderr=true',
2114 '--default-log-stderr-prefix=debug ',
2115 ]
2116 if daemon_type == 'mon':
2117 r += [
2118 '--default-mon-cluster-log-to-file=false',
2119 '--default-mon-cluster-log-to-stderr=true',
2120 ]
2121 elif daemon_type in Monitoring.components:
2122 metadata = Monitoring.components[daemon_type]
2123 r += metadata.get('args', list())
2124 # set ip and port to bind to for nodeexporter,alertmanager,prometheus
2125 if daemon_type != 'grafana':
2126 ip = ''
2127 port = Monitoring.port_map[daemon_type][0]
2128 if 'meta_json' in ctx and ctx.meta_json:
2129 meta = json.loads(ctx.meta_json) or {}
2130 if 'ip' in meta and meta['ip']:
2131 ip = meta['ip']
2132 if 'ports' in meta and meta['ports']:
2133 port = meta['ports'][0]
2134 r += [f'--web.listen-address={ip}:{port}']
2135 if daemon_type == 'alertmanager':
2136 config = get_parm(ctx.config_json)
2137 peers = config.get('peers', list()) # type: ignore
2138 for peer in peers:
2139 r += ['--cluster.peer={}'.format(peer)]
2140 # some alertmanager, by default, look elsewhere for a config
2141 r += ['--config.file=/etc/alertmanager/alertmanager.yml']
2142 elif daemon_type == NFSGanesha.daemon_type:
2143 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2144 r += nfs_ganesha.get_daemon_args()
2145 elif daemon_type == HAproxy.daemon_type:
2146 haproxy = HAproxy.init(ctx, fsid, daemon_id)
2147 r += haproxy.get_daemon_args()
2148 elif daemon_type == CustomContainer.daemon_type:
2149 cc = CustomContainer.init(ctx, fsid, daemon_id)
2150 r.extend(cc.get_daemon_args())
2151
2152 return r
2153
2154
2155 def create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid,
2156 config=None, keyring=None):
2157 # type: (CephadmContext, str, str, Union[int, str], int, int, Optional[str], Optional[str]) -> None
2158 data_dir = make_data_dir(ctx, fsid, daemon_type, daemon_id, uid=uid, gid=gid)
2159 make_log_dir(ctx, fsid, uid=uid, gid=gid)
2160
2161 if config:
2162 config_path = os.path.join(data_dir, 'config')
2163 with open(config_path, 'w') as f:
2164 os.fchown(f.fileno(), uid, gid)
2165 os.fchmod(f.fileno(), 0o600)
2166 f.write(config)
2167
2168 if keyring:
2169 keyring_path = os.path.join(data_dir, 'keyring')
2170 with open(keyring_path, 'w') as f:
2171 os.fchmod(f.fileno(), 0o600)
2172 os.fchown(f.fileno(), uid, gid)
2173 f.write(keyring)
2174
2175 if daemon_type in Monitoring.components.keys():
2176 config_json: Dict[str, Any] = get_parm(ctx.config_json)
2177
2178 # Set up directories specific to the monitoring component
2179 config_dir = ''
2180 data_dir_root = ''
2181 if daemon_type == 'prometheus':
2182 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2183 daemon_type, daemon_id)
2184 config_dir = 'etc/prometheus'
2185 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2186 makedirs(os.path.join(data_dir_root, config_dir, 'alerting'), uid, gid, 0o755)
2187 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2188 elif daemon_type == 'grafana':
2189 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2190 daemon_type, daemon_id)
2191 config_dir = 'etc/grafana'
2192 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2193 makedirs(os.path.join(data_dir_root, config_dir, 'certs'), uid, gid, 0o755)
2194 makedirs(os.path.join(data_dir_root, config_dir, 'provisioning/datasources'), uid, gid, 0o755)
2195 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2196 touch(os.path.join(data_dir_root, 'data', 'grafana.db'), uid, gid)
2197 elif daemon_type == 'alertmanager':
2198 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2199 daemon_type, daemon_id)
2200 config_dir = 'etc/alertmanager'
2201 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2202 makedirs(os.path.join(data_dir_root, config_dir, 'data'), uid, gid, 0o755)
2203
2204 # populate the config directory for the component from the config-json
2205 if 'files' in config_json:
2206 for fname in config_json['files']:
2207 content = dict_get_join(config_json['files'], fname)
2208 if os.path.isabs(fname):
2209 fpath = os.path.join(data_dir_root, fname.lstrip(os.path.sep))
2210 else:
2211 fpath = os.path.join(data_dir_root, config_dir, fname)
2212 with open(fpath, 'w', encoding='utf-8') as f:
2213 os.fchown(f.fileno(), uid, gid)
2214 os.fchmod(f.fileno(), 0o600)
2215 f.write(content)
2216
2217 elif daemon_type == NFSGanesha.daemon_type:
2218 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2219 nfs_ganesha.create_daemon_dirs(data_dir, uid, gid)
2220
2221 elif daemon_type == CephIscsi.daemon_type:
2222 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2223 ceph_iscsi.create_daemon_dirs(data_dir, uid, gid)
2224
2225 elif daemon_type == HAproxy.daemon_type:
2226 haproxy = HAproxy.init(ctx, fsid, daemon_id)
2227 haproxy.create_daemon_dirs(data_dir, uid, gid)
2228
2229 elif daemon_type == Keepalived.daemon_type:
2230 keepalived = Keepalived.init(ctx, fsid, daemon_id)
2231 keepalived.create_daemon_dirs(data_dir, uid, gid)
2232
2233 elif daemon_type == CustomContainer.daemon_type:
2234 cc = CustomContainer.init(ctx, fsid, daemon_id)
2235 cc.create_daemon_dirs(data_dir, uid, gid)
2236
2237
2238 def get_parm(option):
2239 # type: (str) -> Dict[str, str]
2240
2241 if not option:
2242 return dict()
2243
2244 global cached_stdin
2245 if option == '-':
2246 if cached_stdin is not None:
2247 j = cached_stdin
2248 else:
2249 j = sys.stdin.read()
2250 cached_stdin = j
2251 else:
2252 # inline json string
2253 if option[0] == '{' and option[-1] == '}':
2254 j = option
2255 # json file
2256 elif os.path.exists(option):
2257 with open(option, 'r') as f:
2258 j = f.read()
2259 else:
2260 raise Error('Config file {} not found'.format(option))
2261
2262 try:
2263 js = json.loads(j)
2264 except ValueError as e:
2265 raise Error('Invalid JSON in {}: {}'.format(option, e))
2266 else:
2267 return js
2268
2269
2270 def get_config_and_keyring(ctx):
2271 # type: (CephadmContext) -> Tuple[Optional[str], Optional[str]]
2272 config = None
2273 keyring = None
2274
2275 if 'config_json' in ctx and ctx.config_json:
2276 d = get_parm(ctx.config_json)
2277 config = d.get('config')
2278 keyring = d.get('keyring')
2279
2280 if 'config' in ctx and ctx.config:
2281 try:
2282 with open(ctx.config, 'r') as f:
2283 config = f.read()
2284 except FileNotFoundError as e:
2285 raise Error(e)
2286
2287 if 'key' in ctx and ctx.key:
2288 keyring = '[%s]\n\tkey = %s\n' % (ctx.name, ctx.key)
2289 elif 'keyring' in ctx and ctx.keyring:
2290 try:
2291 with open(ctx.keyring, 'r') as f:
2292 keyring = f.read()
2293 except FileNotFoundError as e:
2294 raise Error(e)
2295
2296 return config, keyring
2297
2298
2299 def get_container_binds(ctx, fsid, daemon_type, daemon_id):
2300 # type: (CephadmContext, str, str, Union[int, str, None]) -> List[List[str]]
2301 binds = list()
2302
2303 if daemon_type == CephIscsi.daemon_type:
2304 binds.extend(CephIscsi.get_container_binds())
2305 elif daemon_type == CustomContainer.daemon_type:
2306 assert daemon_id
2307 cc = CustomContainer.init(ctx, fsid, daemon_id)
2308 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2309 binds.extend(cc.get_container_binds(data_dir))
2310
2311 return binds
2312
2313
2314 def get_container_mounts(ctx, fsid, daemon_type, daemon_id,
2315 no_config=False):
2316 # type: (CephadmContext, str, str, Union[int, str, None], Optional[bool]) -> Dict[str, str]
2317 mounts = dict()
2318
2319 if daemon_type in Ceph.daemons:
2320 if fsid:
2321 run_path = os.path.join('/var/run/ceph', fsid)
2322 if os.path.exists(run_path):
2323 mounts[run_path] = '/var/run/ceph:z'
2324 log_dir = get_log_dir(fsid, ctx.log_dir)
2325 mounts[log_dir] = '/var/log/ceph:z'
2326 crash_dir = '/var/lib/ceph/%s/crash' % fsid
2327 if os.path.exists(crash_dir):
2328 mounts[crash_dir] = '/var/lib/ceph/crash:z'
2329
2330 if daemon_type in Ceph.daemons and daemon_id:
2331 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2332 if daemon_type == 'rgw':
2333 cdata_dir = '/var/lib/ceph/radosgw/ceph-rgw.%s' % (daemon_id)
2334 else:
2335 cdata_dir = '/var/lib/ceph/%s/ceph-%s' % (daemon_type, daemon_id)
2336 if daemon_type != 'crash':
2337 mounts[data_dir] = cdata_dir + ':z'
2338 if not no_config:
2339 mounts[data_dir + '/config'] = '/etc/ceph/ceph.conf:z'
2340 if daemon_type in ['rbd-mirror', 'cephfs-mirror', 'crash']:
2341 # these do not search for their keyrings in a data directory
2342 mounts[data_dir + '/keyring'] = '/etc/ceph/ceph.client.%s.%s.keyring' % (daemon_type, daemon_id)
2343
2344 if daemon_type in ['mon', 'osd', 'clusterless-ceph-volume']:
2345 mounts['/dev'] = '/dev' # FIXME: narrow this down?
2346 mounts['/run/udev'] = '/run/udev'
2347 if daemon_type in ['osd', 'clusterless-ceph-volume']:
2348 mounts['/sys'] = '/sys' # for numa.cc, pick_address, cgroups, ...
2349 mounts['/run/lvm'] = '/run/lvm'
2350 mounts['/run/lock/lvm'] = '/run/lock/lvm'
2351 if daemon_type == 'osd':
2352 # selinux-policy in the container may not match the host.
2353 if HostFacts(ctx).selinux_enabled:
2354 selinux_folder = '/var/lib/ceph/%s/selinux' % fsid
2355 if not os.path.exists(selinux_folder):
2356 os.makedirs(selinux_folder, mode=0o755)
2357 mounts[selinux_folder] = '/sys/fs/selinux:ro'
2358
2359 try:
2360 if ctx.shared_ceph_folder: # make easy manager modules/ceph-volume development
2361 ceph_folder = pathify(ctx.shared_ceph_folder)
2362 if os.path.exists(ceph_folder):
2363 mounts[ceph_folder + '/src/ceph-volume/ceph_volume'] = '/usr/lib/python3.6/site-packages/ceph_volume'
2364 mounts[ceph_folder + '/src/pybind/mgr'] = '/usr/share/ceph/mgr'
2365 mounts[ceph_folder + '/src/python-common/ceph'] = '/usr/lib/python3.6/site-packages/ceph'
2366 mounts[ceph_folder + '/monitoring/grafana/dashboards'] = '/etc/grafana/dashboards/ceph-dashboard'
2367 mounts[ceph_folder + '/monitoring/prometheus/alerts'] = '/etc/prometheus/ceph'
2368 else:
2369 logger.error('{}{}{}'.format(termcolor.red,
2370 'Ceph shared source folder does not exist.',
2371 termcolor.end))
2372 except AttributeError:
2373 pass
2374
2375 if daemon_type in Monitoring.components and daemon_id:
2376 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2377 if daemon_type == 'prometheus':
2378 mounts[os.path.join(data_dir, 'etc/prometheus')] = '/etc/prometheus:Z'
2379 mounts[os.path.join(data_dir, 'data')] = '/prometheus:Z'
2380 elif daemon_type == 'node-exporter':
2381 mounts['/proc'] = '/host/proc:ro'
2382 mounts['/sys'] = '/host/sys:ro'
2383 mounts['/'] = '/rootfs:ro'
2384 elif daemon_type == 'grafana':
2385 mounts[os.path.join(data_dir, 'etc/grafana/grafana.ini')] = '/etc/grafana/grafana.ini:Z'
2386 mounts[os.path.join(data_dir, 'etc/grafana/provisioning/datasources')] = '/etc/grafana/provisioning/datasources:Z'
2387 mounts[os.path.join(data_dir, 'etc/grafana/certs')] = '/etc/grafana/certs:Z'
2388 mounts[os.path.join(data_dir, 'data/grafana.db')] = '/var/lib/grafana/grafana.db:Z'
2389 elif daemon_type == 'alertmanager':
2390 mounts[os.path.join(data_dir, 'etc/alertmanager')] = '/etc/alertmanager:Z'
2391
2392 if daemon_type == NFSGanesha.daemon_type:
2393 assert daemon_id
2394 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2395 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2396 mounts.update(nfs_ganesha.get_container_mounts(data_dir))
2397
2398 if daemon_type == HAproxy.daemon_type:
2399 assert daemon_id
2400 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2401 mounts.update(HAproxy.get_container_mounts(data_dir))
2402
2403 if daemon_type == CephIscsi.daemon_type:
2404 assert daemon_id
2405 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2406 log_dir = get_log_dir(fsid, ctx.log_dir)
2407 mounts.update(CephIscsi.get_container_mounts(data_dir, log_dir))
2408
2409 if daemon_type == Keepalived.daemon_type:
2410 assert daemon_id
2411 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2412 mounts.update(Keepalived.get_container_mounts(data_dir))
2413
2414 if daemon_type == CustomContainer.daemon_type:
2415 assert daemon_id
2416 cc = CustomContainer.init(ctx, fsid, daemon_id)
2417 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2418 mounts.update(cc.get_container_mounts(data_dir))
2419
2420 return mounts
2421
2422
2423 def get_container(ctx: CephadmContext,
2424 fsid: str, daemon_type: str, daemon_id: Union[int, str],
2425 privileged: bool = False,
2426 ptrace: bool = False,
2427 container_args: Optional[List[str]] = None) -> 'CephContainer':
2428 entrypoint: str = ''
2429 name: str = ''
2430 ceph_args: List[str] = []
2431 envs: List[str] = [
2432 'TCMALLOC_MAX_TOTAL_THREAD_CACHE_BYTES=134217728',
2433 ]
2434 host_network: bool = True
2435
2436 if container_args is None:
2437 container_args = []
2438 if daemon_type in ['mon', 'osd']:
2439 # mon and osd need privileged in order for libudev to query devices
2440 privileged = True
2441 if daemon_type == 'rgw':
2442 entrypoint = '/usr/bin/radosgw'
2443 name = 'client.rgw.%s' % daemon_id
2444 elif daemon_type == 'rbd-mirror':
2445 entrypoint = '/usr/bin/rbd-mirror'
2446 name = 'client.rbd-mirror.%s' % daemon_id
2447 elif daemon_type == 'cephfs-mirror':
2448 entrypoint = '/usr/bin/cephfs-mirror'
2449 name = 'client.cephfs-mirror.%s' % daemon_id
2450 elif daemon_type == 'crash':
2451 entrypoint = '/usr/bin/ceph-crash'
2452 name = 'client.crash.%s' % daemon_id
2453 elif daemon_type in ['mon', 'mgr', 'mds', 'osd']:
2454 entrypoint = '/usr/bin/ceph-' + daemon_type
2455 name = '%s.%s' % (daemon_type, daemon_id)
2456 elif daemon_type in Monitoring.components:
2457 entrypoint = ''
2458 elif daemon_type == NFSGanesha.daemon_type:
2459 entrypoint = NFSGanesha.entrypoint
2460 name = '%s.%s' % (daemon_type, daemon_id)
2461 envs.extend(NFSGanesha.get_container_envs())
2462 elif daemon_type == HAproxy.daemon_type:
2463 name = '%s.%s' % (daemon_type, daemon_id)
2464 elif daemon_type == Keepalived.daemon_type:
2465 name = '%s.%s' % (daemon_type, daemon_id)
2466 envs.extend(Keepalived.get_container_envs())
2467 container_args.extend(['--cap-add=NET_ADMIN', '--cap-add=NET_RAW'])
2468 elif daemon_type == CephIscsi.daemon_type:
2469 entrypoint = CephIscsi.entrypoint
2470 name = '%s.%s' % (daemon_type, daemon_id)
2471 # So the container can modprobe iscsi_target_mod and have write perms
2472 # to configfs we need to make this a privileged container.
2473 privileged = True
2474 elif daemon_type == CustomContainer.daemon_type:
2475 cc = CustomContainer.init(ctx, fsid, daemon_id)
2476 entrypoint = cc.entrypoint
2477 host_network = False
2478 envs.extend(cc.get_container_envs())
2479 container_args.extend(cc.get_container_args())
2480
2481 if daemon_type in Monitoring.components:
2482 uid, gid = extract_uid_gid_monitoring(ctx, daemon_type)
2483 monitoring_args = [
2484 '--user',
2485 str(uid),
2486 # FIXME: disable cpu/memory limits for the time being (not supported
2487 # by ubuntu 18.04 kernel!)
2488 ]
2489 container_args.extend(monitoring_args)
2490 elif daemon_type == 'crash':
2491 ceph_args = ['-n', name]
2492 elif daemon_type in Ceph.daemons:
2493 ceph_args = ['-n', name, '-f']
2494
2495 # if using podman, set -d, --conmon-pidfile & --cidfile flags
2496 # so service can have Type=Forking
2497 if isinstance(ctx.container_engine, Podman):
2498 runtime_dir = '/run'
2499 container_args.extend([
2500 '-d', '--log-driver', 'journald',
2501 '--conmon-pidfile',
2502 runtime_dir + '/ceph-%s@%s.%s.service-pid' % (fsid, daemon_type, daemon_id),
2503 '--cidfile',
2504 runtime_dir + '/ceph-%s@%s.%s.service-cid' % (fsid, daemon_type, daemon_id),
2505 ])
2506 if ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION:
2507 container_args.append('--cgroups=split')
2508
2509 return CephContainer(
2510 ctx,
2511 image=ctx.image,
2512 entrypoint=entrypoint,
2513 args=ceph_args + get_daemon_args(ctx, fsid, daemon_type, daemon_id),
2514 container_args=container_args,
2515 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
2516 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
2517 cname='ceph-%s-%s.%s' % (fsid, daemon_type, daemon_id),
2518 envs=envs,
2519 privileged=privileged,
2520 ptrace=ptrace,
2521 host_network=host_network,
2522 )
2523
2524
2525 def extract_uid_gid(ctx, img='', file_path='/var/lib/ceph'):
2526 # type: (CephadmContext, str, Union[str, List[str]]) -> Tuple[int, int]
2527
2528 if not img:
2529 img = ctx.image
2530
2531 if isinstance(file_path, str):
2532 paths = [file_path]
2533 else:
2534 paths = file_path
2535
2536 for fp in paths:
2537 try:
2538 out = CephContainer(
2539 ctx,
2540 image=img,
2541 entrypoint='stat',
2542 args=['-c', '%u %g', fp]
2543 ).run()
2544 uid, gid = out.split(' ')
2545 return int(uid), int(gid)
2546 except RuntimeError:
2547 pass
2548 raise RuntimeError('uid/gid not found')
2549
2550
2551 def deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid,
2552 config=None, keyring=None,
2553 osd_fsid=None,
2554 reconfig=False,
2555 ports=None):
2556 # type: (CephadmContext, str, str, Union[int, str], Optional[CephContainer], int, int, Optional[str], Optional[str], Optional[str], Optional[bool], Optional[List[int]]) -> None
2557
2558 ports = ports or []
2559 if any([port_in_use(ctx, port) for port in ports]):
2560 if daemon_type == 'mgr':
2561 # non-fatal for mgr when we are in mgr_standby_modules=false, but we can't
2562 # tell whether that is the case here.
2563 logger.warning(
2564 f"ceph-mgr TCP port(s) {','.join(map(str, ports))} already in use"
2565 )
2566 else:
2567 raise Error("TCP Port(s) '{}' required for {} already in use".format(','.join(map(str, ports)), daemon_type))
2568
2569 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2570 if reconfig and not os.path.exists(data_dir):
2571 raise Error('cannot reconfig, data path %s does not exist' % data_dir)
2572 if daemon_type == 'mon' and not os.path.exists(data_dir):
2573 assert config
2574 assert keyring
2575 # tmp keyring file
2576 tmp_keyring = write_tmp(keyring, uid, gid)
2577
2578 # tmp config file
2579 tmp_config = write_tmp(config, uid, gid)
2580
2581 # --mkfs
2582 create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid)
2583 mon_dir = get_data_dir(fsid, ctx.data_dir, 'mon', daemon_id)
2584 log_dir = get_log_dir(fsid, ctx.log_dir)
2585 CephContainer(
2586 ctx,
2587 image=ctx.image,
2588 entrypoint='/usr/bin/ceph-mon',
2589 args=[
2590 '--mkfs',
2591 '-i', str(daemon_id),
2592 '--fsid', fsid,
2593 '-c', '/tmp/config',
2594 '--keyring', '/tmp/keyring',
2595 ] + get_daemon_args(ctx, fsid, 'mon', daemon_id),
2596 volume_mounts={
2597 log_dir: '/var/log/ceph:z',
2598 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (daemon_id),
2599 tmp_keyring.name: '/tmp/keyring:z',
2600 tmp_config.name: '/tmp/config:z',
2601 },
2602 ).run()
2603
2604 # write conf
2605 with open(mon_dir + '/config', 'w') as f:
2606 os.fchown(f.fileno(), uid, gid)
2607 os.fchmod(f.fileno(), 0o600)
2608 f.write(config)
2609 else:
2610 # dirs, conf, keyring
2611 create_daemon_dirs(
2612 ctx,
2613 fsid, daemon_type, daemon_id,
2614 uid, gid,
2615 config, keyring)
2616
2617 if not reconfig:
2618 if daemon_type == CephadmDaemon.daemon_type:
2619 port = next(iter(ports), None) # get first tcp port provided or None
2620
2621 if ctx.config_json == '-':
2622 config_js = get_parm('-')
2623 else:
2624 config_js = get_parm(ctx.config_json)
2625 assert isinstance(config_js, dict)
2626
2627 cephadm_exporter = CephadmDaemon(ctx, fsid, daemon_id, port)
2628 cephadm_exporter.deploy_daemon_unit(config_js)
2629 else:
2630 if c:
2631 deploy_daemon_units(ctx, fsid, uid, gid, daemon_type, daemon_id,
2632 c, osd_fsid=osd_fsid, ports=ports)
2633 else:
2634 raise RuntimeError('attempting to deploy a daemon without a container image')
2635
2636 if not os.path.exists(data_dir + '/unit.created'):
2637 with open(data_dir + '/unit.created', 'w') as f:
2638 os.fchmod(f.fileno(), 0o600)
2639 os.fchown(f.fileno(), uid, gid)
2640 f.write('mtime is time the daemon deployment was created\n')
2641
2642 with open(data_dir + '/unit.configured', 'w') as f:
2643 f.write('mtime is time we were last configured\n')
2644 os.fchmod(f.fileno(), 0o600)
2645 os.fchown(f.fileno(), uid, gid)
2646
2647 update_firewalld(ctx, daemon_type)
2648
2649 # Open ports explicitly required for the daemon
2650 if ports:
2651 fw = Firewalld(ctx)
2652 fw.open_ports(ports)
2653 fw.apply_rules()
2654
2655 if reconfig and daemon_type not in Ceph.daemons:
2656 # ceph daemons do not need a restart; others (presumably) do to pick
2657 # up the new config
2658 call_throws(ctx, ['systemctl', 'reset-failed',
2659 get_unit_name(fsid, daemon_type, daemon_id)])
2660 call_throws(ctx, ['systemctl', 'restart',
2661 get_unit_name(fsid, daemon_type, daemon_id)])
2662
2663
2664 def _write_container_cmd_to_bash(ctx, file_obj, container, comment=None, background=False):
2665 # type: (CephadmContext, IO[str], CephContainer, Optional[str], Optional[bool]) -> None
2666 if comment:
2667 # Sometimes adding a comment, especially if there are multiple containers in one
2668 # unit file, makes it easier to read and grok.
2669 file_obj.write('# ' + comment + '\n')
2670 # Sometimes, adding `--rm` to a run_cmd doesn't work. Let's remove the container manually
2671 file_obj.write('! ' + ' '.join(container.rm_cmd()) + ' 2> /dev/null\n')
2672 # Sometimes, `podman rm` doesn't find the container. Then you'll have to add `--storage`
2673 if isinstance(ctx.container_engine, Podman):
2674 file_obj.write(
2675 '! '
2676 + ' '.join([shlex.quote(a) for a in container.rm_cmd(storage=True)])
2677 + ' 2> /dev/null\n')
2678
2679 # container run command
2680 file_obj.write(
2681 ' '.join([shlex.quote(a) for a in container.run_cmd()])
2682 + (' &' if background else '') + '\n')
2683
2684
2685 def deploy_daemon_units(
2686 ctx: CephadmContext,
2687 fsid: str,
2688 uid: int,
2689 gid: int,
2690 daemon_type: str,
2691 daemon_id: Union[int, str],
2692 c: 'CephContainer',
2693 enable: bool = True,
2694 start: bool = True,
2695 osd_fsid: Optional[str] = None,
2696 ports: Optional[List[int]] = None,
2697 ) -> None:
2698 # cmd
2699 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2700 with open(data_dir + '/unit.run.new', 'w') as f, \
2701 open(data_dir + '/unit.meta.new', 'w') as metaf:
2702 f.write('set -e\n')
2703
2704 if daemon_type in Ceph.daemons:
2705 install_path = find_program('install')
2706 f.write('{install_path} -d -m0770 -o {uid} -g {gid} /var/run/ceph/{fsid}\n'.format(install_path=install_path, fsid=fsid, uid=uid, gid=gid))
2707
2708 # pre-start cmd(s)
2709 if daemon_type == 'osd':
2710 # osds have a pre-start step
2711 assert osd_fsid
2712 simple_fn = os.path.join('/etc/ceph/osd',
2713 '%s-%s.json.adopted-by-cephadm' % (daemon_id, osd_fsid))
2714 if os.path.exists(simple_fn):
2715 f.write('# Simple OSDs need chown on startup:\n')
2716 for n in ['block', 'block.db', 'block.wal']:
2717 p = os.path.join(data_dir, n)
2718 f.write('[ ! -L {p} ] || chown {uid}:{gid} {p}\n'.format(p=p, uid=uid, gid=gid))
2719 else:
2720 prestart = CephContainer(
2721 ctx,
2722 image=ctx.image,
2723 entrypoint='/usr/sbin/ceph-volume',
2724 args=[
2725 'lvm', 'activate',
2726 str(daemon_id), osd_fsid,
2727 '--no-systemd'
2728 ],
2729 privileged=True,
2730 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
2731 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
2732 cname='ceph-%s-%s.%s-activate' % (fsid, daemon_type, daemon_id),
2733 memory_request=ctx.memory_request,
2734 memory_limit=ctx.memory_limit,
2735 )
2736 _write_container_cmd_to_bash(ctx, f, prestart, 'LVM OSDs use ceph-volume lvm activate')
2737 elif daemon_type == CephIscsi.daemon_type:
2738 f.write(' '.join(CephIscsi.configfs_mount_umount(data_dir, mount=True)) + '\n')
2739 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2740 tcmu_container = ceph_iscsi.get_tcmu_runner_container()
2741 _write_container_cmd_to_bash(ctx, f, tcmu_container, 'iscsi tcmu-runnter container', background=True)
2742
2743 _write_container_cmd_to_bash(ctx, f, c, '%s.%s' % (daemon_type, str(daemon_id)))
2744
2745 # some metadata about the deploy
2746 meta: Dict[str, Any] = {}
2747 if 'meta_json' in ctx and ctx.meta_json:
2748 meta = json.loads(ctx.meta_json) or {}
2749 meta.update({
2750 'memory_request': int(ctx.memory_request) if ctx.memory_request else None,
2751 'memory_limit': int(ctx.memory_limit) if ctx.memory_limit else None,
2752 })
2753 if not meta.get('ports'):
2754 meta['ports'] = ports
2755 metaf.write(json.dumps(meta, indent=4) + '\n')
2756
2757 os.fchmod(f.fileno(), 0o600)
2758 os.fchmod(metaf.fileno(), 0o600)
2759 os.rename(data_dir + '/unit.run.new',
2760 data_dir + '/unit.run')
2761 os.rename(data_dir + '/unit.meta.new',
2762 data_dir + '/unit.meta')
2763
2764 # post-stop command(s)
2765 with open(data_dir + '/unit.poststop.new', 'w') as f:
2766 if daemon_type == 'osd':
2767 assert osd_fsid
2768 poststop = CephContainer(
2769 ctx,
2770 image=ctx.image,
2771 entrypoint='/usr/sbin/ceph-volume',
2772 args=[
2773 'lvm', 'deactivate',
2774 str(daemon_id), osd_fsid,
2775 ],
2776 privileged=True,
2777 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
2778 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
2779 cname='ceph-%s-%s.%s-deactivate' % (fsid, daemon_type,
2780 daemon_id),
2781 )
2782 _write_container_cmd_to_bash(ctx, f, poststop, 'deactivate osd')
2783 elif daemon_type == CephIscsi.daemon_type:
2784 # make sure we also stop the tcmu container
2785 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2786 tcmu_container = ceph_iscsi.get_tcmu_runner_container()
2787 f.write('! ' + ' '.join(tcmu_container.stop_cmd()) + '\n')
2788 f.write(' '.join(CephIscsi.configfs_mount_umount(data_dir, mount=False)) + '\n')
2789 os.fchmod(f.fileno(), 0o600)
2790 os.rename(data_dir + '/unit.poststop.new',
2791 data_dir + '/unit.poststop')
2792
2793 if c:
2794 with open(data_dir + '/unit.image.new', 'w') as f:
2795 f.write(c.image + '\n')
2796 os.fchmod(f.fileno(), 0o600)
2797 os.rename(data_dir + '/unit.image.new',
2798 data_dir + '/unit.image')
2799
2800 # sysctl
2801 install_sysctl(ctx, fsid, daemon_type)
2802
2803 # systemd
2804 install_base_units(ctx, fsid)
2805 unit = get_unit_file(ctx, fsid)
2806 unit_file = 'ceph-%s@.service' % (fsid)
2807 with open(ctx.unit_dir + '/' + unit_file + '.new', 'w') as f:
2808 f.write(unit)
2809 os.rename(ctx.unit_dir + '/' + unit_file + '.new',
2810 ctx.unit_dir + '/' + unit_file)
2811 call_throws(ctx, ['systemctl', 'daemon-reload'])
2812
2813 unit_name = get_unit_name(fsid, daemon_type, daemon_id)
2814 call(ctx, ['systemctl', 'stop', unit_name],
2815 verbosity=CallVerbosity.DEBUG)
2816 call(ctx, ['systemctl', 'reset-failed', unit_name],
2817 verbosity=CallVerbosity.DEBUG)
2818 if enable:
2819 call_throws(ctx, ['systemctl', 'enable', unit_name])
2820 if start:
2821 call_throws(ctx, ['systemctl', 'start', unit_name])
2822
2823
2824 class Firewalld(object):
2825 def __init__(self, ctx):
2826 # type: (CephadmContext) -> None
2827 self.ctx = ctx
2828 self.available = self.check()
2829
2830 def check(self):
2831 # type: () -> bool
2832 self.cmd = find_executable('firewall-cmd')
2833 if not self.cmd:
2834 logger.debug('firewalld does not appear to be present')
2835 return False
2836 (enabled, state, _) = check_unit(self.ctx, 'firewalld.service')
2837 if not enabled:
2838 logger.debug('firewalld.service is not enabled')
2839 return False
2840 if state != 'running':
2841 logger.debug('firewalld.service is not running')
2842 return False
2843
2844 logger.info('firewalld ready')
2845 return True
2846
2847 def enable_service_for(self, daemon_type):
2848 # type: (str) -> None
2849 if not self.available:
2850 logger.debug('Not possible to enable service <%s>. firewalld.service is not available' % daemon_type)
2851 return
2852
2853 if daemon_type == 'mon':
2854 svc = 'ceph-mon'
2855 elif daemon_type in ['mgr', 'mds', 'osd']:
2856 svc = 'ceph'
2857 elif daemon_type == NFSGanesha.daemon_type:
2858 svc = 'nfs'
2859 else:
2860 return
2861
2862 if not self.cmd:
2863 raise RuntimeError('command not defined')
2864
2865 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-service', svc], verbosity=CallVerbosity.DEBUG)
2866 if ret:
2867 logger.info('Enabling firewalld service %s in current zone...' % svc)
2868 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--add-service', svc])
2869 if ret:
2870 raise RuntimeError(
2871 'unable to add service %s to current zone: %s' % (svc, err))
2872 else:
2873 logger.debug('firewalld service %s is enabled in current zone' % svc)
2874
2875 def open_ports(self, fw_ports):
2876 # type: (List[int]) -> None
2877 if not self.available:
2878 logger.debug('Not possible to open ports <%s>. firewalld.service is not available' % fw_ports)
2879 return
2880
2881 if not self.cmd:
2882 raise RuntimeError('command not defined')
2883
2884 for port in fw_ports:
2885 tcp_port = str(port) + '/tcp'
2886 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-port', tcp_port], verbosity=CallVerbosity.DEBUG)
2887 if ret:
2888 logger.info('Enabling firewalld port %s in current zone...' % tcp_port)
2889 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--add-port', tcp_port])
2890 if ret:
2891 raise RuntimeError('unable to add port %s to current zone: %s' %
2892 (tcp_port, err))
2893 else:
2894 logger.debug('firewalld port %s is enabled in current zone' % tcp_port)
2895
2896 def close_ports(self, fw_ports):
2897 # type: (List[int]) -> None
2898 if not self.available:
2899 logger.debug('Not possible to close ports <%s>. firewalld.service is not available' % fw_ports)
2900 return
2901
2902 if not self.cmd:
2903 raise RuntimeError('command not defined')
2904
2905 for port in fw_ports:
2906 tcp_port = str(port) + '/tcp'
2907 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-port', tcp_port], verbosity=CallVerbosity.DEBUG)
2908 if not ret:
2909 logger.info('Disabling port %s in current zone...' % tcp_port)
2910 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--remove-port', tcp_port])
2911 if ret:
2912 raise RuntimeError('unable to remove port %s from current zone: %s' %
2913 (tcp_port, err))
2914 else:
2915 logger.info(f'Port {tcp_port} disabled')
2916 else:
2917 logger.info(f'firewalld port {tcp_port} already closed')
2918
2919 def apply_rules(self):
2920 # type: () -> None
2921 if not self.available:
2922 return
2923
2924 if not self.cmd:
2925 raise RuntimeError('command not defined')
2926
2927 call_throws(self.ctx, [self.cmd, '--reload'])
2928
2929
2930 def update_firewalld(ctx, daemon_type):
2931 # type: (CephadmContext, str) -> None
2932 firewall = Firewalld(ctx)
2933 firewall.enable_service_for(daemon_type)
2934 firewall.apply_rules()
2935
2936
2937 def install_sysctl(ctx: CephadmContext, fsid: str, daemon_type: str) -> None:
2938 """
2939 Set up sysctl settings
2940 """
2941 def _write(conf: Path, lines: List[str]) -> None:
2942 lines = [
2943 '# created by cephadm',
2944 '',
2945 *lines,
2946 '',
2947 ]
2948 with open(conf, 'w') as f:
2949 f.write('\n'.join(lines))
2950
2951 conf = Path(ctx.sysctl_dir).joinpath(f'90-ceph-{fsid}-{daemon_type}.conf')
2952 lines: Optional[List] = None
2953
2954 if daemon_type == 'osd':
2955 lines = OSD.get_sysctl_settings()
2956 elif daemon_type == 'haproxy':
2957 lines = HAproxy.get_sysctl_settings()
2958 elif daemon_type == 'keepalived':
2959 lines = Keepalived.get_sysctl_settings()
2960
2961 # apply the sysctl settings
2962 if lines:
2963 _write(conf, lines)
2964 call_throws(ctx, ['sysctl', '--system'])
2965
2966
2967 def install_base_units(ctx, fsid):
2968 # type: (CephadmContext, str) -> None
2969 """
2970 Set up ceph.target and ceph-$fsid.target units.
2971 """
2972 # global unit
2973 existed = os.path.exists(ctx.unit_dir + '/ceph.target')
2974 with open(ctx.unit_dir + '/ceph.target.new', 'w') as f:
2975 f.write('[Unit]\n'
2976 'Description=All Ceph clusters and services\n'
2977 '\n'
2978 '[Install]\n'
2979 'WantedBy=multi-user.target\n')
2980 os.rename(ctx.unit_dir + '/ceph.target.new',
2981 ctx.unit_dir + '/ceph.target')
2982 if not existed:
2983 # we disable before enable in case a different ceph.target
2984 # (from the traditional package) is present; while newer
2985 # systemd is smart enough to disable the old
2986 # (/lib/systemd/...) and enable the new (/etc/systemd/...),
2987 # some older versions of systemd error out with EEXIST.
2988 call_throws(ctx, ['systemctl', 'disable', 'ceph.target'])
2989 call_throws(ctx, ['systemctl', 'enable', 'ceph.target'])
2990 call_throws(ctx, ['systemctl', 'start', 'ceph.target'])
2991
2992 # cluster unit
2993 existed = os.path.exists(ctx.unit_dir + '/ceph-%s.target' % fsid)
2994 with open(ctx.unit_dir + '/ceph-%s.target.new' % fsid, 'w') as f:
2995 f.write(
2996 '[Unit]\n'
2997 'Description=Ceph cluster {fsid}\n'
2998 'PartOf=ceph.target\n'
2999 'Before=ceph.target\n'
3000 '\n'
3001 '[Install]\n'
3002 'WantedBy=multi-user.target ceph.target\n'.format(
3003 fsid=fsid)
3004 )
3005 os.rename(ctx.unit_dir + '/ceph-%s.target.new' % fsid,
3006 ctx.unit_dir + '/ceph-%s.target' % fsid)
3007 if not existed:
3008 call_throws(ctx, ['systemctl', 'enable', 'ceph-%s.target' % fsid])
3009 call_throws(ctx, ['systemctl', 'start', 'ceph-%s.target' % fsid])
3010
3011 # logrotate for the cluster
3012 with open(ctx.logrotate_dir + '/ceph-%s' % fsid, 'w') as f:
3013 """
3014 This is a bit sloppy in that the killall/pkill will touch all ceph daemons
3015 in all containers, but I don't see an elegant way to send SIGHUP *just* to
3016 the daemons for this cluster. (1) systemd kill -s will get the signal to
3017 podman, but podman will exit. (2) podman kill will get the signal to the
3018 first child (bash), but that isn't the ceph daemon. This is simpler and
3019 should be harmless.
3020 """
3021 f.write("""# created by cephadm
3022 /var/log/ceph/%s/*.log {
3023 rotate 7
3024 daily
3025 compress
3026 sharedscripts
3027 postrotate
3028 killall -q -1 ceph-mon ceph-mgr ceph-mds ceph-osd ceph-fuse radosgw rbd-mirror cephfs-mirror || pkill -1 -x 'ceph-mon|ceph-mgr|ceph-mds|ceph-osd|ceph-fuse|radosgw|rbd-mirror|cephfs-mirror' || true
3029 endscript
3030 missingok
3031 notifempty
3032 su root root
3033 }
3034 """ % fsid)
3035
3036
3037 def get_unit_file(ctx, fsid):
3038 # type: (CephadmContext, str) -> str
3039 extra_args = ''
3040 if isinstance(ctx.container_engine, Podman):
3041 extra_args = ('ExecStartPre=-/bin/rm -f %t/%n-pid %t/%n-cid\n'
3042 'ExecStopPost=-/bin/rm -f %t/%n-pid %t/%n-cid\n'
3043 'Type=forking\n'
3044 'PIDFile=%t/%n-pid\n')
3045 if ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION:
3046 extra_args += 'Delegate=yes\n'
3047
3048 docker = isinstance(ctx.container_engine, Docker)
3049 u = """# generated by cephadm
3050 [Unit]
3051 Description=Ceph %i for {fsid}
3052
3053 # According to:
3054 # http://www.freedesktop.org/wiki/Software/systemd/NetworkTarget
3055 # these can be removed once ceph-mon will dynamically change network
3056 # configuration.
3057 After=network-online.target local-fs.target time-sync.target{docker_after}
3058 Wants=network-online.target local-fs.target time-sync.target
3059 {docker_requires}
3060
3061 PartOf=ceph-{fsid}.target
3062 Before=ceph-{fsid}.target
3063
3064 [Service]
3065 LimitNOFILE=1048576
3066 LimitNPROC=1048576
3067 EnvironmentFile=-/etc/environment
3068 ExecStart=/bin/bash {data_dir}/{fsid}/%i/unit.run
3069 ExecStop=-{container_path} stop ceph-{fsid}-%i
3070 ExecStopPost=-/bin/bash {data_dir}/{fsid}/%i/unit.poststop
3071 KillMode=none
3072 Restart=on-failure
3073 RestartSec=10s
3074 TimeoutStartSec=120
3075 TimeoutStopSec=120
3076 StartLimitInterval=30min
3077 StartLimitBurst=5
3078 {extra_args}
3079 [Install]
3080 WantedBy=ceph-{fsid}.target
3081 """.format(container_path=ctx.container_engine.path,
3082 fsid=fsid,
3083 data_dir=ctx.data_dir,
3084 extra_args=extra_args,
3085 # if docker, we depend on docker.service
3086 docker_after=' docker.service' if docker else '',
3087 docker_requires='Requires=docker.service\n' if docker else '')
3088
3089 return u
3090
3091 ##################################
3092
3093
3094 class CephContainer:
3095 def __init__(self,
3096 ctx: CephadmContext,
3097 image: str,
3098 entrypoint: str,
3099 args: List[str] = [],
3100 volume_mounts: Dict[str, str] = {},
3101 cname: str = '',
3102 container_args: List[str] = [],
3103 envs: Optional[List[str]] = None,
3104 privileged: bool = False,
3105 ptrace: bool = False,
3106 bind_mounts: Optional[List[List[str]]] = None,
3107 init: Optional[bool] = None,
3108 host_network: bool = True,
3109 memory_request: Optional[str] = None,
3110 memory_limit: Optional[str] = None,
3111 ) -> None:
3112 self.ctx = ctx
3113 self.image = image
3114 self.entrypoint = entrypoint
3115 self.args = args
3116 self.volume_mounts = volume_mounts
3117 self.cname = cname
3118 self.container_args = container_args
3119 self.envs = envs
3120 self.privileged = privileged
3121 self.ptrace = ptrace
3122 self.bind_mounts = bind_mounts if bind_mounts else []
3123 self.init = init if init else ctx.container_init
3124 self.host_network = host_network
3125 self.memory_request = memory_request
3126 self.memory_limit = memory_limit
3127
3128 def run_cmd(self) -> List[str]:
3129 cmd_args: List[str] = [
3130 str(self.ctx.container_engine.path),
3131 'run',
3132 '--rm',
3133 '--ipc=host',
3134 # some containers (ahem, haproxy) override this, but we want a fast
3135 # shutdown always (and, more importantly, a successful exit even if we
3136 # fall back to SIGKILL).
3137 '--stop-signal=SIGTERM',
3138 ]
3139
3140 if isinstance(self.ctx.container_engine, Podman):
3141 if os.path.exists('/etc/ceph/podman-auth.json'):
3142 cmd_args.append('--authfile=/etc/ceph/podman-auth.json')
3143
3144 envs: List[str] = [
3145 '-e', 'CONTAINER_IMAGE=%s' % self.image,
3146 '-e', 'NODE_NAME=%s' % get_hostname(),
3147 ]
3148 vols: List[str] = []
3149 binds: List[str] = []
3150
3151 if self.memory_request:
3152 cmd_args.extend(['-e', 'POD_MEMORY_REQUEST', str(self.memory_request)])
3153 if self.memory_limit:
3154 cmd_args.extend(['-e', 'POD_MEMORY_LIMIT', str(self.memory_limit)])
3155 cmd_args.extend(['--memory', str(self.memory_limit)])
3156
3157 if self.host_network:
3158 cmd_args.append('--net=host')
3159 if self.entrypoint:
3160 cmd_args.extend(['--entrypoint', self.entrypoint])
3161 if self.privileged:
3162 cmd_args.extend([
3163 '--privileged',
3164 # let OSD etc read block devs that haven't been chowned
3165 '--group-add=disk'])
3166 if self.ptrace and not self.privileged:
3167 # if privileged, the SYS_PTRACE cap is already added
3168 # in addition, --cap-add and --privileged are mutually
3169 # exclusive since podman >= 2.0
3170 cmd_args.append('--cap-add=SYS_PTRACE')
3171 if self.init:
3172 cmd_args.append('--init')
3173 envs += ['-e', 'CEPH_USE_RANDOM_NONCE=1']
3174 if self.cname:
3175 cmd_args.extend(['--name', self.cname])
3176 if self.envs:
3177 for env in self.envs:
3178 envs.extend(['-e', env])
3179
3180 vols = sum(
3181 [['-v', '%s:%s' % (host_dir, container_dir)]
3182 for host_dir, container_dir in self.volume_mounts.items()], [])
3183 binds = sum([['--mount', '{}'.format(','.join(bind))]
3184 for bind in self.bind_mounts], [])
3185
3186 return \
3187 cmd_args + self.container_args + \
3188 envs + vols + binds + \
3189 [self.image] + self.args # type: ignore
3190
3191 def shell_cmd(self, cmd: List[str]) -> List[str]:
3192 cmd_args: List[str] = [
3193 str(self.ctx.container_engine.path),
3194 'run',
3195 '--rm',
3196 '--ipc=host',
3197 ]
3198 envs: List[str] = [
3199 '-e', 'CONTAINER_IMAGE=%s' % self.image,
3200 '-e', 'NODE_NAME=%s' % get_hostname(),
3201 ]
3202 vols: List[str] = []
3203 binds: List[str] = []
3204
3205 if self.host_network:
3206 cmd_args.append('--net=host')
3207 if self.ctx.no_hosts:
3208 cmd_args.append('--no-hosts')
3209 if self.privileged:
3210 cmd_args.extend([
3211 '--privileged',
3212 # let OSD etc read block devs that haven't been chowned
3213 '--group-add=disk',
3214 ])
3215 if self.init:
3216 cmd_args.append('--init')
3217 envs += ['-e', 'CEPH_USE_RANDOM_NONCE=1']
3218 if self.envs:
3219 for env in self.envs:
3220 envs.extend(['-e', env])
3221
3222 vols = sum(
3223 [['-v', '%s:%s' % (host_dir, container_dir)]
3224 for host_dir, container_dir in self.volume_mounts.items()], [])
3225 binds = sum([['--mount', '{}'.format(','.join(bind))]
3226 for bind in self.bind_mounts], [])
3227
3228 return cmd_args + self.container_args + envs + vols + binds + [
3229 '--entrypoint', cmd[0],
3230 self.image,
3231 ] + cmd[1:]
3232
3233 def exec_cmd(self, cmd):
3234 # type: (List[str]) -> List[str]
3235 return [
3236 str(self.ctx.container_engine.path),
3237 'exec',
3238 ] + self.container_args + [
3239 self.cname,
3240 ] + cmd
3241
3242 def rm_cmd(self, storage=False):
3243 # type: (bool) -> List[str]
3244 ret = [
3245 str(self.ctx.container_engine.path),
3246 'rm', '-f',
3247 ]
3248 if storage:
3249 ret.append('--storage')
3250 ret.append(self.cname)
3251 return ret
3252
3253 def stop_cmd(self):
3254 # type () -> List[str]
3255 ret = [
3256 str(self.ctx.container_engine.path),
3257 'stop', self.cname,
3258 ]
3259 return ret
3260
3261 def run(self, timeout=DEFAULT_TIMEOUT):
3262 # type: (Optional[int]) -> str
3263 out, _, _ = call_throws(self.ctx, self.run_cmd(),
3264 desc=self.entrypoint, timeout=timeout)
3265 return out
3266
3267 ##################################
3268
3269
3270 @infer_image
3271 def command_version(ctx):
3272 # type: (CephadmContext) -> int
3273 c = CephContainer(ctx, ctx.image, 'ceph', ['--version'])
3274 out, err, ret = call(ctx, c.run_cmd(), desc=c.entrypoint)
3275 if not ret:
3276 print(out.strip())
3277 return ret
3278
3279 ##################################
3280
3281
3282 @infer_image
3283 def command_pull(ctx):
3284 # type: (CephadmContext) -> int
3285
3286 _pull_image(ctx, ctx.image)
3287 return command_inspect_image(ctx)
3288
3289
3290 def _pull_image(ctx, image):
3291 # type: (CephadmContext, str) -> None
3292 logger.info('Pulling container image %s...' % image)
3293
3294 ignorelist = [
3295 'error creating read-write layer with ID',
3296 'net/http: TLS handshake timeout',
3297 'Digest did not match, expected',
3298 ]
3299
3300 cmd = [ctx.container_engine.path, 'pull', image]
3301 if isinstance(ctx.container_engine, Podman) and os.path.exists('/etc/ceph/podman-auth.json'):
3302 cmd.append('--authfile=/etc/ceph/podman-auth.json')
3303 cmd_str = ' '.join(cmd)
3304
3305 for sleep_secs in [1, 4, 25]:
3306 out, err, ret = call(ctx, cmd)
3307 if not ret:
3308 return
3309
3310 if not any(pattern in err for pattern in ignorelist):
3311 raise RuntimeError('Failed command: %s' % cmd_str)
3312
3313 logger.info('`%s` failed transiently. Retrying. waiting %s seconds...' % (cmd_str, sleep_secs))
3314 time.sleep(sleep_secs)
3315
3316 raise RuntimeError('Failed command: %s: maximum retries reached' % cmd_str)
3317
3318 ##################################
3319
3320
3321 @infer_image
3322 def command_inspect_image(ctx):
3323 # type: (CephadmContext) -> int
3324 out, err, ret = call_throws(ctx, [
3325 ctx.container_engine.path, 'inspect',
3326 '--format', '{{.ID}},{{.RepoDigests}}',
3327 ctx.image])
3328 if ret:
3329 return errno.ENOENT
3330 info_from = get_image_info_from_inspect(out.strip(), ctx.image)
3331
3332 ver = CephContainer(ctx, ctx.image, 'ceph', ['--version']).run().strip()
3333 info_from['ceph_version'] = ver
3334
3335 print(json.dumps(info_from, indent=4, sort_keys=True))
3336 return 0
3337
3338
3339 def normalize_image_digest(digest):
3340 # normal case:
3341 # ceph/ceph -> docker.io/ceph/ceph
3342 # edge cases that shouldn't ever come up:
3343 # ubuntu -> docker.io/ubuntu (ubuntu alias for library/ubuntu)
3344 # no change:
3345 # quay.ceph.io/ceph/ceph -> ceph
3346 # docker.io/ubuntu -> no change
3347 bits = digest.split('/')
3348 if '.' not in bits[0] and len(bits) < 3:
3349 digest = DEFAULT_REGISTRY + '/' + digest
3350 return digest
3351
3352
3353 def get_image_info_from_inspect(out, image):
3354 # type: (str, str) -> Dict[str, Union[str,List[str]]]
3355 image_id, digests = out.split(',', 1)
3356 if not out:
3357 raise Error('inspect {}: empty result'.format(image))
3358 r = {
3359 'image_id': normalize_container_id(image_id)
3360 } # type: Dict[str, Union[str,List[str]]]
3361 if digests:
3362 r['repo_digests'] = list(map(normalize_image_digest, digests[1:-1].split(' ')))
3363 return r
3364
3365 ##################################
3366
3367
3368 def check_subnet(subnets: str) -> Tuple[int, List[int], str]:
3369 """Determine whether the given string is a valid subnet
3370
3371 :param subnets: subnet string, a single definition or comma separated list of CIDR subnets
3372 :returns: return code, IP version list of the subnets and msg describing any errors validation errors
3373 """
3374
3375 rc = 0
3376 versions = set()
3377 errors = []
3378 subnet_list = subnets.split(',')
3379 for subnet in subnet_list:
3380 # ensure the format of the string is as expected address/netmask
3381 if not re.search(r'\/\d+$', subnet):
3382 rc = 1
3383 errors.append(f'{subnet} is not in CIDR format (address/netmask)')
3384 continue
3385 try:
3386 v = ipaddress.ip_network(subnet).version
3387 versions.add(v)
3388 except ValueError as e:
3389 rc = 1
3390 errors.append(f'{subnet} invalid: {str(e)}')
3391
3392 return rc, list(versions), ', '.join(errors)
3393
3394
3395 def unwrap_ipv6(address):
3396 # type: (str) -> str
3397 if address.startswith('[') and address.endswith(']'):
3398 return address[1:-1]
3399 return address
3400
3401
3402 def wrap_ipv6(address):
3403 # type: (str) -> str
3404
3405 # We cannot assume it's already wrapped or even an IPv6 address if
3406 # it's already wrapped it'll not pass (like if it's a hostname) and trigger
3407 # the ValueError
3408 try:
3409 if ipaddress.ip_address(address).version == 6:
3410 return f'[{address}]'
3411 except ValueError:
3412 pass
3413
3414 return address
3415
3416
3417 def is_ipv6(address):
3418 # type: (str) -> bool
3419 address = unwrap_ipv6(address)
3420 try:
3421 return ipaddress.ip_address(address).version == 6
3422 except ValueError:
3423 logger.warning('Address: {} is not a valid IP address'.format(address))
3424 return False
3425
3426
3427 def prepare_mon_addresses(
3428 ctx: CephadmContext
3429 ) -> Tuple[str, bool, Optional[str]]:
3430 r = re.compile(r':(\d+)$')
3431 base_ip = ''
3432 ipv6 = False
3433
3434 if ctx.mon_ip:
3435 ipv6 = is_ipv6(ctx.mon_ip)
3436 if ipv6:
3437 ctx.mon_ip = wrap_ipv6(ctx.mon_ip)
3438 hasport = r.findall(ctx.mon_ip)
3439 if hasport:
3440 port = int(hasport[0])
3441 if port == 6789:
3442 addr_arg = '[v1:%s]' % ctx.mon_ip
3443 elif port == 3300:
3444 addr_arg = '[v2:%s]' % ctx.mon_ip
3445 else:
3446 logger.warning('Using msgr2 protocol for unrecognized port %d' %
3447 port)
3448 addr_arg = '[v2:%s]' % ctx.mon_ip
3449 base_ip = ctx.mon_ip[0:-(len(str(port))) - 1]
3450 check_ip_port(ctx, base_ip, port)
3451 else:
3452 base_ip = ctx.mon_ip
3453 addr_arg = '[v2:%s:3300,v1:%s:6789]' % (ctx.mon_ip, ctx.mon_ip)
3454 check_ip_port(ctx, ctx.mon_ip, 3300)
3455 check_ip_port(ctx, ctx.mon_ip, 6789)
3456 elif ctx.mon_addrv:
3457 addr_arg = ctx.mon_addrv
3458 if addr_arg[0] != '[' or addr_arg[-1] != ']':
3459 raise Error('--mon-addrv value %s must use square backets' %
3460 addr_arg)
3461 ipv6 = addr_arg.count('[') > 1
3462 for addr in addr_arg[1:-1].split(','):
3463 hasport = r.findall(addr)
3464 if not hasport:
3465 raise Error('--mon-addrv value %s must include port number' %
3466 addr_arg)
3467 port = int(hasport[0])
3468 # strip off v1: or v2: prefix
3469 addr = re.sub(r'^\w+:', '', addr)
3470 base_ip = addr[0:-(len(str(port))) - 1]
3471 check_ip_port(ctx, base_ip, port)
3472 else:
3473 raise Error('must specify --mon-ip or --mon-addrv')
3474 logger.debug('Base mon IP is %s, final addrv is %s' % (base_ip, addr_arg))
3475
3476 mon_network = None
3477 if not ctx.skip_mon_network:
3478 # make sure IP is configured locally, and then figure out the
3479 # CIDR network
3480 errmsg = f'Cannot infer CIDR network for mon IP `{base_ip}`'
3481 for net, ifaces in list_networks(ctx).items():
3482 ips: List[str] = []
3483 for iface, ls in ifaces.items():
3484 ips.extend(ls)
3485 try:
3486 if ipaddress.ip_address(unwrap_ipv6(base_ip)) in \
3487 [ipaddress.ip_address(ip) for ip in ips]:
3488 mon_network = net
3489 logger.info(f'Mon IP `{base_ip}` is in CIDR network `{mon_network}`')
3490 break
3491 except ValueError as e:
3492 logger.warning(f'{errmsg}: {e}')
3493 if not mon_network:
3494 raise Error(f'{errmsg}: pass --skip-mon-network to configure it later')
3495
3496 return (addr_arg, ipv6, mon_network)
3497
3498
3499 def prepare_cluster_network(ctx: CephadmContext) -> Tuple[str, bool]:
3500 cluster_network = ''
3501 ipv6_cluster_network = False
3502 # the cluster network may not exist on this node, so all we can do is
3503 # validate that the address given is valid ipv4 or ipv6 subnet
3504 if ctx.cluster_network:
3505 rc, versions, err_msg = check_subnet(ctx.cluster_network)
3506 if rc:
3507 raise Error(f'Invalid --cluster-network parameter: {err_msg}')
3508 cluster_network = ctx.cluster_network
3509 ipv6_cluster_network = True if 6 in versions else False
3510 else:
3511 logger.info('- internal network (--cluster-network) has not '
3512 'been provided, OSD replication will default to '
3513 'the public_network')
3514
3515 return cluster_network, ipv6_cluster_network
3516
3517
3518 def create_initial_keys(
3519 ctx: CephadmContext,
3520 uid: int, gid: int,
3521 mgr_id: str
3522 ) -> Tuple[str, str, str, Any, Any]: # type: ignore
3523
3524 _image = ctx.image
3525
3526 # create some initial keys
3527 logger.info('Creating initial keys...')
3528 mon_key = CephContainer(
3529 ctx,
3530 image=_image,
3531 entrypoint='/usr/bin/ceph-authtool',
3532 args=['--gen-print-key'],
3533 ).run().strip()
3534 admin_key = CephContainer(
3535 ctx,
3536 image=_image,
3537 entrypoint='/usr/bin/ceph-authtool',
3538 args=['--gen-print-key'],
3539 ).run().strip()
3540 mgr_key = CephContainer(
3541 ctx,
3542 image=_image,
3543 entrypoint='/usr/bin/ceph-authtool',
3544 args=['--gen-print-key'],
3545 ).run().strip()
3546
3547 keyring = ('[mon.]\n'
3548 '\tkey = %s\n'
3549 '\tcaps mon = allow *\n'
3550 '[client.admin]\n'
3551 '\tkey = %s\n'
3552 '\tcaps mon = allow *\n'
3553 '\tcaps mds = allow *\n'
3554 '\tcaps mgr = allow *\n'
3555 '\tcaps osd = allow *\n'
3556 '[mgr.%s]\n'
3557 '\tkey = %s\n'
3558 '\tcaps mon = profile mgr\n'
3559 '\tcaps mds = allow *\n'
3560 '\tcaps osd = allow *\n'
3561 % (mon_key, admin_key, mgr_id, mgr_key))
3562
3563 admin_keyring = write_tmp('[client.admin]\n'
3564 '\tkey = ' + admin_key + '\n',
3565 uid, gid)
3566
3567 # tmp keyring file
3568 bootstrap_keyring = write_tmp(keyring, uid, gid)
3569 return (mon_key, mgr_key, admin_key,
3570 bootstrap_keyring, admin_keyring)
3571
3572
3573 def create_initial_monmap(
3574 ctx: CephadmContext,
3575 uid: int, gid: int,
3576 fsid: str,
3577 mon_id: str, mon_addr: str
3578 ) -> Any:
3579 logger.info('Creating initial monmap...')
3580 monmap = write_tmp('', 0, 0)
3581 out = CephContainer(
3582 ctx,
3583 image=ctx.image,
3584 entrypoint='/usr/bin/monmaptool',
3585 args=[
3586 '--create',
3587 '--clobber',
3588 '--fsid', fsid,
3589 '--addv', mon_id, mon_addr,
3590 '/tmp/monmap'
3591 ],
3592 volume_mounts={
3593 monmap.name: '/tmp/monmap:z',
3594 },
3595 ).run()
3596 logger.debug(f'monmaptool for {mon_id} {mon_addr} on {out}')
3597
3598 # pass monmap file to ceph user for use by ceph-mon --mkfs below
3599 os.fchown(monmap.fileno(), uid, gid)
3600 return monmap
3601
3602
3603 def prepare_create_mon(
3604 ctx: CephadmContext,
3605 uid: int, gid: int,
3606 fsid: str, mon_id: str,
3607 bootstrap_keyring_path: str,
3608 monmap_path: str
3609 ):
3610 logger.info('Creating mon...')
3611 create_daemon_dirs(ctx, fsid, 'mon', mon_id, uid, gid)
3612 mon_dir = get_data_dir(fsid, ctx.data_dir, 'mon', mon_id)
3613 log_dir = get_log_dir(fsid, ctx.log_dir)
3614 out = CephContainer(
3615 ctx,
3616 image=ctx.image,
3617 entrypoint='/usr/bin/ceph-mon',
3618 args=[
3619 '--mkfs',
3620 '-i', mon_id,
3621 '--fsid', fsid,
3622 '-c', '/dev/null',
3623 '--monmap', '/tmp/monmap',
3624 '--keyring', '/tmp/keyring',
3625 ] + get_daemon_args(ctx, fsid, 'mon', mon_id),
3626 volume_mounts={
3627 log_dir: '/var/log/ceph:z',
3628 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
3629 bootstrap_keyring_path: '/tmp/keyring:z',
3630 monmap_path: '/tmp/monmap:z',
3631 },
3632 ).run()
3633 logger.debug(f'create mon.{mon_id} on {out}')
3634 return (mon_dir, log_dir)
3635
3636
3637 def create_mon(
3638 ctx: CephadmContext,
3639 uid: int, gid: int,
3640 fsid: str, mon_id: str
3641 ) -> None:
3642 mon_c = get_container(ctx, fsid, 'mon', mon_id)
3643 ctx.meta_json = json.dumps({'service_name': 'mon'})
3644 deploy_daemon(ctx, fsid, 'mon', mon_id, mon_c, uid, gid,
3645 config=None, keyring=None)
3646
3647
3648 def wait_for_mon(
3649 ctx: CephadmContext,
3650 mon_id: str, mon_dir: str,
3651 admin_keyring_path: str, config_path: str
3652 ):
3653 logger.info('Waiting for mon to start...')
3654 c = CephContainer(
3655 ctx,
3656 image=ctx.image,
3657 entrypoint='/usr/bin/ceph',
3658 args=[
3659 'status'],
3660 volume_mounts={
3661 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
3662 admin_keyring_path: '/etc/ceph/ceph.client.admin.keyring:z',
3663 config_path: '/etc/ceph/ceph.conf:z',
3664 },
3665 )
3666
3667 # wait for the service to become available
3668 def is_mon_available():
3669 # type: () -> bool
3670 timeout = ctx.timeout if ctx.timeout else 60 # seconds
3671 out, err, ret = call(ctx, c.run_cmd(),
3672 desc=c.entrypoint,
3673 timeout=timeout)
3674 return ret == 0
3675
3676 is_available(ctx, 'mon', is_mon_available)
3677
3678
3679 def create_mgr(
3680 ctx: CephadmContext,
3681 uid: int, gid: int,
3682 fsid: str, mgr_id: str, mgr_key: str,
3683 config: str, clifunc: Callable
3684 ) -> None:
3685 logger.info('Creating mgr...')
3686 mgr_keyring = '[mgr.%s]\n\tkey = %s\n' % (mgr_id, mgr_key)
3687 mgr_c = get_container(ctx, fsid, 'mgr', mgr_id)
3688 # Note:the default port used by the Prometheus node exporter is opened in fw
3689 ctx.meta_json = json.dumps({'service_name': 'mgr'})
3690 deploy_daemon(ctx, fsid, 'mgr', mgr_id, mgr_c, uid, gid,
3691 config=config, keyring=mgr_keyring, ports=[9283])
3692
3693 # wait for the service to become available
3694 logger.info('Waiting for mgr to start...')
3695
3696 def is_mgr_available():
3697 # type: () -> bool
3698 timeout = ctx.timeout if ctx.timeout else 60 # seconds
3699 try:
3700 out = clifunc(['status', '-f', 'json-pretty'], timeout=timeout)
3701 j = json.loads(out)
3702 return j.get('mgrmap', {}).get('available', False)
3703 except Exception as e:
3704 logger.debug('status failed: %s' % e)
3705 return False
3706 is_available(ctx, 'mgr', is_mgr_available)
3707
3708
3709 def prepare_ssh(
3710 ctx: CephadmContext,
3711 cli: Callable, wait_for_mgr_restart: Callable
3712 ) -> None:
3713
3714 cli(['cephadm', 'set-user', ctx.ssh_user])
3715
3716 if ctx.ssh_config:
3717 logger.info('Using provided ssh config...')
3718 mounts = {
3719 pathify(ctx.ssh_config.name): '/tmp/cephadm-ssh-config:z',
3720 }
3721 cli(['cephadm', 'set-ssh-config', '-i', '/tmp/cephadm-ssh-config'], extra_mounts=mounts)
3722
3723 if ctx.ssh_private_key and ctx.ssh_public_key:
3724 logger.info('Using provided ssh keys...')
3725 mounts = {
3726 pathify(ctx.ssh_private_key.name): '/tmp/cephadm-ssh-key:z',
3727 pathify(ctx.ssh_public_key.name): '/tmp/cephadm-ssh-key.pub:z'
3728 }
3729 cli(['cephadm', 'set-priv-key', '-i', '/tmp/cephadm-ssh-key'], extra_mounts=mounts)
3730 cli(['cephadm', 'set-pub-key', '-i', '/tmp/cephadm-ssh-key.pub'], extra_mounts=mounts)
3731 else:
3732 logger.info('Generating ssh key...')
3733 cli(['cephadm', 'generate-key'])
3734 ssh_pub = cli(['cephadm', 'get-pub-key'])
3735
3736 with open(ctx.output_pub_ssh_key, 'w') as f:
3737 f.write(ssh_pub)
3738 logger.info('Wrote public SSH key to %s' % ctx.output_pub_ssh_key)
3739
3740 logger.info('Adding key to %s@localhost authorized_keys...' % ctx.ssh_user)
3741 try:
3742 s_pwd = pwd.getpwnam(ctx.ssh_user)
3743 except KeyError:
3744 raise Error('Cannot find uid/gid for ssh-user: %s' % (ctx.ssh_user))
3745 ssh_uid = s_pwd.pw_uid
3746 ssh_gid = s_pwd.pw_gid
3747 ssh_dir = os.path.join(s_pwd.pw_dir, '.ssh')
3748
3749 if not os.path.exists(ssh_dir):
3750 makedirs(ssh_dir, ssh_uid, ssh_gid, 0o700)
3751
3752 auth_keys_file = '%s/authorized_keys' % ssh_dir
3753 add_newline = False
3754
3755 if os.path.exists(auth_keys_file):
3756 with open(auth_keys_file, 'r') as f:
3757 f.seek(0, os.SEEK_END)
3758 if f.tell() > 0:
3759 f.seek(f.tell() - 1, os.SEEK_SET) # go to last char
3760 if f.read() != '\n':
3761 add_newline = True
3762
3763 with open(auth_keys_file, 'a') as f:
3764 os.fchown(f.fileno(), ssh_uid, ssh_gid) # just in case we created it
3765 os.fchmod(f.fileno(), 0o600) # just in case we created it
3766 if add_newline:
3767 f.write('\n')
3768 f.write(ssh_pub.strip() + '\n')
3769
3770 host = get_hostname()
3771 logger.info('Adding host %s...' % host)
3772 try:
3773 args = ['orch', 'host', 'add', host]
3774 if ctx.mon_ip:
3775 args.append(ctx.mon_ip)
3776 cli(args)
3777 except RuntimeError as e:
3778 raise Error('Failed to add host <%s>: %s' % (host, e))
3779
3780 for t in ['mon', 'mgr']:
3781 if not ctx.orphan_initial_daemons:
3782 logger.info('Deploying %s service with default placement...' % t)
3783 cli(['orch', 'apply', t])
3784 else:
3785 logger.info('Deploying unmanaged %s service...' % t)
3786 cli(['orch', 'apply', t, '--unmanaged'])
3787
3788 if not ctx.orphan_initial_daemons:
3789 logger.info('Deploying crash service with default placement...')
3790 cli(['orch', 'apply', 'crash'])
3791
3792 if not ctx.skip_monitoring_stack:
3793 logger.info('Enabling mgr prometheus module...')
3794 cli(['mgr', 'module', 'enable', 'prometheus'])
3795 for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager']:
3796 logger.info('Deploying %s service with default placement...' % t)
3797 cli(['orch', 'apply', t])
3798
3799
3800 def enable_cephadm_mgr_module(
3801 cli: Callable, wait_for_mgr_restart: Callable
3802 ) -> None:
3803
3804 logger.info('Enabling cephadm module...')
3805 cli(['mgr', 'module', 'enable', 'cephadm'])
3806 wait_for_mgr_restart()
3807 logger.info('Setting orchestrator backend to cephadm...')
3808 cli(['orch', 'set', 'backend', 'cephadm'])
3809
3810
3811 def prepare_dashboard(
3812 ctx: CephadmContext,
3813 uid: int, gid: int,
3814 cli: Callable, wait_for_mgr_restart: Callable
3815 ) -> None:
3816
3817 # Configure SSL port (cephadm only allows to configure dashboard SSL port)
3818 # if the user does not want to use SSL he can change this setting once the cluster is up
3819 cli(['config', 'set', 'mgr', 'mgr/dashboard/ssl_server_port', str(ctx.ssl_dashboard_port)])
3820
3821 # configuring dashboard parameters
3822 logger.info('Enabling the dashboard module...')
3823 cli(['mgr', 'module', 'enable', 'dashboard'])
3824 wait_for_mgr_restart()
3825
3826 # dashboard crt and key
3827 if ctx.dashboard_key and ctx.dashboard_crt:
3828 logger.info('Using provided dashboard certificate...')
3829 mounts = {
3830 pathify(ctx.dashboard_crt.name): '/tmp/dashboard.crt:z',
3831 pathify(ctx.dashboard_key.name): '/tmp/dashboard.key:z'
3832 }
3833 cli(['dashboard', 'set-ssl-certificate', '-i', '/tmp/dashboard.crt'], extra_mounts=mounts)
3834 cli(['dashboard', 'set-ssl-certificate-key', '-i', '/tmp/dashboard.key'], extra_mounts=mounts)
3835 else:
3836 logger.info('Generating a dashboard self-signed certificate...')
3837 cli(['dashboard', 'create-self-signed-cert'])
3838
3839 logger.info('Creating initial admin user...')
3840 password = ctx.initial_dashboard_password or generate_password()
3841 tmp_password_file = write_tmp(password, uid, gid)
3842 cmd = ['dashboard', 'ac-user-create', ctx.initial_dashboard_user, '-i', '/tmp/dashboard.pw', 'administrator', '--force-password']
3843 if not ctx.dashboard_password_noupdate:
3844 cmd.append('--pwd-update-required')
3845 cli(cmd, extra_mounts={pathify(tmp_password_file.name): '/tmp/dashboard.pw:z'})
3846 logger.info('Fetching dashboard port number...')
3847 out = cli(['config', 'get', 'mgr', 'mgr/dashboard/ssl_server_port'])
3848 port = int(out)
3849
3850 # Open dashboard port
3851 fw = Firewalld(ctx)
3852 fw.open_ports([port])
3853 fw.apply_rules()
3854
3855 logger.info('Ceph Dashboard is now available at:\n\n'
3856 '\t URL: https://%s:%s/\n'
3857 '\t User: %s\n'
3858 '\tPassword: %s\n' % (
3859 get_fqdn(), port,
3860 ctx.initial_dashboard_user,
3861 password))
3862
3863
3864 def prepare_bootstrap_config(
3865 ctx: CephadmContext,
3866 fsid: str, mon_addr: str, image: str
3867
3868 ) -> str:
3869
3870 cp = read_config(ctx.config)
3871 if not cp.has_section('global'):
3872 cp.add_section('global')
3873 cp.set('global', 'fsid', fsid)
3874 cp.set('global', 'mon_host', mon_addr)
3875 cp.set('global', 'container_image', image)
3876
3877 if not cp.has_section('mon'):
3878 cp.add_section('mon')
3879 if (
3880 not cp.has_option('mon', 'auth_allow_insecure_global_id_reclaim')
3881 and not cp.has_option('mon', 'auth allow insecure global id reclaim')
3882 ):
3883 cp.set('mon', 'auth_allow_insecure_global_id_reclaim', 'false')
3884
3885 if ctx.single_host_defaults:
3886 logger.info('Adjusting default settings to suit single-host cluster...')
3887 # replicate across osds, not hosts
3888 if (
3889 not cp.has_option('global', 'osd_crush_choose_leaf_type')
3890 and not cp.has_option('global', 'osd crush choose leaf type')
3891 ):
3892 cp.set('global', 'osd_crush_choose_leaf_type', '0')
3893 # replica 2x
3894 if (
3895 not cp.has_option('global', 'osd_pool_default_size')
3896 and not cp.has_option('global', 'osd pool default size')
3897 ):
3898 cp.set('global', 'osd_pool_default_size', '2')
3899 # disable mgr standby modules (so we can colocate multiple mgrs on one host)
3900 if not cp.has_section('mgr'):
3901 cp.add_section('mgr')
3902 if (
3903 not cp.has_option('mgr', 'mgr_standby_modules')
3904 and not cp.has_option('mgr', 'mgr standby modules')
3905 ):
3906 cp.set('mgr', 'mgr_standby_modules', 'false')
3907
3908 cpf = StringIO()
3909 cp.write(cpf)
3910 config = cpf.getvalue()
3911
3912 if ctx.registry_json or ctx.registry_url:
3913 command_registry_login(ctx)
3914
3915 return config
3916
3917
3918 def finish_bootstrap_config(
3919 ctx: CephadmContext,
3920 fsid: str,
3921 config: str,
3922 mon_id: str, mon_dir: str,
3923 mon_network: Optional[str], ipv6: bool,
3924 cli: Callable,
3925 cluster_network: Optional[str], ipv6_cluster_network: bool
3926
3927 ) -> None:
3928 if not ctx.no_minimize_config:
3929 logger.info('Assimilating anything we can from ceph.conf...')
3930 cli([
3931 'config', 'assimilate-conf',
3932 '-i', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
3933 ], {
3934 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
3935 })
3936 logger.info('Generating new minimal ceph.conf...')
3937 cli([
3938 'config', 'generate-minimal-conf',
3939 '-o', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
3940 ], {
3941 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
3942 })
3943 # re-read our minimized config
3944 with open(mon_dir + '/config', 'r') as f:
3945 config = f.read()
3946 logger.info('Restarting the monitor...')
3947 call_throws(ctx, [
3948 'systemctl',
3949 'restart',
3950 get_unit_name(fsid, 'mon', mon_id)
3951 ])
3952
3953 if mon_network:
3954 logger.info(f'Setting mon public_network to {mon_network}')
3955 cli(['config', 'set', 'mon', 'public_network', mon_network])
3956
3957 if cluster_network:
3958 logger.info(f'Setting cluster_network to {cluster_network}')
3959 cli(['config', 'set', 'global', 'cluster_network', cluster_network])
3960
3961 if ipv6 or ipv6_cluster_network:
3962 logger.info('Enabling IPv6 (ms_bind_ipv6) binding')
3963 cli(['config', 'set', 'global', 'ms_bind_ipv6', 'true'])
3964
3965 with open(ctx.output_config, 'w') as f:
3966 f.write(config)
3967 logger.info('Wrote config to %s' % ctx.output_config)
3968 pass
3969
3970
3971 @default_image
3972 def command_bootstrap(ctx):
3973 # type: (CephadmContext) -> int
3974
3975 if not ctx.output_config:
3976 ctx.output_config = os.path.join(ctx.output_dir, 'ceph.conf')
3977 if not ctx.output_keyring:
3978 ctx.output_keyring = os.path.join(ctx.output_dir,
3979 'ceph.client.admin.keyring')
3980 if not ctx.output_pub_ssh_key:
3981 ctx.output_pub_ssh_key = os.path.join(ctx.output_dir, 'ceph.pub')
3982
3983 # verify output files
3984 for f in [ctx.output_config, ctx.output_keyring,
3985 ctx.output_pub_ssh_key]:
3986 if not ctx.allow_overwrite:
3987 if os.path.exists(f):
3988 raise Error('%s already exists; delete or pass '
3989 '--allow-overwrite to overwrite' % f)
3990 dirname = os.path.dirname(f)
3991 if dirname and not os.path.exists(dirname):
3992 fname = os.path.basename(f)
3993 logger.info(f'Creating directory {dirname} for {fname}')
3994 try:
3995 # use makedirs to create intermediate missing dirs
3996 os.makedirs(dirname, 0o755)
3997 except PermissionError:
3998 raise Error(f'Unable to create {dirname} due to permissions failure. Retry with root, or sudo or preallocate the directory.')
3999
4000 (user_conf, _) = get_config_and_keyring(ctx)
4001
4002 if not ctx.skip_prepare_host:
4003 command_prepare_host(ctx)
4004 else:
4005 logger.info('Skip prepare_host')
4006
4007 # initial vars
4008 fsid = ctx.fsid or make_fsid()
4009 if not is_fsid(fsid):
4010 raise Error('not an fsid: %s' % fsid)
4011 logger.info('Cluster fsid: %s' % fsid)
4012
4013 hostname = get_hostname()
4014 if '.' in hostname and not ctx.allow_fqdn_hostname:
4015 raise Error('hostname is a fully qualified domain name (%s); either fix (e.g., "sudo hostname %s" or similar) or pass --allow-fqdn-hostname' % (hostname, hostname.split('.')[0]))
4016 mon_id = ctx.mon_id or hostname
4017 mgr_id = ctx.mgr_id or generate_service_id()
4018
4019 lock = FileLock(ctx, fsid)
4020 lock.acquire()
4021
4022 (addr_arg, ipv6, mon_network) = prepare_mon_addresses(ctx)
4023 cluster_network, ipv6_cluster_network = prepare_cluster_network(ctx)
4024
4025 config = prepare_bootstrap_config(ctx, fsid, addr_arg, ctx.image)
4026
4027 if not ctx.skip_pull:
4028 _pull_image(ctx, ctx.image)
4029
4030 image_ver = CephContainer(ctx, ctx.image, 'ceph', ['--version']).run().strip()
4031 logger.info(f'Ceph version: {image_ver}')
4032
4033 if not ctx.allow_mismatched_release:
4034 image_release = image_ver.split()[4]
4035 if image_release not in \
4036 [DEFAULT_IMAGE_RELEASE, LATEST_STABLE_RELEASE]:
4037 raise Error(
4038 f'Container release {image_release} != cephadm release {DEFAULT_IMAGE_RELEASE};'
4039 ' please use matching version of cephadm (pass --allow-mismatched-release to continue anyway)'
4040 )
4041
4042 logger.info('Extracting ceph user uid/gid from container image...')
4043 (uid, gid) = extract_uid_gid(ctx)
4044
4045 # create some initial keys
4046 (mon_key, mgr_key, admin_key, bootstrap_keyring, admin_keyring) = \
4047 create_initial_keys(ctx, uid, gid, mgr_id)
4048
4049 monmap = create_initial_monmap(ctx, uid, gid, fsid, mon_id, addr_arg)
4050 (mon_dir, log_dir) = \
4051 prepare_create_mon(ctx, uid, gid, fsid, mon_id,
4052 bootstrap_keyring.name, monmap.name)
4053
4054 with open(mon_dir + '/config', 'w') as f:
4055 os.fchown(f.fileno(), uid, gid)
4056 os.fchmod(f.fileno(), 0o600)
4057 f.write(config)
4058
4059 make_var_run(ctx, fsid, uid, gid)
4060 create_mon(ctx, uid, gid, fsid, mon_id)
4061
4062 # config to issue various CLI commands
4063 tmp_config = write_tmp(config, uid, gid)
4064
4065 # a CLI helper to reduce our typing
4066 def cli(cmd, extra_mounts={}, timeout=DEFAULT_TIMEOUT):
4067 # type: (List[str], Dict[str, str], Optional[int]) -> str
4068 mounts = {
4069 log_dir: '/var/log/ceph:z',
4070 admin_keyring.name: '/etc/ceph/ceph.client.admin.keyring:z',
4071 tmp_config.name: '/etc/ceph/ceph.conf:z',
4072 }
4073 for k, v in extra_mounts.items():
4074 mounts[k] = v
4075 timeout = timeout or ctx.timeout
4076 return CephContainer(
4077 ctx,
4078 image=ctx.image,
4079 entrypoint='/usr/bin/ceph',
4080 args=cmd,
4081 volume_mounts=mounts,
4082 ).run(timeout=timeout)
4083
4084 wait_for_mon(ctx, mon_id, mon_dir, admin_keyring.name, tmp_config.name)
4085
4086 finish_bootstrap_config(ctx, fsid, config, mon_id, mon_dir,
4087 mon_network, ipv6, cli,
4088 cluster_network, ipv6_cluster_network)
4089
4090 # output files
4091 with open(ctx.output_keyring, 'w') as f:
4092 os.fchmod(f.fileno(), 0o600)
4093 f.write('[client.admin]\n'
4094 '\tkey = ' + admin_key + '\n')
4095 logger.info('Wrote keyring to %s' % ctx.output_keyring)
4096
4097 # create mgr
4098 create_mgr(ctx, uid, gid, fsid, mgr_id, mgr_key, config, cli)
4099
4100 if user_conf:
4101 # user given config settings were already assimilated earlier
4102 # but if the given settings contained any attributes in
4103 # the mgr (e.g. mgr/cephadm/container_image_prometheus)
4104 # they don't seem to be stored if there isn't a mgr yet.
4105 # Since re-assimilating the same conf settings should be
4106 # idempotent we can just do it again here.
4107 with tempfile.NamedTemporaryFile(buffering=0) as tmp:
4108 tmp.write(user_conf.encode('utf-8'))
4109 cli(['config', 'assimilate-conf',
4110 '-i', '/var/lib/ceph/user.conf'],
4111 {tmp.name: '/var/lib/ceph/user.conf:z'})
4112
4113 # wait for mgr to restart (after enabling a module)
4114 def wait_for_mgr_restart():
4115 # first get latest mgrmap epoch from the mon. try newer 'mgr
4116 # stat' command first, then fall back to 'mgr dump' if
4117 # necessary
4118 try:
4119 j = json_loads_retry(lambda: cli(['mgr', 'stat']))
4120 except Exception:
4121 j = json_loads_retry(lambda: cli(['mgr', 'dump']))
4122 epoch = j['epoch']
4123
4124 # wait for mgr to have it
4125 logger.info('Waiting for the mgr to restart...')
4126
4127 def mgr_has_latest_epoch():
4128 # type: () -> bool
4129 try:
4130 out = cli(['tell', 'mgr', 'mgr_status'])
4131 j = json.loads(out)
4132 return j['mgrmap_epoch'] >= epoch
4133 except Exception as e:
4134 logger.debug('tell mgr mgr_status failed: %s' % e)
4135 return False
4136 is_available(ctx, 'mgr epoch %d' % epoch, mgr_has_latest_epoch)
4137
4138 enable_cephadm_mgr_module(cli, wait_for_mgr_restart)
4139
4140 # ssh
4141 if not ctx.skip_ssh:
4142 prepare_ssh(ctx, cli, wait_for_mgr_restart)
4143
4144 if ctx.registry_url and ctx.registry_username and ctx.registry_password:
4145 cli(['config', 'set', 'mgr', 'mgr/cephadm/registry_url', ctx.registry_url, '--force'])
4146 cli(['config', 'set', 'mgr', 'mgr/cephadm/registry_username', ctx.registry_username, '--force'])
4147 cli(['config', 'set', 'mgr', 'mgr/cephadm/registry_password', ctx.registry_password, '--force'])
4148
4149 cli(['config', 'set', 'mgr', 'mgr/cephadm/container_init', str(ctx.container_init), '--force'])
4150
4151 if ctx.with_exporter:
4152 cli(['config-key', 'set', 'mgr/cephadm/exporter_enabled', 'true'])
4153 if ctx.exporter_config:
4154 logger.info('Applying custom cephadm exporter settings')
4155 # validated within the parser, so we can just apply to the store
4156 with tempfile.NamedTemporaryFile(buffering=0) as tmp:
4157 tmp.write(json.dumps(ctx.exporter_config).encode('utf-8'))
4158 mounts = {
4159 tmp.name: '/tmp/exporter-config.json:z'
4160 }
4161 cli(['cephadm', 'set-exporter-config', '-i', '/tmp/exporter-config.json'], extra_mounts=mounts)
4162 logger.info('-> Use ceph orch apply cephadm-exporter to deploy')
4163 else:
4164 # generate a default SSL configuration for the exporter(s)
4165 logger.info('Generating a default cephadm exporter configuration (self-signed)')
4166 cli(['cephadm', 'generate-exporter-config'])
4167 #
4168 # deploy the service (commented out until the cephadm changes are in the ceph container build)
4169 logger.info('Deploying cephadm exporter service with default placement...')
4170 cli(['orch', 'apply', 'cephadm-exporter'])
4171
4172 if not ctx.skip_dashboard:
4173 prepare_dashboard(ctx, uid, gid, cli, wait_for_mgr_restart)
4174
4175 if ctx.output_config == '/etc/ceph/ceph.conf' and not ctx.skip_admin_label:
4176 logger.info('Enabling client.admin keyring and conf on hosts with "admin" label')
4177 try:
4178 cli(['orch', 'client-keyring', 'set', 'client.admin', 'label:_admin'])
4179 cli(['orch', 'host', 'label', 'add', get_hostname(), '_admin'])
4180 except Exception:
4181 logger.info('Unable to set up "admin" label; assuming older version of Ceph')
4182
4183 if ctx.apply_spec:
4184 logger.info('Applying %s to cluster' % ctx.apply_spec)
4185
4186 with open(ctx.apply_spec) as f:
4187 for line in f:
4188 if 'hostname:' in line:
4189 line = line.replace('\n', '')
4190 split = line.split(': ')
4191 if split[1] != hostname:
4192 logger.info('Adding ssh key to %s' % split[1])
4193
4194 ssh_key = '/etc/ceph/ceph.pub'
4195 if ctx.ssh_public_key:
4196 ssh_key = ctx.ssh_public_key.name
4197 out, err, code = call_throws(ctx, ['sudo', '-u', ctx.ssh_user, 'ssh-copy-id', '-f', '-i', ssh_key, '-o StrictHostKeyChecking=no', '%s@%s' % (ctx.ssh_user, split[1])])
4198
4199 mounts = {}
4200 mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:z'
4201
4202 out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
4203 logger.info(out)
4204
4205 logger.info('You can access the Ceph CLI with:\n\n'
4206 '\tsudo %s shell --fsid %s -c %s -k %s\n' % (
4207 sys.argv[0],
4208 fsid,
4209 ctx.output_config,
4210 ctx.output_keyring))
4211 logger.info('Please consider enabling telemetry to help improve Ceph:\n\n'
4212 '\tceph telemetry on\n\n'
4213 'For more information see:\n\n'
4214 '\thttps://docs.ceph.com/docs/pacific/mgr/telemetry/\n')
4215 logger.info('Bootstrap complete.')
4216 return 0
4217
4218 ##################################
4219
4220
4221 def command_registry_login(ctx: CephadmContext):
4222 if ctx.registry_json:
4223 logger.info('Pulling custom registry login info from %s.' % ctx.registry_json)
4224 d = get_parm(ctx.registry_json)
4225 if d.get('url') and d.get('username') and d.get('password'):
4226 ctx.registry_url = d.get('url')
4227 ctx.registry_username = d.get('username')
4228 ctx.registry_password = d.get('password')
4229 registry_login(ctx, ctx.registry_url, ctx.registry_username, ctx.registry_password)
4230 else:
4231 raise Error('json provided for custom registry login did not include all necessary fields. '
4232 'Please setup json file as\n'
4233 '{\n'
4234 ' "url": "REGISTRY_URL",\n'
4235 ' "username": "REGISTRY_USERNAME",\n'
4236 ' "password": "REGISTRY_PASSWORD"\n'
4237 '}\n')
4238 elif ctx.registry_url and ctx.registry_username and ctx.registry_password:
4239 registry_login(ctx, ctx.registry_url, ctx.registry_username, ctx.registry_password)
4240 else:
4241 raise Error('Invalid custom registry arguments received. To login to a custom registry include '
4242 '--registry-url, --registry-username and --registry-password '
4243 'options or --registry-json option')
4244 return 0
4245
4246
4247 def registry_login(ctx: CephadmContext, url, username, password):
4248 logger.info('Logging into custom registry.')
4249 try:
4250 engine = ctx.container_engine
4251 cmd = [engine.path, 'login',
4252 '-u', username, '-p', password,
4253 url]
4254 if isinstance(engine, Podman):
4255 cmd.append('--authfile=/etc/ceph/podman-auth.json')
4256 out, _, _ = call_throws(ctx, cmd)
4257 if isinstance(engine, Podman):
4258 os.chmod('/etc/ceph/podman-auth.json', 0o600)
4259 except Exception:
4260 raise Error('Failed to login to custom registry @ %s as %s with given password' % (ctx.registry_url, ctx.registry_username))
4261
4262 ##################################
4263
4264
4265 def extract_uid_gid_monitoring(ctx, daemon_type):
4266 # type: (CephadmContext, str) -> Tuple[int, int]
4267
4268 if daemon_type == 'prometheus':
4269 uid, gid = extract_uid_gid(ctx, file_path='/etc/prometheus')
4270 elif daemon_type == 'node-exporter':
4271 uid, gid = 65534, 65534
4272 elif daemon_type == 'grafana':
4273 uid, gid = extract_uid_gid(ctx, file_path='/var/lib/grafana')
4274 elif daemon_type == 'alertmanager':
4275 uid, gid = extract_uid_gid(ctx, file_path=['/etc/alertmanager', '/etc/prometheus'])
4276 else:
4277 raise Error('{} not implemented yet'.format(daemon_type))
4278 return uid, gid
4279
4280
4281 @default_image
4282 def command_deploy(ctx):
4283 # type: (CephadmContext) -> None
4284 daemon_type, daemon_id = ctx.name.split('.', 1)
4285
4286 lock = FileLock(ctx, ctx.fsid)
4287 lock.acquire()
4288
4289 if daemon_type not in get_supported_daemons():
4290 raise Error('daemon type %s not recognized' % daemon_type)
4291
4292 redeploy = False
4293 unit_name = get_unit_name(ctx.fsid, daemon_type, daemon_id)
4294 container_name = 'ceph-%s-%s.%s' % (ctx.fsid, daemon_type, daemon_id)
4295 (_, state, _) = check_unit(ctx, unit_name)
4296 if state == 'running' or is_container_running(ctx, container_name):
4297 redeploy = True
4298
4299 if ctx.reconfig:
4300 logger.info('%s daemon %s ...' % ('Reconfig', ctx.name))
4301 elif redeploy:
4302 logger.info('%s daemon %s ...' % ('Redeploy', ctx.name))
4303 else:
4304 logger.info('%s daemon %s ...' % ('Deploy', ctx.name))
4305
4306 # Get and check ports explicitly required to be opened
4307 daemon_ports = [] # type: List[int]
4308
4309 # only check port in use if not reconfig or redeploy since service
4310 # we are redeploying/reconfiguring will already be using the port
4311 if not ctx.reconfig and not redeploy:
4312 if ctx.tcp_ports:
4313 daemon_ports = list(map(int, ctx.tcp_ports.split()))
4314
4315 if daemon_type in Ceph.daemons:
4316 config, keyring = get_config_and_keyring(ctx)
4317 uid, gid = extract_uid_gid(ctx)
4318 make_var_run(ctx, ctx.fsid, uid, gid)
4319
4320 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id,
4321 ptrace=ctx.allow_ptrace)
4322 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4323 config=config, keyring=keyring,
4324 osd_fsid=ctx.osd_fsid,
4325 reconfig=ctx.reconfig,
4326 ports=daemon_ports)
4327
4328 elif daemon_type in Monitoring.components:
4329 # monitoring daemon - prometheus, grafana, alertmanager, node-exporter
4330 # Default Checks
4331 # make sure provided config-json is sufficient
4332 config = get_parm(ctx.config_json) # type: ignore
4333 required_files = Monitoring.components[daemon_type].get('config-json-files', list())
4334 required_args = Monitoring.components[daemon_type].get('config-json-args', list())
4335 if required_files:
4336 if not config or not all(c in config.get('files', {}).keys() for c in required_files): # type: ignore
4337 raise Error('{} deployment requires config-json which must '
4338 'contain file content for {}'.format(daemon_type.capitalize(), ', '.join(required_files)))
4339 if required_args:
4340 if not config or not all(c in config.keys() for c in required_args): # type: ignore
4341 raise Error('{} deployment requires config-json which must '
4342 'contain arg for {}'.format(daemon_type.capitalize(), ', '.join(required_args)))
4343
4344 uid, gid = extract_uid_gid_monitoring(ctx, daemon_type)
4345 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4346 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4347 reconfig=ctx.reconfig,
4348 ports=daemon_ports)
4349
4350 elif daemon_type == NFSGanesha.daemon_type:
4351 if not ctx.reconfig and not redeploy and not daemon_ports:
4352 daemon_ports = list(NFSGanesha.port_map.values())
4353
4354 config, keyring = get_config_and_keyring(ctx)
4355 # TODO: extract ganesha uid/gid (997, 994) ?
4356 uid, gid = extract_uid_gid(ctx)
4357 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4358 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4359 config=config, keyring=keyring,
4360 reconfig=ctx.reconfig,
4361 ports=daemon_ports)
4362
4363 elif daemon_type == CephIscsi.daemon_type:
4364 config, keyring = get_config_and_keyring(ctx)
4365 uid, gid = extract_uid_gid(ctx)
4366 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4367 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4368 config=config, keyring=keyring,
4369 reconfig=ctx.reconfig,
4370 ports=daemon_ports)
4371
4372 elif daemon_type == HAproxy.daemon_type:
4373 haproxy = HAproxy.init(ctx, ctx.fsid, daemon_id)
4374 uid, gid = haproxy.extract_uid_gid_haproxy()
4375 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4376 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4377 reconfig=ctx.reconfig,
4378 ports=daemon_ports)
4379
4380 elif daemon_type == Keepalived.daemon_type:
4381 keepalived = Keepalived.init(ctx, ctx.fsid, daemon_id)
4382 uid, gid = keepalived.extract_uid_gid_keepalived()
4383 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4384 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4385 reconfig=ctx.reconfig,
4386 ports=daemon_ports)
4387
4388 elif daemon_type == CustomContainer.daemon_type:
4389 cc = CustomContainer.init(ctx, ctx.fsid, daemon_id)
4390 if not ctx.reconfig and not redeploy:
4391 daemon_ports.extend(cc.ports)
4392 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id,
4393 privileged=cc.privileged,
4394 ptrace=ctx.allow_ptrace)
4395 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c,
4396 uid=cc.uid, gid=cc.gid, config=None,
4397 keyring=None, reconfig=ctx.reconfig,
4398 ports=daemon_ports)
4399
4400 elif daemon_type == CephadmDaemon.daemon_type:
4401 # get current user gid and uid
4402 uid = os.getuid()
4403 gid = os.getgid()
4404 config_js = get_parm(ctx.config_json) # type: Dict[str, str]
4405 if not daemon_ports:
4406 logger.info('cephadm-exporter will use default port ({})'.format(CephadmDaemon.default_port))
4407 daemon_ports = [CephadmDaemon.default_port]
4408
4409 CephadmDaemon.validate_config(config_js)
4410
4411 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, None,
4412 uid, gid, ports=daemon_ports)
4413
4414 else:
4415 raise Error('daemon type {} not implemented in command_deploy function'
4416 .format(daemon_type))
4417
4418 ##################################
4419
4420
4421 @infer_image
4422 def command_run(ctx):
4423 # type: (CephadmContext) -> int
4424 (daemon_type, daemon_id) = ctx.name.split('.', 1)
4425 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4426 command = c.run_cmd()
4427 return call_timeout(ctx, command, ctx.timeout)
4428
4429 ##################################
4430
4431
4432 def fsid_conf_mismatch(ctx):
4433 # type: (CephadmContext) -> bool
4434 (config, _) = get_config_and_keyring(ctx)
4435 if config:
4436 for c in config.split('\n'):
4437 if 'fsid = ' in c.strip():
4438 if 'fsid = ' + ctx.fsid != c.strip():
4439 return True
4440 return False
4441
4442
4443 @infer_fsid
4444 @infer_config
4445 @infer_image
4446 def command_shell(ctx):
4447 # type: (CephadmContext) -> int
4448 if fsid_conf_mismatch(ctx):
4449 raise Error('fsid does not match ceph conf')
4450
4451 if ctx.fsid:
4452 make_log_dir(ctx, ctx.fsid)
4453 if ctx.name:
4454 if '.' in ctx.name:
4455 (daemon_type, daemon_id) = ctx.name.split('.', 1)
4456 else:
4457 daemon_type = ctx.name
4458 daemon_id = None
4459 else:
4460 daemon_type = 'osd' # get the most mounts
4461 daemon_id = None
4462
4463 if daemon_id and not ctx.fsid:
4464 raise Error('must pass --fsid to specify cluster')
4465
4466 # use /etc/ceph files by default, if present. we do this instead of
4467 # making these defaults in the arg parser because we don't want an error
4468 # if they don't exist.
4469 if not ctx.keyring and os.path.exists(SHELL_DEFAULT_KEYRING):
4470 ctx.keyring = SHELL_DEFAULT_KEYRING
4471
4472 container_args: List[str] = ['-i']
4473 mounts = get_container_mounts(ctx, ctx.fsid, daemon_type, daemon_id,
4474 no_config=True if ctx.config else False)
4475 binds = get_container_binds(ctx, ctx.fsid, daemon_type, daemon_id)
4476 if ctx.config:
4477 mounts[pathify(ctx.config)] = '/etc/ceph/ceph.conf:z'
4478 if ctx.keyring:
4479 mounts[pathify(ctx.keyring)] = '/etc/ceph/ceph.keyring:z'
4480 if ctx.mount:
4481 for _mount in ctx.mount:
4482 split_src_dst = _mount.split(':')
4483 mount = pathify(split_src_dst[0])
4484 filename = os.path.basename(split_src_dst[0])
4485 if len(split_src_dst) > 1:
4486 dst = split_src_dst[1] + ':z' if len(split_src_dst) == 3 else split_src_dst[1]
4487 mounts[mount] = dst
4488 else:
4489 mounts[mount] = '/mnt/{}:z'.format(filename)
4490 if ctx.command:
4491 command = ctx.command
4492 else:
4493 command = ['bash']
4494 container_args += [
4495 '-t',
4496 '-e', 'LANG=C',
4497 '-e', 'PS1=%s' % CUSTOM_PS1,
4498 ]
4499 if ctx.fsid:
4500 home = os.path.join(ctx.data_dir, ctx.fsid, 'home')
4501 if not os.path.exists(home):
4502 logger.debug('Creating root home at %s' % home)
4503 makedirs(home, 0, 0, 0o660)
4504 if os.path.exists('/etc/skel'):
4505 for f in os.listdir('/etc/skel'):
4506 if f.startswith('.bash'):
4507 shutil.copyfile(os.path.join('/etc/skel', f),
4508 os.path.join(home, f))
4509 mounts[home] = '/root'
4510
4511 for i in ctx.volume:
4512 a, b = i.split(':', 1)
4513 mounts[a] = b
4514
4515 c = CephContainer(
4516 ctx,
4517 image=ctx.image,
4518 entrypoint='doesnotmatter',
4519 args=[],
4520 container_args=container_args,
4521 volume_mounts=mounts,
4522 bind_mounts=binds,
4523 envs=ctx.env,
4524 privileged=True)
4525 command = c.shell_cmd(command)
4526
4527 return call_timeout(ctx, command, ctx.timeout)
4528
4529 ##################################
4530
4531
4532 @infer_fsid
4533 def command_enter(ctx):
4534 # type: (CephadmContext) -> int
4535 if not ctx.fsid:
4536 raise Error('must pass --fsid to specify cluster')
4537 (daemon_type, daemon_id) = ctx.name.split('.', 1)
4538 container_args = ['-i'] # type: List[str]
4539 if ctx.command:
4540 command = ctx.command
4541 else:
4542 command = ['sh']
4543 container_args += [
4544 '-t',
4545 '-e', 'LANG=C',
4546 '-e', 'PS1=%s' % CUSTOM_PS1,
4547 ]
4548 c = CephContainer(
4549 ctx,
4550 image=ctx.image,
4551 entrypoint='doesnotmatter',
4552 container_args=container_args,
4553 cname='ceph-%s-%s.%s' % (ctx.fsid, daemon_type, daemon_id),
4554 )
4555 command = c.exec_cmd(command)
4556 return call_timeout(ctx, command, ctx.timeout)
4557
4558 ##################################
4559
4560
4561 @infer_fsid
4562 @infer_image
4563 def command_ceph_volume(ctx):
4564 # type: (CephadmContext) -> None
4565 if ctx.fsid:
4566 make_log_dir(ctx, ctx.fsid)
4567
4568 lock = FileLock(ctx, ctx.fsid)
4569 lock.acquire()
4570
4571 (uid, gid) = (0, 0) # ceph-volume runs as root
4572 mounts = get_container_mounts(ctx, ctx.fsid, 'osd', None)
4573
4574 tmp_config = None
4575 tmp_keyring = None
4576
4577 (config, keyring) = get_config_and_keyring(ctx)
4578
4579 if config:
4580 # tmp config file
4581 tmp_config = write_tmp(config, uid, gid)
4582 mounts[tmp_config.name] = '/etc/ceph/ceph.conf:z'
4583
4584 if keyring:
4585 # tmp keyring file
4586 tmp_keyring = write_tmp(keyring, uid, gid)
4587 mounts[tmp_keyring.name] = '/var/lib/ceph/bootstrap-osd/ceph.keyring:z'
4588
4589 c = CephContainer(
4590 ctx,
4591 image=ctx.image,
4592 entrypoint='/usr/sbin/ceph-volume',
4593 envs=ctx.env,
4594 args=ctx.command,
4595 privileged=True,
4596 volume_mounts=mounts,
4597 )
4598
4599 out, err, code = call_throws(ctx, c.run_cmd())
4600 if not code:
4601 print(out)
4602
4603 ##################################
4604
4605
4606 @infer_fsid
4607 def command_unit(ctx):
4608 # type: (CephadmContext) -> None
4609 if not ctx.fsid:
4610 raise Error('must pass --fsid to specify cluster')
4611
4612 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
4613
4614 call_throws(ctx, [
4615 'systemctl',
4616 ctx.command,
4617 unit_name],
4618 verbosity=CallVerbosity.VERBOSE,
4619 desc=''
4620 )
4621
4622 ##################################
4623
4624
4625 @infer_fsid
4626 def command_logs(ctx):
4627 # type: (CephadmContext) -> None
4628 if not ctx.fsid:
4629 raise Error('must pass --fsid to specify cluster')
4630
4631 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
4632
4633 cmd = [find_program('journalctl')]
4634 cmd.extend(['-u', unit_name])
4635 if ctx.command:
4636 cmd.extend(ctx.command)
4637
4638 # call this directly, without our wrapper, so that we get an unmolested
4639 # stdout with logger prefixing.
4640 logger.debug('Running command: %s' % ' '.join(cmd))
4641 subprocess.call(cmd) # type: ignore
4642
4643 ##################################
4644
4645
4646 def list_networks(ctx):
4647 # type: (CephadmContext) -> Dict[str,Dict[str,List[str]]]
4648
4649 # sadly, 18.04's iproute2 4.15.0-2ubun doesn't support the -j flag,
4650 # so we'll need to use a regex to parse 'ip' command output.
4651 #
4652 # out, _, _ = call_throws(['ip', '-j', 'route', 'ls'])
4653 # j = json.loads(out)
4654 # for x in j:
4655
4656 res = _list_ipv4_networks(ctx)
4657 res.update(_list_ipv6_networks(ctx))
4658 return res
4659
4660
4661 def _list_ipv4_networks(ctx: CephadmContext):
4662 execstr: Optional[str] = find_executable('ip')
4663 if not execstr:
4664 raise FileNotFoundError("unable to find 'ip' command")
4665 out, _, _ = call_throws(ctx, [execstr, 'route', 'ls'])
4666 return _parse_ipv4_route(out)
4667
4668
4669 def _parse_ipv4_route(out):
4670 r = {} # type: Dict[str,Dict[str,List[str]]]
4671 p = re.compile(r'^(\S+) dev (\S+) (.*)scope link (.*)src (\S+)')
4672 for line in out.splitlines():
4673 m = p.findall(line)
4674 if not m:
4675 continue
4676 net = m[0][0]
4677 iface = m[0][1]
4678 ip = m[0][4]
4679 if net not in r:
4680 r[net] = {}
4681 if iface not in r[net]:
4682 r[net][iface] = []
4683 r[net][iface].append(ip)
4684 return r
4685
4686
4687 def _list_ipv6_networks(ctx: CephadmContext):
4688 execstr: Optional[str] = find_executable('ip')
4689 if not execstr:
4690 raise FileNotFoundError("unable to find 'ip' command")
4691 routes, _, _ = call_throws(ctx, [execstr, '-6', 'route', 'ls'])
4692 ips, _, _ = call_throws(ctx, [execstr, '-6', 'addr', 'ls'])
4693 return _parse_ipv6_route(routes, ips)
4694
4695
4696 def _parse_ipv6_route(routes, ips):
4697 r = {} # type: Dict[str,Dict[str,List[str]]]
4698 route_p = re.compile(r'^(\S+) dev (\S+) proto (\S+) metric (\S+) .*pref (\S+)$')
4699 ip_p = re.compile(r'^\s+inet6 (\S+)/(.*)scope (.*)$')
4700 iface_p = re.compile(r'^(\d+): (\S+): (.*)$')
4701 for line in routes.splitlines():
4702 m = route_p.findall(line)
4703 if not m or m[0][0].lower() == 'default':
4704 continue
4705 net = m[0][0]
4706 if '/' not in net: # only consider networks with a mask
4707 continue
4708 iface = m[0][1]
4709 if net not in r:
4710 r[net] = {}
4711 if iface not in r[net]:
4712 r[net][iface] = []
4713
4714 iface = None
4715 for line in ips.splitlines():
4716 m = ip_p.findall(line)
4717 if not m:
4718 m = iface_p.findall(line)
4719 if m:
4720 # drop @... suffix, if present
4721 iface = m[0][1].split('@')[0]
4722 continue
4723 ip = m[0][0]
4724 # find the network it belongs to
4725 net = [n for n in r.keys()
4726 if ipaddress.ip_address(ip) in ipaddress.ip_network(n)]
4727 if net:
4728 assert(iface)
4729 r[net[0]][iface].append(ip)
4730
4731 return r
4732
4733
4734 def command_list_networks(ctx):
4735 # type: (CephadmContext) -> None
4736 r = list_networks(ctx)
4737 print(json.dumps(r, indent=4))
4738
4739 ##################################
4740
4741
4742 def command_ls(ctx):
4743 # type: (CephadmContext) -> None
4744 ls = list_daemons(ctx, detail=not ctx.no_detail,
4745 legacy_dir=ctx.legacy_dir)
4746 print(json.dumps(ls, indent=4))
4747
4748
4749 def with_units_to_int(v: str) -> int:
4750 if v.endswith('iB'):
4751 v = v[:-2]
4752 elif v.endswith('B'):
4753 v = v[:-1]
4754 mult = 1
4755 if v[-1].upper() == 'K':
4756 mult = 1024
4757 v = v[:-1]
4758 elif v[-1].upper() == 'M':
4759 mult = 1024 * 1024
4760 v = v[:-1]
4761 elif v[-1].upper() == 'G':
4762 mult = 1024 * 1024 * 1024
4763 v = v[:-1]
4764 elif v[-1].upper() == 'T':
4765 mult = 1024 * 1024 * 1024 * 1024
4766 v = v[:-1]
4767 return int(float(v) * mult)
4768
4769
4770 def list_daemons(ctx, detail=True, legacy_dir=None):
4771 # type: (CephadmContext, bool, Optional[str]) -> List[Dict[str, str]]
4772 host_version: Optional[str] = None
4773 ls = []
4774 container_path = ctx.container_engine.path
4775
4776 data_dir = ctx.data_dir
4777 if legacy_dir is not None:
4778 data_dir = os.path.abspath(legacy_dir + data_dir)
4779
4780 # keep track of ceph versions we see
4781 seen_versions = {} # type: Dict[str, Optional[str]]
4782
4783 # keep track of image digests
4784 seen_digests = {} # type: Dict[str, List[str]]
4785
4786 # keep track of memory usage we've seen
4787 seen_memusage = {} # type: Dict[str, int]
4788 out, err, code = call(
4789 ctx,
4790 [container_path, 'stats', '--format', '{{.ID}},{{.MemUsage}}', '--no-stream'],
4791 verbosity=CallVerbosity.DEBUG
4792 )
4793 seen_memusage_cid_len = 0
4794 if not code:
4795 for line in out.splitlines():
4796 (cid, usage) = line.split(',')
4797 (used, limit) = usage.split(' / ')
4798 seen_memusage[cid] = with_units_to_int(used)
4799 if not seen_memusage_cid_len:
4800 seen_memusage_cid_len = len(cid)
4801
4802 # /var/lib/ceph
4803 if os.path.exists(data_dir):
4804 for i in os.listdir(data_dir):
4805 if i in ['mon', 'osd', 'mds', 'mgr']:
4806 daemon_type = i
4807 for j in os.listdir(os.path.join(data_dir, i)):
4808 if '-' not in j:
4809 continue
4810 (cluster, daemon_id) = j.split('-', 1)
4811 fsid = get_legacy_daemon_fsid(ctx,
4812 cluster, daemon_type, daemon_id,
4813 legacy_dir=legacy_dir)
4814 legacy_unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
4815 val: Dict[str, Any] = {
4816 'style': 'legacy',
4817 'name': '%s.%s' % (daemon_type, daemon_id),
4818 'fsid': fsid if fsid is not None else 'unknown',
4819 'systemd_unit': legacy_unit_name,
4820 }
4821 if detail:
4822 (val['enabled'], val['state'], _) = \
4823 check_unit(ctx, legacy_unit_name)
4824 if not host_version:
4825 try:
4826 out, err, code = call(ctx,
4827 ['ceph', '-v'],
4828 verbosity=CallVerbosity.DEBUG)
4829 if not code and out.startswith('ceph version '):
4830 host_version = out.split(' ')[2]
4831 except Exception:
4832 pass
4833 val['host_version'] = host_version
4834 ls.append(val)
4835 elif is_fsid(i):
4836 fsid = str(i) # convince mypy that fsid is a str here
4837 for j in os.listdir(os.path.join(data_dir, i)):
4838 if '.' in j and os.path.isdir(os.path.join(data_dir, fsid, j)):
4839 name = j
4840 (daemon_type, daemon_id) = j.split('.', 1)
4841 unit_name = get_unit_name(fsid,
4842 daemon_type,
4843 daemon_id)
4844 else:
4845 continue
4846 val = {
4847 'style': 'cephadm:v1',
4848 'name': name,
4849 'fsid': fsid,
4850 'systemd_unit': unit_name,
4851 }
4852 if detail:
4853 # get container id
4854 (val['enabled'], val['state'], _) = \
4855 check_unit(ctx, unit_name)
4856 container_id = None
4857 image_name = None
4858 image_id = None
4859 image_digests = None
4860 version = None
4861 start_stamp = None
4862
4863 cmd = [
4864 container_path, 'inspect',
4865 '--format', '{{.Id}},{{.Config.Image}},{{.Image}},{{.Created}},{{index .Config.Labels "io.ceph.version"}}',
4866 'ceph-%s-%s' % (fsid, j)
4867 ]
4868 out, err, code = call(ctx, cmd, verbosity=CallVerbosity.DEBUG)
4869 if not code:
4870 (container_id, image_name, image_id, start,
4871 version) = out.strip().split(',')
4872 image_id = normalize_container_id(image_id)
4873 daemon_type = name.split('.', 1)[0]
4874 start_stamp = try_convert_datetime(start)
4875
4876 # collect digests for this image id
4877 image_digests = seen_digests.get(image_id)
4878 if not image_digests:
4879 out, err, code = call(
4880 ctx,
4881 [
4882 container_path, 'image', 'inspect', image_id,
4883 '--format', '{{.RepoDigests}}',
4884 ],
4885 verbosity=CallVerbosity.DEBUG)
4886 if not code:
4887 image_digests = list(set(map(
4888 normalize_image_digest,
4889 out.strip()[1:-1].split(' '))))
4890 seen_digests[image_id] = image_digests
4891
4892 # identify software version inside the container (if we can)
4893 if not version or '.' not in version:
4894 version = seen_versions.get(image_id, None)
4895 if daemon_type == NFSGanesha.daemon_type:
4896 version = NFSGanesha.get_version(ctx, container_id)
4897 if daemon_type == CephIscsi.daemon_type:
4898 version = CephIscsi.get_version(ctx, container_id)
4899 elif not version:
4900 if daemon_type in Ceph.daemons:
4901 out, err, code = call(ctx,
4902 [container_path, 'exec', container_id,
4903 'ceph', '-v'],
4904 verbosity=CallVerbosity.DEBUG)
4905 if not code and \
4906 out.startswith('ceph version '):
4907 version = out.split(' ')[2]
4908 seen_versions[image_id] = version
4909 elif daemon_type == 'grafana':
4910 out, err, code = call(ctx,
4911 [container_path, 'exec', container_id,
4912 'grafana-server', '-v'],
4913 verbosity=CallVerbosity.DEBUG)
4914 if not code and \
4915 out.startswith('Version '):
4916 version = out.split(' ')[1]
4917 seen_versions[image_id] = version
4918 elif daemon_type in ['prometheus',
4919 'alertmanager',
4920 'node-exporter']:
4921 version = Monitoring.get_version(ctx, container_id, daemon_type)
4922 seen_versions[image_id] = version
4923 elif daemon_type == 'haproxy':
4924 out, err, code = call(ctx,
4925 [container_path, 'exec', container_id,
4926 'haproxy', '-v'],
4927 verbosity=CallVerbosity.DEBUG)
4928 if not code and \
4929 out.startswith('HA-Proxy version '):
4930 version = out.split(' ')[2]
4931 seen_versions[image_id] = version
4932 elif daemon_type == 'keepalived':
4933 out, err, code = call(ctx,
4934 [container_path, 'exec', container_id,
4935 'keepalived', '--version'],
4936 verbosity=CallVerbosity.DEBUG)
4937 if not code and \
4938 err.startswith('Keepalived '):
4939 version = err.split(' ')[1]
4940 if version[0] == 'v':
4941 version = version[1:]
4942 seen_versions[image_id] = version
4943 elif daemon_type == CustomContainer.daemon_type:
4944 # Because a custom container can contain
4945 # everything, we do not know which command
4946 # to execute to get the version.
4947 pass
4948 else:
4949 logger.warning('version for unknown daemon type %s' % daemon_type)
4950 else:
4951 vfile = os.path.join(data_dir, fsid, j, 'unit.image') # type: ignore
4952 try:
4953 with open(vfile, 'r') as f:
4954 image_name = f.read().strip() or None
4955 except IOError:
4956 pass
4957
4958 # unit.meta?
4959 mfile = os.path.join(data_dir, fsid, j, 'unit.meta') # type: ignore
4960 try:
4961 with open(mfile, 'r') as f:
4962 meta = json.loads(f.read())
4963 val.update(meta)
4964 except IOError:
4965 pass
4966
4967 val['container_id'] = container_id
4968 val['container_image_name'] = image_name
4969 val['container_image_id'] = image_id
4970 val['container_image_digests'] = image_digests
4971 if container_id:
4972 val['memory_usage'] = seen_memusage.get(container_id[0:seen_memusage_cid_len])
4973 val['version'] = version
4974 val['started'] = start_stamp
4975 val['created'] = get_file_timestamp(
4976 os.path.join(data_dir, fsid, j, 'unit.created')
4977 )
4978 val['deployed'] = get_file_timestamp(
4979 os.path.join(data_dir, fsid, j, 'unit.image'))
4980 val['configured'] = get_file_timestamp(
4981 os.path.join(data_dir, fsid, j, 'unit.configured'))
4982
4983 ls.append(val)
4984
4985 return ls
4986
4987
4988 def get_daemon_description(ctx, fsid, name, detail=False, legacy_dir=None):
4989 # type: (CephadmContext, str, str, bool, Optional[str]) -> Dict[str, str]
4990
4991 for d in list_daemons(ctx, detail=detail, legacy_dir=legacy_dir):
4992 if d['fsid'] != fsid:
4993 continue
4994 if d['name'] != name:
4995 continue
4996 return d
4997 raise Error('Daemon not found: {}. See `cephadm ls`'.format(name))
4998
4999 ##################################
5000
5001
5002 @default_image
5003 def command_adopt(ctx):
5004 # type: (CephadmContext) -> None
5005
5006 if not ctx.skip_pull:
5007 _pull_image(ctx, ctx.image)
5008
5009 (daemon_type, daemon_id) = ctx.name.split('.', 1)
5010
5011 # legacy check
5012 if ctx.style != 'legacy':
5013 raise Error('adoption of style %s not implemented' % ctx.style)
5014
5015 # lock
5016 fsid = get_legacy_daemon_fsid(ctx,
5017 ctx.cluster,
5018 daemon_type,
5019 daemon_id,
5020 legacy_dir=ctx.legacy_dir)
5021 if not fsid:
5022 raise Error('could not detect legacy fsid; set fsid in ceph.conf')
5023 lock = FileLock(ctx, fsid)
5024 lock.acquire()
5025
5026 # call correct adoption
5027 if daemon_type in Ceph.daemons:
5028 command_adopt_ceph(ctx, daemon_type, daemon_id, fsid)
5029 elif daemon_type == 'prometheus':
5030 command_adopt_prometheus(ctx, daemon_id, fsid)
5031 elif daemon_type == 'grafana':
5032 command_adopt_grafana(ctx, daemon_id, fsid)
5033 elif daemon_type == 'node-exporter':
5034 raise Error('adoption of node-exporter not implemented')
5035 elif daemon_type == 'alertmanager':
5036 command_adopt_alertmanager(ctx, daemon_id, fsid)
5037 else:
5038 raise Error('daemon type %s not recognized' % daemon_type)
5039
5040
5041 class AdoptOsd(object):
5042 def __init__(self, ctx, osd_data_dir, osd_id):
5043 # type: (CephadmContext, str, str) -> None
5044 self.ctx = ctx
5045 self.osd_data_dir = osd_data_dir
5046 self.osd_id = osd_id
5047
5048 def check_online_osd(self):
5049 # type: () -> Tuple[Optional[str], Optional[str]]
5050
5051 osd_fsid, osd_type = None, None
5052
5053 path = os.path.join(self.osd_data_dir, 'fsid')
5054 try:
5055 with open(path, 'r') as f:
5056 osd_fsid = f.read().strip()
5057 logger.info('Found online OSD at %s' % path)
5058 except IOError:
5059 logger.info('Unable to read OSD fsid from %s' % path)
5060 if os.path.exists(os.path.join(self.osd_data_dir, 'type')):
5061 with open(os.path.join(self.osd_data_dir, 'type')) as f:
5062 osd_type = f.read().strip()
5063 else:
5064 logger.info('"type" file missing for OSD data dir')
5065
5066 return osd_fsid, osd_type
5067
5068 def check_offline_lvm_osd(self):
5069 # type: () -> Tuple[Optional[str], Optional[str]]
5070 osd_fsid, osd_type = None, None
5071
5072 c = CephContainer(
5073 self.ctx,
5074 image=self.ctx.image,
5075 entrypoint='/usr/sbin/ceph-volume',
5076 args=['lvm', 'list', '--format=json'],
5077 privileged=True
5078 )
5079 out, err, code = call_throws(self.ctx, c.run_cmd())
5080 if not code:
5081 try:
5082 js = json.loads(out)
5083 if self.osd_id in js:
5084 logger.info('Found offline LVM OSD {}'.format(self.osd_id))
5085 osd_fsid = js[self.osd_id][0]['tags']['ceph.osd_fsid']
5086 for device in js[self.osd_id]:
5087 if device['tags']['ceph.type'] == 'block':
5088 osd_type = 'bluestore'
5089 break
5090 if device['tags']['ceph.type'] == 'data':
5091 osd_type = 'filestore'
5092 break
5093 except ValueError as e:
5094 logger.info('Invalid JSON in ceph-volume lvm list: {}'.format(e))
5095
5096 return osd_fsid, osd_type
5097
5098 def check_offline_simple_osd(self):
5099 # type: () -> Tuple[Optional[str], Optional[str]]
5100 osd_fsid, osd_type = None, None
5101
5102 osd_file = glob('/etc/ceph/osd/{}-[a-f0-9-]*.json'.format(self.osd_id))
5103 if len(osd_file) == 1:
5104 with open(osd_file[0], 'r') as f:
5105 try:
5106 js = json.loads(f.read())
5107 logger.info('Found offline simple OSD {}'.format(self.osd_id))
5108 osd_fsid = js['fsid']
5109 osd_type = js['type']
5110 if osd_type != 'filestore':
5111 # need this to be mounted for the adopt to work, as it
5112 # needs to move files from this directory
5113 call_throws(self.ctx, ['mount', js['data']['path'], self.osd_data_dir])
5114 except ValueError as e:
5115 logger.info('Invalid JSON in {}: {}'.format(osd_file, e))
5116
5117 return osd_fsid, osd_type
5118
5119
5120 def command_adopt_ceph(ctx, daemon_type, daemon_id, fsid):
5121 # type: (CephadmContext, str, str, str) -> None
5122
5123 (uid, gid) = extract_uid_gid(ctx)
5124
5125 data_dir_src = ('/var/lib/ceph/%s/%s-%s' %
5126 (daemon_type, ctx.cluster, daemon_id))
5127 data_dir_src = os.path.abspath(ctx.legacy_dir + data_dir_src)
5128
5129 if not os.path.exists(data_dir_src):
5130 raise Error("{}.{} data directory '{}' does not exist. "
5131 'Incorrect ID specified, or daemon already adopted?'.format(
5132 daemon_type, daemon_id, data_dir_src))
5133
5134 osd_fsid = None
5135 if daemon_type == 'osd':
5136 adopt_osd = AdoptOsd(ctx, data_dir_src, daemon_id)
5137 osd_fsid, osd_type = adopt_osd.check_online_osd()
5138 if not osd_fsid:
5139 osd_fsid, osd_type = adopt_osd.check_offline_lvm_osd()
5140 if not osd_fsid:
5141 osd_fsid, osd_type = adopt_osd.check_offline_simple_osd()
5142 if not osd_fsid:
5143 raise Error('Unable to find OSD {}'.format(daemon_id))
5144 logger.info('objectstore_type is %s' % osd_type)
5145 assert osd_type
5146 if osd_type == 'filestore':
5147 raise Error('FileStore is not supported by cephadm')
5148
5149 # NOTE: implicit assumption here that the units correspond to the
5150 # cluster we are adopting based on the /etc/{defaults,sysconfig}/ceph
5151 # CLUSTER field.
5152 unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
5153 (enabled, state, _) = check_unit(ctx, unit_name)
5154 if state == 'running':
5155 logger.info('Stopping old systemd unit %s...' % unit_name)
5156 call_throws(ctx, ['systemctl', 'stop', unit_name])
5157 if enabled:
5158 logger.info('Disabling old systemd unit %s...' % unit_name)
5159 call_throws(ctx, ['systemctl', 'disable', unit_name])
5160
5161 # data
5162 logger.info('Moving data...')
5163 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5164 uid=uid, gid=gid)
5165 move_files(ctx, glob(os.path.join(data_dir_src, '*')),
5166 data_dir_dst,
5167 uid=uid, gid=gid)
5168 logger.debug('Remove dir `%s`' % (data_dir_src))
5169 if os.path.ismount(data_dir_src):
5170 call_throws(ctx, ['umount', data_dir_src])
5171 os.rmdir(data_dir_src)
5172
5173 logger.info('Chowning content...')
5174 call_throws(ctx, ['chown', '-c', '-R', '%d.%d' % (uid, gid), data_dir_dst])
5175
5176 if daemon_type == 'mon':
5177 # rename *.ldb -> *.sst, in case they are coming from ubuntu
5178 store = os.path.join(data_dir_dst, 'store.db')
5179 num_renamed = 0
5180 if os.path.exists(store):
5181 for oldf in os.listdir(store):
5182 if oldf.endswith('.ldb'):
5183 newf = oldf.replace('.ldb', '.sst')
5184 oldp = os.path.join(store, oldf)
5185 newp = os.path.join(store, newf)
5186 logger.debug('Renaming %s -> %s' % (oldp, newp))
5187 os.rename(oldp, newp)
5188 if num_renamed:
5189 logger.info('Renamed %d leveldb *.ldb files to *.sst',
5190 num_renamed)
5191 if daemon_type == 'osd':
5192 for n in ['block', 'block.db', 'block.wal']:
5193 p = os.path.join(data_dir_dst, n)
5194 if os.path.exists(p):
5195 logger.info('Chowning %s...' % p)
5196 os.chown(p, uid, gid)
5197 # disable the ceph-volume 'simple' mode files on the host
5198 simple_fn = os.path.join('/etc/ceph/osd',
5199 '%s-%s.json' % (daemon_id, osd_fsid))
5200 if os.path.exists(simple_fn):
5201 new_fn = simple_fn + '.adopted-by-cephadm'
5202 logger.info('Renaming %s -> %s', simple_fn, new_fn)
5203 os.rename(simple_fn, new_fn)
5204 logger.info('Disabling host unit ceph-volume@ simple unit...')
5205 call(ctx, ['systemctl', 'disable',
5206 'ceph-volume@simple-%s-%s.service' % (daemon_id, osd_fsid)])
5207 else:
5208 # assume this is an 'lvm' c-v for now, but don't error
5209 # out if it's not.
5210 logger.info('Disabling host unit ceph-volume@ lvm unit...')
5211 call(ctx, ['systemctl', 'disable',
5212 'ceph-volume@lvm-%s-%s.service' % (daemon_id, osd_fsid)])
5213
5214 # config
5215 config_src = '/etc/ceph/%s.conf' % (ctx.cluster)
5216 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5217 config_dst = os.path.join(data_dir_dst, 'config')
5218 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5219
5220 # logs
5221 logger.info('Moving logs...')
5222 log_dir_src = ('/var/log/ceph/%s-%s.%s.log*' %
5223 (ctx.cluster, daemon_type, daemon_id))
5224 log_dir_src = os.path.abspath(ctx.legacy_dir + log_dir_src)
5225 log_dir_dst = make_log_dir(ctx, fsid, uid=uid, gid=gid)
5226 move_files(ctx, glob(log_dir_src),
5227 log_dir_dst,
5228 uid=uid, gid=gid)
5229
5230 logger.info('Creating new units...')
5231 make_var_run(ctx, fsid, uid, gid)
5232 c = get_container(ctx, fsid, daemon_type, daemon_id)
5233 deploy_daemon_units(ctx, fsid, uid, gid, daemon_type, daemon_id, c,
5234 enable=True, # unconditionally enable the new unit
5235 start=(state == 'running' or ctx.force_start),
5236 osd_fsid=osd_fsid)
5237 update_firewalld(ctx, daemon_type)
5238
5239
5240 def command_adopt_prometheus(ctx, daemon_id, fsid):
5241 # type: (CephadmContext, str, str) -> None
5242 daemon_type = 'prometheus'
5243 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
5244
5245 _stop_and_disable(ctx, 'prometheus')
5246
5247 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5248 uid=uid, gid=gid)
5249
5250 # config
5251 config_src = '/etc/prometheus/prometheus.yml'
5252 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5253 config_dst = os.path.join(data_dir_dst, 'etc/prometheus')
5254 makedirs(config_dst, uid, gid, 0o755)
5255 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5256
5257 # data
5258 data_src = '/var/lib/prometheus/metrics/'
5259 data_src = os.path.abspath(ctx.legacy_dir + data_src)
5260 data_dst = os.path.join(data_dir_dst, 'data')
5261 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
5262
5263 make_var_run(ctx, fsid, uid, gid)
5264 c = get_container(ctx, fsid, daemon_type, daemon_id)
5265 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
5266 update_firewalld(ctx, daemon_type)
5267
5268
5269 def command_adopt_grafana(ctx, daemon_id, fsid):
5270 # type: (CephadmContext, str, str) -> None
5271
5272 daemon_type = 'grafana'
5273 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
5274
5275 _stop_and_disable(ctx, 'grafana-server')
5276
5277 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5278 uid=uid, gid=gid)
5279
5280 # config
5281 config_src = '/etc/grafana/grafana.ini'
5282 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5283 config_dst = os.path.join(data_dir_dst, 'etc/grafana')
5284 makedirs(config_dst, uid, gid, 0o755)
5285 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5286
5287 prov_src = '/etc/grafana/provisioning/'
5288 prov_src = os.path.abspath(ctx.legacy_dir + prov_src)
5289 prov_dst = os.path.join(data_dir_dst, 'etc/grafana')
5290 copy_tree(ctx, [prov_src], prov_dst, uid=uid, gid=gid)
5291
5292 # cert
5293 cert = '/etc/grafana/grafana.crt'
5294 key = '/etc/grafana/grafana.key'
5295 if os.path.exists(cert) and os.path.exists(key):
5296 cert_src = '/etc/grafana/grafana.crt'
5297 cert_src = os.path.abspath(ctx.legacy_dir + cert_src)
5298 makedirs(os.path.join(data_dir_dst, 'etc/grafana/certs'), uid, gid, 0o755)
5299 cert_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_file')
5300 copy_files(ctx, [cert_src], cert_dst, uid=uid, gid=gid)
5301
5302 key_src = '/etc/grafana/grafana.key'
5303 key_src = os.path.abspath(ctx.legacy_dir + key_src)
5304 key_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_key')
5305 copy_files(ctx, [key_src], key_dst, uid=uid, gid=gid)
5306
5307 _adjust_grafana_ini(os.path.join(config_dst, 'grafana.ini'))
5308 else:
5309 logger.debug('Skipping ssl, missing cert {} or key {}'.format(cert, key))
5310
5311 # data - possible custom dashboards/plugins
5312 data_src = '/var/lib/grafana/'
5313 data_src = os.path.abspath(ctx.legacy_dir + data_src)
5314 data_dst = os.path.join(data_dir_dst, 'data')
5315 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
5316
5317 make_var_run(ctx, fsid, uid, gid)
5318 c = get_container(ctx, fsid, daemon_type, daemon_id)
5319 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
5320 update_firewalld(ctx, daemon_type)
5321
5322
5323 def command_adopt_alertmanager(ctx, daemon_id, fsid):
5324 # type: (CephadmContext, str, str) -> None
5325
5326 daemon_type = 'alertmanager'
5327 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
5328
5329 _stop_and_disable(ctx, 'prometheus-alertmanager')
5330
5331 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5332 uid=uid, gid=gid)
5333
5334 # config
5335 config_src = '/etc/prometheus/alertmanager.yml'
5336 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5337 config_dst = os.path.join(data_dir_dst, 'etc/alertmanager')
5338 makedirs(config_dst, uid, gid, 0o755)
5339 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5340
5341 # data
5342 data_src = '/var/lib/prometheus/alertmanager/'
5343 data_src = os.path.abspath(ctx.legacy_dir + data_src)
5344 data_dst = os.path.join(data_dir_dst, 'etc/alertmanager/data')
5345 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
5346
5347 make_var_run(ctx, fsid, uid, gid)
5348 c = get_container(ctx, fsid, daemon_type, daemon_id)
5349 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
5350 update_firewalld(ctx, daemon_type)
5351
5352
5353 def _adjust_grafana_ini(filename):
5354 # type: (str) -> None
5355
5356 # Update cert_file, cert_key pathnames in server section
5357 # ConfigParser does not preserve comments
5358 try:
5359 with open(filename, 'r') as grafana_ini:
5360 lines = grafana_ini.readlines()
5361 with open('{}.new'.format(filename), 'w') as grafana_ini:
5362 server_section = False
5363 for line in lines:
5364 if line.startswith('['):
5365 server_section = False
5366 if line.startswith('[server]'):
5367 server_section = True
5368 if server_section:
5369 line = re.sub(r'^cert_file.*',
5370 'cert_file = /etc/grafana/certs/cert_file', line)
5371 line = re.sub(r'^cert_key.*',
5372 'cert_key = /etc/grafana/certs/cert_key', line)
5373 grafana_ini.write(line)
5374 os.rename('{}.new'.format(filename), filename)
5375 except OSError as err:
5376 raise Error('Cannot update {}: {}'.format(filename, err))
5377
5378
5379 def _stop_and_disable(ctx, unit_name):
5380 # type: (CephadmContext, str) -> None
5381
5382 (enabled, state, _) = check_unit(ctx, unit_name)
5383 if state == 'running':
5384 logger.info('Stopping old systemd unit %s...' % unit_name)
5385 call_throws(ctx, ['systemctl', 'stop', unit_name])
5386 if enabled:
5387 logger.info('Disabling old systemd unit %s...' % unit_name)
5388 call_throws(ctx, ['systemctl', 'disable', unit_name])
5389
5390 ##################################
5391
5392
5393 def command_rm_daemon(ctx):
5394 # type: (CephadmContext) -> None
5395 lock = FileLock(ctx, ctx.fsid)
5396 lock.acquire()
5397
5398 (daemon_type, daemon_id) = ctx.name.split('.', 1)
5399 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
5400
5401 if daemon_type in ['mon', 'osd'] and not ctx.force:
5402 raise Error('must pass --force to proceed: '
5403 'this command may destroy precious data!')
5404
5405 call(ctx, ['systemctl', 'stop', unit_name],
5406 verbosity=CallVerbosity.DEBUG)
5407 call(ctx, ['systemctl', 'reset-failed', unit_name],
5408 verbosity=CallVerbosity.DEBUG)
5409 call(ctx, ['systemctl', 'disable', unit_name],
5410 verbosity=CallVerbosity.DEBUG)
5411 data_dir = get_data_dir(ctx.fsid, ctx.data_dir, daemon_type, daemon_id)
5412 if daemon_type in ['mon', 'osd', 'prometheus'] and \
5413 not ctx.force_delete_data:
5414 # rename it out of the way -- do not delete
5415 backup_dir = os.path.join(ctx.data_dir, ctx.fsid, 'removed')
5416 if not os.path.exists(backup_dir):
5417 makedirs(backup_dir, 0, 0, DATA_DIR_MODE)
5418 dirname = '%s.%s_%s' % (daemon_type, daemon_id,
5419 datetime.datetime.utcnow().strftime(DATEFMT))
5420 os.rename(data_dir,
5421 os.path.join(backup_dir, dirname))
5422 else:
5423 if daemon_type == CephadmDaemon.daemon_type:
5424 CephadmDaemon.uninstall(ctx, ctx.fsid, daemon_type, daemon_id)
5425 call_throws(ctx, ['rm', '-rf', data_dir])
5426
5427 ##################################
5428
5429
5430 def _zap(ctx, what):
5431 mounts = get_container_mounts(ctx, ctx.fsid, 'clusterless-ceph-volume', None)
5432 c = CephContainer(
5433 ctx,
5434 image=ctx.image,
5435 entrypoint='/usr/sbin/ceph-volume',
5436 envs=ctx.env,
5437 args=['lvm', 'zap', '--destroy', what],
5438 privileged=True,
5439 volume_mounts=mounts,
5440 )
5441 logger.info(f'Zapping {what}...')
5442 out, err, code = call_throws(ctx, c.run_cmd())
5443
5444
5445 @infer_image
5446 def _zap_osds(ctx):
5447 # assume fsid lock already held
5448
5449 # list
5450 mounts = get_container_mounts(ctx, ctx.fsid, 'clusterless-ceph-volume', None)
5451 c = CephContainer(
5452 ctx,
5453 image=ctx.image,
5454 entrypoint='/usr/sbin/ceph-volume',
5455 envs=ctx.env,
5456 args=['inventory', '--format', 'json'],
5457 privileged=True,
5458 volume_mounts=mounts,
5459 )
5460 out, err, code = call_throws(ctx, c.run_cmd())
5461 if code:
5462 raise Error('failed to list osd inventory')
5463 try:
5464 ls = json.loads(out)
5465 except ValueError as e:
5466 raise Error(f'Invalid JSON in ceph-volume inventory: {e}')
5467
5468 for i in ls:
5469 matches = [lv.get('cluster_fsid') == ctx.fsid for lv in i.get('lvs', [])]
5470 if any(matches) and all(matches):
5471 _zap(ctx, i.get('path'))
5472 elif any(matches):
5473 lv_names = [lv['name'] for lv in i.get('lvs', [])]
5474 # TODO: we need to map the lv_names back to device paths (the vg
5475 # id isn't part of the output here!)
5476 logger.warning(f'Not zapping LVs (not implemented): {lv_names}')
5477
5478
5479 def command_zap_osds(ctx):
5480 if not ctx.force:
5481 raise Error('must pass --force to proceed: '
5482 'this command may destroy precious data!')
5483
5484 lock = FileLock(ctx, ctx.fsid)
5485 lock.acquire()
5486
5487 _zap_osds(ctx)
5488
5489 ##################################
5490
5491
5492 def command_rm_cluster(ctx):
5493 # type: (CephadmContext) -> None
5494 if not ctx.force:
5495 raise Error('must pass --force to proceed: '
5496 'this command may destroy precious data!')
5497
5498 lock = FileLock(ctx, ctx.fsid)
5499 lock.acquire()
5500
5501 # stop + disable individual daemon units
5502 for d in list_daemons(ctx, detail=False):
5503 if d['fsid'] != ctx.fsid:
5504 continue
5505 if d['style'] != 'cephadm:v1':
5506 continue
5507 unit_name = get_unit_name(ctx.fsid, d['name'])
5508 call(ctx, ['systemctl', 'stop', unit_name],
5509 verbosity=CallVerbosity.DEBUG)
5510 call(ctx, ['systemctl', 'reset-failed', unit_name],
5511 verbosity=CallVerbosity.DEBUG)
5512 call(ctx, ['systemctl', 'disable', unit_name],
5513 verbosity=CallVerbosity.DEBUG)
5514
5515 # cluster units
5516 for unit_name in ['ceph-%s.target' % ctx.fsid]:
5517 call(ctx, ['systemctl', 'stop', unit_name],
5518 verbosity=CallVerbosity.DEBUG)
5519 call(ctx, ['systemctl', 'reset-failed', unit_name],
5520 verbosity=CallVerbosity.DEBUG)
5521 call(ctx, ['systemctl', 'disable', unit_name],
5522 verbosity=CallVerbosity.DEBUG)
5523
5524 slice_name = 'system-%s.slice' % (('ceph-%s' % ctx.fsid).replace('-', '\\x2d'))
5525 call(ctx, ['systemctl', 'stop', slice_name],
5526 verbosity=CallVerbosity.DEBUG)
5527
5528 # osds?
5529 if ctx.zap_osds:
5530 _zap_osds(ctx)
5531
5532 # rm units
5533 call_throws(ctx, ['rm', '-f', ctx.unit_dir
5534 + '/ceph-%s@.service' % ctx.fsid])
5535 call_throws(ctx, ['rm', '-f', ctx.unit_dir
5536 + '/ceph-%s.target' % ctx.fsid])
5537 call_throws(ctx, ['rm', '-rf',
5538 ctx.unit_dir + '/ceph-%s.target.wants' % ctx.fsid])
5539 # rm data
5540 call_throws(ctx, ['rm', '-rf', ctx.data_dir + '/' + ctx.fsid])
5541
5542 if not ctx.keep_logs:
5543 # rm logs
5544 call_throws(ctx, ['rm', '-rf', ctx.log_dir + '/' + ctx.fsid])
5545 call_throws(ctx, ['rm', '-rf', ctx.log_dir
5546 + '/*.wants/ceph-%s@*' % ctx.fsid])
5547
5548 # rm logrotate config
5549 call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/ceph-%s' % ctx.fsid])
5550
5551 # rm cephadm logrotate config if last cluster on host
5552 if not os.listdir(ctx.data_dir):
5553 call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/cephadm'])
5554
5555 # rm sysctl settings
5556 sysctl_dir = Path(ctx.sysctl_dir)
5557 for p in sysctl_dir.glob(f'90-ceph-{ctx.fsid}-*.conf'):
5558 p.unlink()
5559
5560 # clean up config, keyring, and pub key files
5561 files = ['/etc/ceph/ceph.conf', '/etc/ceph/ceph.pub', '/etc/ceph/ceph.client.admin.keyring']
5562
5563 if os.path.exists(files[0]):
5564 valid_fsid = False
5565 with open(files[0]) as f:
5566 if ctx.fsid in f.read():
5567 valid_fsid = True
5568 if valid_fsid:
5569 for n in range(0, len(files)):
5570 if os.path.exists(files[n]):
5571 os.remove(files[n])
5572
5573
5574 ##################################
5575
5576
5577 def check_time_sync(ctx, enabler=None):
5578 # type: (CephadmContext, Optional[Packager]) -> bool
5579 units = [
5580 'chrony.service', # 18.04 (at least)
5581 'chronyd.service', # el / opensuse
5582 'systemd-timesyncd.service',
5583 'ntpd.service', # el7 (at least)
5584 'ntp.service', # 18.04 (at least)
5585 'ntpsec.service', # 20.04 (at least) / buster
5586 ]
5587 if not check_units(ctx, units, enabler):
5588 logger.warning('No time sync service is running; checked for %s' % units)
5589 return False
5590 return True
5591
5592
5593 def command_check_host(ctx: CephadmContext) -> None:
5594 container_path = ctx.container_engine.path
5595
5596 errors = []
5597 commands = ['systemctl', 'lvcreate']
5598
5599 try:
5600 check_container_engine(ctx)
5601 logger.info('podman|docker (%s) is present' % container_path)
5602 except Error as e:
5603 errors.append(str(e))
5604
5605 for command in commands:
5606 try:
5607 find_program(command)
5608 logger.info('%s is present' % command)
5609 except ValueError:
5610 errors.append('%s binary does not appear to be installed' % command)
5611
5612 # check for configured+running chronyd or ntp
5613 if not check_time_sync(ctx):
5614 errors.append('No time synchronization is active')
5615
5616 if 'expect_hostname' in ctx and ctx.expect_hostname:
5617 if get_hostname().lower() != ctx.expect_hostname.lower():
5618 errors.append('hostname "%s" does not match expected hostname "%s"' % (
5619 get_hostname(), ctx.expect_hostname))
5620 logger.info('Hostname "%s" matches what is expected.',
5621 ctx.expect_hostname)
5622
5623 if errors:
5624 raise Error('\nERROR: '.join(errors))
5625
5626 logger.info('Host looks OK')
5627
5628 ##################################
5629
5630
5631 def command_prepare_host(ctx: CephadmContext) -> None:
5632 logger.info('Verifying podman|docker is present...')
5633 pkg = None
5634 try:
5635 check_container_engine(ctx)
5636 except Error as e:
5637 logger.warning(str(e))
5638 if not pkg:
5639 pkg = create_packager(ctx)
5640 pkg.install_podman()
5641
5642 logger.info('Verifying lvm2 is present...')
5643 if not find_executable('lvcreate'):
5644 if not pkg:
5645 pkg = create_packager(ctx)
5646 pkg.install(['lvm2'])
5647
5648 logger.info('Verifying time synchronization is in place...')
5649 if not check_time_sync(ctx):
5650 if not pkg:
5651 pkg = create_packager(ctx)
5652 pkg.install(['chrony'])
5653 # check again, and this time try to enable
5654 # the service
5655 check_time_sync(ctx, enabler=pkg)
5656
5657 if 'expect_hostname' in ctx and ctx.expect_hostname and ctx.expect_hostname != get_hostname():
5658 logger.warning('Adjusting hostname from %s -> %s...' % (get_hostname(), ctx.expect_hostname))
5659 call_throws(ctx, ['hostname', ctx.expect_hostname])
5660 with open('/etc/hostname', 'w') as f:
5661 f.write(ctx.expect_hostname + '\n')
5662
5663 logger.info('Repeating the final host check...')
5664 command_check_host(ctx)
5665
5666 ##################################
5667
5668
5669 class CustomValidation(argparse.Action):
5670
5671 def _check_name(self, values):
5672 try:
5673 (daemon_type, daemon_id) = values.split('.', 1)
5674 except ValueError:
5675 raise argparse.ArgumentError(self,
5676 'must be of the format <type>.<id>. For example, osd.1 or prometheus.myhost.com')
5677
5678 daemons = get_supported_daemons()
5679 if daemon_type not in daemons:
5680 raise argparse.ArgumentError(self,
5681 'name must declare the type of daemon e.g. '
5682 '{}'.format(', '.join(daemons)))
5683
5684 def __call__(self, parser, namespace, values, option_string=None):
5685 if self.dest == 'name':
5686 self._check_name(values)
5687 setattr(namespace, self.dest, values)
5688 elif self.dest == 'exporter_config':
5689 cfg = get_parm(values)
5690 # run the class' validate method, and convert to an argparse error
5691 # if problems are found
5692 try:
5693 CephadmDaemon.validate_config(cfg)
5694 except Error as e:
5695 raise argparse.ArgumentError(self,
5696 str(e))
5697 setattr(namespace, self.dest, cfg)
5698
5699 ##################################
5700
5701
5702 def get_distro():
5703 # type: () -> Tuple[Optional[str], Optional[str], Optional[str]]
5704 distro = None
5705 distro_version = None
5706 distro_codename = None
5707 with open('/etc/os-release', 'r') as f:
5708 for line in f.readlines():
5709 line = line.strip()
5710 if '=' not in line or line.startswith('#'):
5711 continue
5712 (var, val) = line.split('=', 1)
5713 if val[0] == '"' and val[-1] == '"':
5714 val = val[1:-1]
5715 if var == 'ID':
5716 distro = val.lower()
5717 elif var == 'VERSION_ID':
5718 distro_version = val.lower()
5719 elif var == 'VERSION_CODENAME':
5720 distro_codename = val.lower()
5721 return distro, distro_version, distro_codename
5722
5723
5724 class Packager(object):
5725 def __init__(self, ctx: CephadmContext,
5726 stable=None, version=None, branch=None, commit=None):
5727 assert \
5728 (stable and not version and not branch and not commit) or \
5729 (not stable and version and not branch and not commit) or \
5730 (not stable and not version and branch) or \
5731 (not stable and not version and not branch and not commit)
5732 self.ctx = ctx
5733 self.stable = stable
5734 self.version = version
5735 self.branch = branch
5736 self.commit = commit
5737
5738 def add_repo(self):
5739 raise NotImplementedError
5740
5741 def rm_repo(self):
5742 raise NotImplementedError
5743
5744 def query_shaman(self, distro, distro_version, branch, commit):
5745 # query shaman
5746 logger.info('Fetching repo metadata from shaman and chacra...')
5747 shaman_url = 'https://shaman.ceph.com/api/repos/ceph/{branch}/{sha1}/{distro}/{distro_version}/repo/?arch={arch}'.format(
5748 distro=distro,
5749 distro_version=distro_version,
5750 branch=branch,
5751 sha1=commit or 'latest',
5752 arch=get_arch()
5753 )
5754 try:
5755 shaman_response = urlopen(shaman_url)
5756 except HTTPError as err:
5757 logger.error('repository not found in shaman (might not be available yet)')
5758 raise Error('%s, failed to fetch %s' % (err, shaman_url))
5759 chacra_url = ''
5760 try:
5761 chacra_url = shaman_response.geturl()
5762 chacra_response = urlopen(chacra_url)
5763 except HTTPError as err:
5764 logger.error('repository not found in chacra (might not be available yet)')
5765 raise Error('%s, failed to fetch %s' % (err, chacra_url))
5766 return chacra_response.read().decode('utf-8')
5767
5768 def repo_gpgkey(self):
5769 if self.ctx.gpg_url:
5770 return self.ctx.gpg_url
5771 if self.stable or self.version:
5772 return 'https://download.ceph.com/keys/release.gpg', 'release'
5773 else:
5774 return 'https://download.ceph.com/keys/autobuild.gpg', 'autobuild'
5775
5776 def enable_service(self, service):
5777 """
5778 Start and enable the service (typically using systemd).
5779 """
5780 call_throws(self.ctx, ['systemctl', 'enable', '--now', service])
5781
5782
5783 class Apt(Packager):
5784 DISTRO_NAMES = {
5785 'ubuntu': 'ubuntu',
5786 'debian': 'debian',
5787 }
5788
5789 def __init__(self, ctx: CephadmContext,
5790 stable, version, branch, commit,
5791 distro, distro_version, distro_codename):
5792 super(Apt, self).__init__(ctx, stable=stable, version=version,
5793 branch=branch, commit=commit)
5794 self.ctx = ctx
5795 self.distro = self.DISTRO_NAMES[distro]
5796 self.distro_codename = distro_codename
5797 self.distro_version = distro_version
5798
5799 def repo_path(self):
5800 return '/etc/apt/sources.list.d/ceph.list'
5801
5802 def add_repo(self):
5803
5804 url, name = self.repo_gpgkey()
5805 logger.info('Installing repo GPG key from %s...' % url)
5806 try:
5807 response = urlopen(url)
5808 except HTTPError as err:
5809 logger.error('failed to fetch GPG repo key from %s: %s' % (
5810 url, err))
5811 raise Error('failed to fetch GPG key')
5812 key = response.read()
5813 with open('/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name, 'wb') as f:
5814 f.write(key)
5815
5816 if self.version:
5817 content = 'deb %s/debian-%s/ %s main\n' % (
5818 self.ctx.repo_url, self.version, self.distro_codename)
5819 elif self.stable:
5820 content = 'deb %s/debian-%s/ %s main\n' % (
5821 self.ctx.repo_url, self.stable, self.distro_codename)
5822 else:
5823 content = self.query_shaman(self.distro, self.distro_codename, self.branch,
5824 self.commit)
5825
5826 logger.info('Installing repo file at %s...' % self.repo_path())
5827 with open(self.repo_path(), 'w') as f:
5828 f.write(content)
5829
5830 self.update()
5831
5832 def rm_repo(self):
5833 for name in ['autobuild', 'release']:
5834 p = '/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name
5835 if os.path.exists(p):
5836 logger.info('Removing repo GPG key %s...' % p)
5837 os.unlink(p)
5838 if os.path.exists(self.repo_path()):
5839 logger.info('Removing repo at %s...' % self.repo_path())
5840 os.unlink(self.repo_path())
5841
5842 if self.distro == 'ubuntu':
5843 self.rm_kubic_repo()
5844
5845 def install(self, ls):
5846 logger.info('Installing packages %s...' % ls)
5847 call_throws(self.ctx, ['apt-get', 'install', '-y'] + ls)
5848
5849 def update(self):
5850 logger.info('Updating package list...')
5851 call_throws(self.ctx, ['apt-get', 'update'])
5852
5853 def install_podman(self):
5854 if self.distro == 'ubuntu':
5855 logger.info('Setting up repo for podman...')
5856 self.add_kubic_repo()
5857 self.update()
5858
5859 logger.info('Attempting podman install...')
5860 try:
5861 self.install(['podman'])
5862 except Error:
5863 logger.info('Podman did not work. Falling back to docker...')
5864 self.install(['docker.io'])
5865
5866 def kubic_repo_url(self):
5867 return 'https://download.opensuse.org/repositories/devel:/kubic:/' \
5868 'libcontainers:/stable/xUbuntu_%s/' % self.distro_version
5869
5870 def kubic_repo_path(self):
5871 return '/etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list'
5872
5873 def kubric_repo_gpgkey_url(self):
5874 return '%s/Release.key' % self.kubic_repo_url()
5875
5876 def kubric_repo_gpgkey_path(self):
5877 return '/etc/apt/trusted.gpg.d/kubic.release.gpg'
5878
5879 def add_kubic_repo(self):
5880 url = self.kubric_repo_gpgkey_url()
5881 logger.info('Installing repo GPG key from %s...' % url)
5882 try:
5883 response = urlopen(url)
5884 except HTTPError as err:
5885 logger.error('failed to fetch GPG repo key from %s: %s' % (
5886 url, err))
5887 raise Error('failed to fetch GPG key')
5888 key = response.read().decode('utf-8')
5889 tmp_key = write_tmp(key, 0, 0)
5890 keyring = self.kubric_repo_gpgkey_path()
5891 call_throws(self.ctx, ['apt-key', '--keyring', keyring, 'add', tmp_key.name])
5892
5893 logger.info('Installing repo file at %s...' % self.kubic_repo_path())
5894 content = 'deb %s /\n' % self.kubic_repo_url()
5895 with open(self.kubic_repo_path(), 'w') as f:
5896 f.write(content)
5897
5898 def rm_kubic_repo(self):
5899 keyring = self.kubric_repo_gpgkey_path()
5900 if os.path.exists(keyring):
5901 logger.info('Removing repo GPG key %s...' % keyring)
5902 os.unlink(keyring)
5903
5904 p = self.kubic_repo_path()
5905 if os.path.exists(p):
5906 logger.info('Removing repo at %s...' % p)
5907 os.unlink(p)
5908
5909
5910 class YumDnf(Packager):
5911 DISTRO_NAMES = {
5912 'centos': ('centos', 'el'),
5913 'rhel': ('centos', 'el'),
5914 'scientific': ('centos', 'el'),
5915 'rocky': ('centos', 'el'),
5916 'fedora': ('fedora', 'fc'),
5917 }
5918
5919 def __init__(self, ctx: CephadmContext,
5920 stable, version, branch, commit,
5921 distro, distro_version):
5922 super(YumDnf, self).__init__(ctx, stable=stable, version=version,
5923 branch=branch, commit=commit)
5924 self.ctx = ctx
5925 self.major = int(distro_version.split('.')[0])
5926 self.distro_normalized = self.DISTRO_NAMES[distro][0]
5927 self.distro_code = self.DISTRO_NAMES[distro][1] + str(self.major)
5928 if (self.distro_code == 'fc' and self.major >= 30) or \
5929 (self.distro_code == 'el' and self.major >= 8):
5930 self.tool = 'dnf'
5931 else:
5932 self.tool = 'yum'
5933
5934 def custom_repo(self, **kw):
5935 """
5936 Repo files need special care in that a whole line should not be present
5937 if there is no value for it. Because we were using `format()` we could
5938 not conditionally add a line for a repo file. So the end result would
5939 contain a key with a missing value (say if we were passing `None`).
5940
5941 For example, it could look like::
5942
5943 [ceph repo]
5944 name= ceph repo
5945 proxy=
5946 gpgcheck=
5947
5948 Which breaks. This function allows us to conditionally add lines,
5949 preserving an order and be more careful.
5950
5951 Previously, and for historical purposes, this is how the template used
5952 to look::
5953
5954 custom_repo =
5955 [{repo_name}]
5956 name={name}
5957 baseurl={baseurl}
5958 enabled={enabled}
5959 gpgcheck={gpgcheck}
5960 type={_type}
5961 gpgkey={gpgkey}
5962 proxy={proxy}
5963
5964 """
5965 lines = []
5966
5967 # by using tuples (vs a dict) we preserve the order of what we want to
5968 # return, like starting with a [repo name]
5969 tmpl = (
5970 ('reponame', '[%s]'),
5971 ('name', 'name=%s'),
5972 ('baseurl', 'baseurl=%s'),
5973 ('enabled', 'enabled=%s'),
5974 ('gpgcheck', 'gpgcheck=%s'),
5975 ('_type', 'type=%s'),
5976 ('gpgkey', 'gpgkey=%s'),
5977 ('proxy', 'proxy=%s'),
5978 ('priority', 'priority=%s'),
5979 )
5980
5981 for line in tmpl:
5982 tmpl_key, tmpl_value = line # key values from tmpl
5983
5984 # ensure that there is an actual value (not None nor empty string)
5985 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
5986 lines.append(tmpl_value % kw.get(tmpl_key))
5987
5988 return '\n'.join(lines)
5989
5990 def repo_path(self):
5991 return '/etc/yum.repos.d/ceph.repo'
5992
5993 def repo_baseurl(self):
5994 assert self.stable or self.version
5995 if self.version:
5996 return '%s/rpm-%s/%s' % (self.ctx.repo_url, self.version,
5997 self.distro_code)
5998 else:
5999 return '%s/rpm-%s/%s' % (self.ctx.repo_url, self.stable,
6000 self.distro_code)
6001
6002 def add_repo(self):
6003 if self.distro_code.startswith('fc'):
6004 raise Error('Ceph team does not build Fedora specific packages and therefore cannot add repos for this distro')
6005 if self.distro_code == 'el7':
6006 if self.stable and self.stable >= 'pacific':
6007 raise Error('Ceph does not support pacific or later for this version of this linux distro and therefore cannot add a repo for it')
6008 if self.version and self.version.split('.')[0] >= '16':
6009 raise Error('Ceph does not support 16.y.z or later for this version of this linux distro and therefore cannot add a repo for it')
6010 if self.stable or self.version:
6011 content = ''
6012 for n, t in {
6013 'Ceph': '$basearch',
6014 'Ceph-noarch': 'noarch',
6015 'Ceph-source': 'SRPMS'}.items():
6016 content += '[%s]\n' % (n)
6017 content += self.custom_repo(
6018 name='Ceph %s' % t,
6019 baseurl=self.repo_baseurl() + '/' + t,
6020 enabled=1,
6021 gpgcheck=1,
6022 gpgkey=self.repo_gpgkey()[0],
6023 )
6024 content += '\n\n'
6025 else:
6026 content = self.query_shaman(self.distro_normalized, self.major,
6027 self.branch,
6028 self.commit)
6029
6030 logger.info('Writing repo to %s...' % self.repo_path())
6031 with open(self.repo_path(), 'w') as f:
6032 f.write(content)
6033
6034 if self.distro_code.startswith('el'):
6035 logger.info('Enabling EPEL...')
6036 call_throws(self.ctx, [self.tool, 'install', '-y', 'epel-release'])
6037
6038 def rm_repo(self):
6039 if os.path.exists(self.repo_path()):
6040 os.unlink(self.repo_path())
6041
6042 def install(self, ls):
6043 logger.info('Installing packages %s...' % ls)
6044 call_throws(self.ctx, [self.tool, 'install', '-y'] + ls)
6045
6046 def install_podman(self):
6047 self.install(['podman'])
6048
6049
6050 class Zypper(Packager):
6051 DISTRO_NAMES = [
6052 'sles',
6053 'opensuse-tumbleweed',
6054 'opensuse-leap'
6055 ]
6056
6057 def __init__(self, ctx: CephadmContext,
6058 stable, version, branch, commit,
6059 distro, distro_version):
6060 super(Zypper, self).__init__(ctx, stable=stable, version=version,
6061 branch=branch, commit=commit)
6062 self.ctx = ctx
6063 self.tool = 'zypper'
6064 self.distro = 'opensuse'
6065 self.distro_version = '15.1'
6066 if 'tumbleweed' not in distro and distro_version is not None:
6067 self.distro_version = distro_version
6068
6069 def custom_repo(self, **kw):
6070 """
6071 See YumDnf for format explanation.
6072 """
6073 lines = []
6074
6075 # by using tuples (vs a dict) we preserve the order of what we want to
6076 # return, like starting with a [repo name]
6077 tmpl = (
6078 ('reponame', '[%s]'),
6079 ('name', 'name=%s'),
6080 ('baseurl', 'baseurl=%s'),
6081 ('enabled', 'enabled=%s'),
6082 ('gpgcheck', 'gpgcheck=%s'),
6083 ('_type', 'type=%s'),
6084 ('gpgkey', 'gpgkey=%s'),
6085 ('proxy', 'proxy=%s'),
6086 ('priority', 'priority=%s'),
6087 )
6088
6089 for line in tmpl:
6090 tmpl_key, tmpl_value = line # key values from tmpl
6091
6092 # ensure that there is an actual value (not None nor empty string)
6093 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
6094 lines.append(tmpl_value % kw.get(tmpl_key))
6095
6096 return '\n'.join(lines)
6097
6098 def repo_path(self):
6099 return '/etc/zypp/repos.d/ceph.repo'
6100
6101 def repo_baseurl(self):
6102 assert self.stable or self.version
6103 if self.version:
6104 return '%s/rpm-%s/%s' % (self.ctx.repo_url,
6105 self.stable, self.distro)
6106 else:
6107 return '%s/rpm-%s/%s' % (self.ctx.repo_url,
6108 self.stable, self.distro)
6109
6110 def add_repo(self):
6111 if self.stable or self.version:
6112 content = ''
6113 for n, t in {
6114 'Ceph': '$basearch',
6115 'Ceph-noarch': 'noarch',
6116 'Ceph-source': 'SRPMS'}.items():
6117 content += '[%s]\n' % (n)
6118 content += self.custom_repo(
6119 name='Ceph %s' % t,
6120 baseurl=self.repo_baseurl() + '/' + t,
6121 enabled=1,
6122 gpgcheck=1,
6123 gpgkey=self.repo_gpgkey()[0],
6124 )
6125 content += '\n\n'
6126 else:
6127 content = self.query_shaman(self.distro, self.distro_version,
6128 self.branch,
6129 self.commit)
6130
6131 logger.info('Writing repo to %s...' % self.repo_path())
6132 with open(self.repo_path(), 'w') as f:
6133 f.write(content)
6134
6135 def rm_repo(self):
6136 if os.path.exists(self.repo_path()):
6137 os.unlink(self.repo_path())
6138
6139 def install(self, ls):
6140 logger.info('Installing packages %s...' % ls)
6141 call_throws(self.ctx, [self.tool, 'in', '-y'] + ls)
6142
6143 def install_podman(self):
6144 self.install(['podman'])
6145
6146
6147 def create_packager(ctx: CephadmContext,
6148 stable=None, version=None, branch=None, commit=None):
6149 distro, distro_version, distro_codename = get_distro()
6150 if distro in YumDnf.DISTRO_NAMES:
6151 return YumDnf(ctx, stable=stable, version=version,
6152 branch=branch, commit=commit,
6153 distro=distro, distro_version=distro_version)
6154 elif distro in Apt.DISTRO_NAMES:
6155 return Apt(ctx, stable=stable, version=version,
6156 branch=branch, commit=commit,
6157 distro=distro, distro_version=distro_version,
6158 distro_codename=distro_codename)
6159 elif distro in Zypper.DISTRO_NAMES:
6160 return Zypper(ctx, stable=stable, version=version,
6161 branch=branch, commit=commit,
6162 distro=distro, distro_version=distro_version)
6163 raise Error('Distro %s version %s not supported' % (distro, distro_version))
6164
6165
6166 def command_add_repo(ctx: CephadmContext):
6167 if ctx.version and ctx.release:
6168 raise Error('you can specify either --release or --version but not both')
6169 if not ctx.version and not ctx.release and not ctx.dev and not ctx.dev_commit:
6170 raise Error('please supply a --release, --version, --dev or --dev-commit argument')
6171 if ctx.version:
6172 try:
6173 (x, y, z) = ctx.version.split('.')
6174 except Exception:
6175 raise Error('version must be in the form x.y.z (e.g., 15.2.0)')
6176 if ctx.release:
6177 # Pacific =/= pacific in this case, set to undercase to avoid confision
6178 ctx.release = ctx.release.lower()
6179
6180 pkg = create_packager(ctx, stable=ctx.release,
6181 version=ctx.version,
6182 branch=ctx.dev,
6183 commit=ctx.dev_commit)
6184 pkg.add_repo()
6185 logger.info('Completed adding repo.')
6186
6187
6188 def command_rm_repo(ctx: CephadmContext):
6189 pkg = create_packager(ctx)
6190 pkg.rm_repo()
6191
6192
6193 def command_install(ctx: CephadmContext):
6194 pkg = create_packager(ctx)
6195 pkg.install(ctx.packages)
6196
6197 ##################################
6198
6199
6200 def get_ipv4_address(ifname):
6201 # type: (str) -> str
6202 def _extract(sock, offset):
6203 return socket.inet_ntop(
6204 socket.AF_INET,
6205 fcntl.ioctl(
6206 sock.fileno(),
6207 offset,
6208 struct.pack('256s', bytes(ifname[:15], 'utf-8'))
6209 )[20:24])
6210
6211 s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
6212 try:
6213 addr = _extract(s, 35093) # '0x8915' = SIOCGIFADDR
6214 dq_mask = _extract(s, 35099) # 0x891b = SIOCGIFNETMASK
6215 except OSError:
6216 # interface does not have an ipv4 address
6217 return ''
6218
6219 dec_mask = sum([bin(int(i)).count('1')
6220 for i in dq_mask.split('.')])
6221 return '{}/{}'.format(addr, dec_mask)
6222
6223
6224 def get_ipv6_address(ifname):
6225 # type: (str) -> str
6226 if not os.path.exists('/proc/net/if_inet6'):
6227 return ''
6228
6229 raw = read_file(['/proc/net/if_inet6'])
6230 data = raw.splitlines()
6231 # based on docs @ https://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/ch11s04.html
6232 # field 0 is ipv6, field 2 is scope
6233 for iface_setting in data:
6234 field = iface_setting.split()
6235 if field[-1] == ifname:
6236 ipv6_raw = field[0]
6237 ipv6_fmtd = ':'.join([ipv6_raw[_p:_p + 4] for _p in range(0, len(field[0]), 4)])
6238 # apply naming rules using ipaddress module
6239 ipv6 = ipaddress.ip_address(ipv6_fmtd)
6240 return '{}/{}'.format(str(ipv6), int('0x{}'.format(field[2]), 16))
6241 return ''
6242
6243
6244 def bytes_to_human(num, mode='decimal'):
6245 # type: (float, str) -> str
6246 """Convert a bytes value into it's human-readable form.
6247
6248 :param num: number, in bytes, to convert
6249 :param mode: Either decimal (default) or binary to determine divisor
6250 :returns: string representing the bytes value in a more readable format
6251 """
6252 unit_list = ['', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB']
6253 divisor = 1000.0
6254 yotta = 'YB'
6255
6256 if mode == 'binary':
6257 unit_list = ['', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB']
6258 divisor = 1024.0
6259 yotta = 'YiB'
6260
6261 for unit in unit_list:
6262 if abs(num) < divisor:
6263 return '%3.1f%s' % (num, unit)
6264 num /= divisor
6265 return '%.1f%s' % (num, yotta)
6266
6267
6268 def read_file(path_list, file_name=''):
6269 # type: (List[str], str) -> str
6270 """Returns the content of the first file found within the `path_list`
6271
6272 :param path_list: list of file paths to search
6273 :param file_name: optional file_name to be applied to a file path
6274 :returns: content of the file or 'Unknown'
6275 """
6276 for path in path_list:
6277 if file_name:
6278 file_path = os.path.join(path, file_name)
6279 else:
6280 file_path = path
6281 if os.path.exists(file_path):
6282 with open(file_path, 'r') as f:
6283 try:
6284 content = f.read().strip()
6285 except OSError:
6286 # sysfs may populate the file, but for devices like
6287 # virtio reads can fail
6288 return 'Unknown'
6289 else:
6290 return content
6291 return 'Unknown'
6292
6293 ##################################
6294
6295
6296 class HostFacts():
6297 _dmi_path_list = ['/sys/class/dmi/id']
6298 _nic_path_list = ['/sys/class/net']
6299 _selinux_path_list = ['/etc/selinux/config']
6300 _apparmor_path_list = ['/etc/apparmor']
6301 _disk_vendor_workarounds = {
6302 '0x1af4': 'Virtio Block Device'
6303 }
6304
6305 def __init__(self, ctx: CephadmContext):
6306 self.ctx: CephadmContext = ctx
6307 self.cpu_model: str = 'Unknown'
6308 self.cpu_count: int = 0
6309 self.cpu_cores: int = 0
6310 self.cpu_threads: int = 0
6311 self.interfaces: Dict[str, Any] = {}
6312
6313 self._meminfo: List[str] = read_file(['/proc/meminfo']).splitlines()
6314 self._get_cpuinfo()
6315 self._process_nics()
6316 self.arch: str = platform.processor()
6317 self.kernel: str = platform.release()
6318
6319 def _get_cpuinfo(self):
6320 # type: () -> None
6321 """Determine cpu information via /proc/cpuinfo"""
6322 raw = read_file(['/proc/cpuinfo'])
6323 output = raw.splitlines()
6324 cpu_set = set()
6325
6326 for line in output:
6327 field = [f.strip() for f in line.split(':')]
6328 if 'model name' in line:
6329 self.cpu_model = field[1]
6330 if 'physical id' in line:
6331 cpu_set.add(field[1])
6332 if 'siblings' in line:
6333 self.cpu_threads = int(field[1].strip())
6334 if 'cpu cores' in line:
6335 self.cpu_cores = int(field[1].strip())
6336 pass
6337 self.cpu_count = len(cpu_set)
6338
6339 def _get_block_devs(self):
6340 # type: () -> List[str]
6341 """Determine the list of block devices by looking at /sys/block"""
6342 return [dev for dev in os.listdir('/sys/block')
6343 if not dev.startswith('dm')]
6344
6345 def _get_devs_by_type(self, rota='0'):
6346 # type: (str) -> List[str]
6347 """Filter block devices by a given rotational attribute (0=flash, 1=spinner)"""
6348 devs = list()
6349 for blk_dev in self._get_block_devs():
6350 rot_path = '/sys/block/{}/queue/rotational'.format(blk_dev)
6351 rot_value = read_file([rot_path])
6352 if rot_value == rota:
6353 devs.append(blk_dev)
6354 return devs
6355
6356 @property
6357 def operating_system(self):
6358 # type: () -> str
6359 """Determine OS version"""
6360 raw_info = read_file(['/etc/os-release'])
6361 os_release = raw_info.splitlines()
6362 rel_str = 'Unknown'
6363 rel_dict = dict()
6364
6365 for line in os_release:
6366 if '=' in line:
6367 var_name, var_value = line.split('=')
6368 rel_dict[var_name] = var_value.strip('"')
6369
6370 # Would normally use PRETTY_NAME, but NAME and VERSION are more
6371 # consistent
6372 if all(_v in rel_dict for _v in ['NAME', 'VERSION']):
6373 rel_str = '{} {}'.format(rel_dict['NAME'], rel_dict['VERSION'])
6374 return rel_str
6375
6376 @property
6377 def hostname(self):
6378 # type: () -> str
6379 """Return the hostname"""
6380 return platform.node()
6381
6382 @property
6383 def subscribed(self):
6384 # type: () -> str
6385 """Highlevel check to see if the host is subscribed to receive updates/support"""
6386 def _red_hat():
6387 # type: () -> str
6388 # RHEL 7 and RHEL 8
6389 entitlements_dir = '/etc/pki/entitlement'
6390 if os.path.exists(entitlements_dir):
6391 pems = glob('{}/*.pem'.format(entitlements_dir))
6392 if len(pems) >= 2:
6393 return 'Yes'
6394
6395 return 'No'
6396
6397 os_name = self.operating_system
6398 if os_name.upper().startswith('RED HAT'):
6399 return _red_hat()
6400
6401 return 'Unknown'
6402
6403 @property
6404 def hdd_count(self):
6405 # type: () -> int
6406 """Return a count of HDDs (spinners)"""
6407 return len(self._get_devs_by_type(rota='1'))
6408
6409 def _get_capacity(self, dev):
6410 # type: (str) -> int
6411 """Determine the size of a given device"""
6412 size_path = os.path.join('/sys/block', dev, 'size')
6413 size_blocks = int(read_file([size_path]))
6414 blk_path = os.path.join('/sys/block', dev, 'queue', 'logical_block_size')
6415 blk_count = int(read_file([blk_path]))
6416 return size_blocks * blk_count
6417
6418 def _get_capacity_by_type(self, rota='0'):
6419 # type: (str) -> int
6420 """Return the total capacity of a category of device (flash or hdd)"""
6421 devs = self._get_devs_by_type(rota=rota)
6422 capacity = 0
6423 for dev in devs:
6424 capacity += self._get_capacity(dev)
6425 return capacity
6426
6427 def _dev_list(self, dev_list):
6428 # type: (List[str]) -> List[Dict[str, object]]
6429 """Return a 'pretty' name list for each device in the `dev_list`"""
6430 disk_list = list()
6431
6432 for dev in dev_list:
6433 disk_model = read_file(['/sys/block/{}/device/model'.format(dev)]).strip()
6434 disk_rev = read_file(['/sys/block/{}/device/rev'.format(dev)]).strip()
6435 disk_wwid = read_file(['/sys/block/{}/device/wwid'.format(dev)]).strip()
6436 vendor = read_file(['/sys/block/{}/device/vendor'.format(dev)]).strip()
6437 disk_vendor = HostFacts._disk_vendor_workarounds.get(vendor, vendor)
6438 disk_size_bytes = self._get_capacity(dev)
6439 disk_list.append({
6440 'description': '{} {} ({})'.format(disk_vendor, disk_model, bytes_to_human(disk_size_bytes)),
6441 'vendor': disk_vendor,
6442 'model': disk_model,
6443 'rev': disk_rev,
6444 'wwid': disk_wwid,
6445 'dev_name': dev,
6446 'disk_size_bytes': disk_size_bytes,
6447 })
6448 return disk_list
6449
6450 @property
6451 def hdd_list(self):
6452 # type: () -> List[Dict[str, object]]
6453 """Return a list of devices that are HDDs (spinners)"""
6454 devs = self._get_devs_by_type(rota='1')
6455 return self._dev_list(devs)
6456
6457 @property
6458 def flash_list(self):
6459 # type: () -> List[Dict[str, object]]
6460 """Return a list of devices that are flash based (SSD, NVMe)"""
6461 devs = self._get_devs_by_type(rota='0')
6462 return self._dev_list(devs)
6463
6464 @property
6465 def hdd_capacity_bytes(self):
6466 # type: () -> int
6467 """Return the total capacity for all HDD devices (bytes)"""
6468 return self._get_capacity_by_type(rota='1')
6469
6470 @property
6471 def hdd_capacity(self):
6472 # type: () -> str
6473 """Return the total capacity for all HDD devices (human readable format)"""
6474 return bytes_to_human(self.hdd_capacity_bytes)
6475
6476 @property
6477 def cpu_load(self):
6478 # type: () -> Dict[str, float]
6479 """Return the cpu load average data for the host"""
6480 raw = read_file(['/proc/loadavg']).strip()
6481 data = raw.split()
6482 return {
6483 '1min': float(data[0]),
6484 '5min': float(data[1]),
6485 '15min': float(data[2]),
6486 }
6487
6488 @property
6489 def flash_count(self):
6490 # type: () -> int
6491 """Return the number of flash devices in the system (SSD, NVMe)"""
6492 return len(self._get_devs_by_type(rota='0'))
6493
6494 @property
6495 def flash_capacity_bytes(self):
6496 # type: () -> int
6497 """Return the total capacity for all flash devices (bytes)"""
6498 return self._get_capacity_by_type(rota='0')
6499
6500 @property
6501 def flash_capacity(self):
6502 # type: () -> str
6503 """Return the total capacity for all Flash devices (human readable format)"""
6504 return bytes_to_human(self.flash_capacity_bytes)
6505
6506 def _process_nics(self):
6507 # type: () -> None
6508 """Look at the NIC devices and extract network related metadata"""
6509 # from https://github.com/torvalds/linux/blob/master/include/uapi/linux/if_arp.h
6510 hw_lookup = {
6511 '1': 'ethernet',
6512 '32': 'infiniband',
6513 '772': 'loopback',
6514 }
6515
6516 for nic_path in HostFacts._nic_path_list:
6517 if not os.path.exists(nic_path):
6518 continue
6519 for iface in os.listdir(nic_path):
6520
6521 lower_devs_list = [os.path.basename(link.replace('lower_', '')) for link in glob(os.path.join(nic_path, iface, 'lower_*'))]
6522 upper_devs_list = [os.path.basename(link.replace('upper_', '')) for link in glob(os.path.join(nic_path, iface, 'upper_*'))]
6523
6524 try:
6525 mtu = int(read_file([os.path.join(nic_path, iface, 'mtu')]))
6526 except ValueError:
6527 mtu = 0
6528
6529 operstate = read_file([os.path.join(nic_path, iface, 'operstate')])
6530 try:
6531 speed = int(read_file([os.path.join(nic_path, iface, 'speed')]))
6532 except (OSError, ValueError):
6533 # OSError : device doesn't support the ethtool get_link_ksettings
6534 # ValueError : raised when the read fails, and returns Unknown
6535 #
6536 # Either way, we show a -1 when speed isn't available
6537 speed = -1
6538
6539 if os.path.exists(os.path.join(nic_path, iface, 'bridge')):
6540 nic_type = 'bridge'
6541 elif os.path.exists(os.path.join(nic_path, iface, 'bonding')):
6542 nic_type = 'bonding'
6543 else:
6544 nic_type = hw_lookup.get(read_file([os.path.join(nic_path, iface, 'type')]), 'Unknown')
6545
6546 dev_link = os.path.join(nic_path, iface, 'device')
6547 if os.path.exists(dev_link):
6548 iftype = 'physical'
6549 driver_path = os.path.join(dev_link, 'driver')
6550 if os.path.exists(driver_path):
6551 driver = os.path.basename(os.path.realpath(driver_path))
6552 else:
6553 driver = 'Unknown'
6554
6555 else:
6556 iftype = 'logical'
6557 driver = ''
6558
6559 self.interfaces[iface] = {
6560 'mtu': mtu,
6561 'upper_devs_list': upper_devs_list,
6562 'lower_devs_list': lower_devs_list,
6563 'operstate': operstate,
6564 'iftype': iftype,
6565 'nic_type': nic_type,
6566 'driver': driver,
6567 'speed': speed,
6568 'ipv4_address': get_ipv4_address(iface),
6569 'ipv6_address': get_ipv6_address(iface),
6570 }
6571
6572 @property
6573 def nic_count(self):
6574 # type: () -> int
6575 """Return a total count of all physical NICs detected in the host"""
6576 phys_devs = []
6577 for iface in self.interfaces:
6578 if self.interfaces[iface]['iftype'] == 'physical':
6579 phys_devs.append(iface)
6580 return len(phys_devs)
6581
6582 def _get_mem_data(self, field_name):
6583 # type: (str) -> int
6584 for line in self._meminfo:
6585 if line.startswith(field_name):
6586 _d = line.split()
6587 return int(_d[1])
6588 return 0
6589
6590 @property
6591 def memory_total_kb(self):
6592 # type: () -> int
6593 """Determine the memory installed (kb)"""
6594 return self._get_mem_data('MemTotal')
6595
6596 @property
6597 def memory_free_kb(self):
6598 # type: () -> int
6599 """Determine the memory free (not cache, immediately usable)"""
6600 return self._get_mem_data('MemFree')
6601
6602 @property
6603 def memory_available_kb(self):
6604 # type: () -> int
6605 """Determine the memory available to new applications without swapping"""
6606 return self._get_mem_data('MemAvailable')
6607
6608 @property
6609 def vendor(self):
6610 # type: () -> str
6611 """Determine server vendor from DMI data in sysfs"""
6612 return read_file(HostFacts._dmi_path_list, 'sys_vendor')
6613
6614 @property
6615 def model(self):
6616 # type: () -> str
6617 """Determine server model information from DMI data in sysfs"""
6618 family = read_file(HostFacts._dmi_path_list, 'product_family')
6619 product = read_file(HostFacts._dmi_path_list, 'product_name')
6620 if family == 'Unknown' and product:
6621 return '{}'.format(product)
6622
6623 return '{} ({})'.format(family, product)
6624
6625 @property
6626 def bios_version(self):
6627 # type: () -> str
6628 """Determine server BIOS version from DMI data in sysfs"""
6629 return read_file(HostFacts._dmi_path_list, 'bios_version')
6630
6631 @property
6632 def bios_date(self):
6633 # type: () -> str
6634 """Determine server BIOS date from DMI data in sysfs"""
6635 return read_file(HostFacts._dmi_path_list, 'bios_date')
6636
6637 @property
6638 def timestamp(self):
6639 # type: () -> float
6640 """Return the current time as Epoch seconds"""
6641 return time.time()
6642
6643 @property
6644 def system_uptime(self):
6645 # type: () -> float
6646 """Return the system uptime (in secs)"""
6647 raw_time = read_file(['/proc/uptime'])
6648 up_secs, _ = raw_time.split()
6649 return float(up_secs)
6650
6651 @property
6652 def kernel_security(self):
6653 # type: () -> Dict[str, str]
6654 """Determine the security features enabled in the kernel - SELinux, AppArmor"""
6655 def _fetch_selinux() -> Dict[str, str]:
6656 """Read the selinux config file to determine state"""
6657 security = {}
6658 for selinux_path in HostFacts._selinux_path_list:
6659 if os.path.exists(selinux_path):
6660 selinux_config = read_file([selinux_path]).splitlines()
6661 security['type'] = 'SELinux'
6662 for line in selinux_config:
6663 if line.strip().startswith('#'):
6664 continue
6665 k, v = line.split('=')
6666 security[k] = v
6667 if security['SELINUX'].lower() == 'disabled':
6668 security['description'] = 'SELinux: Disabled'
6669 else:
6670 security['description'] = 'SELinux: Enabled({}, {})'.format(security['SELINUX'], security['SELINUXTYPE'])
6671 return security
6672 return {}
6673
6674 def _fetch_apparmor() -> Dict[str, str]:
6675 """Read the apparmor profiles directly, returning an overview of AppArmor status"""
6676 security = {}
6677 for apparmor_path in HostFacts._apparmor_path_list:
6678 if os.path.exists(apparmor_path):
6679 security['type'] = 'AppArmor'
6680 security['description'] = 'AppArmor: Enabled'
6681 try:
6682 profiles = read_file(['/sys/kernel/security/apparmor/profiles'])
6683 if len(profiles) == 0:
6684 return {}
6685 except OSError:
6686 pass
6687 else:
6688 summary = {} # type: Dict[str, int]
6689 for line in profiles.split('\n'):
6690 item, mode = line.split(' ')
6691 mode = mode.strip('()')
6692 if mode in summary:
6693 summary[mode] += 1
6694 else:
6695 summary[mode] = 0
6696 summary_str = ','.join(['{} {}'.format(v, k) for k, v in summary.items()])
6697 security = {**security, **summary} # type: ignore
6698 security['description'] += '({})'.format(summary_str)
6699
6700 return security
6701 return {}
6702
6703 ret = {}
6704 if os.path.exists('/sys/kernel/security/lsm'):
6705 lsm = read_file(['/sys/kernel/security/lsm']).strip()
6706 if 'selinux' in lsm:
6707 ret = _fetch_selinux()
6708 elif 'apparmor' in lsm:
6709 ret = _fetch_apparmor()
6710 else:
6711 return {
6712 'type': 'Unknown',
6713 'description': 'Linux Security Module framework is active, but is not using SELinux or AppArmor'
6714 }
6715
6716 if ret:
6717 return ret
6718
6719 return {
6720 'type': 'None',
6721 'description': 'Linux Security Module framework is not available'
6722 }
6723
6724 @property
6725 def selinux_enabled(self):
6726 return (self.kernel_security['type'] == 'SELinux') and \
6727 (self.kernel_security['description'] != 'SELinux: Disabled')
6728
6729 @property
6730 def kernel_parameters(self):
6731 # type: () -> Dict[str, str]
6732 """Get kernel parameters required/used in Ceph clusters"""
6733
6734 k_param = {}
6735 out, _, _ = call_throws(self.ctx, ['sysctl', '-a'], verbosity=CallVerbosity.SILENT)
6736 if out:
6737 param_list = out.split('\n')
6738 param_dict = {param.split(' = ')[0]: param.split(' = ')[-1] for param in param_list}
6739
6740 # return only desired parameters
6741 if 'net.ipv4.ip_nonlocal_bind' in param_dict:
6742 k_param['net.ipv4.ip_nonlocal_bind'] = param_dict['net.ipv4.ip_nonlocal_bind']
6743
6744 return k_param
6745
6746 def dump(self):
6747 # type: () -> str
6748 """Return the attributes of this HostFacts object as json"""
6749 data = {
6750 k: getattr(self, k) for k in dir(self)
6751 if not k.startswith('_')
6752 and isinstance(getattr(self, k), (float, int, str, list, dict, tuple))
6753 }
6754 return json.dumps(data, indent=2, sort_keys=True)
6755
6756 ##################################
6757
6758
6759 def command_gather_facts(ctx: CephadmContext):
6760 """gather_facts is intended to provide host releated metadata to the caller"""
6761 host = HostFacts(ctx)
6762 print(host.dump())
6763
6764
6765 ##################################
6766
6767
6768 class CephadmCache:
6769 task_types = ['disks', 'daemons', 'host', 'http_server']
6770
6771 def __init__(self):
6772 self.started_epoch_secs = time.time()
6773 self.tasks = {
6774 'daemons': 'inactive',
6775 'disks': 'inactive',
6776 'host': 'inactive',
6777 'http_server': 'inactive',
6778 }
6779 self.errors = []
6780 self.disks = {}
6781 self.daemons = {}
6782 self.host = {}
6783 self.lock = RLock()
6784
6785 @property
6786 def health(self):
6787 return {
6788 'started_epoch_secs': self.started_epoch_secs,
6789 'tasks': self.tasks,
6790 'errors': self.errors,
6791 }
6792
6793 def to_json(self):
6794 return {
6795 'health': self.health,
6796 'host': self.host,
6797 'daemons': self.daemons,
6798 'disks': self.disks,
6799 }
6800
6801 def update_health(self, task_type, task_status, error_msg=None):
6802 assert task_type in CephadmCache.task_types
6803 with self.lock:
6804 self.tasks[task_type] = task_status
6805 if error_msg:
6806 self.errors.append(error_msg)
6807
6808 def update_task(self, task_type, content):
6809 assert task_type in CephadmCache.task_types
6810 assert isinstance(content, dict)
6811 with self.lock:
6812 current = getattr(self, task_type)
6813 for k in content:
6814 current[k] = content[k]
6815
6816 setattr(self, task_type, current)
6817
6818
6819 class CephadmHTTPServer(ThreadingMixIn, HTTPServer):
6820 allow_reuse_address = True
6821 daemon_threads = True
6822 cephadm_cache: CephadmCache
6823 token: str
6824
6825
6826 class CephadmDaemonHandler(BaseHTTPRequestHandler):
6827 server: CephadmHTTPServer
6828 api_version = 'v1'
6829 valid_routes = [
6830 f'/{api_version}/metadata',
6831 f'/{api_version}/metadata/health',
6832 f'/{api_version}/metadata/disks',
6833 f'/{api_version}/metadata/daemons',
6834 f'/{api_version}/metadata/host',
6835 ]
6836
6837 class Decorators:
6838 @classmethod
6839 def authorize(cls, f):
6840 """Implement a basic token check.
6841
6842 The token is installed at deployment time and must be provided to
6843 ensure we only respond to callers who know our token i.e. mgr
6844 """
6845
6846 def wrapper(self, *args, **kwargs):
6847 auth = self.headers.get('Authorization', None)
6848 if auth != 'Bearer ' + self.server.token:
6849 self.send_error(401)
6850 return
6851 f(self, *args, **kwargs)
6852
6853 return wrapper
6854
6855 def _help_page(self):
6856 return """<!DOCTYPE html>
6857 <html>
6858 <head><title>cephadm metadata exporter</title></head>
6859 <style>
6860 body {{
6861 font-family: sans-serif;
6862 font-size: 0.8em;
6863 }}
6864 table {{
6865 border-width: 0px;
6866 border-spacing: 0px;
6867 margin-left:20px;
6868 }}
6869 tr:hover {{
6870 background: PowderBlue;
6871 }}
6872 td,th {{
6873 padding: 5px;
6874 }}
6875 </style>
6876 <body>
6877 <h1>cephadm metadata exporter {api_version}</h1>
6878 <table>
6879 <thead>
6880 <tr><th>Endpoint</th><th>Methods</th><th>Response</th><th>Description</th></tr>
6881 </thead>
6882 <tr><td><a href='{api_version}/metadata'>{api_version}/metadata</a></td><td>GET</td><td>JSON</td><td>Return <b>all</b> metadata for the host</td></tr>
6883 <tr><td><a href='{api_version}/metadata/daemons'>{api_version}/metadata/daemons</a></td><td>GET</td><td>JSON</td><td>Return daemon and systemd states for ceph daemons (ls)</td></tr>
6884 <tr><td><a href='{api_version}/metadata/disks'>{api_version}/metadata/disks</a></td><td>GET</td><td>JSON</td><td>show disk inventory (ceph-volume)</td></tr>
6885 <tr><td><a href='{api_version}/metadata/health'>{api_version}/metadata/health</a></td><td>GET</td><td>JSON</td><td>Show current health of the exporter sub-tasks</td></tr>
6886 <tr><td><a href='{api_version}/metadata/host'>{api_version}/metadata/host</a></td><td>GET</td><td>JSON</td><td>Show host metadata (gather-facts)</td></tr>
6887 </table>
6888 </body>
6889 </html>""".format(api_version=CephadmDaemonHandler.api_version)
6890
6891 def _fetch_root(self):
6892 self.send_response(200)
6893 self.send_header('Content-type', 'text/html; charset=utf-8')
6894 self.end_headers()
6895 self.wfile.write(self._help_page().encode('utf-8'))
6896
6897 @Decorators.authorize
6898 def do_GET(self):
6899 """Handle *all* GET requests"""
6900
6901 if self.path == '/':
6902 # provide a html response if someone hits the root url, to document the
6903 # available api endpoints
6904 return self._fetch_root()
6905 elif self.path in CephadmDaemonHandler.valid_routes:
6906 u = self.path.split('/')[-1]
6907 data = json.dumps({})
6908 status_code = 200
6909
6910 tasks = self.server.cephadm_cache.health.get('tasks', {})
6911 assert tasks
6912
6913 # We're using the http status code to help indicate thread health
6914 # - 200 (OK): request successful
6915 # - 204 (No Content): access to a cache relating to a dead thread
6916 # - 206 (Partial content): one or more theads are inactive
6917 # - 500 (Server Error): all threads inactive
6918 if u == 'metadata':
6919 data = json.dumps(self.server.cephadm_cache.to_json())
6920 if all([tasks[task_name] == 'inactive' for task_name in tasks if task_name != 'http_server']):
6921 # All the subtasks are dead!
6922 status_code = 500
6923 elif any([tasks[task_name] == 'inactive' for task_name in tasks if task_name != 'http_server']):
6924 status_code = 206
6925
6926 # Individual GETs against the a tasks endpoint will also return a 503 if the corresponding thread is inactive
6927 elif u == 'daemons':
6928 data = json.dumps(self.server.cephadm_cache.daemons)
6929 if tasks['daemons'] == 'inactive':
6930 status_code = 204
6931 elif u == 'disks':
6932 data = json.dumps(self.server.cephadm_cache.disks)
6933 if tasks['disks'] == 'inactive':
6934 status_code = 204
6935 elif u == 'host':
6936 data = json.dumps(self.server.cephadm_cache.host)
6937 if tasks['host'] == 'inactive':
6938 status_code = 204
6939
6940 # a GET against health will always return a 200, since the op is always successful
6941 elif u == 'health':
6942 data = json.dumps(self.server.cephadm_cache.health)
6943
6944 self.send_response(status_code)
6945 self.send_header('Content-type', 'application/json')
6946 self.end_headers()
6947 self.wfile.write(data.encode('utf-8'))
6948 else:
6949 # Invalid GET URL
6950 bad_request_msg = 'Valid URLs are: {}'.format(', '.join(CephadmDaemonHandler.valid_routes))
6951 self.send_response(404, message=bad_request_msg) # reason
6952 self.send_header('Content-type', 'application/json')
6953 self.end_headers()
6954 self.wfile.write(json.dumps({'message': bad_request_msg}).encode('utf-8'))
6955
6956 def log_message(self, format, *args):
6957 rqst = ' '.join(str(a) for a in args)
6958 logger.info(f'client:{self.address_string()} [{self.log_date_time_string()}] {rqst}')
6959
6960
6961 class CephadmDaemon():
6962
6963 daemon_type = 'cephadm-exporter'
6964 default_port = 9443
6965 key_name = 'key'
6966 crt_name = 'crt'
6967 token_name = 'token'
6968 config_requirements = [
6969 key_name,
6970 crt_name,
6971 token_name,
6972 ]
6973 loop_delay = 1
6974 thread_check_interval = 5
6975
6976 def __init__(self, ctx: CephadmContext, fsid, daemon_id=None, port=None):
6977 self.ctx = ctx
6978 self.fsid = fsid
6979 self.daemon_id = daemon_id
6980 if not port:
6981 self.port = CephadmDaemon.default_port
6982 else:
6983 self.port = port
6984 self.workers: List[Thread] = []
6985 self.http_server: CephadmHTTPServer
6986 self.stop = False
6987 self.cephadm_cache = CephadmCache()
6988 self.errors: List[str] = []
6989 self.token = read_file([os.path.join(self.daemon_path, CephadmDaemon.token_name)])
6990
6991 @classmethod
6992 def validate_config(cls, config):
6993 reqs = ', '.join(CephadmDaemon.config_requirements)
6994 errors = []
6995
6996 if not config or not all([k_name in config for k_name in CephadmDaemon.config_requirements]):
6997 raise Error(f'config must contain the following fields : {reqs}')
6998
6999 if not all([isinstance(config[k_name], str) for k_name in CephadmDaemon.config_requirements]):
7000 errors.append(f'the following fields must be strings: {reqs}')
7001
7002 crt = config[CephadmDaemon.crt_name]
7003 key = config[CephadmDaemon.key_name]
7004 token = config[CephadmDaemon.token_name]
7005
7006 if not crt.startswith('-----BEGIN CERTIFICATE-----') or not crt.endswith('-----END CERTIFICATE-----\n'):
7007 errors.append('crt field is not a valid SSL certificate')
7008 if not key.startswith('-----BEGIN PRIVATE KEY-----') or not key.endswith('-----END PRIVATE KEY-----\n'):
7009 errors.append('key is not a valid SSL private key')
7010 if len(token) < 8:
7011 errors.append("'token' must be more than 8 characters long")
7012
7013 if 'port' in config:
7014 try:
7015 p = int(config['port'])
7016 if p <= 1024:
7017 raise ValueError
7018 except (TypeError, ValueError):
7019 errors.append('port must be an integer > 1024')
7020
7021 if errors:
7022 raise Error('Parameter errors : {}'.format(', '.join(errors)))
7023
7024 @property
7025 def port_active(self):
7026 return port_in_use(self.ctx, self.port)
7027
7028 @property
7029 def can_run(self):
7030 # if port is in use
7031 if self.port_active:
7032 self.errors.append(f'TCP port {self.port} already in use, unable to bind')
7033 if not os.path.exists(os.path.join(self.daemon_path, CephadmDaemon.key_name)):
7034 self.errors.append(f"Key file '{CephadmDaemon.key_name}' is missing from {self.daemon_path}")
7035 if not os.path.exists(os.path.join(self.daemon_path, CephadmDaemon.crt_name)):
7036 self.errors.append(f"Certificate file '{CephadmDaemon.crt_name}' is missing from {self.daemon_path}")
7037 if self.token == 'Unknown':
7038 self.errors.append(f"Authentication token '{CephadmDaemon.token_name}' is missing from {self.daemon_path}")
7039 return len(self.errors) == 0
7040
7041 @staticmethod
7042 def _unit_name(fsid, daemon_id):
7043 return '{}.service'.format(get_unit_name(fsid, CephadmDaemon.daemon_type, daemon_id))
7044
7045 @property
7046 def unit_name(self):
7047 return CephadmDaemon._unit_name(self.fsid, self.daemon_id)
7048
7049 @property
7050 def daemon_path(self):
7051 return os.path.join(
7052 self.ctx.data_dir,
7053 self.fsid,
7054 f'{self.daemon_type}.{self.daemon_id}'
7055 )
7056
7057 @property
7058 def binary_path(self):
7059 path = os.path.realpath(__file__)
7060 assert os.path.isfile(path)
7061 return path
7062
7063 def _handle_thread_exception(self, exc, thread_type):
7064 e_msg = f'{exc.__class__.__name__} exception: {str(exc)}'
7065 thread_info = getattr(self.cephadm_cache, thread_type)
7066 errors = thread_info.get('scrape_errors', [])
7067 errors.append(e_msg)
7068 logger.error(e_msg)
7069 logger.exception(exc)
7070 self.cephadm_cache.update_task(
7071 thread_type,
7072 {
7073 'scrape_errors': errors,
7074 'data': None,
7075 }
7076 )
7077
7078 def _scrape_host_facts(self, refresh_interval=10):
7079 ctr = 0
7080 exception_encountered = False
7081
7082 while True:
7083
7084 if self.stop or exception_encountered:
7085 break
7086
7087 if ctr >= refresh_interval:
7088 ctr = 0
7089 logger.debug('executing host-facts scrape')
7090 errors = []
7091 s_time = time.time()
7092
7093 try:
7094 facts = HostFacts(self.ctx)
7095 except Exception as e:
7096 self._handle_thread_exception(e, 'host')
7097 exception_encountered = True
7098 else:
7099 elapsed = time.time() - s_time
7100 try:
7101 data = json.loads(facts.dump())
7102 except json.decoder.JSONDecodeError:
7103 errors.append('host-facts provided invalid JSON')
7104 logger.warning(errors[-1])
7105 data = {}
7106 self.cephadm_cache.update_task(
7107 'host',
7108 {
7109 'scrape_timestamp': s_time,
7110 'scrape_duration_secs': elapsed,
7111 'scrape_errors': errors,
7112 'data': data,
7113 }
7114 )
7115 logger.debug(f'completed host-facts scrape - {elapsed}s')
7116
7117 time.sleep(CephadmDaemon.loop_delay)
7118 ctr += CephadmDaemon.loop_delay
7119 logger.info('host-facts thread stopped')
7120
7121 def _scrape_ceph_volume(self, refresh_interval=15):
7122 # we're invoking the ceph_volume command, so we need to set the args that it
7123 # expects to use
7124 self.ctx.command = 'inventory --format=json'.split()
7125 self.ctx.fsid = self.fsid
7126
7127 ctr = 0
7128 exception_encountered = False
7129
7130 while True:
7131 if self.stop or exception_encountered:
7132 break
7133
7134 if ctr >= refresh_interval:
7135 ctr = 0
7136 logger.debug('executing ceph-volume scrape')
7137 errors = []
7138 s_time = time.time()
7139 stream = io.StringIO()
7140 try:
7141 with redirect_stdout(stream):
7142 command_ceph_volume(self.ctx)
7143 except Exception as e:
7144 self._handle_thread_exception(e, 'disks')
7145 exception_encountered = True
7146 else:
7147 elapsed = time.time() - s_time
7148
7149 # if the call to ceph-volume returns junk with the
7150 # json, it won't parse
7151 stdout = stream.getvalue()
7152
7153 data = []
7154 if stdout:
7155 try:
7156 data = json.loads(stdout)
7157 except json.decoder.JSONDecodeError:
7158 errors.append('ceph-volume thread provided bad json data')
7159 logger.warning(errors[-1])
7160 else:
7161 errors.append('ceph-volume did not return any data')
7162 logger.warning(errors[-1])
7163
7164 self.cephadm_cache.update_task(
7165 'disks',
7166 {
7167 'scrape_timestamp': s_time,
7168 'scrape_duration_secs': elapsed,
7169 'scrape_errors': errors,
7170 'data': data,
7171 }
7172 )
7173
7174 logger.debug(f'completed ceph-volume scrape - {elapsed}s')
7175 time.sleep(CephadmDaemon.loop_delay)
7176 ctr += CephadmDaemon.loop_delay
7177
7178 logger.info('ceph-volume thread stopped')
7179
7180 def _scrape_list_daemons(self, refresh_interval=20):
7181 ctr = 0
7182 exception_encountered = False
7183 while True:
7184 if self.stop or exception_encountered:
7185 break
7186
7187 if ctr >= refresh_interval:
7188 ctr = 0
7189 logger.debug('executing list-daemons scrape')
7190 errors = []
7191 s_time = time.time()
7192
7193 try:
7194 # list daemons should ideally be invoked with a fsid
7195 data = list_daemons(self.ctx)
7196 except Exception as e:
7197 self._handle_thread_exception(e, 'daemons')
7198 exception_encountered = True
7199 else:
7200 if not isinstance(data, list):
7201 errors.append('list-daemons did not supply a list?')
7202 logger.warning(errors[-1])
7203 data = []
7204 elapsed = time.time() - s_time
7205 self.cephadm_cache.update_task(
7206 'daemons',
7207 {
7208 'scrape_timestamp': s_time,
7209 'scrape_duration_secs': elapsed,
7210 'scrape_errors': errors,
7211 'data': data,
7212 }
7213 )
7214 logger.debug(f'completed list-daemons scrape - {elapsed}s')
7215
7216 time.sleep(CephadmDaemon.loop_delay)
7217 ctr += CephadmDaemon.loop_delay
7218 logger.info('list-daemons thread stopped')
7219
7220 def _create_thread(self, target, name, refresh_interval=None):
7221 if refresh_interval:
7222 t = Thread(target=target, args=(refresh_interval,))
7223 else:
7224 t = Thread(target=target)
7225 t.daemon = True
7226 t.name = name
7227 self.cephadm_cache.update_health(name, 'active')
7228 t.start()
7229
7230 start_msg = f'Started {name} thread'
7231 if refresh_interval:
7232 logger.info(f'{start_msg}, with a refresh interval of {refresh_interval}s')
7233 else:
7234 logger.info(f'{start_msg}')
7235 return t
7236
7237 def reload(self, *args):
7238 """reload -HUP received
7239
7240 This is a placeholder function only, and serves to provide the hook that could
7241 be exploited later if the exporter evolves to incorporate a config file
7242 """
7243 logger.info('Reload request received - ignoring, no action needed')
7244
7245 def shutdown(self, *args):
7246 logger.info('Shutdown request received')
7247 self.stop = True
7248 self.http_server.shutdown()
7249
7250 def run(self):
7251 logger.info(f"cephadm exporter starting for FSID '{self.fsid}'")
7252 if not self.can_run:
7253 logger.error('Unable to start the exporter daemon')
7254 for e in self.errors:
7255 logger.error(e)
7256 return
7257
7258 # register signal handlers for running under systemd control
7259 signal.signal(signal.SIGTERM, self.shutdown)
7260 signal.signal(signal.SIGINT, self.shutdown)
7261 signal.signal(signal.SIGHUP, self.reload)
7262 logger.debug('Signal handlers attached')
7263
7264 host_facts = self._create_thread(self._scrape_host_facts, 'host', 5)
7265 self.workers.append(host_facts)
7266
7267 daemons = self._create_thread(self._scrape_list_daemons, 'daemons', 20)
7268 self.workers.append(daemons)
7269
7270 disks = self._create_thread(self._scrape_ceph_volume, 'disks', 20)
7271 self.workers.append(disks)
7272
7273 self.http_server = CephadmHTTPServer(('0.0.0.0', self.port), CephadmDaemonHandler) # IPv4 only
7274 self.http_server.socket = ssl.wrap_socket(self.http_server.socket,
7275 keyfile=os.path.join(self.daemon_path, CephadmDaemon.key_name),
7276 certfile=os.path.join(self.daemon_path, CephadmDaemon.crt_name),
7277 server_side=True)
7278
7279 self.http_server.cephadm_cache = self.cephadm_cache
7280 self.http_server.token = self.token
7281 server_thread = self._create_thread(self.http_server.serve_forever, 'http_server')
7282 logger.info(f'https server listening on {self.http_server.server_address[0]}:{self.http_server.server_port}')
7283
7284 ctr = 0
7285 while server_thread.is_alive():
7286 if self.stop:
7287 break
7288
7289 if ctr >= CephadmDaemon.thread_check_interval:
7290 ctr = 0
7291 for worker in self.workers:
7292 if self.cephadm_cache.tasks[worker.name] == 'inactive':
7293 continue
7294 if not worker.is_alive():
7295 logger.warning(f'{worker.name} thread not running')
7296 stop_time = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')
7297 self.cephadm_cache.update_health(worker.name, 'inactive', f'{worker.name} stopped at {stop_time}')
7298
7299 time.sleep(CephadmDaemon.loop_delay)
7300 ctr += CephadmDaemon.loop_delay
7301
7302 logger.info('Main http server thread stopped')
7303
7304 @property
7305 def unit_run(self):
7306
7307 return """set -e
7308 {py3} {bin_path} exporter --fsid {fsid} --id {daemon_id} --port {port} &""".format(
7309 py3=shutil.which('python3'),
7310 bin_path=self.binary_path,
7311 fsid=self.fsid,
7312 daemon_id=self.daemon_id,
7313 port=self.port
7314 )
7315
7316 @property
7317 def unit_file(self):
7318 docker = isinstance(self.ctx.container_engine, Docker)
7319 return """#generated by cephadm
7320 [Unit]
7321 Description=cephadm exporter service for cluster {fsid}
7322 After=network-online.target{docker_after}
7323 Wants=network-online.target
7324 {docker_requires}
7325
7326 PartOf=ceph-{fsid}.target
7327 Before=ceph-{fsid}.target
7328
7329 [Service]
7330 Type=forking
7331 ExecStart=/bin/bash {daemon_path}/unit.run
7332 ExecReload=/bin/kill -HUP $MAINPID
7333 Restart=on-failure
7334 RestartSec=10s
7335
7336 [Install]
7337 WantedBy=ceph-{fsid}.target
7338 """.format(fsid=self.fsid,
7339 daemon_path=self.daemon_path,
7340 # if docker, we depend on docker.service
7341 docker_after=' docker.service' if docker else '',
7342 docker_requires='Requires=docker.service\n' if docker else '')
7343
7344 def deploy_daemon_unit(self, config=None):
7345 """deploy a specific unit file for cephadm
7346
7347 The normal deploy_daemon_units doesn't apply for this
7348 daemon since it's not a container, so we just create a
7349 simple service definition and add it to the fsid's target
7350 """
7351 if not config:
7352 raise Error('Attempting to deploy cephadm daemon without a config')
7353 assert isinstance(config, dict)
7354
7355 # Create the required config files in the daemons dir, with restricted permissions
7356 for filename in config:
7357 with open(os.open(os.path.join(self.daemon_path, filename), os.O_CREAT | os.O_WRONLY, mode=0o600), 'w') as f:
7358 f.write(config[filename])
7359
7360 # When __file__ is <stdin> we're being invoked over remoto via the orchestrator, so
7361 # we pick up the file from where the orchestrator placed it - otherwise we'll
7362 # copy it to the binary location for this cluster
7363 if not __file__ == '<stdin>':
7364 shutil.copy(__file__,
7365 self.binary_path)
7366
7367 with open(os.path.join(self.daemon_path, 'unit.run'), 'w') as f:
7368 f.write(self.unit_run)
7369
7370 with open(
7371 os.path.join(self.ctx.unit_dir,
7372 f'{self.unit_name}.new'),
7373 'w'
7374 ) as f:
7375 f.write(self.unit_file)
7376 os.rename(
7377 os.path.join(self.ctx.unit_dir, f'{self.unit_name}.new'),
7378 os.path.join(self.ctx.unit_dir, self.unit_name))
7379
7380 call_throws(self.ctx, ['systemctl', 'daemon-reload'])
7381 call(self.ctx, ['systemctl', 'stop', self.unit_name],
7382 verbosity=CallVerbosity.DEBUG)
7383 call(self.ctx, ['systemctl', 'reset-failed', self.unit_name],
7384 verbosity=CallVerbosity.DEBUG)
7385 call_throws(self.ctx, ['systemctl', 'enable', '--now', self.unit_name])
7386
7387 @classmethod
7388 def uninstall(cls, ctx: CephadmContext, fsid, daemon_type, daemon_id):
7389 unit_name = CephadmDaemon._unit_name(fsid, daemon_id)
7390 unit_path = os.path.join(ctx.unit_dir, unit_name)
7391 unit_run = os.path.join(ctx.data_dir, fsid, f'{daemon_type}.{daemon_id}', 'unit.run')
7392 port = None
7393 try:
7394 with open(unit_run, 'r') as u:
7395 contents = u.read().strip(' &')
7396 except OSError:
7397 logger.warning(f'Unable to access the unit.run file @ {unit_run}')
7398 return
7399
7400 port = None
7401 for line in contents.split('\n'):
7402 if '--port ' in line:
7403 try:
7404 port = int(line.split('--port ')[-1])
7405 except ValueError:
7406 logger.warning('Unexpected format in unit.run file: port is not numeric')
7407 logger.warning('Unable to remove the systemd file and close the port')
7408 return
7409 break
7410
7411 if port:
7412 fw = Firewalld(ctx)
7413 try:
7414 fw.close_ports([port])
7415 except RuntimeError:
7416 logger.error(f'Unable to close port {port}')
7417
7418 stdout, stderr, rc = call(ctx, ['rm', '-f', unit_path])
7419 if rc:
7420 logger.error(f'Unable to remove the systemd file @ {unit_path}')
7421 else:
7422 logger.info(f'removed systemd unit file @ {unit_path}')
7423 stdout, stderr, rc = call(ctx, ['systemctl', 'daemon-reload'])
7424
7425
7426 def command_exporter(ctx: CephadmContext):
7427 exporter = CephadmDaemon(ctx, ctx.fsid, daemon_id=ctx.id, port=ctx.port)
7428
7429 if ctx.fsid not in os.listdir(ctx.data_dir):
7430 raise Error(f"cluster fsid '{ctx.fsid}' not found in '{ctx.data_dir}'")
7431
7432 exporter.run()
7433
7434 ##################################
7435
7436
7437 def systemd_target_state(target_name: str, subsystem: str = 'ceph') -> bool:
7438 # TODO: UNITTEST
7439 return os.path.exists(
7440 os.path.join(
7441 UNIT_DIR,
7442 f'{subsystem}.target.wants',
7443 target_name
7444 )
7445 )
7446
7447
7448 @infer_fsid
7449 def command_maintenance(ctx: CephadmContext):
7450 if not ctx.fsid:
7451 raise Error('must pass --fsid to specify cluster')
7452
7453 target = f'ceph-{ctx.fsid}.target'
7454
7455 if ctx.maintenance_action.lower() == 'enter':
7456 logger.info('Requested to place host into maintenance')
7457 if systemd_target_state(target):
7458 _out, _err, code = call(ctx,
7459 ['systemctl', 'disable', target],
7460 verbosity=CallVerbosity.DEBUG)
7461 if code:
7462 logger.error(f'Failed to disable the {target} target')
7463 return 'failed - to disable the target'
7464 else:
7465 # stopping a target waits by default
7466 _out, _err, code = call(ctx,
7467 ['systemctl', 'stop', target],
7468 verbosity=CallVerbosity.DEBUG)
7469 if code:
7470 logger.error(f'Failed to stop the {target} target')
7471 return 'failed - to disable the target'
7472 else:
7473 return f'success - systemd target {target} disabled'
7474
7475 else:
7476 return 'skipped - target already disabled'
7477
7478 else:
7479 logger.info('Requested to exit maintenance state')
7480 # exit maintenance request
7481 if not systemd_target_state(target):
7482 _out, _err, code = call(ctx,
7483 ['systemctl', 'enable', target],
7484 verbosity=CallVerbosity.DEBUG)
7485 if code:
7486 logger.error(f'Failed to enable the {target} target')
7487 return 'failed - unable to enable the target'
7488 else:
7489 # starting a target waits by default
7490 _out, _err, code = call(ctx,
7491 ['systemctl', 'start', target],
7492 verbosity=CallVerbosity.DEBUG)
7493 if code:
7494 logger.error(f'Failed to start the {target} target')
7495 return 'failed - unable to start the target'
7496 else:
7497 return f'success - systemd target {target} enabled and started'
7498
7499 ##################################
7500
7501
7502 def _get_parser():
7503 # type: () -> argparse.ArgumentParser
7504 parser = argparse.ArgumentParser(
7505 description='Bootstrap Ceph daemons with systemd and containers.',
7506 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
7507 parser.add_argument(
7508 '--image',
7509 help='container image. Can also be set via the "CEPHADM_IMAGE" '
7510 'env var')
7511 parser.add_argument(
7512 '--docker',
7513 action='store_true',
7514 help='use docker instead of podman')
7515 parser.add_argument(
7516 '--data-dir',
7517 default=DATA_DIR,
7518 help='base directory for daemon data')
7519 parser.add_argument(
7520 '--log-dir',
7521 default=LOG_DIR,
7522 help='base directory for daemon logs')
7523 parser.add_argument(
7524 '--logrotate-dir',
7525 default=LOGROTATE_DIR,
7526 help='location of logrotate configuration files')
7527 parser.add_argument(
7528 '--sysctl-dir',
7529 default=SYSCTL_DIR,
7530 help='location of sysctl configuration files')
7531 parser.add_argument(
7532 '--unit-dir',
7533 default=UNIT_DIR,
7534 help='base directory for systemd units')
7535 parser.add_argument(
7536 '--verbose', '-v',
7537 action='store_true',
7538 help='Show debug-level log messages')
7539 parser.add_argument(
7540 '--timeout',
7541 type=int,
7542 default=DEFAULT_TIMEOUT,
7543 help='timeout in seconds')
7544 parser.add_argument(
7545 '--retry',
7546 type=int,
7547 default=DEFAULT_RETRY,
7548 help='max number of retries')
7549 parser.add_argument(
7550 '--env', '-e',
7551 action='append',
7552 default=[],
7553 help='set environment variable')
7554 parser.add_argument(
7555 '--no-container-init',
7556 action='store_true',
7557 default=not CONTAINER_INIT,
7558 help='Do not run podman/docker with `--init`')
7559
7560 subparsers = parser.add_subparsers(help='sub-command')
7561
7562 parser_version = subparsers.add_parser(
7563 'version', help='get ceph version from container')
7564 parser_version.set_defaults(func=command_version)
7565
7566 parser_pull = subparsers.add_parser(
7567 'pull', help='pull latest image version')
7568 parser_pull.set_defaults(func=command_pull)
7569
7570 parser_inspect_image = subparsers.add_parser(
7571 'inspect-image', help='inspect local container image')
7572 parser_inspect_image.set_defaults(func=command_inspect_image)
7573
7574 parser_ls = subparsers.add_parser(
7575 'ls', help='list daemon instances on this host')
7576 parser_ls.set_defaults(func=command_ls)
7577 parser_ls.add_argument(
7578 '--no-detail',
7579 action='store_true',
7580 help='Do not include daemon status')
7581 parser_ls.add_argument(
7582 '--legacy-dir',
7583 default='/',
7584 help='base directory for legacy daemon data')
7585
7586 parser_list_networks = subparsers.add_parser(
7587 'list-networks', help='list IP networks')
7588 parser_list_networks.set_defaults(func=command_list_networks)
7589
7590 parser_adopt = subparsers.add_parser(
7591 'adopt', help='adopt daemon deployed with a different tool')
7592 parser_adopt.set_defaults(func=command_adopt)
7593 parser_adopt.add_argument(
7594 '--name', '-n',
7595 required=True,
7596 help='daemon name (type.id)')
7597 parser_adopt.add_argument(
7598 '--style',
7599 required=True,
7600 help='deployment style (legacy, ...)')
7601 parser_adopt.add_argument(
7602 '--cluster',
7603 default='ceph',
7604 help='cluster name')
7605 parser_adopt.add_argument(
7606 '--legacy-dir',
7607 default='/',
7608 help='base directory for legacy daemon data')
7609 parser_adopt.add_argument(
7610 '--config-json',
7611 help='Additional configuration information in JSON format')
7612 parser_adopt.add_argument(
7613 '--skip-firewalld',
7614 action='store_true',
7615 help='Do not configure firewalld')
7616 parser_adopt.add_argument(
7617 '--skip-pull',
7618 action='store_true',
7619 help='do not pull the latest image before adopting')
7620 parser_adopt.add_argument(
7621 '--force-start',
7622 action='store_true',
7623 help='start newly adoped daemon, even if it was not running previously')
7624 parser_adopt.add_argument(
7625 '--container-init',
7626 action='store_true',
7627 default=CONTAINER_INIT,
7628 help=argparse.SUPPRESS)
7629
7630 parser_rm_daemon = subparsers.add_parser(
7631 'rm-daemon', help='remove daemon instance')
7632 parser_rm_daemon.set_defaults(func=command_rm_daemon)
7633 parser_rm_daemon.add_argument(
7634 '--name', '-n',
7635 required=True,
7636 action=CustomValidation,
7637 help='daemon name (type.id)')
7638 parser_rm_daemon.add_argument(
7639 '--fsid',
7640 required=True,
7641 help='cluster FSID')
7642 parser_rm_daemon.add_argument(
7643 '--force',
7644 action='store_true',
7645 help='proceed, even though this may destroy valuable data')
7646 parser_rm_daemon.add_argument(
7647 '--force-delete-data',
7648 action='store_true',
7649 help='delete valuable daemon data instead of making a backup')
7650
7651 parser_rm_cluster = subparsers.add_parser(
7652 'rm-cluster', help='remove all daemons for a cluster')
7653 parser_rm_cluster.set_defaults(func=command_rm_cluster)
7654 parser_rm_cluster.add_argument(
7655 '--fsid',
7656 required=True,
7657 help='cluster FSID')
7658 parser_rm_cluster.add_argument(
7659 '--force',
7660 action='store_true',
7661 help='proceed, even though this may destroy valuable data')
7662 parser_rm_cluster.add_argument(
7663 '--keep-logs',
7664 action='store_true',
7665 help='do not remove log files')
7666 parser_rm_cluster.add_argument(
7667 '--zap-osds',
7668 action='store_true',
7669 help='zap OSD devices for this cluster')
7670
7671 parser_run = subparsers.add_parser(
7672 'run', help='run a ceph daemon, in a container, in the foreground')
7673 parser_run.set_defaults(func=command_run)
7674 parser_run.add_argument(
7675 '--name', '-n',
7676 required=True,
7677 help='daemon name (type.id)')
7678 parser_run.add_argument(
7679 '--fsid',
7680 required=True,
7681 help='cluster FSID')
7682
7683 parser_shell = subparsers.add_parser(
7684 'shell', help='run an interactive shell inside a daemon container')
7685 parser_shell.set_defaults(func=command_shell)
7686 parser_shell.add_argument(
7687 '--fsid',
7688 help='cluster FSID')
7689 parser_shell.add_argument(
7690 '--name', '-n',
7691 help='daemon name (type.id)')
7692 parser_shell.add_argument(
7693 '--config', '-c',
7694 help='ceph.conf to pass through to the container')
7695 parser_shell.add_argument(
7696 '--keyring', '-k',
7697 help='ceph.keyring to pass through to the container')
7698 parser_shell.add_argument(
7699 '--mount', '-m',
7700 help=('mount a file or directory in the container. '
7701 'Support multiple mounts. '
7702 'ie: `--mount /foo /bar:/bar`. '
7703 'When no destination is passed, default is /mnt'),
7704 nargs='+')
7705 parser_shell.add_argument(
7706 '--env', '-e',
7707 action='append',
7708 default=[],
7709 help='set environment variable')
7710 parser_shell.add_argument(
7711 '--volume', '-v',
7712 action='append',
7713 default=[],
7714 help='set environment variable')
7715 parser_shell.add_argument(
7716 'command', nargs=argparse.REMAINDER,
7717 help='command (optional)')
7718 parser_shell.add_argument(
7719 '--no-hosts',
7720 action='store_true',
7721 help='dont pass /etc/hosts through to the container')
7722
7723 parser_enter = subparsers.add_parser(
7724 'enter', help='run an interactive shell inside a running daemon container')
7725 parser_enter.set_defaults(func=command_enter)
7726 parser_enter.add_argument(
7727 '--fsid',
7728 help='cluster FSID')
7729 parser_enter.add_argument(
7730 '--name', '-n',
7731 required=True,
7732 help='daemon name (type.id)')
7733 parser_enter.add_argument(
7734 'command', nargs=argparse.REMAINDER,
7735 help='command')
7736
7737 parser_ceph_volume = subparsers.add_parser(
7738 'ceph-volume', help='run ceph-volume inside a container')
7739 parser_ceph_volume.set_defaults(func=command_ceph_volume)
7740 parser_ceph_volume.add_argument(
7741 '--fsid',
7742 help='cluster FSID')
7743 parser_ceph_volume.add_argument(
7744 '--config-json',
7745 help='JSON file with config and (client.bootrap-osd) key')
7746 parser_ceph_volume.add_argument(
7747 '--config', '-c',
7748 help='ceph conf file')
7749 parser_ceph_volume.add_argument(
7750 '--keyring', '-k',
7751 help='ceph.keyring to pass through to the container')
7752 parser_ceph_volume.add_argument(
7753 'command', nargs=argparse.REMAINDER,
7754 help='command')
7755
7756 parser_zap_osds = subparsers.add_parser(
7757 'zap-osds', help='zap all OSDs associated with a particular fsid')
7758 parser_zap_osds.set_defaults(func=command_zap_osds)
7759 parser_zap_osds.add_argument(
7760 '--fsid',
7761 required=True,
7762 help='cluster FSID')
7763 parser_zap_osds.add_argument(
7764 '--force',
7765 action='store_true',
7766 help='proceed, even though this may destroy valuable data')
7767
7768 parser_unit = subparsers.add_parser(
7769 'unit', help="operate on the daemon's systemd unit")
7770 parser_unit.set_defaults(func=command_unit)
7771 parser_unit.add_argument(
7772 'command',
7773 help='systemd command (start, stop, restart, enable, disable, ...)')
7774 parser_unit.add_argument(
7775 '--fsid',
7776 help='cluster FSID')
7777 parser_unit.add_argument(
7778 '--name', '-n',
7779 required=True,
7780 help='daemon name (type.id)')
7781
7782 parser_logs = subparsers.add_parser(
7783 'logs', help='print journald logs for a daemon container')
7784 parser_logs.set_defaults(func=command_logs)
7785 parser_logs.add_argument(
7786 '--fsid',
7787 help='cluster FSID')
7788 parser_logs.add_argument(
7789 '--name', '-n',
7790 required=True,
7791 help='daemon name (type.id)')
7792 parser_logs.add_argument(
7793 'command', nargs='*',
7794 help='additional journalctl args')
7795
7796 parser_bootstrap = subparsers.add_parser(
7797 'bootstrap', help='bootstrap a cluster (mon + mgr daemons)')
7798 parser_bootstrap.set_defaults(func=command_bootstrap)
7799 parser_bootstrap.add_argument(
7800 '--config', '-c',
7801 help='ceph conf file to incorporate')
7802 parser_bootstrap.add_argument(
7803 '--mon-id',
7804 required=False,
7805 help='mon id (default: local hostname)')
7806 parser_bootstrap.add_argument(
7807 '--mon-addrv',
7808 help='mon IPs (e.g., [v2:localipaddr:3300,v1:localipaddr:6789])')
7809 parser_bootstrap.add_argument(
7810 '--mon-ip',
7811 help='mon IP')
7812 parser_bootstrap.add_argument(
7813 '--mgr-id',
7814 required=False,
7815 help='mgr id (default: randomly generated)')
7816 parser_bootstrap.add_argument(
7817 '--fsid',
7818 help='cluster FSID')
7819 parser_bootstrap.add_argument(
7820 '--output-dir',
7821 default='/etc/ceph',
7822 help='directory to write config, keyring, and pub key files')
7823 parser_bootstrap.add_argument(
7824 '--output-keyring',
7825 help='location to write keyring file with new cluster admin and mon keys')
7826 parser_bootstrap.add_argument(
7827 '--output-config',
7828 help='location to write conf file to connect to new cluster')
7829 parser_bootstrap.add_argument(
7830 '--output-pub-ssh-key',
7831 help="location to write the cluster's public SSH key")
7832 parser_bootstrap.add_argument(
7833 '--skip-admin-label',
7834 action='store_true',
7835 help='do not create admin label for ceph.conf and client.admin keyring distribution')
7836 parser_bootstrap.add_argument(
7837 '--skip-ssh',
7838 action='store_true',
7839 help='skip setup of ssh key on local host')
7840 parser_bootstrap.add_argument(
7841 '--initial-dashboard-user',
7842 default='admin',
7843 help='Initial user for the dashboard')
7844 parser_bootstrap.add_argument(
7845 '--initial-dashboard-password',
7846 help='Initial password for the initial dashboard user')
7847 parser_bootstrap.add_argument(
7848 '--ssl-dashboard-port',
7849 type=int,
7850 default=8443,
7851 help='Port number used to connect with dashboard using SSL')
7852 parser_bootstrap.add_argument(
7853 '--dashboard-key',
7854 type=argparse.FileType('r'),
7855 help='Dashboard key')
7856 parser_bootstrap.add_argument(
7857 '--dashboard-crt',
7858 type=argparse.FileType('r'),
7859 help='Dashboard certificate')
7860
7861 parser_bootstrap.add_argument(
7862 '--ssh-config',
7863 type=argparse.FileType('r'),
7864 help='SSH config')
7865 parser_bootstrap.add_argument(
7866 '--ssh-private-key',
7867 type=argparse.FileType('r'),
7868 help='SSH private key')
7869 parser_bootstrap.add_argument(
7870 '--ssh-public-key',
7871 type=argparse.FileType('r'),
7872 help='SSH public key')
7873 parser_bootstrap.add_argument(
7874 '--ssh-user',
7875 default='root',
7876 help='set user for SSHing to cluster hosts, passwordless sudo will be needed for non-root users')
7877
7878 parser_bootstrap.add_argument(
7879 '--skip-mon-network',
7880 action='store_true',
7881 help='set mon public_network based on bootstrap mon ip')
7882 parser_bootstrap.add_argument(
7883 '--skip-dashboard',
7884 action='store_true',
7885 help='do not enable the Ceph Dashboard')
7886 parser_bootstrap.add_argument(
7887 '--dashboard-password-noupdate',
7888 action='store_true',
7889 help='stop forced dashboard password change')
7890 parser_bootstrap.add_argument(
7891 '--no-minimize-config',
7892 action='store_true',
7893 help='do not assimilate and minimize the config file')
7894 parser_bootstrap.add_argument(
7895 '--skip-ping-check',
7896 action='store_true',
7897 help='do not verify that mon IP is pingable')
7898 parser_bootstrap.add_argument(
7899 '--skip-pull',
7900 action='store_true',
7901 help='do not pull the latest image before bootstrapping')
7902 parser_bootstrap.add_argument(
7903 '--skip-firewalld',
7904 action='store_true',
7905 help='Do not configure firewalld')
7906 parser_bootstrap.add_argument(
7907 '--allow-overwrite',
7908 action='store_true',
7909 help='allow overwrite of existing --output-* config/keyring/ssh files')
7910 parser_bootstrap.add_argument(
7911 '--allow-fqdn-hostname',
7912 action='store_true',
7913 help='allow hostname that is fully-qualified (contains ".")')
7914 parser_bootstrap.add_argument(
7915 '--allow-mismatched-release',
7916 action='store_true',
7917 help="allow bootstrap of ceph that doesn't match this version of cephadm")
7918 parser_bootstrap.add_argument(
7919 '--skip-prepare-host',
7920 action='store_true',
7921 help='Do not prepare host')
7922 parser_bootstrap.add_argument(
7923 '--orphan-initial-daemons',
7924 action='store_true',
7925 help='Set mon and mgr service to `unmanaged`, Do not create the crash service')
7926 parser_bootstrap.add_argument(
7927 '--skip-monitoring-stack',
7928 action='store_true',
7929 help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter)')
7930 parser_bootstrap.add_argument(
7931 '--apply-spec',
7932 help='Apply cluster spec after bootstrap (copy ssh key, add hosts and apply services)')
7933
7934 parser_bootstrap.add_argument(
7935 '--shared_ceph_folder',
7936 metavar='CEPH_SOURCE_FOLDER',
7937 help='Development mode. Several folders in containers are volumes mapped to different sub-folders in the ceph source folder')
7938
7939 parser_bootstrap.add_argument(
7940 '--registry-url',
7941 help='url for custom registry')
7942 parser_bootstrap.add_argument(
7943 '--registry-username',
7944 help='username for custom registry')
7945 parser_bootstrap.add_argument(
7946 '--registry-password',
7947 help='password for custom registry')
7948 parser_bootstrap.add_argument(
7949 '--registry-json',
7950 help='json file with custom registry login info (URL, Username, Password)')
7951 parser_bootstrap.add_argument(
7952 '--container-init',
7953 action='store_true',
7954 default=CONTAINER_INIT,
7955 help=argparse.SUPPRESS)
7956 parser_bootstrap.add_argument(
7957 '--with-exporter',
7958 action='store_true',
7959 help='Automatically deploy cephadm metadata exporter to each node')
7960 parser_bootstrap.add_argument(
7961 '--exporter-config',
7962 action=CustomValidation,
7963 help=f'Exporter configuration information in JSON format (providing: {", ".join(CephadmDaemon.config_requirements)}, port information)')
7964 parser_bootstrap.add_argument(
7965 '--cluster-network',
7966 help='subnet to use for cluster replication, recovery and heartbeats (in CIDR notation network/mask)')
7967 parser_bootstrap.add_argument(
7968 '--single-host-defaults',
7969 action='store_true',
7970 help='adjust configuration defaults to suit a single-host cluster')
7971
7972 parser_deploy = subparsers.add_parser(
7973 'deploy', help='deploy a daemon')
7974 parser_deploy.set_defaults(func=command_deploy)
7975 parser_deploy.add_argument(
7976 '--name',
7977 required=True,
7978 action=CustomValidation,
7979 help='daemon name (type.id)')
7980 parser_deploy.add_argument(
7981 '--fsid',
7982 required=True,
7983 help='cluster FSID')
7984 parser_deploy.add_argument(
7985 '--config', '-c',
7986 help='config file for new daemon')
7987 parser_deploy.add_argument(
7988 '--config-json',
7989 help='Additional configuration information in JSON format')
7990 parser_deploy.add_argument(
7991 '--keyring',
7992 help='keyring for new daemon')
7993 parser_deploy.add_argument(
7994 '--key',
7995 help='key for new daemon')
7996 parser_deploy.add_argument(
7997 '--osd-fsid',
7998 help='OSD uuid, if creating an OSD container')
7999 parser_deploy.add_argument(
8000 '--skip-firewalld',
8001 action='store_true',
8002 help='Do not configure firewalld')
8003 parser_deploy.add_argument(
8004 '--tcp-ports',
8005 help='List of tcp ports to open in the host firewall')
8006 parser_deploy.add_argument(
8007 '--reconfig',
8008 action='store_true',
8009 help='Reconfigure a previously deployed daemon')
8010 parser_deploy.add_argument(
8011 '--allow-ptrace',
8012 action='store_true',
8013 help='Allow SYS_PTRACE on daemon container')
8014 parser_deploy.add_argument(
8015 '--container-init',
8016 action='store_true',
8017 default=CONTAINER_INIT,
8018 help=argparse.SUPPRESS)
8019 parser_deploy.add_argument(
8020 '--memory-request',
8021 help='Container memory request/target'
8022 )
8023 parser_deploy.add_argument(
8024 '--memory-limit',
8025 help='Container memory hard limit'
8026 )
8027 parser_deploy.add_argument(
8028 '--meta-json',
8029 help='JSON dict of additional metadata'
8030 )
8031
8032 parser_check_host = subparsers.add_parser(
8033 'check-host', help='check host configuration')
8034 parser_check_host.set_defaults(func=command_check_host)
8035 parser_check_host.add_argument(
8036 '--expect-hostname',
8037 help='Check that hostname matches an expected value')
8038
8039 parser_prepare_host = subparsers.add_parser(
8040 'prepare-host', help='prepare a host for cephadm use')
8041 parser_prepare_host.set_defaults(func=command_prepare_host)
8042 parser_prepare_host.add_argument(
8043 '--expect-hostname',
8044 help='Set hostname')
8045
8046 parser_add_repo = subparsers.add_parser(
8047 'add-repo', help='configure package repository')
8048 parser_add_repo.set_defaults(func=command_add_repo)
8049 parser_add_repo.add_argument(
8050 '--release',
8051 help='use latest version of a named release (e.g., {})'.format(LATEST_STABLE_RELEASE))
8052 parser_add_repo.add_argument(
8053 '--version',
8054 help='use specific upstream version (x.y.z)')
8055 parser_add_repo.add_argument(
8056 '--dev',
8057 help='use specified bleeding edge build from git branch or tag')
8058 parser_add_repo.add_argument(
8059 '--dev-commit',
8060 help='use specified bleeding edge build from git commit')
8061 parser_add_repo.add_argument(
8062 '--gpg-url',
8063 help='specify alternative GPG key location')
8064 parser_add_repo.add_argument(
8065 '--repo-url',
8066 default='https://download.ceph.com',
8067 help='specify alternative repo location')
8068 # TODO: proxy?
8069
8070 parser_rm_repo = subparsers.add_parser(
8071 'rm-repo', help='remove package repository configuration')
8072 parser_rm_repo.set_defaults(func=command_rm_repo)
8073
8074 parser_install = subparsers.add_parser(
8075 'install', help='install ceph package(s)')
8076 parser_install.set_defaults(func=command_install)
8077 parser_install.add_argument(
8078 'packages', nargs='*',
8079 default=['cephadm'],
8080 help='packages')
8081
8082 parser_registry_login = subparsers.add_parser(
8083 'registry-login', help='log host into authenticated registry')
8084 parser_registry_login.set_defaults(func=command_registry_login)
8085 parser_registry_login.add_argument(
8086 '--registry-url',
8087 help='url for custom registry')
8088 parser_registry_login.add_argument(
8089 '--registry-username',
8090 help='username for custom registry')
8091 parser_registry_login.add_argument(
8092 '--registry-password',
8093 help='password for custom registry')
8094 parser_registry_login.add_argument(
8095 '--registry-json',
8096 help='json file with custom registry login info (URL, Username, Password)')
8097 parser_registry_login.add_argument(
8098 '--fsid',
8099 help='cluster FSID')
8100
8101 parser_gather_facts = subparsers.add_parser(
8102 'gather-facts', help='gather and return host related information (JSON format)')
8103 parser_gather_facts.set_defaults(func=command_gather_facts)
8104
8105 parser_exporter = subparsers.add_parser(
8106 'exporter', help='Start cephadm in exporter mode (web service), providing host/daemon/disk metadata')
8107 parser_exporter.add_argument(
8108 '--fsid',
8109 required=True,
8110 type=str,
8111 help='fsid of the cephadm exporter to run against')
8112 parser_exporter.add_argument(
8113 '--port',
8114 type=int,
8115 default=int(CephadmDaemon.default_port),
8116 help='port number for the cephadm exporter service')
8117 parser_exporter.add_argument(
8118 '--id',
8119 type=str,
8120 default=get_hostname().split('.')[0],
8121 help='daemon identifer for the exporter')
8122 parser_exporter.set_defaults(func=command_exporter)
8123
8124 parser_maintenance = subparsers.add_parser(
8125 'host-maintenance', help='Manage the maintenance state of a host')
8126 parser_maintenance.add_argument(
8127 '--fsid',
8128 help='cluster FSID')
8129 parser_maintenance.add_argument(
8130 'maintenance_action',
8131 type=str,
8132 choices=['enter', 'exit'],
8133 help='Maintenance action - enter maintenance, or exit maintenance')
8134 parser_maintenance.set_defaults(func=command_maintenance)
8135
8136 return parser
8137
8138
8139 def _parse_args(av):
8140 parser = _get_parser()
8141
8142 args = parser.parse_args(av)
8143 if 'command' in args and args.command and args.command[0] == '--':
8144 args.command.pop(0)
8145
8146 # workaround argparse to deprecate the subparser `--container-init` flag
8147 # container_init and no_container_init must always be mutually exclusive
8148 container_init_args = ('--container-init', '--no-container-init')
8149 if set(container_init_args).issubset(av):
8150 parser.error('argument %s: not allowed with argument %s' % (container_init_args))
8151 elif '--container-init' in av:
8152 args.no_container_init = not args.container_init
8153 else:
8154 args.container_init = not args.no_container_init
8155 assert args.container_init is not args.no_container_init
8156
8157 return args
8158
8159
8160 def cephadm_init_ctx(args: List[str]) -> CephadmContext:
8161 ctx = CephadmContext()
8162 ctx.set_args(_parse_args(args))
8163 return ctx
8164
8165
8166 def cephadm_init(args: List[str]) -> CephadmContext:
8167 global logger
8168 ctx = cephadm_init_ctx(args)
8169
8170 # Logger configuration
8171 if not os.path.exists(LOG_DIR):
8172 os.makedirs(LOG_DIR)
8173 dictConfig(logging_config)
8174 logger = logging.getLogger()
8175
8176 if not os.path.exists(ctx.logrotate_dir + '/cephadm'):
8177 with open(ctx.logrotate_dir + '/cephadm', 'w') as f:
8178 f.write("""# created by cephadm
8179 /var/log/ceph/cephadm.log {
8180 rotate 7
8181 daily
8182 compress
8183 missingok
8184 notifempty
8185 }
8186 """)
8187
8188 if ctx.verbose:
8189 for handler in logger.handlers:
8190 if handler.name == 'console':
8191 handler.setLevel(logging.DEBUG)
8192
8193 return ctx
8194
8195
8196 def main():
8197
8198 # root?
8199 if os.geteuid() != 0:
8200 sys.stderr.write('ERROR: cephadm should be run as root\n')
8201 sys.exit(1)
8202
8203 av: List[str] = []
8204 av = sys.argv[1:]
8205
8206 ctx = cephadm_init(av)
8207 if not ctx.has_function():
8208 sys.stderr.write('No command specified; pass -h or --help for usage\n')
8209 sys.exit(1)
8210
8211 try:
8212 # podman or docker?
8213 ctx.container_engine = find_container_engine(ctx)
8214 if ctx.func not in \
8215 [command_check_host, command_prepare_host, command_add_repo, command_install]:
8216 check_container_engine(ctx)
8217 # command handler
8218 r = ctx.func(ctx)
8219 except Error as e:
8220 if ctx.verbose:
8221 raise
8222 logger.error('ERROR: %s' % e)
8223 sys.exit(1)
8224 if not r:
8225 r = 0
8226 sys.exit(r)
8227
8228
8229 if __name__ == '__main__':
8230 main()