]> git.proxmox.com Git - ceph.git/blob - ceph/src/cephadm/cephadm
601b8f4a6dadafac47a3892d198f2891feede1ed
[ceph.git] / ceph / src / cephadm / cephadm
1 #!/usr/bin/python3
2
3 import asyncio
4 import asyncio.subprocess
5 import argparse
6 import datetime
7 import fcntl
8 import ipaddress
9 import json
10 import logging
11 from logging.config import dictConfig
12 import os
13 import platform
14 import pwd
15 import random
16 import shlex
17 import shutil
18 import socket
19 import string
20 import subprocess
21 import sys
22 import tempfile
23 import time
24 import errno
25 import struct
26 from socketserver import ThreadingMixIn
27 from http.server import BaseHTTPRequestHandler, HTTPServer
28 import signal
29 import io
30 from contextlib import redirect_stdout
31 import ssl
32 from enum import Enum
33
34 from typing import Dict, List, Tuple, Optional, Union, Any, NoReturn, Callable, IO
35
36 import re
37 import uuid
38
39 from configparser import ConfigParser
40 from functools import wraps
41 from glob import glob
42 from io import StringIO
43 from threading import Thread, RLock
44 from urllib.error import HTTPError
45 from urllib.request import urlopen
46 from pathlib import Path
47
48 # Default container images -----------------------------------------------------
49 DEFAULT_IMAGE = 'docker.io/ceph/ceph:v16'
50 DEFAULT_IMAGE_IS_MASTER = False
51 DEFAULT_IMAGE_RELEASE = 'pacific'
52 DEFAULT_PROMETHEUS_IMAGE = 'docker.io/prom/prometheus:v2.18.1'
53 DEFAULT_NODE_EXPORTER_IMAGE = 'docker.io/prom/node-exporter:v0.18.1'
54 DEFAULT_GRAFANA_IMAGE = 'docker.io/ceph/ceph-grafana:6.7.4'
55 DEFAULT_ALERT_MANAGER_IMAGE = 'docker.io/prom/alertmanager:v0.20.0'
56 DEFAULT_REGISTRY = 'docker.io' # normalize unqualified digests to this
57 # ------------------------------------------------------------------------------
58
59 LATEST_STABLE_RELEASE = 'pacific'
60 DATA_DIR = '/var/lib/ceph'
61 LOG_DIR = '/var/log/ceph'
62 LOCK_DIR = '/run/cephadm'
63 LOGROTATE_DIR = '/etc/logrotate.d'
64 UNIT_DIR = '/etc/systemd/system'
65 LOG_DIR_MODE = 0o770
66 DATA_DIR_MODE = 0o700
67 CONTAINER_INIT = True
68 MIN_PODMAN_VERSION = (2, 0, 2)
69 CGROUPS_SPLIT_PODMAN_VERSION = (2, 1, 0)
70 CUSTOM_PS1 = r'[ceph: \u@\h \W]\$ '
71 DEFAULT_TIMEOUT = None # in seconds
72 DEFAULT_RETRY = 15
73 SHELL_DEFAULT_CONF = '/etc/ceph/ceph.conf'
74 SHELL_DEFAULT_KEYRING = '/etc/ceph/ceph.client.admin.keyring'
75 DATEFMT = '%Y-%m-%dT%H:%M:%S.%fZ'
76
77 logger: logging.Logger = None # type: ignore
78
79 """
80 You can invoke cephadm in two ways:
81
82 1. The normal way, at the command line.
83
84 2. By piping the script to the python3 binary. In this latter case, you should
85 prepend one or more lines to the beginning of the script.
86
87 For arguments,
88
89 injected_argv = [...]
90
91 e.g.,
92
93 injected_argv = ['ls']
94
95 For reading stdin from the '--config-json -' argument,
96
97 injected_stdin = '...'
98 """
99 cached_stdin = None
100
101 ##################################
102
103
104 class BaseConfig:
105
106 def __init__(self):
107 self.image: str = ''
108 self.docker: bool = False
109 self.data_dir: str = DATA_DIR
110 self.log_dir: str = LOG_DIR
111 self.logrotate_dir: str = LOGROTATE_DIR
112 self.unit_dir: str = UNIT_DIR
113 self.verbose: bool = False
114 self.timeout: Optional[int] = DEFAULT_TIMEOUT
115 self.retry: int = DEFAULT_RETRY
116 self.env: List[str] = []
117 self.memory_request: Optional[int] = None
118 self.memory_limit: Optional[int] = None
119
120 self.container_init: bool = CONTAINER_INIT
121 self.container_engine: Optional[ContainerEngine] = None
122
123 def set_from_args(self, args: argparse.Namespace):
124 argdict: Dict[str, Any] = vars(args)
125 for k, v in argdict.items():
126 if hasattr(self, k):
127 setattr(self, k, v)
128
129
130 class CephadmContext:
131
132 def __init__(self):
133 self.__dict__['_args'] = None
134 self.__dict__['_conf'] = BaseConfig()
135
136 def set_args(self, args: argparse.Namespace) -> None:
137 self._conf.set_from_args(args)
138 self._args = args
139
140 def has_function(self) -> bool:
141 return 'func' in self._args
142
143 def __contains__(self, name: str) -> bool:
144 return hasattr(self, name)
145
146 def __getattr__(self, name: str) -> Any:
147 if '_conf' in self.__dict__ and hasattr(self._conf, name):
148 return getattr(self._conf, name)
149 elif '_args' in self.__dict__ and hasattr(self._args, name):
150 return getattr(self._args, name)
151 else:
152 return super().__getattribute__(name)
153
154 def __setattr__(self, name: str, value: Any) -> None:
155 if hasattr(self._conf, name):
156 setattr(self._conf, name, value)
157 elif hasattr(self._args, name):
158 setattr(self._args, name, value)
159 else:
160 super().__setattr__(name, value)
161
162
163 class ContainerEngine:
164 def __init__(self):
165 self.path = find_program(self.EXE)
166
167 @property
168 def EXE(self) -> str:
169 raise NotImplementedError()
170
171
172 class Podman(ContainerEngine):
173 EXE = 'podman'
174
175 def __init__(self):
176 super().__init__()
177 self._version = None
178
179 @property
180 def version(self):
181 if self._version is None:
182 raise RuntimeError('Please call `get_version` first')
183 return self._version
184
185 def get_version(self, ctx: CephadmContext):
186 out, _, _ = call_throws(ctx, [self.path, 'version', '--format', '{{.Client.Version}}'])
187 self._version = _parse_podman_version(out)
188
189
190 class Docker(ContainerEngine):
191 EXE = 'docker'
192
193
194 CONTAINER_PREFERENCE = (Podman, Docker) # prefer podman to docker
195
196
197 # Log and console output config
198 logging_config = {
199 'version': 1,
200 'disable_existing_loggers': True,
201 'formatters': {
202 'cephadm': {
203 'format': '%(asctime)s %(levelname)s %(message)s'
204 },
205 },
206 'handlers': {
207 'console': {
208 'level': 'INFO',
209 'class': 'logging.StreamHandler',
210 },
211 'log_file': {
212 'level': 'DEBUG',
213 'class': 'logging.handlers.RotatingFileHandler',
214 'formatter': 'cephadm',
215 'filename': '%s/cephadm.log' % LOG_DIR,
216 'maxBytes': 1024000,
217 'backupCount': 1,
218 }
219 },
220 'loggers': {
221 '': {
222 'level': 'DEBUG',
223 'handlers': ['console', 'log_file'],
224 }
225 }
226 }
227
228
229 class termcolor:
230 yellow = '\033[93m'
231 red = '\033[31m'
232 end = '\033[0m'
233
234
235 class Error(Exception):
236 pass
237
238
239 class TimeoutExpired(Error):
240 pass
241
242 ##################################
243
244
245 class Ceph(object):
246 daemons = ('mon', 'mgr', 'mds', 'osd', 'rgw', 'rbd-mirror',
247 'crash', 'cephfs-mirror')
248
249 ##################################
250
251
252 class Monitoring(object):
253 """Define the configs for the monitoring containers"""
254
255 port_map = {
256 'prometheus': [9095], # Avoid default 9090, due to conflict with cockpit UI
257 'node-exporter': [9100],
258 'grafana': [3000],
259 'alertmanager': [9093, 9094],
260 }
261
262 components = {
263 'prometheus': {
264 'image': DEFAULT_PROMETHEUS_IMAGE,
265 'cpus': '2',
266 'memory': '4GB',
267 'args': [
268 '--config.file=/etc/prometheus/prometheus.yml',
269 '--storage.tsdb.path=/prometheus',
270 '--web.listen-address=:{}'.format(port_map['prometheus'][0]),
271 ],
272 'config-json-files': [
273 'prometheus.yml',
274 ],
275 },
276 'node-exporter': {
277 'image': DEFAULT_NODE_EXPORTER_IMAGE,
278 'cpus': '1',
279 'memory': '1GB',
280 'args': [
281 '--no-collector.timex',
282 ],
283 },
284 'grafana': {
285 'image': DEFAULT_GRAFANA_IMAGE,
286 'cpus': '2',
287 'memory': '4GB',
288 'args': [],
289 'config-json-files': [
290 'grafana.ini',
291 'provisioning/datasources/ceph-dashboard.yml',
292 'certs/cert_file',
293 'certs/cert_key',
294 ],
295 },
296 'alertmanager': {
297 'image': DEFAULT_ALERT_MANAGER_IMAGE,
298 'cpus': '2',
299 'memory': '2GB',
300 'args': [
301 '--web.listen-address=:{}'.format(port_map['alertmanager'][0]),
302 '--cluster.listen-address=:{}'.format(port_map['alertmanager'][1]),
303 ],
304 'config-json-files': [
305 'alertmanager.yml',
306 ],
307 'config-json-args': [
308 'peers',
309 ],
310 },
311 } # type: ignore
312
313 @staticmethod
314 def get_version(ctx, container_id, daemon_type):
315 # type: (CephadmContext, str, str) -> str
316 """
317 :param: daemon_type Either "prometheus", "alertmanager" or "node-exporter"
318 """
319 assert daemon_type in ('prometheus', 'alertmanager', 'node-exporter')
320 cmd = daemon_type.replace('-', '_')
321 code = -1
322 err = ''
323 version = ''
324 if daemon_type == 'alertmanager':
325 for cmd in ['alertmanager', 'prometheus-alertmanager']:
326 _, err, code = call(ctx, [
327 ctx.container_engine.path, 'exec', container_id, cmd,
328 '--version'
329 ], verbosity=CallVerbosity.DEBUG)
330 if code == 0:
331 break
332 cmd = 'alertmanager' # reset cmd for version extraction
333 else:
334 _, err, code = call(ctx, [
335 ctx.container_engine.path, 'exec', container_id, cmd, '--version'
336 ], verbosity=CallVerbosity.DEBUG)
337 if code == 0 and \
338 err.startswith('%s, version ' % cmd):
339 version = err.split(' ')[2]
340 return version
341
342 ##################################
343
344
345 def populate_files(config_dir, config_files, uid, gid):
346 # type: (str, Dict, int, int) -> None
347 """create config files for different services"""
348 for fname in config_files:
349 config_file = os.path.join(config_dir, fname)
350 config_content = dict_get_join(config_files, fname)
351 logger.info('Write file: %s' % (config_file))
352 with open(config_file, 'w') as f:
353 os.fchown(f.fileno(), uid, gid)
354 os.fchmod(f.fileno(), 0o600)
355 f.write(config_content)
356
357
358 class NFSGanesha(object):
359 """Defines a NFS-Ganesha container"""
360
361 daemon_type = 'nfs'
362 entrypoint = '/usr/bin/ganesha.nfsd'
363 daemon_args = ['-F', '-L', 'STDERR']
364
365 required_files = ['ganesha.conf']
366
367 port_map = {
368 'nfs': 2049,
369 }
370
371 def __init__(self,
372 ctx,
373 fsid,
374 daemon_id,
375 config_json,
376 image=DEFAULT_IMAGE):
377 # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
378 self.ctx = ctx
379 self.fsid = fsid
380 self.daemon_id = daemon_id
381 self.image = image
382
383 # config-json options
384 self.pool = dict_get(config_json, 'pool', require=True)
385 self.namespace = dict_get(config_json, 'namespace')
386 self.userid = dict_get(config_json, 'userid')
387 self.extra_args = dict_get(config_json, 'extra_args', [])
388 self.files = dict_get(config_json, 'files', {})
389 self.rgw = dict_get(config_json, 'rgw', {})
390
391 # validate the supplied args
392 self.validate()
393
394 @classmethod
395 def init(cls, ctx, fsid, daemon_id):
396 # type: (CephadmContext, str, Union[int, str]) -> NFSGanesha
397 return cls(ctx, fsid, daemon_id, get_parm(ctx.config_json), ctx.image)
398
399 def get_container_mounts(self, data_dir):
400 # type: (str) -> Dict[str, str]
401 mounts = dict()
402 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
403 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
404 mounts[os.path.join(data_dir, 'etc/ganesha')] = '/etc/ganesha:z'
405 if self.rgw:
406 cluster = self.rgw.get('cluster', 'ceph')
407 rgw_user = self.rgw.get('user', 'admin')
408 mounts[os.path.join(data_dir, 'keyring.rgw')] = \
409 '/var/lib/ceph/radosgw/%s-%s/keyring:z' % (cluster, rgw_user)
410 return mounts
411
412 @staticmethod
413 def get_container_envs():
414 # type: () -> List[str]
415 envs = [
416 'CEPH_CONF=%s' % ('/etc/ceph/ceph.conf')
417 ]
418 return envs
419
420 @staticmethod
421 def get_version(ctx, container_id):
422 # type: (CephadmContext, str) -> Optional[str]
423 version = None
424 out, err, code = call(ctx,
425 [ctx.container_engine.path, 'exec', container_id,
426 NFSGanesha.entrypoint, '-v'],
427 verbosity=CallVerbosity.DEBUG)
428 if code == 0:
429 match = re.search(r'NFS-Ganesha Release\s*=\s*[V]*([\d.]+)', out)
430 if match:
431 version = match.group(1)
432 return version
433
434 def validate(self):
435 # type: () -> None
436 if not is_fsid(self.fsid):
437 raise Error('not an fsid: %s' % self.fsid)
438 if not self.daemon_id:
439 raise Error('invalid daemon_id: %s' % self.daemon_id)
440 if not self.image:
441 raise Error('invalid image: %s' % self.image)
442
443 # check for the required files
444 if self.required_files:
445 for fname in self.required_files:
446 if fname not in self.files:
447 raise Error('required file missing from config-json: %s' % fname)
448
449 # check for an RGW config
450 if self.rgw:
451 if not self.rgw.get('keyring'):
452 raise Error('RGW keyring is missing')
453 if not self.rgw.get('user'):
454 raise Error('RGW user is missing')
455
456 def get_daemon_name(self):
457 # type: () -> str
458 return '%s.%s' % (self.daemon_type, self.daemon_id)
459
460 def get_container_name(self, desc=None):
461 # type: (Optional[str]) -> str
462 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
463 if desc:
464 cname = '%s-%s' % (cname, desc)
465 return cname
466
467 def get_daemon_args(self):
468 # type: () -> List[str]
469 return self.daemon_args + self.extra_args
470
471 def create_daemon_dirs(self, data_dir, uid, gid):
472 # type: (str, int, int) -> None
473 """Create files under the container data dir"""
474 if not os.path.isdir(data_dir):
475 raise OSError('data_dir is not a directory: %s' % (data_dir))
476
477 logger.info('Creating ganesha config...')
478
479 # create the ganesha conf dir
480 config_dir = os.path.join(data_dir, 'etc/ganesha')
481 makedirs(config_dir, uid, gid, 0o755)
482
483 # populate files from the config-json
484 populate_files(config_dir, self.files, uid, gid)
485
486 # write the RGW keyring
487 if self.rgw:
488 keyring_path = os.path.join(data_dir, 'keyring.rgw')
489 with open(keyring_path, 'w') as f:
490 os.fchmod(f.fileno(), 0o600)
491 os.fchown(f.fileno(), uid, gid)
492 f.write(self.rgw.get('keyring', ''))
493
494 def get_rados_grace_container(self, action):
495 # type: (str) -> CephContainer
496 """Container for a ganesha action on the grace db"""
497 entrypoint = '/usr/bin/ganesha-rados-grace'
498
499 assert self.pool
500 args = ['--pool', self.pool]
501 if self.namespace:
502 args += ['--ns', self.namespace]
503 if self.userid:
504 args += ['--userid', self.userid]
505 args += [action, self.get_daemon_name()]
506
507 data_dir = get_data_dir(self.fsid, self.ctx.data_dir,
508 self.daemon_type, self.daemon_id)
509 volume_mounts = self.get_container_mounts(data_dir)
510 envs = self.get_container_envs()
511
512 logger.info('Creating RADOS grace for action: %s' % action)
513 c = CephContainer(
514 self.ctx,
515 image=self.image,
516 entrypoint=entrypoint,
517 args=args,
518 volume_mounts=volume_mounts,
519 cname=self.get_container_name(desc='grace-%s' % action),
520 envs=envs
521 )
522 return c
523
524 ##################################
525
526
527 class CephIscsi(object):
528 """Defines a Ceph-Iscsi container"""
529
530 daemon_type = 'iscsi'
531 entrypoint = '/usr/bin/rbd-target-api'
532
533 required_files = ['iscsi-gateway.cfg']
534
535 def __init__(self,
536 ctx,
537 fsid,
538 daemon_id,
539 config_json,
540 image=DEFAULT_IMAGE):
541 # type: (CephadmContext, str, Union[int, str], Dict, str) -> None
542 self.ctx = ctx
543 self.fsid = fsid
544 self.daemon_id = daemon_id
545 self.image = image
546
547 # config-json options
548 self.files = dict_get(config_json, 'files', {})
549
550 # validate the supplied args
551 self.validate()
552
553 @classmethod
554 def init(cls, ctx, fsid, daemon_id):
555 # type: (CephadmContext, str, Union[int, str]) -> CephIscsi
556 return cls(ctx, fsid, daemon_id,
557 get_parm(ctx.config_json), ctx.image)
558
559 @staticmethod
560 def get_container_mounts(data_dir, log_dir):
561 # type: (str, str) -> Dict[str, str]
562 mounts = dict()
563 mounts[os.path.join(data_dir, 'config')] = '/etc/ceph/ceph.conf:z'
564 mounts[os.path.join(data_dir, 'keyring')] = '/etc/ceph/keyring:z'
565 mounts[os.path.join(data_dir, 'iscsi-gateway.cfg')] = '/etc/ceph/iscsi-gateway.cfg:z'
566 mounts[os.path.join(data_dir, 'configfs')] = '/sys/kernel/config'
567 mounts[log_dir] = '/var/log/rbd-target-api:z'
568 mounts['/dev'] = '/dev'
569 return mounts
570
571 @staticmethod
572 def get_container_binds():
573 # type: () -> List[List[str]]
574 binds = []
575 lib_modules = ['type=bind',
576 'source=/lib/modules',
577 'destination=/lib/modules',
578 'ro=true']
579 binds.append(lib_modules)
580 return binds
581
582 @staticmethod
583 def get_version(ctx, container_id):
584 # type: (CephadmContext, str) -> Optional[str]
585 version = None
586 out, err, code = call(ctx,
587 [ctx.container_engine.path, 'exec', container_id,
588 '/usr/bin/python3', '-c', "import pkg_resources; print(pkg_resources.require('ceph_iscsi')[0].version)"],
589 verbosity=CallVerbosity.DEBUG)
590 if code == 0:
591 version = out.strip()
592 return version
593
594 def validate(self):
595 # type: () -> None
596 if not is_fsid(self.fsid):
597 raise Error('not an fsid: %s' % self.fsid)
598 if not self.daemon_id:
599 raise Error('invalid daemon_id: %s' % self.daemon_id)
600 if not self.image:
601 raise Error('invalid image: %s' % self.image)
602
603 # check for the required files
604 if self.required_files:
605 for fname in self.required_files:
606 if fname not in self.files:
607 raise Error('required file missing from config-json: %s' % fname)
608
609 def get_daemon_name(self):
610 # type: () -> str
611 return '%s.%s' % (self.daemon_type, self.daemon_id)
612
613 def get_container_name(self, desc=None):
614 # type: (Optional[str]) -> str
615 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
616 if desc:
617 cname = '%s-%s' % (cname, desc)
618 return cname
619
620 def create_daemon_dirs(self, data_dir, uid, gid):
621 # type: (str, int, int) -> None
622 """Create files under the container data dir"""
623 if not os.path.isdir(data_dir):
624 raise OSError('data_dir is not a directory: %s' % (data_dir))
625
626 logger.info('Creating ceph-iscsi config...')
627 configfs_dir = os.path.join(data_dir, 'configfs')
628 makedirs(configfs_dir, uid, gid, 0o755)
629
630 # populate files from the config-json
631 populate_files(data_dir, self.files, uid, gid)
632
633 @staticmethod
634 def configfs_mount_umount(data_dir, mount=True):
635 # type: (str, bool) -> List[str]
636 mount_path = os.path.join(data_dir, 'configfs')
637 if mount:
638 cmd = 'if ! grep -qs {0} /proc/mounts; then ' \
639 'mount -t configfs none {0}; fi'.format(mount_path)
640 else:
641 cmd = 'if grep -qs {0} /proc/mounts; then ' \
642 'umount {0}; fi'.format(mount_path)
643 return cmd.split()
644
645 def get_tcmu_runner_container(self):
646 # type: () -> CephContainer
647 tcmu_container = get_container(self.ctx, self.fsid, self.daemon_type, self.daemon_id)
648 tcmu_container.entrypoint = '/usr/bin/tcmu-runner'
649 tcmu_container.cname = self.get_container_name(desc='tcmu')
650 # remove extra container args for tcmu container.
651 # extra args could cause issue with forking service type
652 tcmu_container.container_args = []
653 return tcmu_container
654
655 ##################################
656
657
658 class HAproxy(object):
659 """Defines an HAproxy container"""
660 daemon_type = 'haproxy'
661 required_files = ['haproxy.cfg']
662 default_image = 'haproxy'
663
664 def __init__(self,
665 ctx: CephadmContext,
666 fsid: str, daemon_id: Union[int, str],
667 config_json: Dict, image: str) -> None:
668 self.ctx = ctx
669 self.fsid = fsid
670 self.daemon_id = daemon_id
671 self.image = image
672
673 # config-json options
674 self.files = dict_get(config_json, 'files', {})
675
676 self.validate()
677
678 @classmethod
679 def init(cls, ctx: CephadmContext,
680 fsid: str, daemon_id: Union[int, str]) -> 'HAproxy':
681 return cls(ctx, fsid, daemon_id, get_parm(ctx.config_json),
682 ctx.image)
683
684 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
685 """Create files under the container data dir"""
686 if not os.path.isdir(data_dir):
687 raise OSError('data_dir is not a directory: %s' % (data_dir))
688
689 # create additional directories in data dir for HAproxy to use
690 if not os.path.isdir(os.path.join(data_dir, 'haproxy')):
691 makedirs(os.path.join(data_dir, 'haproxy'), uid, gid, DATA_DIR_MODE)
692
693 data_dir = os.path.join(data_dir, 'haproxy')
694 populate_files(data_dir, self.files, uid, gid)
695
696 def get_daemon_args(self) -> List[str]:
697 return ['haproxy', '-f', '/var/lib/haproxy/haproxy.cfg']
698
699 def validate(self):
700 # type: () -> None
701 if not is_fsid(self.fsid):
702 raise Error('not an fsid: %s' % self.fsid)
703 if not self.daemon_id:
704 raise Error('invalid daemon_id: %s' % self.daemon_id)
705 if not self.image:
706 raise Error('invalid image: %s' % self.image)
707
708 # check for the required files
709 if self.required_files:
710 for fname in self.required_files:
711 if fname not in self.files:
712 raise Error('required file missing from config-json: %s' % fname)
713
714 def get_daemon_name(self):
715 # type: () -> str
716 return '%s.%s' % (self.daemon_type, self.daemon_id)
717
718 def get_container_name(self, desc=None):
719 # type: (Optional[str]) -> str
720 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
721 if desc:
722 cname = '%s-%s' % (cname, desc)
723 return cname
724
725 def extract_uid_gid_haproxy(self):
726 # better directory for this?
727 return extract_uid_gid(self.ctx, file_path='/var/lib')
728
729 @staticmethod
730 def get_container_mounts(data_dir: str) -> Dict[str, str]:
731 mounts = dict()
732 mounts[os.path.join(data_dir, 'haproxy')] = '/var/lib/haproxy'
733 return mounts
734
735 ##################################
736
737
738 class Keepalived(object):
739 """Defines an Keepalived container"""
740 daemon_type = 'keepalived'
741 required_files = ['keepalived.conf']
742 default_image = 'arcts/keepalived'
743
744 def __init__(self,
745 ctx: CephadmContext,
746 fsid: str, daemon_id: Union[int, str],
747 config_json: Dict, image: str) -> None:
748 self.ctx = ctx
749 self.fsid = fsid
750 self.daemon_id = daemon_id
751 self.image = image
752
753 # config-json options
754 self.files = dict_get(config_json, 'files', {})
755
756 self.validate()
757
758 @classmethod
759 def init(cls, ctx: CephadmContext, fsid: str,
760 daemon_id: Union[int, str]) -> 'Keepalived':
761 return cls(ctx, fsid, daemon_id,
762 get_parm(ctx.config_json), ctx.image)
763
764 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
765 """Create files under the container data dir"""
766 if not os.path.isdir(data_dir):
767 raise OSError('data_dir is not a directory: %s' % (data_dir))
768
769 # create additional directories in data dir for keepalived to use
770 if not os.path.isdir(os.path.join(data_dir, 'keepalived')):
771 makedirs(os.path.join(data_dir, 'keepalived'), uid, gid, DATA_DIR_MODE)
772
773 # populate files from the config-json
774 populate_files(data_dir, self.files, uid, gid)
775
776 def validate(self):
777 # type: () -> None
778 if not is_fsid(self.fsid):
779 raise Error('not an fsid: %s' % self.fsid)
780 if not self.daemon_id:
781 raise Error('invalid daemon_id: %s' % self.daemon_id)
782 if not self.image:
783 raise Error('invalid image: %s' % self.image)
784
785 # check for the required files
786 if self.required_files:
787 for fname in self.required_files:
788 if fname not in self.files:
789 raise Error('required file missing from config-json: %s' % fname)
790
791 def get_daemon_name(self):
792 # type: () -> str
793 return '%s.%s' % (self.daemon_type, self.daemon_id)
794
795 def get_container_name(self, desc=None):
796 # type: (Optional[str]) -> str
797 cname = 'ceph-%s-%s' % (self.fsid, self.get_daemon_name())
798 if desc:
799 cname = '%s-%s' % (cname, desc)
800 return cname
801
802 @staticmethod
803 def get_container_envs():
804 # type: () -> List[str]
805 envs = [
806 'KEEPALIVED_AUTOCONF=false',
807 'KEEPALIVED_CONF=/etc/keepalived/keepalived.conf',
808 'KEEPALIVED_CMD=/usr/sbin/keepalived -n -l -f /etc/keepalived/keepalived.conf',
809 'KEEPALIVED_DEBUG=false'
810 ]
811 return envs
812
813 @staticmethod
814 def get_prestart():
815 return (
816 '# keepalived needs IP forwarding and non-local bind\n'
817 'sysctl net.ipv4.ip_forward=1\n'
818 'sysctl net.ipv4.ip_nonlocal_bind=1\n'
819 )
820
821 def extract_uid_gid_keepalived(self):
822 # better directory for this?
823 return extract_uid_gid(self.ctx, file_path='/var/lib')
824
825 @staticmethod
826 def get_container_mounts(data_dir: str) -> Dict[str, str]:
827 mounts = dict()
828 mounts[os.path.join(data_dir, 'keepalived.conf')] = '/etc/keepalived/keepalived.conf'
829 return mounts
830
831 ##################################
832
833
834 class CustomContainer(object):
835 """Defines a custom container"""
836 daemon_type = 'container'
837
838 def __init__(self,
839 fsid: str, daemon_id: Union[int, str],
840 config_json: Dict, image: str) -> None:
841 self.fsid = fsid
842 self.daemon_id = daemon_id
843 self.image = image
844
845 # config-json options
846 self.entrypoint = dict_get(config_json, 'entrypoint')
847 self.uid = dict_get(config_json, 'uid', 65534) # nobody
848 self.gid = dict_get(config_json, 'gid', 65534) # nobody
849 self.volume_mounts = dict_get(config_json, 'volume_mounts', {})
850 self.args = dict_get(config_json, 'args', [])
851 self.envs = dict_get(config_json, 'envs', [])
852 self.privileged = dict_get(config_json, 'privileged', False)
853 self.bind_mounts = dict_get(config_json, 'bind_mounts', [])
854 self.ports = dict_get(config_json, 'ports', [])
855 self.dirs = dict_get(config_json, 'dirs', [])
856 self.files = dict_get(config_json, 'files', {})
857
858 @classmethod
859 def init(cls, ctx: CephadmContext,
860 fsid: str, daemon_id: Union[int, str]) -> 'CustomContainer':
861 return cls(fsid, daemon_id,
862 get_parm(ctx.config_json), ctx.image)
863
864 def create_daemon_dirs(self, data_dir: str, uid: int, gid: int) -> None:
865 """
866 Create dirs/files below the container data directory.
867 """
868 logger.info('Creating custom container configuration '
869 'dirs/files in {} ...'.format(data_dir))
870
871 if not os.path.isdir(data_dir):
872 raise OSError('data_dir is not a directory: %s' % data_dir)
873
874 for dir_path in self.dirs:
875 logger.info('Creating directory: {}'.format(dir_path))
876 dir_path = os.path.join(data_dir, dir_path.strip('/'))
877 makedirs(dir_path, uid, gid, 0o755)
878
879 for file_path in self.files:
880 logger.info('Creating file: {}'.format(file_path))
881 content = dict_get_join(self.files, file_path)
882 file_path = os.path.join(data_dir, file_path.strip('/'))
883 with open(file_path, 'w', encoding='utf-8') as f:
884 os.fchown(f.fileno(), uid, gid)
885 os.fchmod(f.fileno(), 0o600)
886 f.write(content)
887
888 def get_daemon_args(self) -> List[str]:
889 return []
890
891 def get_container_args(self) -> List[str]:
892 return self.args
893
894 def get_container_envs(self) -> List[str]:
895 return self.envs
896
897 def get_container_mounts(self, data_dir: str) -> Dict[str, str]:
898 """
899 Get the volume mounts. Relative source paths will be located below
900 `/var/lib/ceph/<cluster-fsid>/<daemon-name>`.
901
902 Example:
903 {
904 /foo/conf: /conf
905 foo/conf: /conf
906 }
907 becomes
908 {
909 /foo/conf: /conf
910 /var/lib/ceph/<cluster-fsid>/<daemon-name>/foo/conf: /conf
911 }
912 """
913 mounts = {}
914 for source, destination in self.volume_mounts.items():
915 source = os.path.join(data_dir, source)
916 mounts[source] = destination
917 return mounts
918
919 def get_container_binds(self, data_dir: str) -> List[List[str]]:
920 """
921 Get the bind mounts. Relative `source=...` paths will be located below
922 `/var/lib/ceph/<cluster-fsid>/<daemon-name>`.
923
924 Example:
925 [
926 'type=bind',
927 'source=lib/modules',
928 'destination=/lib/modules',
929 'ro=true'
930 ]
931 becomes
932 [
933 ...
934 'source=/var/lib/ceph/<cluster-fsid>/<daemon-name>/lib/modules',
935 ...
936 ]
937 """
938 binds = self.bind_mounts.copy()
939 for bind in binds:
940 for index, value in enumerate(bind):
941 match = re.match(r'^source=(.+)$', value)
942 if match:
943 bind[index] = 'source={}'.format(os.path.join(
944 data_dir, match.group(1)))
945 return binds
946
947 ##################################
948
949
950 def touch(file_path: str, uid: Optional[int] = None, gid: Optional[int] = None) -> None:
951 Path(file_path).touch()
952 if uid and gid:
953 os.chown(file_path, uid, gid)
954
955
956 ##################################
957
958
959 def dict_get(d: Dict, key: str, default: Any = None, require: bool = False) -> Any:
960 """
961 Helper function to get a key from a dictionary.
962 :param d: The dictionary to process.
963 :param key: The name of the key to get.
964 :param default: The default value in case the key does not
965 exist. Default is `None`.
966 :param require: Set to `True` if the key is required. An
967 exception will be raised if the key does not exist in
968 the given dictionary.
969 :return: Returns the value of the given key.
970 :raises: :exc:`self.Error` if the given key does not exist
971 and `require` is set to `True`.
972 """
973 if require and key not in d.keys():
974 raise Error('{} missing from dict'.format(key))
975 return d.get(key, default) # type: ignore
976
977 ##################################
978
979
980 def dict_get_join(d: Dict, key: str) -> Any:
981 """
982 Helper function to get the value of a given key from a dictionary.
983 `List` values will be converted to a string by joining them with a
984 line break.
985 :param d: The dictionary to process.
986 :param key: The name of the key to get.
987 :return: Returns the value of the given key. If it was a `list`, it
988 will be joining with a line break.
989 """
990 value = d.get(key)
991 if isinstance(value, list):
992 value = '\n'.join(map(str, value))
993 return value
994
995 ##################################
996
997
998 def get_supported_daemons():
999 # type: () -> List[str]
1000 supported_daemons = list(Ceph.daemons)
1001 supported_daemons.extend(Monitoring.components)
1002 supported_daemons.append(NFSGanesha.daemon_type)
1003 supported_daemons.append(CephIscsi.daemon_type)
1004 supported_daemons.append(CustomContainer.daemon_type)
1005 supported_daemons.append(CephadmDaemon.daemon_type)
1006 supported_daemons.append(HAproxy.daemon_type)
1007 supported_daemons.append(Keepalived.daemon_type)
1008 assert len(supported_daemons) == len(set(supported_daemons))
1009 return supported_daemons
1010
1011 ##################################
1012
1013
1014 class PortOccupiedError(Error):
1015 pass
1016
1017
1018 def attempt_bind(ctx, s, address, port):
1019 # type: (CephadmContext, socket.socket, str, int) -> None
1020 try:
1021 s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
1022 s.bind((address, port))
1023 except (socket.error, OSError) as e: # py2 and py3
1024 if e.errno == errno.EADDRINUSE:
1025 msg = 'Cannot bind to IP %s port %d: %s' % (address, port, e)
1026 logger.warning(msg)
1027 raise PortOccupiedError(msg)
1028 else:
1029 raise e
1030 finally:
1031 s.close()
1032
1033
1034 def port_in_use(ctx, port_num):
1035 # type: (CephadmContext, int) -> bool
1036 """Detect whether a port is in use on the local machine - IPv4 and IPv6"""
1037 logger.info('Verifying port %d ...' % port_num)
1038
1039 def _port_in_use(af: socket.AddressFamily, address: str) -> bool:
1040 try:
1041 s = socket.socket(af, socket.SOCK_STREAM)
1042 attempt_bind(ctx, s, address, port_num)
1043 except PortOccupiedError:
1044 return True
1045 except OSError as e:
1046 if e.errno in (errno.EAFNOSUPPORT, errno.EADDRNOTAVAIL):
1047 # Ignore EAFNOSUPPORT and EADDRNOTAVAIL as two interfaces are
1048 # being tested here and one might be intentionally be disabled.
1049 # In that case no error should be raised.
1050 return False
1051 else:
1052 raise e
1053 return False
1054 return any(_port_in_use(af, address) for af, address in (
1055 (socket.AF_INET, '0.0.0.0'),
1056 (socket.AF_INET6, '::')
1057 ))
1058
1059
1060 def check_ip_port(ctx, ip, port):
1061 # type: (CephadmContext, str, int) -> None
1062 if not ctx.skip_ping_check:
1063 logger.info('Verifying IP %s port %d ...' % (ip, port))
1064 if is_ipv6(ip):
1065 s = socket.socket(socket.AF_INET6, socket.SOCK_STREAM)
1066 ip = unwrap_ipv6(ip)
1067 else:
1068 s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
1069 attempt_bind(ctx, s, ip, port)
1070
1071 ##################################
1072
1073
1074 # this is an abbreviated version of
1075 # https://github.com/benediktschmitt/py-filelock/blob/master/filelock.py
1076 # that drops all of the compatibility (this is Unix/Linux only).
1077
1078 class Timeout(TimeoutError):
1079 """
1080 Raised when the lock could not be acquired in *timeout*
1081 seconds.
1082 """
1083
1084 def __init__(self, lock_file):
1085 """
1086 """
1087 #: The path of the file lock.
1088 self.lock_file = lock_file
1089 return None
1090
1091 def __str__(self):
1092 temp = "The file lock '{}' could not be acquired."\
1093 .format(self.lock_file)
1094 return temp
1095
1096
1097 class _Acquire_ReturnProxy(object):
1098 def __init__(self, lock):
1099 self.lock = lock
1100 return None
1101
1102 def __enter__(self):
1103 return self.lock
1104
1105 def __exit__(self, exc_type, exc_value, traceback):
1106 self.lock.release()
1107 return None
1108
1109
1110 class FileLock(object):
1111 def __init__(self, ctx: CephadmContext, name, timeout=-1):
1112 if not os.path.exists(LOCK_DIR):
1113 os.mkdir(LOCK_DIR, 0o700)
1114 self._lock_file = os.path.join(LOCK_DIR, name + '.lock')
1115 self.ctx = ctx
1116
1117 # The file descriptor for the *_lock_file* as it is returned by the
1118 # os.open() function.
1119 # This file lock is only NOT None, if the object currently holds the
1120 # lock.
1121 self._lock_file_fd: Optional[int] = None
1122 self.timeout = timeout
1123 # The lock counter is used for implementing the nested locking
1124 # mechanism. Whenever the lock is acquired, the counter is increased and
1125 # the lock is only released, when this value is 0 again.
1126 self._lock_counter = 0
1127 return None
1128
1129 @property
1130 def is_locked(self):
1131 return self._lock_file_fd is not None
1132
1133 def acquire(self, timeout=None, poll_intervall=0.05):
1134 """
1135 Acquires the file lock or fails with a :exc:`Timeout` error.
1136 .. code-block:: python
1137 # You can use this method in the context manager (recommended)
1138 with lock.acquire():
1139 pass
1140 # Or use an equivalent try-finally construct:
1141 lock.acquire()
1142 try:
1143 pass
1144 finally:
1145 lock.release()
1146 :arg float timeout:
1147 The maximum time waited for the file lock.
1148 If ``timeout < 0``, there is no timeout and this method will
1149 block until the lock could be acquired.
1150 If ``timeout`` is None, the default :attr:`~timeout` is used.
1151 :arg float poll_intervall:
1152 We check once in *poll_intervall* seconds if we can acquire the
1153 file lock.
1154 :raises Timeout:
1155 if the lock could not be acquired in *timeout* seconds.
1156 .. versionchanged:: 2.0.0
1157 This method returns now a *proxy* object instead of *self*,
1158 so that it can be used in a with statement without side effects.
1159 """
1160
1161 # Use the default timeout, if no timeout is provided.
1162 if timeout is None:
1163 timeout = self.timeout
1164
1165 # Increment the number right at the beginning.
1166 # We can still undo it, if something fails.
1167 self._lock_counter += 1
1168
1169 lock_id = id(self)
1170 lock_filename = self._lock_file
1171 start_time = time.time()
1172 try:
1173 while True:
1174 if not self.is_locked:
1175 logger.debug('Acquiring lock %s on %s', lock_id,
1176 lock_filename)
1177 self._acquire()
1178
1179 if self.is_locked:
1180 logger.debug('Lock %s acquired on %s', lock_id,
1181 lock_filename)
1182 break
1183 elif timeout >= 0 and time.time() - start_time > timeout:
1184 logger.warning('Timeout acquiring lock %s on %s', lock_id,
1185 lock_filename)
1186 raise Timeout(self._lock_file)
1187 else:
1188 logger.debug(
1189 'Lock %s not acquired on %s, waiting %s seconds ...',
1190 lock_id, lock_filename, poll_intervall
1191 )
1192 time.sleep(poll_intervall)
1193 except: # noqa
1194 # Something did go wrong, so decrement the counter.
1195 self._lock_counter = max(0, self._lock_counter - 1)
1196
1197 raise
1198 return _Acquire_ReturnProxy(lock=self)
1199
1200 def release(self, force=False):
1201 """
1202 Releases the file lock.
1203 Please note, that the lock is only completly released, if the lock
1204 counter is 0.
1205 Also note, that the lock file itself is not automatically deleted.
1206 :arg bool force:
1207 If true, the lock counter is ignored and the lock is released in
1208 every case.
1209 """
1210 if self.is_locked:
1211 self._lock_counter -= 1
1212
1213 if self._lock_counter == 0 or force:
1214 lock_id = id(self)
1215 lock_filename = self._lock_file
1216
1217 logger.debug('Releasing lock %s on %s', lock_id, lock_filename)
1218 self._release()
1219 self._lock_counter = 0
1220 logger.debug('Lock %s released on %s', lock_id, lock_filename)
1221
1222 return None
1223
1224 def __enter__(self):
1225 self.acquire()
1226 return self
1227
1228 def __exit__(self, exc_type, exc_value, traceback):
1229 self.release()
1230 return None
1231
1232 def __del__(self):
1233 self.release(force=True)
1234 return None
1235
1236 def _acquire(self):
1237 open_mode = os.O_RDWR | os.O_CREAT | os.O_TRUNC
1238 fd = os.open(self._lock_file, open_mode)
1239
1240 try:
1241 fcntl.flock(fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
1242 except (IOError, OSError):
1243 os.close(fd)
1244 else:
1245 self._lock_file_fd = fd
1246 return None
1247
1248 def _release(self):
1249 # Do not remove the lockfile:
1250 #
1251 # https://github.com/benediktschmitt/py-filelock/issues/31
1252 # https://stackoverflow.com/questions/17708885/flock-removing-locked-file-without-race-condition
1253 fd = self._lock_file_fd
1254 self._lock_file_fd = None
1255 fcntl.flock(fd, fcntl.LOCK_UN) # type: ignore
1256 os.close(fd) # type: ignore
1257 return None
1258
1259
1260 ##################################
1261 # Popen wrappers, lifted from ceph-volume
1262
1263 class CallVerbosity(Enum):
1264 SILENT = 0
1265 # log stdout/stderr to logger.debug
1266 DEBUG = 1
1267 # On a non-zero exit status, it will forcefully set
1268 # logging ON for the terminal
1269 VERBOSE_ON_FAILURE = 2
1270 # log at info (instead of debug) level.
1271 VERBOSE = 3
1272
1273
1274 if sys.version_info < (3, 8):
1275 import itertools
1276 import threading
1277 import warnings
1278 from asyncio import events
1279
1280 class ThreadedChildWatcher(asyncio.AbstractChildWatcher):
1281 """Threaded child watcher implementation.
1282 The watcher uses a thread per process
1283 for waiting for the process finish.
1284 It doesn't require subscription on POSIX signal
1285 but a thread creation is not free.
1286 The watcher has O(1) complexity, its performance doesn't depend
1287 on amount of spawn processes.
1288 """
1289
1290 def __init__(self):
1291 self._pid_counter = itertools.count(0)
1292 self._threads = {}
1293
1294 def is_active(self):
1295 return True
1296
1297 def close(self):
1298 self._join_threads()
1299
1300 def _join_threads(self):
1301 """Internal: Join all non-daemon threads"""
1302 threads = [thread for thread in list(self._threads.values())
1303 if thread.is_alive() and not thread.daemon]
1304 for thread in threads:
1305 thread.join()
1306
1307 def __enter__(self):
1308 return self
1309
1310 def __exit__(self, exc_type, exc_val, exc_tb):
1311 pass
1312
1313 def __del__(self, _warn=warnings.warn):
1314 threads = [thread for thread in list(self._threads.values())
1315 if thread.is_alive()]
1316 if threads:
1317 _warn(f'{self.__class__} has registered but not finished child processes',
1318 ResourceWarning,
1319 source=self)
1320
1321 def add_child_handler(self, pid, callback, *args):
1322 loop = events.get_event_loop()
1323 thread = threading.Thread(target=self._do_waitpid,
1324 name=f'waitpid-{next(self._pid_counter)}',
1325 args=(loop, pid, callback, args),
1326 daemon=True)
1327 self._threads[pid] = thread
1328 thread.start()
1329
1330 def remove_child_handler(self, pid):
1331 # asyncio never calls remove_child_handler() !!!
1332 # The method is no-op but is implemented because
1333 # abstract base classe requires it
1334 return True
1335
1336 def attach_loop(self, loop):
1337 pass
1338
1339 def _do_waitpid(self, loop, expected_pid, callback, args):
1340 assert expected_pid > 0
1341
1342 try:
1343 pid, status = os.waitpid(expected_pid, 0)
1344 except ChildProcessError:
1345 # The child process is already reaped
1346 # (may happen if waitpid() is called elsewhere).
1347 pid = expected_pid
1348 returncode = 255
1349 logger.warning(
1350 'Unknown child process pid %d, will report returncode 255',
1351 pid)
1352 else:
1353 if os.WIFEXITED(status):
1354 returncode = os.WEXITSTATUS(status)
1355 elif os.WIFSIGNALED(status):
1356 returncode = -os.WTERMSIG(status)
1357 else:
1358 raise ValueError(f'unknown wait status {status}')
1359 if loop.get_debug():
1360 logger.debug('process %s exited with returncode %s',
1361 expected_pid, returncode)
1362
1363 if loop.is_closed():
1364 logger.warning('Loop %r that handles pid %r is closed', loop, pid)
1365 else:
1366 loop.call_soon_threadsafe(callback, pid, returncode, *args)
1367
1368 self._threads.pop(expected_pid)
1369
1370 # unlike SafeChildWatcher which handles SIGCHLD in the main thread,
1371 # ThreadedChildWatcher runs in a separated thread, hence allows us to
1372 # run create_subprocess_exec() in non-main thread, see
1373 # https://bugs.python.org/issue35621
1374 asyncio.set_child_watcher(ThreadedChildWatcher())
1375
1376
1377 try:
1378 from asyncio import run as async_run # type: ignore[attr-defined]
1379 except ImportError:
1380 def async_run(coro): # type: ignore
1381 loop = asyncio.new_event_loop()
1382 try:
1383 asyncio.set_event_loop(loop)
1384 return loop.run_until_complete(coro)
1385 finally:
1386 try:
1387 loop.run_until_complete(loop.shutdown_asyncgens())
1388 finally:
1389 asyncio.set_event_loop(None)
1390 loop.close()
1391
1392
1393 def call(ctx: CephadmContext,
1394 command: List[str],
1395 desc: Optional[str] = None,
1396 verbosity: CallVerbosity = CallVerbosity.VERBOSE_ON_FAILURE,
1397 timeout: Optional[int] = DEFAULT_TIMEOUT,
1398 **kwargs) -> Tuple[str, str, int]:
1399 """
1400 Wrap subprocess.Popen to
1401
1402 - log stdout/stderr to a logger,
1403 - decode utf-8
1404 - cleanly return out, err, returncode
1405
1406 :param timeout: timeout in seconds
1407 """
1408
1409 prefix = command[0] if desc is None else desc
1410 if prefix:
1411 prefix += ': '
1412 timeout = timeout or ctx.timeout
1413
1414 logger.debug('Running command: %s' % ' '.join(command))
1415
1416 async def tee(reader: asyncio.StreamReader) -> str:
1417 collected = StringIO()
1418 async for line in reader:
1419 message = line.decode('utf-8')
1420 collected.write(message)
1421 if verbosity == CallVerbosity.VERBOSE:
1422 logger.info(prefix + message.rstrip())
1423 elif verbosity != CallVerbosity.SILENT:
1424 logger.debug(prefix + message.rstrip())
1425 return collected.getvalue()
1426
1427 async def run_with_timeout() -> Tuple[str, str, int]:
1428 process = await asyncio.create_subprocess_exec(
1429 *command,
1430 stdout=asyncio.subprocess.PIPE,
1431 stderr=asyncio.subprocess.PIPE)
1432 assert process.stdout
1433 assert process.stderr
1434 try:
1435 stdout, stderr = await asyncio.gather(tee(process.stdout),
1436 tee(process.stderr))
1437 returncode = await asyncio.wait_for(process.wait(), timeout)
1438 except asyncio.TimeoutError:
1439 logger.info(prefix + f'timeout after {timeout} seconds')
1440 return '', '', 124
1441 else:
1442 return stdout, stderr, returncode
1443
1444 stdout, stderr, returncode = async_run(run_with_timeout())
1445 if returncode != 0 and verbosity == CallVerbosity.VERBOSE_ON_FAILURE:
1446 logger.info('Non-zero exit code %d from %s',
1447 returncode, ' '.join(command))
1448 for line in stdout.splitlines():
1449 logger.info(prefix + 'stdout ' + line)
1450 for line in stderr.splitlines():
1451 logger.info(prefix + 'stderr ' + line)
1452 return stdout, stderr, returncode
1453
1454
1455 def call_throws(
1456 ctx: CephadmContext,
1457 command: List[str],
1458 desc: Optional[str] = None,
1459 verbosity: CallVerbosity = CallVerbosity.VERBOSE_ON_FAILURE,
1460 timeout: Optional[int] = DEFAULT_TIMEOUT,
1461 **kwargs) -> Tuple[str, str, int]:
1462 out, err, ret = call(ctx, command, desc, verbosity, timeout, **kwargs)
1463 if ret:
1464 raise RuntimeError('Failed command: %s' % ' '.join(command))
1465 return out, err, ret
1466
1467
1468 def call_timeout(ctx, command, timeout):
1469 # type: (CephadmContext, List[str], int) -> int
1470 logger.debug('Running command (timeout=%s): %s'
1471 % (timeout, ' '.join(command)))
1472
1473 def raise_timeout(command, timeout):
1474 # type: (List[str], int) -> NoReturn
1475 msg = 'Command `%s` timed out after %s seconds' % (command, timeout)
1476 logger.debug(msg)
1477 raise TimeoutExpired(msg)
1478
1479 try:
1480 return subprocess.call(command, timeout=timeout)
1481 except subprocess.TimeoutExpired:
1482 raise_timeout(command, timeout)
1483
1484 ##################################
1485
1486
1487 def is_available(ctx, what, func):
1488 # type: (CephadmContext, str, Callable[[], bool]) -> None
1489 """
1490 Wait for a service to become available
1491
1492 :param what: the name of the service
1493 :param func: the callable object that determines availability
1494 """
1495 retry = ctx.retry
1496 logger.info('Waiting for %s...' % what)
1497 num = 1
1498 while True:
1499 if func():
1500 logger.info('%s is available'
1501 % what)
1502 break
1503 elif num > retry:
1504 raise Error('%s not available after %s tries'
1505 % (what, retry))
1506
1507 logger.info('%s not available, waiting (%s/%s)...'
1508 % (what, num, retry))
1509
1510 num += 1
1511 time.sleep(2)
1512
1513
1514 def read_config(fn):
1515 # type: (Optional[str]) -> ConfigParser
1516 cp = ConfigParser()
1517 if fn:
1518 cp.read(fn)
1519 return cp
1520
1521
1522 def pathify(p):
1523 # type: (str) -> str
1524 p = os.path.expanduser(p)
1525 return os.path.abspath(p)
1526
1527
1528 def get_file_timestamp(fn):
1529 # type: (str) -> Optional[str]
1530 try:
1531 mt = os.path.getmtime(fn)
1532 return datetime.datetime.fromtimestamp(
1533 mt, tz=datetime.timezone.utc
1534 ).strftime(DATEFMT)
1535 except Exception:
1536 return None
1537
1538
1539 def try_convert_datetime(s):
1540 # type: (str) -> Optional[str]
1541 # This is super irritating because
1542 # 1) podman and docker use different formats
1543 # 2) python's strptime can't parse either one
1544 #
1545 # I've seen:
1546 # docker 18.09.7: 2020-03-03T09:21:43.636153304Z
1547 # podman 1.7.0: 2020-03-03T15:52:30.136257504-06:00
1548 # 2020-03-03 15:52:30.136257504 -0600 CST
1549 # (In the podman case, there is a different string format for
1550 # 'inspect' and 'inspect --format {{.Created}}'!!)
1551
1552 # In *all* cases, the 9 digit second precision is too much for
1553 # python's strptime. Shorten it to 6 digits.
1554 p = re.compile(r'(\.[\d]{6})[\d]*')
1555 s = p.sub(r'\1', s)
1556
1557 # replace trailing Z with -0000, since (on python 3.6.8) it won't parse
1558 if s and s[-1] == 'Z':
1559 s = s[:-1] + '-0000'
1560
1561 # cut off the redundant 'CST' part that strptime can't parse, if
1562 # present.
1563 v = s.split(' ')
1564 s = ' '.join(v[0:3])
1565
1566 # try parsing with several format strings
1567 fmts = [
1568 '%Y-%m-%dT%H:%M:%S.%f%z',
1569 '%Y-%m-%d %H:%M:%S.%f %z',
1570 ]
1571 for f in fmts:
1572 try:
1573 # return timestamp normalized to UTC, rendered as DATEFMT.
1574 return datetime.datetime.strptime(s, f).astimezone(tz=datetime.timezone.utc).strftime(DATEFMT)
1575 except ValueError:
1576 pass
1577 return None
1578
1579
1580 def _parse_podman_version(version_str):
1581 # type: (str) -> Tuple[int, ...]
1582 def to_int(val, org_e=None):
1583 if not val and org_e:
1584 raise org_e
1585 try:
1586 return int(val)
1587 except ValueError as e:
1588 return to_int(val[0:-1], org_e or e)
1589
1590 return tuple(map(to_int, version_str.split('.')))
1591
1592
1593 def get_hostname():
1594 # type: () -> str
1595 return socket.gethostname()
1596
1597
1598 def get_fqdn():
1599 # type: () -> str
1600 return socket.getfqdn() or socket.gethostname()
1601
1602
1603 def get_arch():
1604 # type: () -> str
1605 return platform.uname().machine
1606
1607
1608 def generate_service_id():
1609 # type: () -> str
1610 return get_hostname() + '.' + ''.join(random.choice(string.ascii_lowercase)
1611 for _ in range(6))
1612
1613
1614 def generate_password():
1615 # type: () -> str
1616 return ''.join(random.choice(string.ascii_lowercase + string.digits)
1617 for i in range(10))
1618
1619
1620 def normalize_container_id(i):
1621 # type: (str) -> str
1622 # docker adds the sha256: prefix, but AFAICS both
1623 # docker (18.09.7 in bionic at least) and podman
1624 # both always use sha256, so leave off the prefix
1625 # for consistency.
1626 prefix = 'sha256:'
1627 if i.startswith(prefix):
1628 i = i[len(prefix):]
1629 return i
1630
1631
1632 def make_fsid():
1633 # type: () -> str
1634 return str(uuid.uuid1())
1635
1636
1637 def is_fsid(s):
1638 # type: (str) -> bool
1639 try:
1640 uuid.UUID(s)
1641 except ValueError:
1642 return False
1643 return True
1644
1645
1646 def infer_fsid(func):
1647 """
1648 If we only find a single fsid in /var/lib/ceph/*, use that
1649 """
1650 @wraps(func)
1651 def _infer_fsid(ctx: CephadmContext):
1652 if ctx.fsid:
1653 logger.debug('Using specified fsid: %s' % ctx.fsid)
1654 return func(ctx)
1655
1656 fsids_set = set()
1657 daemon_list = list_daemons(ctx, detail=False)
1658 for daemon in daemon_list:
1659 if not is_fsid(daemon['fsid']):
1660 # 'unknown' fsid
1661 continue
1662 elif 'name' not in ctx or not ctx.name:
1663 # ctx.name not specified
1664 fsids_set.add(daemon['fsid'])
1665 elif daemon['name'] == ctx.name:
1666 # ctx.name is a match
1667 fsids_set.add(daemon['fsid'])
1668 fsids = sorted(fsids_set)
1669
1670 if not fsids:
1671 # some commands do not always require an fsid
1672 pass
1673 elif len(fsids) == 1:
1674 logger.info('Inferring fsid %s' % fsids[0])
1675 ctx.fsid = fsids[0]
1676 else:
1677 raise Error('Cannot infer an fsid, one must be specified: %s' % fsids)
1678 return func(ctx)
1679
1680 return _infer_fsid
1681
1682
1683 def infer_config(func):
1684 """
1685 If we find a MON daemon, use the config from that container
1686 """
1687 @wraps(func)
1688 def _infer_config(ctx: CephadmContext):
1689 if ctx.config:
1690 logger.debug('Using specified config: %s' % ctx.config)
1691 return func(ctx)
1692 config = None
1693 if ctx.fsid:
1694 name = ctx.name
1695 if not name:
1696 daemon_list = list_daemons(ctx, detail=False)
1697 for daemon in daemon_list:
1698 if daemon['name'].startswith('mon.'):
1699 name = daemon['name']
1700 break
1701 if name:
1702 config = '/var/lib/ceph/{}/{}/config'.format(ctx.fsid,
1703 name)
1704 if config:
1705 logger.info('Inferring config %s' % config)
1706 ctx.config = config
1707 elif os.path.exists(SHELL_DEFAULT_CONF):
1708 logger.debug('Using default config: %s' % SHELL_DEFAULT_CONF)
1709 ctx.config = SHELL_DEFAULT_CONF
1710 return func(ctx)
1711
1712 return _infer_config
1713
1714
1715 def _get_default_image(ctx: CephadmContext):
1716 if DEFAULT_IMAGE_IS_MASTER:
1717 warn = """This is a development version of cephadm.
1718 For information regarding the latest stable release:
1719 https://docs.ceph.com/docs/{}/cephadm/install
1720 """.format(LATEST_STABLE_RELEASE)
1721 for line in warn.splitlines():
1722 logger.warning('{}{}{}'.format(termcolor.yellow, line, termcolor.end))
1723 return DEFAULT_IMAGE
1724
1725
1726 def infer_image(func):
1727 """
1728 Use the most recent ceph image
1729 """
1730 @wraps(func)
1731 def _infer_image(ctx: CephadmContext):
1732 if not ctx.image:
1733 ctx.image = os.environ.get('CEPHADM_IMAGE')
1734 if not ctx.image:
1735 ctx.image = get_last_local_ceph_image(ctx, ctx.container_engine.path)
1736 if not ctx.image:
1737 ctx.image = _get_default_image(ctx)
1738 return func(ctx)
1739
1740 return _infer_image
1741
1742
1743 def default_image(func):
1744 @wraps(func)
1745 def _default_image(ctx: CephadmContext):
1746 if not ctx.image:
1747 if 'name' in ctx and ctx.name:
1748 type_ = ctx.name.split('.', 1)[0]
1749 if type_ in Monitoring.components:
1750 ctx.image = Monitoring.components[type_]['image']
1751 if type_ == 'haproxy':
1752 ctx.image = HAproxy.default_image
1753 if type_ == 'keepalived':
1754 ctx.image = Keepalived.default_image
1755 if not ctx.image:
1756 ctx.image = os.environ.get('CEPHADM_IMAGE')
1757 if not ctx.image:
1758 ctx.image = _get_default_image(ctx)
1759
1760 return func(ctx)
1761
1762 return _default_image
1763
1764
1765 def get_last_local_ceph_image(ctx: CephadmContext, container_path: str):
1766 """
1767 :return: The most recent local ceph image (already pulled)
1768 """
1769 out, _, _ = call_throws(ctx,
1770 [container_path, 'images',
1771 '--filter', 'label=ceph=True',
1772 '--filter', 'dangling=false',
1773 '--format', '{{.Repository}}@{{.Digest}}'])
1774 return _filter_last_local_ceph_image(out)
1775
1776
1777 def _filter_last_local_ceph_image(out):
1778 # type: (str) -> Optional[str]
1779 for image in out.splitlines():
1780 if image and not image.endswith('@'):
1781 logger.info('Using recent ceph image %s' % image)
1782 return image
1783 return None
1784
1785
1786 def write_tmp(s, uid, gid):
1787 # type: (str, int, int) -> IO[str]
1788 tmp_f = tempfile.NamedTemporaryFile(mode='w',
1789 prefix='ceph-tmp')
1790 os.fchown(tmp_f.fileno(), uid, gid)
1791 tmp_f.write(s)
1792 tmp_f.flush()
1793
1794 return tmp_f
1795
1796
1797 def makedirs(dir, uid, gid, mode):
1798 # type: (str, int, int, int) -> None
1799 if not os.path.exists(dir):
1800 os.makedirs(dir, mode=mode)
1801 else:
1802 os.chmod(dir, mode)
1803 os.chown(dir, uid, gid)
1804 os.chmod(dir, mode) # the above is masked by umask...
1805
1806
1807 def get_data_dir(fsid, data_dir, t, n):
1808 # type: (str, str, str, Union[int, str]) -> str
1809 return os.path.join(data_dir, fsid, '%s.%s' % (t, n))
1810
1811
1812 def get_log_dir(fsid, log_dir):
1813 # type: (str, str) -> str
1814 return os.path.join(log_dir, fsid)
1815
1816
1817 def make_data_dir_base(fsid, data_dir, uid, gid):
1818 # type: (str, str, int, int) -> str
1819 data_dir_base = os.path.join(data_dir, fsid)
1820 makedirs(data_dir_base, uid, gid, DATA_DIR_MODE)
1821 makedirs(os.path.join(data_dir_base, 'crash'), uid, gid, DATA_DIR_MODE)
1822 makedirs(os.path.join(data_dir_base, 'crash', 'posted'), uid, gid,
1823 DATA_DIR_MODE)
1824 return data_dir_base
1825
1826
1827 def make_data_dir(ctx, fsid, daemon_type, daemon_id, uid=None, gid=None):
1828 # type: (CephadmContext, str, str, Union[int, str], Optional[int], Optional[int]) -> str
1829 if uid is None or gid is None:
1830 uid, gid = extract_uid_gid(ctx)
1831 make_data_dir_base(fsid, ctx.data_dir, uid, gid)
1832 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
1833 makedirs(data_dir, uid, gid, DATA_DIR_MODE)
1834 return data_dir
1835
1836
1837 def make_log_dir(ctx, fsid, uid=None, gid=None):
1838 # type: (CephadmContext, str, Optional[int], Optional[int]) -> str
1839 if uid is None or gid is None:
1840 uid, gid = extract_uid_gid(ctx)
1841 log_dir = get_log_dir(fsid, ctx.log_dir)
1842 makedirs(log_dir, uid, gid, LOG_DIR_MODE)
1843 return log_dir
1844
1845
1846 def make_var_run(ctx, fsid, uid, gid):
1847 # type: (CephadmContext, str, int, int) -> None
1848 call_throws(ctx, ['install', '-d', '-m0770', '-o', str(uid), '-g', str(gid),
1849 '/var/run/ceph/%s' % fsid])
1850
1851
1852 def copy_tree(ctx, src, dst, uid=None, gid=None):
1853 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
1854 """
1855 Copy a directory tree from src to dst
1856 """
1857 if uid is None or gid is None:
1858 (uid, gid) = extract_uid_gid(ctx)
1859
1860 for src_dir in src:
1861 dst_dir = dst
1862 if os.path.isdir(dst):
1863 dst_dir = os.path.join(dst, os.path.basename(src_dir))
1864
1865 logger.debug('copy directory `%s` -> `%s`' % (src_dir, dst_dir))
1866 shutil.rmtree(dst_dir, ignore_errors=True)
1867 shutil.copytree(src_dir, dst_dir) # dirs_exist_ok needs python 3.8
1868
1869 for dirpath, dirnames, filenames in os.walk(dst_dir):
1870 logger.debug('chown %s:%s `%s`' % (uid, gid, dirpath))
1871 os.chown(dirpath, uid, gid)
1872 for filename in filenames:
1873 logger.debug('chown %s:%s `%s`' % (uid, gid, filename))
1874 os.chown(os.path.join(dirpath, filename), uid, gid)
1875
1876
1877 def copy_files(ctx, src, dst, uid=None, gid=None):
1878 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
1879 """
1880 Copy a files from src to dst
1881 """
1882 if uid is None or gid is None:
1883 (uid, gid) = extract_uid_gid(ctx)
1884
1885 for src_file in src:
1886 dst_file = dst
1887 if os.path.isdir(dst):
1888 dst_file = os.path.join(dst, os.path.basename(src_file))
1889
1890 logger.debug('copy file `%s` -> `%s`' % (src_file, dst_file))
1891 shutil.copyfile(src_file, dst_file)
1892
1893 logger.debug('chown %s:%s `%s`' % (uid, gid, dst_file))
1894 os.chown(dst_file, uid, gid)
1895
1896
1897 def move_files(ctx, src, dst, uid=None, gid=None):
1898 # type: (CephadmContext, List[str], str, Optional[int], Optional[int]) -> None
1899 """
1900 Move files from src to dst
1901 """
1902 if uid is None or gid is None:
1903 (uid, gid) = extract_uid_gid(ctx)
1904
1905 for src_file in src:
1906 dst_file = dst
1907 if os.path.isdir(dst):
1908 dst_file = os.path.join(dst, os.path.basename(src_file))
1909
1910 if os.path.islink(src_file):
1911 # shutil.move() in py2 does not handle symlinks correctly
1912 src_rl = os.readlink(src_file)
1913 logger.debug("symlink '%s' -> '%s'" % (dst_file, src_rl))
1914 os.symlink(src_rl, dst_file)
1915 os.unlink(src_file)
1916 else:
1917 logger.debug("move file '%s' -> '%s'" % (src_file, dst_file))
1918 shutil.move(src_file, dst_file)
1919 logger.debug('chown %s:%s `%s`' % (uid, gid, dst_file))
1920 os.chown(dst_file, uid, gid)
1921
1922
1923 # copied from distutils
1924 def find_executable(executable, path=None):
1925 """Tries to find 'executable' in the directories listed in 'path'.
1926 A string listing directories separated by 'os.pathsep'; defaults to
1927 os.environ['PATH']. Returns the complete filename or None if not found.
1928 """
1929 _, ext = os.path.splitext(executable)
1930 if (sys.platform == 'win32') and (ext != '.exe'):
1931 executable = executable + '.exe'
1932
1933 if os.path.isfile(executable):
1934 return executable
1935
1936 if path is None:
1937 path = os.environ.get('PATH', None)
1938 if path is None:
1939 try:
1940 path = os.confstr('CS_PATH')
1941 except (AttributeError, ValueError):
1942 # os.confstr() or CS_PATH is not available
1943 path = os.defpath
1944 # bpo-35755: Don't use os.defpath if the PATH environment variable is
1945 # set to an empty string
1946
1947 # PATH='' doesn't match, whereas PATH=':' looks in the current directory
1948 if not path:
1949 return None
1950
1951 paths = path.split(os.pathsep)
1952 for p in paths:
1953 f = os.path.join(p, executable)
1954 if os.path.isfile(f):
1955 # the file exists, we have a shot at spawn working
1956 return f
1957 return None
1958
1959
1960 def find_program(filename):
1961 # type: (str) -> str
1962 name = find_executable(filename)
1963 if name is None:
1964 raise ValueError('%s not found' % filename)
1965 return name
1966
1967
1968 def find_container_engine(ctx: CephadmContext):
1969 if ctx.docker:
1970 return Docker()
1971 else:
1972 for i in CONTAINER_PREFERENCE:
1973 try:
1974 return i()
1975 except Exception as e:
1976 logger.debug('Could not locate %s: %s' % (i.EXE, e))
1977 return None
1978
1979
1980 def check_container_engine(ctx):
1981 # type: (CephadmContext) -> None
1982 engine = ctx.container_engine
1983 if not isinstance(engine, CONTAINER_PREFERENCE):
1984 raise Error('Unable to locate any of %s' % [i.EXE for i in CONTAINER_PREFERENCE])
1985 elif isinstance(engine, Podman):
1986 engine.get_version(ctx)
1987 if engine.version < MIN_PODMAN_VERSION:
1988 raise Error('podman version %d.%d.%d or later is required' % MIN_PODMAN_VERSION)
1989
1990
1991 def get_unit_name(fsid, daemon_type, daemon_id=None):
1992 # type: (str, str, Optional[Union[int, str]]) -> str
1993 # accept either name or type + id
1994 if daemon_type == CephadmDaemon.daemon_type and daemon_id is not None:
1995 return 'ceph-%s-%s.%s' % (fsid, daemon_type, daemon_id)
1996 elif daemon_id is not None:
1997 return 'ceph-%s@%s.%s' % (fsid, daemon_type, daemon_id)
1998 else:
1999 return 'ceph-%s@%s' % (fsid, daemon_type)
2000
2001
2002 def get_unit_name_by_daemon_name(ctx: CephadmContext, fsid, name):
2003 daemon = get_daemon_description(ctx, fsid, name)
2004 try:
2005 return daemon['systemd_unit']
2006 except KeyError:
2007 raise Error('Failed to get unit name for {}'.format(daemon))
2008
2009
2010 def check_unit(ctx, unit_name):
2011 # type: (CephadmContext, str) -> Tuple[bool, str, bool]
2012 # NOTE: we ignore the exit code here because systemctl outputs
2013 # various exit codes based on the state of the service, but the
2014 # string result is more explicit (and sufficient).
2015 enabled = False
2016 installed = False
2017 try:
2018 out, err, code = call(ctx, ['systemctl', 'is-enabled', unit_name],
2019 verbosity=CallVerbosity.DEBUG)
2020 if code == 0:
2021 enabled = True
2022 installed = True
2023 elif 'disabled' in out:
2024 installed = True
2025 except Exception as e:
2026 logger.warning('unable to run systemctl: %s' % e)
2027 enabled = False
2028 installed = False
2029
2030 state = 'unknown'
2031 try:
2032 out, err, code = call(ctx, ['systemctl', 'is-active', unit_name],
2033 verbosity=CallVerbosity.DEBUG)
2034 out = out.strip()
2035 if out in ['active']:
2036 state = 'running'
2037 elif out in ['inactive']:
2038 state = 'stopped'
2039 elif out in ['failed', 'auto-restart']:
2040 state = 'error'
2041 else:
2042 state = 'unknown'
2043 except Exception as e:
2044 logger.warning('unable to run systemctl: %s' % e)
2045 state = 'unknown'
2046 return (enabled, state, installed)
2047
2048
2049 def check_units(ctx, units, enabler=None):
2050 # type: (CephadmContext, List[str], Optional[Packager]) -> bool
2051 for u in units:
2052 (enabled, state, installed) = check_unit(ctx, u)
2053 if enabled and state == 'running':
2054 logger.info('Unit %s is enabled and running' % u)
2055 return True
2056 if enabler is not None:
2057 if installed:
2058 logger.info('Enabling unit %s' % u)
2059 enabler.enable_service(u)
2060 return False
2061
2062
2063 def is_container_running(ctx: CephadmContext, name: str) -> bool:
2064 out, err, ret = call_throws(ctx, [
2065 ctx.container_engine.path, 'ps',
2066 '--format', '{{.Names}}'])
2067 return name in out
2068
2069
2070 def get_legacy_config_fsid(cluster, legacy_dir=None):
2071 # type: (str, Optional[str]) -> Optional[str]
2072 config_file = '/etc/ceph/%s.conf' % cluster
2073 if legacy_dir is not None:
2074 config_file = os.path.abspath(legacy_dir + config_file)
2075
2076 if os.path.exists(config_file):
2077 config = read_config(config_file)
2078 if config.has_section('global') and config.has_option('global', 'fsid'):
2079 return config.get('global', 'fsid')
2080 return None
2081
2082
2083 def get_legacy_daemon_fsid(ctx, cluster,
2084 daemon_type, daemon_id, legacy_dir=None):
2085 # type: (CephadmContext, str, str, Union[int, str], Optional[str]) -> Optional[str]
2086 fsid = None
2087 if daemon_type == 'osd':
2088 try:
2089 fsid_file = os.path.join(ctx.data_dir,
2090 daemon_type,
2091 'ceph-%s' % daemon_id,
2092 'ceph_fsid')
2093 if legacy_dir is not None:
2094 fsid_file = os.path.abspath(legacy_dir + fsid_file)
2095 with open(fsid_file, 'r') as f:
2096 fsid = f.read().strip()
2097 except IOError:
2098 pass
2099 if not fsid:
2100 fsid = get_legacy_config_fsid(cluster, legacy_dir=legacy_dir)
2101 return fsid
2102
2103
2104 def get_daemon_args(ctx, fsid, daemon_type, daemon_id):
2105 # type: (CephadmContext, str, str, Union[int, str]) -> List[str]
2106 r = list() # type: List[str]
2107
2108 if daemon_type in Ceph.daemons and daemon_type != 'crash':
2109 r += [
2110 '--setuser', 'ceph',
2111 '--setgroup', 'ceph',
2112 '--default-log-to-file=false',
2113 '--default-log-to-stderr=true',
2114 '--default-log-stderr-prefix=debug ',
2115 ]
2116 if daemon_type == 'mon':
2117 r += [
2118 '--default-mon-cluster-log-to-file=false',
2119 '--default-mon-cluster-log-to-stderr=true',
2120 ]
2121 elif daemon_type in Monitoring.components:
2122 metadata = Monitoring.components[daemon_type]
2123 r += metadata.get('args', list())
2124 if daemon_type == 'alertmanager':
2125 config = get_parm(ctx.config_json)
2126 peers = config.get('peers', list()) # type: ignore
2127 for peer in peers:
2128 r += ['--cluster.peer={}'.format(peer)]
2129 # some alertmanager, by default, look elsewhere for a config
2130 r += ['--config.file=/etc/alertmanager/alertmanager.yml']
2131 elif daemon_type == NFSGanesha.daemon_type:
2132 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2133 r += nfs_ganesha.get_daemon_args()
2134 elif daemon_type == HAproxy.daemon_type:
2135 haproxy = HAproxy.init(ctx, fsid, daemon_id)
2136 r += haproxy.get_daemon_args()
2137 elif daemon_type == CustomContainer.daemon_type:
2138 cc = CustomContainer.init(ctx, fsid, daemon_id)
2139 r.extend(cc.get_daemon_args())
2140
2141 return r
2142
2143
2144 def create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid,
2145 config=None, keyring=None):
2146 # type: (CephadmContext, str, str, Union[int, str], int, int, Optional[str], Optional[str]) -> None
2147 data_dir = make_data_dir(ctx, fsid, daemon_type, daemon_id, uid=uid, gid=gid)
2148 make_log_dir(ctx, fsid, uid=uid, gid=gid)
2149
2150 if config:
2151 config_path = os.path.join(data_dir, 'config')
2152 with open(config_path, 'w') as f:
2153 os.fchown(f.fileno(), uid, gid)
2154 os.fchmod(f.fileno(), 0o600)
2155 f.write(config)
2156
2157 if keyring:
2158 keyring_path = os.path.join(data_dir, 'keyring')
2159 with open(keyring_path, 'w') as f:
2160 os.fchmod(f.fileno(), 0o600)
2161 os.fchown(f.fileno(), uid, gid)
2162 f.write(keyring)
2163
2164 if daemon_type in Monitoring.components.keys():
2165 config_json: Dict[str, Any] = get_parm(ctx.config_json)
2166 required_files = Monitoring.components[daemon_type].get('config-json-files', list())
2167
2168 # Set up directories specific to the monitoring component
2169 config_dir = ''
2170 data_dir_root = ''
2171 if daemon_type == 'prometheus':
2172 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2173 daemon_type, daemon_id)
2174 config_dir = 'etc/prometheus'
2175 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2176 makedirs(os.path.join(data_dir_root, config_dir, 'alerting'), uid, gid, 0o755)
2177 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2178 elif daemon_type == 'grafana':
2179 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2180 daemon_type, daemon_id)
2181 config_dir = 'etc/grafana'
2182 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2183 makedirs(os.path.join(data_dir_root, config_dir, 'certs'), uid, gid, 0o755)
2184 makedirs(os.path.join(data_dir_root, config_dir, 'provisioning/datasources'), uid, gid, 0o755)
2185 makedirs(os.path.join(data_dir_root, 'data'), uid, gid, 0o755)
2186 touch(os.path.join(data_dir_root, 'data', 'grafana.db'), uid, gid)
2187 elif daemon_type == 'alertmanager':
2188 data_dir_root = get_data_dir(fsid, ctx.data_dir,
2189 daemon_type, daemon_id)
2190 config_dir = 'etc/alertmanager'
2191 makedirs(os.path.join(data_dir_root, config_dir), uid, gid, 0o755)
2192 makedirs(os.path.join(data_dir_root, config_dir, 'data'), uid, gid, 0o755)
2193
2194 # populate the config directory for the component from the config-json
2195 for fname in required_files:
2196 if 'files' in config_json: # type: ignore
2197 content = dict_get_join(config_json['files'], fname)
2198 with open(os.path.join(data_dir_root, config_dir, fname), 'w') as f:
2199 os.fchown(f.fileno(), uid, gid)
2200 os.fchmod(f.fileno(), 0o600)
2201 f.write(content)
2202
2203 elif daemon_type == NFSGanesha.daemon_type:
2204 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2205 nfs_ganesha.create_daemon_dirs(data_dir, uid, gid)
2206
2207 elif daemon_type == CephIscsi.daemon_type:
2208 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2209 ceph_iscsi.create_daemon_dirs(data_dir, uid, gid)
2210
2211 elif daemon_type == HAproxy.daemon_type:
2212 haproxy = HAproxy.init(ctx, fsid, daemon_id)
2213 haproxy.create_daemon_dirs(data_dir, uid, gid)
2214
2215 elif daemon_type == Keepalived.daemon_type:
2216 keepalived = Keepalived.init(ctx, fsid, daemon_id)
2217 keepalived.create_daemon_dirs(data_dir, uid, gid)
2218
2219 elif daemon_type == CustomContainer.daemon_type:
2220 cc = CustomContainer.init(ctx, fsid, daemon_id)
2221 cc.create_daemon_dirs(data_dir, uid, gid)
2222
2223
2224 def get_parm(option):
2225 # type: (str) -> Dict[str, str]
2226
2227 if not option:
2228 return dict()
2229
2230 global cached_stdin
2231 if option == '-':
2232 if cached_stdin is not None:
2233 j = cached_stdin
2234 else:
2235 j = sys.stdin.read()
2236 cached_stdin = j
2237 else:
2238 # inline json string
2239 if option[0] == '{' and option[-1] == '}':
2240 j = option
2241 # json file
2242 elif os.path.exists(option):
2243 with open(option, 'r') as f:
2244 j = f.read()
2245 else:
2246 raise Error('Config file {} not found'.format(option))
2247
2248 try:
2249 js = json.loads(j)
2250 except ValueError as e:
2251 raise Error('Invalid JSON in {}: {}'.format(option, e))
2252 else:
2253 return js
2254
2255
2256 def get_config_and_keyring(ctx):
2257 # type: (CephadmContext) -> Tuple[Optional[str], Optional[str]]
2258 config = None
2259 keyring = None
2260
2261 if 'config_json' in ctx and ctx.config_json:
2262 d = get_parm(ctx.config_json)
2263 config = d.get('config')
2264 keyring = d.get('keyring')
2265
2266 if 'config' in ctx and ctx.config:
2267 try:
2268 with open(ctx.config, 'r') as f:
2269 config = f.read()
2270 except FileNotFoundError:
2271 raise Error('config file: %s does not exist' % ctx.config)
2272
2273 if 'key' in ctx and ctx.key:
2274 keyring = '[%s]\n\tkey = %s\n' % (ctx.name, ctx.key)
2275 elif 'keyring' in ctx and ctx.keyring:
2276 try:
2277 with open(ctx.keyring, 'r') as f:
2278 keyring = f.read()
2279 except FileNotFoundError:
2280 raise Error('keyring file: %s does not exist' % ctx.keyring)
2281
2282 return config, keyring
2283
2284
2285 def get_container_binds(ctx, fsid, daemon_type, daemon_id):
2286 # type: (CephadmContext, str, str, Union[int, str, None]) -> List[List[str]]
2287 binds = list()
2288
2289 if daemon_type == CephIscsi.daemon_type:
2290 binds.extend(CephIscsi.get_container_binds())
2291 elif daemon_type == CustomContainer.daemon_type:
2292 assert daemon_id
2293 cc = CustomContainer.init(ctx, fsid, daemon_id)
2294 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2295 binds.extend(cc.get_container_binds(data_dir))
2296
2297 return binds
2298
2299
2300 def get_container_mounts(ctx, fsid, daemon_type, daemon_id,
2301 no_config=False):
2302 # type: (CephadmContext, str, str, Union[int, str, None], Optional[bool]) -> Dict[str, str]
2303 mounts = dict()
2304
2305 if daemon_type in Ceph.daemons:
2306 if fsid:
2307 run_path = os.path.join('/var/run/ceph', fsid)
2308 if os.path.exists(run_path):
2309 mounts[run_path] = '/var/run/ceph:z'
2310 log_dir = get_log_dir(fsid, ctx.log_dir)
2311 mounts[log_dir] = '/var/log/ceph:z'
2312 crash_dir = '/var/lib/ceph/%s/crash' % fsid
2313 if os.path.exists(crash_dir):
2314 mounts[crash_dir] = '/var/lib/ceph/crash:z'
2315
2316 if daemon_type in Ceph.daemons and daemon_id:
2317 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2318 if daemon_type == 'rgw':
2319 cdata_dir = '/var/lib/ceph/radosgw/ceph-rgw.%s' % (daemon_id)
2320 else:
2321 cdata_dir = '/var/lib/ceph/%s/ceph-%s' % (daemon_type, daemon_id)
2322 if daemon_type != 'crash':
2323 mounts[data_dir] = cdata_dir + ':z'
2324 if not no_config:
2325 mounts[data_dir + '/config'] = '/etc/ceph/ceph.conf:z'
2326 if daemon_type in ['rbd-mirror', 'cephfs-mirror', 'crash']:
2327 # these do not search for their keyrings in a data directory
2328 mounts[data_dir + '/keyring'] = '/etc/ceph/ceph.client.%s.%s.keyring' % (daemon_type, daemon_id)
2329
2330 if daemon_type in ['mon', 'osd']:
2331 mounts['/dev'] = '/dev' # FIXME: narrow this down?
2332 mounts['/run/udev'] = '/run/udev'
2333 if daemon_type == 'osd':
2334 mounts['/sys'] = '/sys' # for numa.cc, pick_address, cgroups, ...
2335 # selinux-policy in the container may not match the host.
2336 if HostFacts(ctx).selinux_enabled:
2337 selinux_folder = '/var/lib/ceph/%s/selinux' % fsid
2338 if not os.path.exists(selinux_folder):
2339 os.makedirs(selinux_folder, mode=0o755)
2340 mounts[selinux_folder] = '/sys/fs/selinux:ro'
2341 mounts['/run/lvm'] = '/run/lvm'
2342 mounts['/run/lock/lvm'] = '/run/lock/lvm'
2343
2344 try:
2345 if ctx.shared_ceph_folder: # make easy manager modules/ceph-volume development
2346 ceph_folder = pathify(ctx.shared_ceph_folder)
2347 if os.path.exists(ceph_folder):
2348 mounts[ceph_folder + '/src/ceph-volume/ceph_volume'] = '/usr/lib/python3.6/site-packages/ceph_volume'
2349 mounts[ceph_folder + '/src/pybind/mgr'] = '/usr/share/ceph/mgr'
2350 mounts[ceph_folder + '/src/python-common/ceph'] = '/usr/lib/python3.6/site-packages/ceph'
2351 mounts[ceph_folder + '/monitoring/grafana/dashboards'] = '/etc/grafana/dashboards/ceph-dashboard'
2352 mounts[ceph_folder + '/monitoring/prometheus/alerts'] = '/etc/prometheus/ceph'
2353 else:
2354 logger.error('{}{}{}'.format(termcolor.red,
2355 'Ceph shared source folder does not exist.',
2356 termcolor.end))
2357 except AttributeError:
2358 pass
2359
2360 if daemon_type in Monitoring.components and daemon_id:
2361 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2362 if daemon_type == 'prometheus':
2363 mounts[os.path.join(data_dir, 'etc/prometheus')] = '/etc/prometheus:Z'
2364 mounts[os.path.join(data_dir, 'data')] = '/prometheus:Z'
2365 elif daemon_type == 'node-exporter':
2366 mounts['/proc'] = '/host/proc:ro'
2367 mounts['/sys'] = '/host/sys:ro'
2368 mounts['/'] = '/rootfs:ro'
2369 elif daemon_type == 'grafana':
2370 mounts[os.path.join(data_dir, 'etc/grafana/grafana.ini')] = '/etc/grafana/grafana.ini:Z'
2371 mounts[os.path.join(data_dir, 'etc/grafana/provisioning/datasources')] = '/etc/grafana/provisioning/datasources:Z'
2372 mounts[os.path.join(data_dir, 'etc/grafana/certs')] = '/etc/grafana/certs:Z'
2373 mounts[os.path.join(data_dir, 'data/grafana.db')] = '/var/lib/grafana/grafana.db:Z'
2374 elif daemon_type == 'alertmanager':
2375 mounts[os.path.join(data_dir, 'etc/alertmanager')] = '/etc/alertmanager:Z'
2376
2377 if daemon_type == NFSGanesha.daemon_type:
2378 assert daemon_id
2379 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2380 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2381 mounts.update(nfs_ganesha.get_container_mounts(data_dir))
2382
2383 if daemon_type == HAproxy.daemon_type:
2384 assert daemon_id
2385 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2386 mounts.update(HAproxy.get_container_mounts(data_dir))
2387
2388 if daemon_type == CephIscsi.daemon_type:
2389 assert daemon_id
2390 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2391 log_dir = get_log_dir(fsid, ctx.log_dir)
2392 mounts.update(CephIscsi.get_container_mounts(data_dir, log_dir))
2393
2394 if daemon_type == Keepalived.daemon_type:
2395 assert daemon_id
2396 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2397 mounts.update(Keepalived.get_container_mounts(data_dir))
2398
2399 if daemon_type == CustomContainer.daemon_type:
2400 assert daemon_id
2401 cc = CustomContainer.init(ctx, fsid, daemon_id)
2402 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2403 mounts.update(cc.get_container_mounts(data_dir))
2404
2405 return mounts
2406
2407
2408 def get_container(ctx: CephadmContext,
2409 fsid: str, daemon_type: str, daemon_id: Union[int, str],
2410 privileged: bool = False,
2411 ptrace: bool = False,
2412 container_args: Optional[List[str]] = None) -> 'CephContainer':
2413 entrypoint: str = ''
2414 name: str = ''
2415 ceph_args: List[str] = []
2416 envs: List[str] = []
2417 host_network: bool = True
2418
2419 if container_args is None:
2420 container_args = []
2421 if daemon_type in ['mon', 'osd']:
2422 # mon and osd need privileged in order for libudev to query devices
2423 privileged = True
2424 if daemon_type == 'rgw':
2425 entrypoint = '/usr/bin/radosgw'
2426 name = 'client.rgw.%s' % daemon_id
2427 elif daemon_type == 'rbd-mirror':
2428 entrypoint = '/usr/bin/rbd-mirror'
2429 name = 'client.rbd-mirror.%s' % daemon_id
2430 elif daemon_type == 'cephfs-mirror':
2431 entrypoint = '/usr/bin/cephfs-mirror'
2432 name = 'client.cephfs-mirror.%s' % daemon_id
2433 elif daemon_type == 'crash':
2434 entrypoint = '/usr/bin/ceph-crash'
2435 name = 'client.crash.%s' % daemon_id
2436 elif daemon_type in ['mon', 'mgr', 'mds', 'osd']:
2437 entrypoint = '/usr/bin/ceph-' + daemon_type
2438 name = '%s.%s' % (daemon_type, daemon_id)
2439 elif daemon_type in Monitoring.components:
2440 entrypoint = ''
2441 elif daemon_type == NFSGanesha.daemon_type:
2442 entrypoint = NFSGanesha.entrypoint
2443 name = '%s.%s' % (daemon_type, daemon_id)
2444 envs.extend(NFSGanesha.get_container_envs())
2445 elif daemon_type == HAproxy.daemon_type:
2446 name = '%s.%s' % (daemon_type, daemon_id)
2447 elif daemon_type == Keepalived.daemon_type:
2448 name = '%s.%s' % (daemon_type, daemon_id)
2449 envs.extend(Keepalived.get_container_envs())
2450 container_args.extend(['--cap-add=NET_ADMIN', '--cap-add=NET_RAW'])
2451 elif daemon_type == CephIscsi.daemon_type:
2452 entrypoint = CephIscsi.entrypoint
2453 name = '%s.%s' % (daemon_type, daemon_id)
2454 # So the container can modprobe iscsi_target_mod and have write perms
2455 # to configfs we need to make this a privileged container.
2456 privileged = True
2457 elif daemon_type == CustomContainer.daemon_type:
2458 cc = CustomContainer.init(ctx, fsid, daemon_id)
2459 entrypoint = cc.entrypoint
2460 host_network = False
2461 envs.extend(cc.get_container_envs())
2462 container_args.extend(cc.get_container_args())
2463
2464 if daemon_type in Monitoring.components:
2465 uid, gid = extract_uid_gid_monitoring(ctx, daemon_type)
2466 monitoring_args = [
2467 '--user',
2468 str(uid),
2469 # FIXME: disable cpu/memory limits for the time being (not supported
2470 # by ubuntu 18.04 kernel!)
2471 ]
2472 container_args.extend(monitoring_args)
2473 elif daemon_type == 'crash':
2474 ceph_args = ['-n', name]
2475 elif daemon_type in Ceph.daemons:
2476 ceph_args = ['-n', name, '-f']
2477
2478 # if using podman, set -d, --conmon-pidfile & --cidfile flags
2479 # so service can have Type=Forking
2480 if isinstance(ctx.container_engine, Podman):
2481 runtime_dir = '/run'
2482 container_args.extend([
2483 '-d', '--log-driver', 'journald',
2484 '--conmon-pidfile',
2485 runtime_dir + '/ceph-%s@%s.%s.service-pid' % (fsid, daemon_type, daemon_id),
2486 '--cidfile',
2487 runtime_dir + '/ceph-%s@%s.%s.service-cid' % (fsid, daemon_type, daemon_id),
2488 ])
2489 if ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION:
2490 container_args.append('--cgroups=split')
2491
2492 return CephContainer(
2493 ctx,
2494 image=ctx.image,
2495 entrypoint=entrypoint,
2496 args=ceph_args + get_daemon_args(ctx, fsid, daemon_type, daemon_id),
2497 container_args=container_args,
2498 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
2499 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
2500 cname='ceph-%s-%s.%s' % (fsid, daemon_type, daemon_id),
2501 envs=envs,
2502 privileged=privileged,
2503 ptrace=ptrace,
2504 host_network=host_network,
2505 )
2506
2507
2508 def extract_uid_gid(ctx, img='', file_path='/var/lib/ceph'):
2509 # type: (CephadmContext, str, Union[str, List[str]]) -> Tuple[int, int]
2510
2511 if not img:
2512 img = ctx.image
2513
2514 if isinstance(file_path, str):
2515 paths = [file_path]
2516 else:
2517 paths = file_path
2518
2519 for fp in paths:
2520 try:
2521 out = CephContainer(
2522 ctx,
2523 image=img,
2524 entrypoint='stat',
2525 args=['-c', '%u %g', fp]
2526 ).run()
2527 uid, gid = out.split(' ')
2528 return int(uid), int(gid)
2529 except RuntimeError:
2530 pass
2531 raise RuntimeError('uid/gid not found')
2532
2533
2534 def deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid,
2535 config=None, keyring=None,
2536 osd_fsid=None,
2537 reconfig=False,
2538 ports=None):
2539 # type: (CephadmContext, str, str, Union[int, str], Optional[CephContainer], int, int, Optional[str], Optional[str], Optional[str], Optional[bool], Optional[List[int]]) -> None
2540
2541 ports = ports or []
2542 if any([port_in_use(ctx, port) for port in ports]):
2543 raise Error("TCP Port(s) '{}' required for {} already in use".format(','.join(map(str, ports)), daemon_type))
2544
2545 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2546 if reconfig and not os.path.exists(data_dir):
2547 raise Error('cannot reconfig, data path %s does not exist' % data_dir)
2548 if daemon_type == 'mon' and not os.path.exists(data_dir):
2549 assert config
2550 assert keyring
2551 # tmp keyring file
2552 tmp_keyring = write_tmp(keyring, uid, gid)
2553
2554 # tmp config file
2555 tmp_config = write_tmp(config, uid, gid)
2556
2557 # --mkfs
2558 create_daemon_dirs(ctx, fsid, daemon_type, daemon_id, uid, gid)
2559 mon_dir = get_data_dir(fsid, ctx.data_dir, 'mon', daemon_id)
2560 log_dir = get_log_dir(fsid, ctx.log_dir)
2561 CephContainer(
2562 ctx,
2563 image=ctx.image,
2564 entrypoint='/usr/bin/ceph-mon',
2565 args=[
2566 '--mkfs',
2567 '-i', str(daemon_id),
2568 '--fsid', fsid,
2569 '-c', '/tmp/config',
2570 '--keyring', '/tmp/keyring',
2571 ] + get_daemon_args(ctx, fsid, 'mon', daemon_id),
2572 volume_mounts={
2573 log_dir: '/var/log/ceph:z',
2574 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (daemon_id),
2575 tmp_keyring.name: '/tmp/keyring:z',
2576 tmp_config.name: '/tmp/config:z',
2577 },
2578 ).run()
2579
2580 # write conf
2581 with open(mon_dir + '/config', 'w') as f:
2582 os.fchown(f.fileno(), uid, gid)
2583 os.fchmod(f.fileno(), 0o600)
2584 f.write(config)
2585 else:
2586 # dirs, conf, keyring
2587 create_daemon_dirs(
2588 ctx,
2589 fsid, daemon_type, daemon_id,
2590 uid, gid,
2591 config, keyring)
2592
2593 if not reconfig:
2594 if daemon_type == CephadmDaemon.daemon_type:
2595 port = next(iter(ports), None) # get first tcp port provided or None
2596
2597 if ctx.config_json == '-':
2598 config_js = get_parm('-')
2599 else:
2600 config_js = get_parm(ctx.config_json)
2601 assert isinstance(config_js, dict)
2602
2603 cephadm_exporter = CephadmDaemon(ctx, fsid, daemon_id, port)
2604 cephadm_exporter.deploy_daemon_unit(config_js)
2605 else:
2606 if c:
2607 deploy_daemon_units(ctx, fsid, uid, gid, daemon_type, daemon_id,
2608 c, osd_fsid=osd_fsid, ports=ports)
2609 else:
2610 raise RuntimeError('attempting to deploy a daemon without a container image')
2611
2612 if not os.path.exists(data_dir + '/unit.created'):
2613 with open(data_dir + '/unit.created', 'w') as f:
2614 os.fchmod(f.fileno(), 0o600)
2615 os.fchown(f.fileno(), uid, gid)
2616 f.write('mtime is time the daemon deployment was created\n')
2617
2618 with open(data_dir + '/unit.configured', 'w') as f:
2619 f.write('mtime is time we were last configured\n')
2620 os.fchmod(f.fileno(), 0o600)
2621 os.fchown(f.fileno(), uid, gid)
2622
2623 update_firewalld(ctx, daemon_type)
2624
2625 # Open ports explicitly required for the daemon
2626 if ports:
2627 fw = Firewalld(ctx)
2628 fw.open_ports(ports)
2629 fw.apply_rules()
2630
2631 if reconfig and daemon_type not in Ceph.daemons:
2632 # ceph daemons do not need a restart; others (presumably) do to pick
2633 # up the new config
2634 call_throws(ctx, ['systemctl', 'reset-failed',
2635 get_unit_name(fsid, daemon_type, daemon_id)])
2636 call_throws(ctx, ['systemctl', 'restart',
2637 get_unit_name(fsid, daemon_type, daemon_id)])
2638
2639
2640 def _write_container_cmd_to_bash(ctx, file_obj, container, comment=None, background=False):
2641 # type: (CephadmContext, IO[str], CephContainer, Optional[str], Optional[bool]) -> None
2642 if comment:
2643 # Sometimes adding a comment, especially if there are multiple containers in one
2644 # unit file, makes it easier to read and grok.
2645 file_obj.write('# ' + comment + '\n')
2646 # Sometimes, adding `--rm` to a run_cmd doesn't work. Let's remove the container manually
2647 file_obj.write('! ' + ' '.join(container.rm_cmd()) + ' 2> /dev/null\n')
2648 # Sometimes, `podman rm` doesn't find the container. Then you'll have to add `--storage`
2649 if isinstance(ctx.container_engine, Podman):
2650 file_obj.write(
2651 '! '
2652 + ' '.join([shlex.quote(a) for a in container.rm_cmd(storage=True)])
2653 + ' 2> /dev/null\n')
2654
2655 # container run command
2656 file_obj.write(
2657 ' '.join([shlex.quote(a) for a in container.run_cmd()])
2658 + (' &' if background else '') + '\n')
2659
2660
2661 def deploy_daemon_units(
2662 ctx: CephadmContext,
2663 fsid: str,
2664 uid: int,
2665 gid: int,
2666 daemon_type: str,
2667 daemon_id: Union[int, str],
2668 c: 'CephContainer',
2669 enable: bool = True,
2670 start: bool = True,
2671 osd_fsid: Optional[str] = None,
2672 ports: Optional[List[int]] = None,
2673 ) -> None:
2674 # cmd
2675 data_dir = get_data_dir(fsid, ctx.data_dir, daemon_type, daemon_id)
2676 with open(data_dir + '/unit.run.new', 'w') as f, \
2677 open(data_dir + '/unit.meta.new', 'w') as metaf:
2678 f.write('set -e\n')
2679
2680 if daemon_type in Ceph.daemons:
2681 install_path = find_program('install')
2682 f.write('{install_path} -d -m0770 -o {uid} -g {gid} /var/run/ceph/{fsid}\n'.format(install_path=install_path, fsid=fsid, uid=uid, gid=gid))
2683
2684 # pre-start cmd(s)
2685 if daemon_type == 'osd':
2686 # osds have a pre-start step
2687 assert osd_fsid
2688 simple_fn = os.path.join('/etc/ceph/osd',
2689 '%s-%s.json.adopted-by-cephadm' % (daemon_id, osd_fsid))
2690 if os.path.exists(simple_fn):
2691 f.write('# Simple OSDs need chown on startup:\n')
2692 for n in ['block', 'block.db', 'block.wal']:
2693 p = os.path.join(data_dir, n)
2694 f.write('[ ! -L {p} ] || chown {uid}:{gid} {p}\n'.format(p=p, uid=uid, gid=gid))
2695 else:
2696 prestart = CephContainer(
2697 ctx,
2698 image=ctx.image,
2699 entrypoint='/usr/sbin/ceph-volume',
2700 args=[
2701 'lvm', 'activate',
2702 str(daemon_id), osd_fsid,
2703 '--no-systemd'
2704 ],
2705 privileged=True,
2706 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
2707 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
2708 cname='ceph-%s-%s.%s-activate' % (fsid, daemon_type, daemon_id),
2709 memory_request=ctx.memory_request,
2710 memory_limit=ctx.memory_limit,
2711 )
2712 _write_container_cmd_to_bash(ctx, f, prestart, 'LVM OSDs use ceph-volume lvm activate')
2713 elif daemon_type == NFSGanesha.daemon_type:
2714 # add nfs to the rados grace db
2715 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2716 prestart = nfs_ganesha.get_rados_grace_container('add')
2717 _write_container_cmd_to_bash(ctx, f, prestart, 'add daemon to rados grace')
2718 elif daemon_type == CephIscsi.daemon_type:
2719 f.write(' '.join(CephIscsi.configfs_mount_umount(data_dir, mount=True)) + '\n')
2720 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2721 tcmu_container = ceph_iscsi.get_tcmu_runner_container()
2722 _write_container_cmd_to_bash(ctx, f, tcmu_container, 'iscsi tcmu-runnter container', background=True)
2723 elif daemon_type == Keepalived.daemon_type:
2724 f.write(Keepalived.get_prestart())
2725
2726 _write_container_cmd_to_bash(ctx, f, c, '%s.%s' % (daemon_type, str(daemon_id)))
2727
2728 # some metadata about the deploy
2729 meta: Dict[str, Any] = {}
2730 if 'meta_json' in ctx and ctx.meta_json:
2731 meta = json.loads(ctx.meta_json) or {}
2732 meta.update({
2733 'memory_request': int(ctx.memory_request) if ctx.memory_request else None,
2734 'memory_limit': int(ctx.memory_limit) if ctx.memory_limit else None,
2735 })
2736 if not meta.get('ports'):
2737 meta['ports'] = ports
2738 metaf.write(json.dumps(meta, indent=4) + '\n')
2739
2740 os.fchmod(f.fileno(), 0o600)
2741 os.fchmod(metaf.fileno(), 0o600)
2742 os.rename(data_dir + '/unit.run.new',
2743 data_dir + '/unit.run')
2744 os.rename(data_dir + '/unit.meta.new',
2745 data_dir + '/unit.meta')
2746
2747 # post-stop command(s)
2748 with open(data_dir + '/unit.poststop.new', 'w') as f:
2749 if daemon_type == 'osd':
2750 assert osd_fsid
2751 poststop = CephContainer(
2752 ctx,
2753 image=ctx.image,
2754 entrypoint='/usr/sbin/ceph-volume',
2755 args=[
2756 'lvm', 'deactivate',
2757 str(daemon_id), osd_fsid,
2758 ],
2759 privileged=True,
2760 volume_mounts=get_container_mounts(ctx, fsid, daemon_type, daemon_id),
2761 bind_mounts=get_container_binds(ctx, fsid, daemon_type, daemon_id),
2762 cname='ceph-%s-%s.%s-deactivate' % (fsid, daemon_type,
2763 daemon_id),
2764 )
2765 _write_container_cmd_to_bash(ctx, f, poststop, 'deactivate osd')
2766 elif daemon_type == NFSGanesha.daemon_type:
2767 # remove nfs from the rados grace db
2768 nfs_ganesha = NFSGanesha.init(ctx, fsid, daemon_id)
2769 poststop = nfs_ganesha.get_rados_grace_container('remove')
2770 _write_container_cmd_to_bash(ctx, f, poststop, 'remove daemon from rados grace')
2771 elif daemon_type == CephIscsi.daemon_type:
2772 # make sure we also stop the tcmu container
2773 ceph_iscsi = CephIscsi.init(ctx, fsid, daemon_id)
2774 tcmu_container = ceph_iscsi.get_tcmu_runner_container()
2775 f.write('! ' + ' '.join(tcmu_container.stop_cmd()) + '\n')
2776 f.write(' '.join(CephIscsi.configfs_mount_umount(data_dir, mount=False)) + '\n')
2777 os.fchmod(f.fileno(), 0o600)
2778 os.rename(data_dir + '/unit.poststop.new',
2779 data_dir + '/unit.poststop')
2780
2781 if c:
2782 with open(data_dir + '/unit.image.new', 'w') as f:
2783 f.write(c.image + '\n')
2784 os.fchmod(f.fileno(), 0o600)
2785 os.rename(data_dir + '/unit.image.new',
2786 data_dir + '/unit.image')
2787
2788 # systemd
2789 install_base_units(ctx, fsid)
2790 unit = get_unit_file(ctx, fsid)
2791 unit_file = 'ceph-%s@.service' % (fsid)
2792 with open(ctx.unit_dir + '/' + unit_file + '.new', 'w') as f:
2793 f.write(unit)
2794 os.rename(ctx.unit_dir + '/' + unit_file + '.new',
2795 ctx.unit_dir + '/' + unit_file)
2796 call_throws(ctx, ['systemctl', 'daemon-reload'])
2797
2798 unit_name = get_unit_name(fsid, daemon_type, daemon_id)
2799 call(ctx, ['systemctl', 'stop', unit_name],
2800 verbosity=CallVerbosity.DEBUG)
2801 call(ctx, ['systemctl', 'reset-failed', unit_name],
2802 verbosity=CallVerbosity.DEBUG)
2803 if enable:
2804 call_throws(ctx, ['systemctl', 'enable', unit_name])
2805 if start:
2806 call_throws(ctx, ['systemctl', 'start', unit_name])
2807
2808
2809 class Firewalld(object):
2810 def __init__(self, ctx):
2811 # type: (CephadmContext) -> None
2812 self.ctx = ctx
2813 self.available = self.check()
2814
2815 def check(self):
2816 # type: () -> bool
2817 self.cmd = find_executable('firewall-cmd')
2818 if not self.cmd:
2819 logger.debug('firewalld does not appear to be present')
2820 return False
2821 (enabled, state, _) = check_unit(self.ctx, 'firewalld.service')
2822 if not enabled:
2823 logger.debug('firewalld.service is not enabled')
2824 return False
2825 if state != 'running':
2826 logger.debug('firewalld.service is not running')
2827 return False
2828
2829 logger.info('firewalld ready')
2830 return True
2831
2832 def enable_service_for(self, daemon_type):
2833 # type: (str) -> None
2834 if not self.available:
2835 logger.debug('Not possible to enable service <%s>. firewalld.service is not available' % daemon_type)
2836 return
2837
2838 if daemon_type == 'mon':
2839 svc = 'ceph-mon'
2840 elif daemon_type in ['mgr', 'mds', 'osd']:
2841 svc = 'ceph'
2842 elif daemon_type == NFSGanesha.daemon_type:
2843 svc = 'nfs'
2844 else:
2845 return
2846
2847 if not self.cmd:
2848 raise RuntimeError('command not defined')
2849
2850 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-service', svc], verbosity=CallVerbosity.DEBUG)
2851 if ret:
2852 logger.info('Enabling firewalld service %s in current zone...' % svc)
2853 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--add-service', svc])
2854 if ret:
2855 raise RuntimeError(
2856 'unable to add service %s to current zone: %s' % (svc, err))
2857 else:
2858 logger.debug('firewalld service %s is enabled in current zone' % svc)
2859
2860 def open_ports(self, fw_ports):
2861 # type: (List[int]) -> None
2862 if not self.available:
2863 logger.debug('Not possible to open ports <%s>. firewalld.service is not available' % fw_ports)
2864 return
2865
2866 if not self.cmd:
2867 raise RuntimeError('command not defined')
2868
2869 for port in fw_ports:
2870 tcp_port = str(port) + '/tcp'
2871 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-port', tcp_port], verbosity=CallVerbosity.DEBUG)
2872 if ret:
2873 logger.info('Enabling firewalld port %s in current zone...' % tcp_port)
2874 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--add-port', tcp_port])
2875 if ret:
2876 raise RuntimeError('unable to add port %s to current zone: %s' %
2877 (tcp_port, err))
2878 else:
2879 logger.debug('firewalld port %s is enabled in current zone' % tcp_port)
2880
2881 def close_ports(self, fw_ports):
2882 # type: (List[int]) -> None
2883 if not self.available:
2884 logger.debug('Not possible to close ports <%s>. firewalld.service is not available' % fw_ports)
2885 return
2886
2887 if not self.cmd:
2888 raise RuntimeError('command not defined')
2889
2890 for port in fw_ports:
2891 tcp_port = str(port) + '/tcp'
2892 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--query-port', tcp_port], verbosity=CallVerbosity.DEBUG)
2893 if not ret:
2894 logger.info('Disabling port %s in current zone...' % tcp_port)
2895 out, err, ret = call(self.ctx, [self.cmd, '--permanent', '--remove-port', tcp_port])
2896 if ret:
2897 raise RuntimeError('unable to remove port %s from current zone: %s' %
2898 (tcp_port, err))
2899 else:
2900 logger.info(f'Port {tcp_port} disabled')
2901 else:
2902 logger.info(f'firewalld port {tcp_port} already closed')
2903
2904 def apply_rules(self):
2905 # type: () -> None
2906 if not self.available:
2907 return
2908
2909 if not self.cmd:
2910 raise RuntimeError('command not defined')
2911
2912 call_throws(self.ctx, [self.cmd, '--reload'])
2913
2914
2915 def update_firewalld(ctx, daemon_type):
2916 # type: (CephadmContext, str) -> None
2917 firewall = Firewalld(ctx)
2918
2919 firewall.enable_service_for(daemon_type)
2920
2921 fw_ports = []
2922
2923 if daemon_type in Monitoring.port_map.keys():
2924 fw_ports.extend(Monitoring.port_map[daemon_type]) # prometheus etc
2925
2926 firewall.open_ports(fw_ports)
2927 firewall.apply_rules()
2928
2929
2930 def install_base_units(ctx, fsid):
2931 # type: (CephadmContext, str) -> None
2932 """
2933 Set up ceph.target and ceph-$fsid.target units.
2934 """
2935 # global unit
2936 existed = os.path.exists(ctx.unit_dir + '/ceph.target')
2937 with open(ctx.unit_dir + '/ceph.target.new', 'w') as f:
2938 f.write('[Unit]\n'
2939 'Description=All Ceph clusters and services\n'
2940 '\n'
2941 '[Install]\n'
2942 'WantedBy=multi-user.target\n')
2943 os.rename(ctx.unit_dir + '/ceph.target.new',
2944 ctx.unit_dir + '/ceph.target')
2945 if not existed:
2946 # we disable before enable in case a different ceph.target
2947 # (from the traditional package) is present; while newer
2948 # systemd is smart enough to disable the old
2949 # (/lib/systemd/...) and enable the new (/etc/systemd/...),
2950 # some older versions of systemd error out with EEXIST.
2951 call_throws(ctx, ['systemctl', 'disable', 'ceph.target'])
2952 call_throws(ctx, ['systemctl', 'enable', 'ceph.target'])
2953 call_throws(ctx, ['systemctl', 'start', 'ceph.target'])
2954
2955 # cluster unit
2956 existed = os.path.exists(ctx.unit_dir + '/ceph-%s.target' % fsid)
2957 with open(ctx.unit_dir + '/ceph-%s.target.new' % fsid, 'w') as f:
2958 f.write(
2959 '[Unit]\n'
2960 'Description=Ceph cluster {fsid}\n'
2961 'PartOf=ceph.target\n'
2962 'Before=ceph.target\n'
2963 '\n'
2964 '[Install]\n'
2965 'WantedBy=multi-user.target ceph.target\n'.format(
2966 fsid=fsid)
2967 )
2968 os.rename(ctx.unit_dir + '/ceph-%s.target.new' % fsid,
2969 ctx.unit_dir + '/ceph-%s.target' % fsid)
2970 if not existed:
2971 call_throws(ctx, ['systemctl', 'enable', 'ceph-%s.target' % fsid])
2972 call_throws(ctx, ['systemctl', 'start', 'ceph-%s.target' % fsid])
2973
2974 # logrotate for the cluster
2975 with open(ctx.logrotate_dir + '/ceph-%s' % fsid, 'w') as f:
2976 """
2977 This is a bit sloppy in that the killall/pkill will touch all ceph daemons
2978 in all containers, but I don't see an elegant way to send SIGHUP *just* to
2979 the daemons for this cluster. (1) systemd kill -s will get the signal to
2980 podman, but podman will exit. (2) podman kill will get the signal to the
2981 first child (bash), but that isn't the ceph daemon. This is simpler and
2982 should be harmless.
2983 """
2984 f.write("""# created by cephadm
2985 /var/log/ceph/%s/*.log {
2986 rotate 7
2987 daily
2988 compress
2989 sharedscripts
2990 postrotate
2991 killall -q -1 ceph-mon ceph-mgr ceph-mds ceph-osd ceph-fuse radosgw rbd-mirror cephfs-mirror || pkill -1 -x 'ceph-mon|ceph-mgr|ceph-mds|ceph-osd|ceph-fuse|radosgw|rbd-mirror|cephfs-mirror' || true
2992 endscript
2993 missingok
2994 notifempty
2995 su root root
2996 }
2997 """ % fsid)
2998
2999
3000 def get_unit_file(ctx, fsid):
3001 # type: (CephadmContext, str) -> str
3002 extra_args = ''
3003 if isinstance(ctx.container_engine, Podman):
3004 extra_args = ('ExecStartPre=-/bin/rm -f %t/%n-pid %t/%n-cid\n'
3005 'ExecStopPost=-/bin/rm -f %t/%n-pid %t/%n-cid\n'
3006 'Type=forking\n'
3007 'PIDFile=%t/%n-pid\n')
3008 if ctx.container_engine.version >= CGROUPS_SPLIT_PODMAN_VERSION:
3009 extra_args += 'Delegate=yes\n'
3010
3011 docker = isinstance(ctx.container_engine, Docker)
3012 u = """# generated by cephadm
3013 [Unit]
3014 Description=Ceph %i for {fsid}
3015
3016 # According to:
3017 # http://www.freedesktop.org/wiki/Software/systemd/NetworkTarget
3018 # these can be removed once ceph-mon will dynamically change network
3019 # configuration.
3020 After=network-online.target local-fs.target time-sync.target{docker_after}
3021 Wants=network-online.target local-fs.target time-sync.target
3022 {docker_requires}
3023
3024 PartOf=ceph-{fsid}.target
3025 Before=ceph-{fsid}.target
3026
3027 [Service]
3028 LimitNOFILE=1048576
3029 LimitNPROC=1048576
3030 EnvironmentFile=-/etc/environment
3031 ExecStart=/bin/bash {data_dir}/{fsid}/%i/unit.run
3032 ExecStop=-{container_path} stop ceph-{fsid}-%i
3033 ExecStopPost=-/bin/bash {data_dir}/{fsid}/%i/unit.poststop
3034 KillMode=none
3035 Restart=on-failure
3036 RestartSec=10s
3037 TimeoutStartSec=120
3038 TimeoutStopSec=120
3039 StartLimitInterval=30min
3040 StartLimitBurst=5
3041 {extra_args}
3042 [Install]
3043 WantedBy=ceph-{fsid}.target
3044 """.format(container_path=ctx.container_engine.path,
3045 fsid=fsid,
3046 data_dir=ctx.data_dir,
3047 extra_args=extra_args,
3048 # if docker, we depend on docker.service
3049 docker_after=' docker.service' if docker else '',
3050 docker_requires='Requires=docker.service\n' if docker else '')
3051
3052 return u
3053
3054 ##################################
3055
3056
3057 class CephContainer:
3058 def __init__(self,
3059 ctx: CephadmContext,
3060 image: str,
3061 entrypoint: str,
3062 args: List[str] = [],
3063 volume_mounts: Dict[str, str] = {},
3064 cname: str = '',
3065 container_args: List[str] = [],
3066 envs: Optional[List[str]] = None,
3067 privileged: bool = False,
3068 ptrace: bool = False,
3069 bind_mounts: Optional[List[List[str]]] = None,
3070 init: Optional[bool] = None,
3071 host_network: bool = True,
3072 memory_request: Optional[str] = None,
3073 memory_limit: Optional[str] = None,
3074 ) -> None:
3075 self.ctx = ctx
3076 self.image = image
3077 self.entrypoint = entrypoint
3078 self.args = args
3079 self.volume_mounts = volume_mounts
3080 self.cname = cname
3081 self.container_args = container_args
3082 self.envs = envs
3083 self.privileged = privileged
3084 self.ptrace = ptrace
3085 self.bind_mounts = bind_mounts if bind_mounts else []
3086 self.init = init if init else ctx.container_init
3087 self.host_network = host_network
3088 self.memory_request = memory_request
3089 self.memory_limit = memory_limit
3090
3091 def run_cmd(self) -> List[str]:
3092 cmd_args: List[str] = [
3093 str(self.ctx.container_engine.path),
3094 'run',
3095 '--rm',
3096 '--ipc=host',
3097 ]
3098
3099 if isinstance(self.ctx.container_engine, Podman):
3100 # podman adds the container *name* to /etc/hosts (for 127.0.1.1)
3101 # by default, which makes python's socket.getfqdn() return that
3102 # instead of a valid hostname.
3103 cmd_args.append('--no-hosts')
3104 if os.path.exists('/etc/ceph/podman-auth.json'):
3105 cmd_args.append('--authfile=/etc/ceph/podman-auth.json')
3106
3107 envs: List[str] = [
3108 '-e', 'CONTAINER_IMAGE=%s' % self.image,
3109 '-e', 'NODE_NAME=%s' % get_hostname(),
3110 ]
3111 vols: List[str] = []
3112 binds: List[str] = []
3113
3114 if self.memory_request:
3115 cmd_args.extend(['-e', 'POD_MEMORY_REQUEST', str(self.memory_request)])
3116 if self.memory_limit:
3117 cmd_args.extend(['-e', 'POD_MEMORY_LIMIT', str(self.memory_limit)])
3118 cmd_args.extend(['--memory', str(self.memory_limit)])
3119
3120 if self.host_network:
3121 cmd_args.append('--net=host')
3122 if self.entrypoint:
3123 cmd_args.extend(['--entrypoint', self.entrypoint])
3124 if self.privileged:
3125 cmd_args.extend([
3126 '--privileged',
3127 # let OSD etc read block devs that haven't been chowned
3128 '--group-add=disk'])
3129 if self.ptrace and not self.privileged:
3130 # if privileged, the SYS_PTRACE cap is already added
3131 # in addition, --cap-add and --privileged are mutually
3132 # exclusive since podman >= 2.0
3133 cmd_args.append('--cap-add=SYS_PTRACE')
3134 if self.init:
3135 cmd_args.append('--init')
3136 envs += ['-e', 'CEPH_USE_RANDOM_NONCE=1']
3137 if self.cname:
3138 cmd_args.extend(['--name', self.cname])
3139 if self.envs:
3140 for env in self.envs:
3141 envs.extend(['-e', env])
3142
3143 vols = sum(
3144 [['-v', '%s:%s' % (host_dir, container_dir)]
3145 for host_dir, container_dir in self.volume_mounts.items()], [])
3146 binds = sum([['--mount', '{}'.format(','.join(bind))]
3147 for bind in self.bind_mounts], [])
3148
3149 return \
3150 cmd_args + self.container_args + \
3151 envs + vols + binds + \
3152 [self.image] + self.args # type: ignore
3153
3154 def shell_cmd(self, cmd: List[str]) -> List[str]:
3155 cmd_args: List[str] = [
3156 str(self.ctx.container_engine.path),
3157 'run',
3158 '--rm',
3159 '--ipc=host',
3160 ]
3161 envs: List[str] = [
3162 '-e', 'CONTAINER_IMAGE=%s' % self.image,
3163 '-e', 'NODE_NAME=%s' % get_hostname(),
3164 ]
3165 vols: List[str] = []
3166 binds: List[str] = []
3167
3168 if self.host_network:
3169 cmd_args.append('--net=host')
3170 if self.privileged:
3171 cmd_args.extend([
3172 '--privileged',
3173 # let OSD etc read block devs that haven't been chowned
3174 '--group-add=disk',
3175 ])
3176 if self.init:
3177 cmd_args.append('--init')
3178 envs += ['-e', 'CEPH_USE_RANDOM_NONCE=1']
3179 if self.envs:
3180 for env in self.envs:
3181 envs.extend(['-e', env])
3182
3183 vols = sum(
3184 [['-v', '%s:%s' % (host_dir, container_dir)]
3185 for host_dir, container_dir in self.volume_mounts.items()], [])
3186 binds = sum([['--mount', '{}'.format(','.join(bind))]
3187 for bind in self.bind_mounts], [])
3188
3189 return cmd_args + self.container_args + envs + vols + binds + [
3190 '--entrypoint', cmd[0],
3191 self.image,
3192 ] + cmd[1:]
3193
3194 def exec_cmd(self, cmd):
3195 # type: (List[str]) -> List[str]
3196 return [
3197 str(self.ctx.container_engine.path),
3198 'exec',
3199 ] + self.container_args + [
3200 self.cname,
3201 ] + cmd
3202
3203 def rm_cmd(self, storage=False):
3204 # type: (bool) -> List[str]
3205 ret = [
3206 str(self.ctx.container_engine.path),
3207 'rm', '-f',
3208 ]
3209 if storage:
3210 ret.append('--storage')
3211 ret.append(self.cname)
3212 return ret
3213
3214 def stop_cmd(self):
3215 # type () -> List[str]
3216 ret = [
3217 str(self.ctx.container_engine.path),
3218 'stop', self.cname,
3219 ]
3220 return ret
3221
3222 def run(self, timeout=DEFAULT_TIMEOUT):
3223 # type: (Optional[int]) -> str
3224 out, _, _ = call_throws(self.ctx, self.run_cmd(),
3225 desc=self.entrypoint, timeout=timeout)
3226 return out
3227
3228 ##################################
3229
3230
3231 @infer_image
3232 def command_version(ctx):
3233 # type: (CephadmContext) -> int
3234 c = CephContainer(ctx, ctx.image, 'ceph', ['--version'])
3235 out, err, ret = call(ctx, c.run_cmd(), desc=c.entrypoint)
3236 if not ret:
3237 print(out.strip())
3238 return ret
3239
3240 ##################################
3241
3242
3243 @infer_image
3244 def command_pull(ctx):
3245 # type: (CephadmContext) -> int
3246
3247 _pull_image(ctx, ctx.image)
3248 return command_inspect_image(ctx)
3249
3250
3251 def _pull_image(ctx, image):
3252 # type: (CephadmContext, str) -> None
3253 logger.info('Pulling container image %s...' % image)
3254
3255 ignorelist = [
3256 'error creating read-write layer with ID',
3257 'net/http: TLS handshake timeout',
3258 'Digest did not match, expected',
3259 ]
3260
3261 cmd = [ctx.container_engine.path, 'pull', image]
3262 if isinstance(ctx.container_engine, Podman) and os.path.exists('/etc/ceph/podman-auth.json'):
3263 cmd.append('--authfile=/etc/ceph/podman-auth.json')
3264 cmd_str = ' '.join(cmd)
3265
3266 for sleep_secs in [1, 4, 25]:
3267 out, err, ret = call(ctx, cmd)
3268 if not ret:
3269 return
3270
3271 if not any(pattern in err for pattern in ignorelist):
3272 raise RuntimeError('Failed command: %s' % cmd_str)
3273
3274 logger.info('`%s` failed transiently. Retrying. waiting %s seconds...' % (cmd_str, sleep_secs))
3275 time.sleep(sleep_secs)
3276
3277 raise RuntimeError('Failed command: %s: maximum retries reached' % cmd_str)
3278
3279 ##################################
3280
3281
3282 @infer_image
3283 def command_inspect_image(ctx):
3284 # type: (CephadmContext) -> int
3285 out, err, ret = call_throws(ctx, [
3286 ctx.container_engine.path, 'inspect',
3287 '--format', '{{.ID}},{{.RepoDigests}}',
3288 ctx.image])
3289 if ret:
3290 return errno.ENOENT
3291 info_from = get_image_info_from_inspect(out.strip(), ctx.image)
3292
3293 ver = CephContainer(ctx, ctx.image, 'ceph', ['--version']).run().strip()
3294 info_from['ceph_version'] = ver
3295
3296 print(json.dumps(info_from, indent=4, sort_keys=True))
3297 return 0
3298
3299
3300 def normalize_image_digest(digest):
3301 # normal case:
3302 # ceph/ceph -> docker.io/ceph/ceph
3303 # edge cases that shouldn't ever come up:
3304 # ubuntu -> docker.io/ubuntu (ubuntu alias for library/ubuntu)
3305 # no change:
3306 # quay.ceph.io/ceph/ceph -> ceph
3307 # docker.io/ubuntu -> no change
3308 bits = digest.split('/')
3309 if '.' not in bits[0] or len(bits) < 3:
3310 digest = DEFAULT_REGISTRY + '/' + digest
3311 return digest
3312
3313
3314 def get_image_info_from_inspect(out, image):
3315 # type: (str, str) -> Dict[str, Union[str,List[str]]]
3316 image_id, digests = out.split(',', 1)
3317 if not out:
3318 raise Error('inspect {}: empty result'.format(image))
3319 r = {
3320 'image_id': normalize_container_id(image_id)
3321 } # type: Dict[str, Union[str,List[str]]]
3322 if digests:
3323 r['repo_digests'] = list(map(normalize_image_digest, digests[1:-1].split(' ')))
3324 return r
3325
3326 ##################################
3327
3328
3329 def check_subnet(subnets: str) -> Tuple[int, List[int], str]:
3330 """Determine whether the given string is a valid subnet
3331
3332 :param subnets: subnet string, a single definition or comma separated list of CIDR subnets
3333 :returns: return code, IP version list of the subnets and msg describing any errors validation errors
3334 """
3335
3336 rc = 0
3337 versions = set()
3338 errors = []
3339 subnet_list = subnets.split(',')
3340 for subnet in subnet_list:
3341 # ensure the format of the string is as expected address/netmask
3342 if not re.search(r'\/\d+$', subnet):
3343 rc = 1
3344 errors.append(f'{subnet} is not in CIDR format (address/netmask)')
3345 continue
3346 try:
3347 v = ipaddress.ip_network(subnet).version
3348 versions.add(v)
3349 except ValueError as e:
3350 rc = 1
3351 errors.append(f'{subnet} invalid: {str(e)}')
3352
3353 return rc, list(versions), ', '.join(errors)
3354
3355
3356 def unwrap_ipv6(address):
3357 # type: (str) -> str
3358 if address.startswith('[') and address.endswith(']'):
3359 return address[1:-1]
3360 return address
3361
3362
3363 def wrap_ipv6(address):
3364 # type: (str) -> str
3365
3366 # We cannot assume it's already wrapped or even an IPv6 address if
3367 # it's already wrapped it'll not pass (like if it's a hostname) and trigger
3368 # the ValueError
3369 try:
3370 if ipaddress.ip_address(address).version == 6:
3371 return f'[{address}]'
3372 except ValueError:
3373 pass
3374
3375 return address
3376
3377
3378 def is_ipv6(address):
3379 # type: (str) -> bool
3380 address = unwrap_ipv6(address)
3381 try:
3382 return ipaddress.ip_address(address).version == 6
3383 except ValueError:
3384 logger.warning('Address: {} is not a valid IP address'.format(address))
3385 return False
3386
3387
3388 def prepare_mon_addresses(
3389 ctx: CephadmContext
3390 ) -> Tuple[str, bool, Optional[str]]:
3391 r = re.compile(r':(\d+)$')
3392 base_ip = ''
3393 ipv6 = False
3394
3395 if ctx.mon_ip:
3396 ipv6 = is_ipv6(ctx.mon_ip)
3397 if ipv6:
3398 ctx.mon_ip = wrap_ipv6(ctx.mon_ip)
3399 hasport = r.findall(ctx.mon_ip)
3400 if hasport:
3401 port = int(hasport[0])
3402 if port == 6789:
3403 addr_arg = '[v1:%s]' % ctx.mon_ip
3404 elif port == 3300:
3405 addr_arg = '[v2:%s]' % ctx.mon_ip
3406 else:
3407 logger.warning('Using msgr2 protocol for unrecognized port %d' %
3408 port)
3409 addr_arg = '[v2:%s]' % ctx.mon_ip
3410 base_ip = ctx.mon_ip[0:-(len(str(port))) - 1]
3411 check_ip_port(ctx, base_ip, port)
3412 else:
3413 base_ip = ctx.mon_ip
3414 addr_arg = '[v2:%s:3300,v1:%s:6789]' % (ctx.mon_ip, ctx.mon_ip)
3415 check_ip_port(ctx, ctx.mon_ip, 3300)
3416 check_ip_port(ctx, ctx.mon_ip, 6789)
3417 elif ctx.mon_addrv:
3418 addr_arg = ctx.mon_addrv
3419 if addr_arg[0] != '[' or addr_arg[-1] != ']':
3420 raise Error('--mon-addrv value %s must use square backets' %
3421 addr_arg)
3422 ipv6 = addr_arg.count('[') > 1
3423 for addr in addr_arg[1:-1].split(','):
3424 hasport = r.findall(addr)
3425 if not hasport:
3426 raise Error('--mon-addrv value %s must include port number' %
3427 addr_arg)
3428 port = int(hasport[0])
3429 # strip off v1: or v2: prefix
3430 addr = re.sub(r'^\w+:', '', addr)
3431 base_ip = addr[0:-(len(str(port))) - 1]
3432 check_ip_port(ctx, base_ip, port)
3433 else:
3434 raise Error('must specify --mon-ip or --mon-addrv')
3435 logger.debug('Base mon IP is %s, final addrv is %s' % (base_ip, addr_arg))
3436
3437 mon_network = None
3438 if not ctx.skip_mon_network:
3439 # make sure IP is configured locally, and then figure out the
3440 # CIDR network
3441 for net, ifaces in list_networks(ctx).items():
3442 ips: List[str] = []
3443 for iface, ls in ifaces.items():
3444 ips.extend(ls)
3445 if ipaddress.ip_address(unwrap_ipv6(base_ip)) in \
3446 [ipaddress.ip_address(ip) for ip in ips]:
3447 mon_network = net
3448 logger.info('Mon IP %s is in CIDR network %s' % (base_ip,
3449 mon_network))
3450 break
3451 if not mon_network:
3452 raise Error('Failed to infer CIDR network for mon ip %s; pass '
3453 '--skip-mon-network to configure it later' % base_ip)
3454
3455 return (addr_arg, ipv6, mon_network)
3456
3457
3458 def prepare_cluster_network(ctx: CephadmContext) -> Tuple[str, bool]:
3459 cluster_network = ''
3460 ipv6_cluster_network = False
3461 # the cluster network may not exist on this node, so all we can do is
3462 # validate that the address given is valid ipv4 or ipv6 subnet
3463 if ctx.cluster_network:
3464 rc, versions, err_msg = check_subnet(ctx.cluster_network)
3465 if rc:
3466 raise Error(f'Invalid --cluster-network parameter: {err_msg}')
3467 cluster_network = ctx.cluster_network
3468 ipv6_cluster_network = True if 6 in versions else False
3469 else:
3470 logger.info('- internal network (--cluster-network) has not '
3471 'been provided, OSD replication will default to '
3472 'the public_network')
3473
3474 return cluster_network, ipv6_cluster_network
3475
3476
3477 def create_initial_keys(
3478 ctx: CephadmContext,
3479 uid: int, gid: int,
3480 mgr_id: str
3481 ) -> Tuple[str, str, str, Any, Any]: # type: ignore
3482
3483 _image = ctx.image
3484
3485 # create some initial keys
3486 logger.info('Creating initial keys...')
3487 mon_key = CephContainer(
3488 ctx,
3489 image=_image,
3490 entrypoint='/usr/bin/ceph-authtool',
3491 args=['--gen-print-key'],
3492 ).run().strip()
3493 admin_key = CephContainer(
3494 ctx,
3495 image=_image,
3496 entrypoint='/usr/bin/ceph-authtool',
3497 args=['--gen-print-key'],
3498 ).run().strip()
3499 mgr_key = CephContainer(
3500 ctx,
3501 image=_image,
3502 entrypoint='/usr/bin/ceph-authtool',
3503 args=['--gen-print-key'],
3504 ).run().strip()
3505
3506 keyring = ('[mon.]\n'
3507 '\tkey = %s\n'
3508 '\tcaps mon = allow *\n'
3509 '[client.admin]\n'
3510 '\tkey = %s\n'
3511 '\tcaps mon = allow *\n'
3512 '\tcaps mds = allow *\n'
3513 '\tcaps mgr = allow *\n'
3514 '\tcaps osd = allow *\n'
3515 '[mgr.%s]\n'
3516 '\tkey = %s\n'
3517 '\tcaps mon = profile mgr\n'
3518 '\tcaps mds = allow *\n'
3519 '\tcaps osd = allow *\n'
3520 % (mon_key, admin_key, mgr_id, mgr_key))
3521
3522 admin_keyring = write_tmp('[client.admin]\n'
3523 '\tkey = ' + admin_key + '\n',
3524 uid, gid)
3525
3526 # tmp keyring file
3527 bootstrap_keyring = write_tmp(keyring, uid, gid)
3528 return (mon_key, mgr_key, admin_key,
3529 bootstrap_keyring, admin_keyring)
3530
3531
3532 def create_initial_monmap(
3533 ctx: CephadmContext,
3534 uid: int, gid: int,
3535 fsid: str,
3536 mon_id: str, mon_addr: str
3537 ) -> Any:
3538 logger.info('Creating initial monmap...')
3539 monmap = write_tmp('', 0, 0)
3540 out = CephContainer(
3541 ctx,
3542 image=ctx.image,
3543 entrypoint='/usr/bin/monmaptool',
3544 args=[
3545 '--create',
3546 '--clobber',
3547 '--fsid', fsid,
3548 '--addv', mon_id, mon_addr,
3549 '/tmp/monmap'
3550 ],
3551 volume_mounts={
3552 monmap.name: '/tmp/monmap:z',
3553 },
3554 ).run()
3555 logger.debug(f'monmaptool for {mon_id} {mon_addr} on {out}')
3556
3557 # pass monmap file to ceph user for use by ceph-mon --mkfs below
3558 os.fchown(monmap.fileno(), uid, gid)
3559 return monmap
3560
3561
3562 def prepare_create_mon(
3563 ctx: CephadmContext,
3564 uid: int, gid: int,
3565 fsid: str, mon_id: str,
3566 bootstrap_keyring_path: str,
3567 monmap_path: str
3568 ):
3569 logger.info('Creating mon...')
3570 create_daemon_dirs(ctx, fsid, 'mon', mon_id, uid, gid)
3571 mon_dir = get_data_dir(fsid, ctx.data_dir, 'mon', mon_id)
3572 log_dir = get_log_dir(fsid, ctx.log_dir)
3573 out = CephContainer(
3574 ctx,
3575 image=ctx.image,
3576 entrypoint='/usr/bin/ceph-mon',
3577 args=[
3578 '--mkfs',
3579 '-i', mon_id,
3580 '--fsid', fsid,
3581 '-c', '/dev/null',
3582 '--monmap', '/tmp/monmap',
3583 '--keyring', '/tmp/keyring',
3584 ] + get_daemon_args(ctx, fsid, 'mon', mon_id),
3585 volume_mounts={
3586 log_dir: '/var/log/ceph:z',
3587 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
3588 bootstrap_keyring_path: '/tmp/keyring:z',
3589 monmap_path: '/tmp/monmap:z',
3590 },
3591 ).run()
3592 logger.debug(f'create mon.{mon_id} on {out}')
3593 return (mon_dir, log_dir)
3594
3595
3596 def create_mon(
3597 ctx: CephadmContext,
3598 uid: int, gid: int,
3599 fsid: str, mon_id: str
3600 ) -> None:
3601 mon_c = get_container(ctx, fsid, 'mon', mon_id)
3602 ctx.meta_json = json.dumps({'service_name': 'mon'})
3603 deploy_daemon(ctx, fsid, 'mon', mon_id, mon_c, uid, gid,
3604 config=None, keyring=None)
3605
3606
3607 def wait_for_mon(
3608 ctx: CephadmContext,
3609 mon_id: str, mon_dir: str,
3610 admin_keyring_path: str, config_path: str
3611 ):
3612 logger.info('Waiting for mon to start...')
3613 c = CephContainer(
3614 ctx,
3615 image=ctx.image,
3616 entrypoint='/usr/bin/ceph',
3617 args=[
3618 'status'],
3619 volume_mounts={
3620 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % (mon_id),
3621 admin_keyring_path: '/etc/ceph/ceph.client.admin.keyring:z',
3622 config_path: '/etc/ceph/ceph.conf:z',
3623 },
3624 )
3625
3626 # wait for the service to become available
3627 def is_mon_available():
3628 # type: () -> bool
3629 timeout = ctx.timeout if ctx.timeout else 60 # seconds
3630 out, err, ret = call(ctx, c.run_cmd(),
3631 desc=c.entrypoint,
3632 timeout=timeout)
3633 return ret == 0
3634
3635 is_available(ctx, 'mon', is_mon_available)
3636
3637
3638 def create_mgr(
3639 ctx: CephadmContext,
3640 uid: int, gid: int,
3641 fsid: str, mgr_id: str, mgr_key: str,
3642 config: str, clifunc: Callable
3643 ) -> None:
3644 logger.info('Creating mgr...')
3645 mgr_keyring = '[mgr.%s]\n\tkey = %s\n' % (mgr_id, mgr_key)
3646 mgr_c = get_container(ctx, fsid, 'mgr', mgr_id)
3647 # Note:the default port used by the Prometheus node exporter is opened in fw
3648 ctx.meta_json = json.dumps({'service_name': 'mgr'})
3649 deploy_daemon(ctx, fsid, 'mgr', mgr_id, mgr_c, uid, gid,
3650 config=config, keyring=mgr_keyring, ports=[9283])
3651
3652 # wait for the service to become available
3653 logger.info('Waiting for mgr to start...')
3654
3655 def is_mgr_available():
3656 # type: () -> bool
3657 timeout = ctx.timeout if ctx.timeout else 60 # seconds
3658 try:
3659 out = clifunc(['status', '-f', 'json-pretty'], timeout=timeout)
3660 j = json.loads(out)
3661 return j.get('mgrmap', {}).get('available', False)
3662 except Exception as e:
3663 logger.debug('status failed: %s' % e)
3664 return False
3665 is_available(ctx, 'mgr', is_mgr_available)
3666
3667
3668 def prepare_ssh(
3669 ctx: CephadmContext,
3670 cli: Callable, wait_for_mgr_restart: Callable
3671 ) -> None:
3672
3673 cli(['cephadm', 'set-user', ctx.ssh_user])
3674
3675 if ctx.ssh_config:
3676 logger.info('Using provided ssh config...')
3677 mounts = {
3678 pathify(ctx.ssh_config.name): '/tmp/cephadm-ssh-config:z',
3679 }
3680 cli(['cephadm', 'set-ssh-config', '-i', '/tmp/cephadm-ssh-config'], extra_mounts=mounts)
3681
3682 if ctx.ssh_private_key and ctx.ssh_public_key:
3683 logger.info('Using provided ssh keys...')
3684 mounts = {
3685 pathify(ctx.ssh_private_key.name): '/tmp/cephadm-ssh-key:z',
3686 pathify(ctx.ssh_public_key.name): '/tmp/cephadm-ssh-key.pub:z'
3687 }
3688 cli(['cephadm', 'set-priv-key', '-i', '/tmp/cephadm-ssh-key'], extra_mounts=mounts)
3689 cli(['cephadm', 'set-pub-key', '-i', '/tmp/cephadm-ssh-key.pub'], extra_mounts=mounts)
3690 else:
3691 logger.info('Generating ssh key...')
3692 cli(['cephadm', 'generate-key'])
3693 ssh_pub = cli(['cephadm', 'get-pub-key'])
3694
3695 with open(ctx.output_pub_ssh_key, 'w') as f:
3696 f.write(ssh_pub)
3697 logger.info('Wrote public SSH key to %s' % ctx.output_pub_ssh_key)
3698
3699 logger.info('Adding key to %s@localhost authorized_keys...' % ctx.ssh_user)
3700 try:
3701 s_pwd = pwd.getpwnam(ctx.ssh_user)
3702 except KeyError:
3703 raise Error('Cannot find uid/gid for ssh-user: %s' % (ctx.ssh_user))
3704 ssh_uid = s_pwd.pw_uid
3705 ssh_gid = s_pwd.pw_gid
3706 ssh_dir = os.path.join(s_pwd.pw_dir, '.ssh')
3707
3708 if not os.path.exists(ssh_dir):
3709 makedirs(ssh_dir, ssh_uid, ssh_gid, 0o700)
3710
3711 auth_keys_file = '%s/authorized_keys' % ssh_dir
3712 add_newline = False
3713
3714 if os.path.exists(auth_keys_file):
3715 with open(auth_keys_file, 'r') as f:
3716 f.seek(0, os.SEEK_END)
3717 if f.tell() > 0:
3718 f.seek(f.tell() - 1, os.SEEK_SET) # go to last char
3719 if f.read() != '\n':
3720 add_newline = True
3721
3722 with open(auth_keys_file, 'a') as f:
3723 os.fchown(f.fileno(), ssh_uid, ssh_gid) # just in case we created it
3724 os.fchmod(f.fileno(), 0o600) # just in case we created it
3725 if add_newline:
3726 f.write('\n')
3727 f.write(ssh_pub.strip() + '\n')
3728
3729 host = get_hostname()
3730 logger.info('Adding host %s...' % host)
3731 try:
3732 args = ['orch', 'host', 'add', host]
3733 if ctx.mon_ip:
3734 args.append(ctx.mon_ip)
3735 cli(args)
3736 except RuntimeError as e:
3737 raise Error('Failed to add host <%s>: %s' % (host, e))
3738
3739 for t in ['mon', 'mgr']:
3740 if not ctx.orphan_initial_daemons:
3741 logger.info('Deploying %s service with default placement...' % t)
3742 cli(['orch', 'apply', t])
3743 else:
3744 logger.info('Deploying unmanaged %s service...' % t)
3745 cli(['orch', 'apply', t, '--unmanaged'])
3746
3747 if not ctx.orphan_initial_daemons:
3748 logger.info('Deploying crash service with default placement...')
3749 cli(['orch', 'apply', 'crash'])
3750
3751 if not ctx.skip_monitoring_stack:
3752 logger.info('Enabling mgr prometheus module...')
3753 cli(['mgr', 'module', 'enable', 'prometheus'])
3754 for t in ['prometheus', 'grafana', 'node-exporter', 'alertmanager']:
3755 logger.info('Deploying %s service with default placement...' % t)
3756 cli(['orch', 'apply', t])
3757
3758
3759 def enable_cephadm_mgr_module(
3760 cli: Callable, wait_for_mgr_restart: Callable
3761 ) -> None:
3762
3763 logger.info('Enabling cephadm module...')
3764 cli(['mgr', 'module', 'enable', 'cephadm'])
3765 wait_for_mgr_restart()
3766 logger.info('Setting orchestrator backend to cephadm...')
3767 cli(['orch', 'set', 'backend', 'cephadm'])
3768
3769
3770 def prepare_dashboard(
3771 ctx: CephadmContext,
3772 uid: int, gid: int,
3773 cli: Callable, wait_for_mgr_restart: Callable
3774 ) -> None:
3775
3776 # Configure SSL port (cephadm only allows to configure dashboard SSL port)
3777 # if the user does not want to use SSL he can change this setting once the cluster is up
3778 cli(['config', 'set', 'mgr', 'mgr/dashboard/ssl_server_port', str(ctx.ssl_dashboard_port)])
3779
3780 # configuring dashboard parameters
3781 logger.info('Enabling the dashboard module...')
3782 cli(['mgr', 'module', 'enable', 'dashboard'])
3783 wait_for_mgr_restart()
3784
3785 # dashboard crt and key
3786 if ctx.dashboard_key and ctx.dashboard_crt:
3787 logger.info('Using provided dashboard certificate...')
3788 mounts = {
3789 pathify(ctx.dashboard_crt.name): '/tmp/dashboard.crt:z',
3790 pathify(ctx.dashboard_key.name): '/tmp/dashboard.key:z'
3791 }
3792 cli(['dashboard', 'set-ssl-certificate', '-i', '/tmp/dashboard.crt'], extra_mounts=mounts)
3793 cli(['dashboard', 'set-ssl-certificate-key', '-i', '/tmp/dashboard.key'], extra_mounts=mounts)
3794 else:
3795 logger.info('Generating a dashboard self-signed certificate...')
3796 cli(['dashboard', 'create-self-signed-cert'])
3797
3798 logger.info('Creating initial admin user...')
3799 password = ctx.initial_dashboard_password or generate_password()
3800 tmp_password_file = write_tmp(password, uid, gid)
3801 cmd = ['dashboard', 'ac-user-create', ctx.initial_dashboard_user, '-i', '/tmp/dashboard.pw', 'administrator', '--force-password']
3802 if not ctx.dashboard_password_noupdate:
3803 cmd.append('--pwd-update-required')
3804 cli(cmd, extra_mounts={pathify(tmp_password_file.name): '/tmp/dashboard.pw:z'})
3805 logger.info('Fetching dashboard port number...')
3806 out = cli(['config', 'get', 'mgr', 'mgr/dashboard/ssl_server_port'])
3807 port = int(out)
3808
3809 # Open dashboard port
3810 fw = Firewalld(ctx)
3811 fw.open_ports([port])
3812 fw.apply_rules()
3813
3814 logger.info('Ceph Dashboard is now available at:\n\n'
3815 '\t URL: https://%s:%s/\n'
3816 '\t User: %s\n'
3817 '\tPassword: %s\n' % (
3818 get_fqdn(), port,
3819 ctx.initial_dashboard_user,
3820 password))
3821
3822
3823 def prepare_bootstrap_config(
3824 ctx: CephadmContext,
3825 fsid: str, mon_addr: str, image: str
3826
3827 ) -> str:
3828
3829 cp = read_config(ctx.config)
3830 if not cp.has_section('global'):
3831 cp.add_section('global')
3832 cp.set('global', 'fsid', fsid)
3833 cp.set('global', 'mon_host', mon_addr)
3834 cp.set('global', 'container_image', image)
3835 if not cp.has_section('mon'):
3836 cp.add_section('mon')
3837 if (
3838 not cp.has_option('mon', 'auth_allow_insecure_global_id_reclaim')
3839 and not cp.has_option('mon', 'auth allow insecure global id reclaim')
3840 ):
3841 cp.set('mon', 'auth_allow_insecure_global_id_reclaim', 'false')
3842 cpf = StringIO()
3843 cp.write(cpf)
3844 config = cpf.getvalue()
3845
3846 if ctx.registry_json or ctx.registry_url:
3847 command_registry_login(ctx)
3848
3849 return config
3850
3851
3852 def finish_bootstrap_config(
3853 ctx: CephadmContext,
3854 fsid: str,
3855 config: str,
3856 mon_id: str, mon_dir: str,
3857 mon_network: Optional[str], ipv6: bool,
3858 cli: Callable,
3859 cluster_network: Optional[str], ipv6_cluster_network: bool
3860
3861 ) -> None:
3862 if not ctx.no_minimize_config:
3863 logger.info('Assimilating anything we can from ceph.conf...')
3864 cli([
3865 'config', 'assimilate-conf',
3866 '-i', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
3867 ], {
3868 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
3869 })
3870 logger.info('Generating new minimal ceph.conf...')
3871 cli([
3872 'config', 'generate-minimal-conf',
3873 '-o', '/var/lib/ceph/mon/ceph-%s/config' % mon_id
3874 ], {
3875 mon_dir: '/var/lib/ceph/mon/ceph-%s:z' % mon_id
3876 })
3877 # re-read our minimized config
3878 with open(mon_dir + '/config', 'r') as f:
3879 config = f.read()
3880 logger.info('Restarting the monitor...')
3881 call_throws(ctx, [
3882 'systemctl',
3883 'restart',
3884 get_unit_name(fsid, 'mon', mon_id)
3885 ])
3886
3887 if mon_network:
3888 logger.info(f'Setting mon public_network to {mon_network}')
3889 cli(['config', 'set', 'mon', 'public_network', mon_network])
3890
3891 if cluster_network:
3892 logger.info(f'Setting cluster_network to {cluster_network}')
3893 cli(['config', 'set', 'global', 'cluster_network', cluster_network])
3894
3895 if ipv6 or ipv6_cluster_network:
3896 logger.info('Enabling IPv6 (ms_bind_ipv6) binding')
3897 cli(['config', 'set', 'global', 'ms_bind_ipv6', 'true'])
3898
3899 with open(ctx.output_config, 'w') as f:
3900 f.write(config)
3901 logger.info('Wrote config to %s' % ctx.output_config)
3902 pass
3903
3904
3905 @default_image
3906 def command_bootstrap(ctx):
3907 # type: (CephadmContext) -> int
3908
3909 if not ctx.output_config:
3910 ctx.output_config = os.path.join(ctx.output_dir, 'ceph.conf')
3911 if not ctx.output_keyring:
3912 ctx.output_keyring = os.path.join(ctx.output_dir,
3913 'ceph.client.admin.keyring')
3914 if not ctx.output_pub_ssh_key:
3915 ctx.output_pub_ssh_key = os.path.join(ctx.output_dir, 'ceph.pub')
3916
3917 # verify output files
3918 for f in [ctx.output_config, ctx.output_keyring,
3919 ctx.output_pub_ssh_key]:
3920 if not ctx.allow_overwrite:
3921 if os.path.exists(f):
3922 raise Error('%s already exists; delete or pass '
3923 '--allow-overwrite to overwrite' % f)
3924 dirname = os.path.dirname(f)
3925 if dirname and not os.path.exists(dirname):
3926 fname = os.path.basename(f)
3927 logger.info(f'Creating directory {dirname} for {fname}')
3928 try:
3929 # use makedirs to create intermediate missing dirs
3930 os.makedirs(dirname, 0o755)
3931 except PermissionError:
3932 raise Error(f'Unable to create {dirname} due to permissions failure. Retry with root, or sudo or preallocate the directory.')
3933
3934 if not ctx.skip_prepare_host:
3935 command_prepare_host(ctx)
3936 else:
3937 logger.info('Skip prepare_host')
3938
3939 # initial vars
3940 fsid = ctx.fsid or make_fsid()
3941 hostname = get_hostname()
3942 if '.' in hostname and not ctx.allow_fqdn_hostname:
3943 raise Error('hostname is a fully qualified domain name (%s); either fix (e.g., "sudo hostname %s" or similar) or pass --allow-fqdn-hostname' % (hostname, hostname.split('.')[0]))
3944 mon_id = ctx.mon_id or hostname
3945 mgr_id = ctx.mgr_id or generate_service_id()
3946 logger.info('Cluster fsid: %s' % fsid)
3947
3948 lock = FileLock(ctx, fsid)
3949 lock.acquire()
3950
3951 (addr_arg, ipv6, mon_network) = prepare_mon_addresses(ctx)
3952 cluster_network, ipv6_cluster_network = prepare_cluster_network(ctx)
3953
3954 config = prepare_bootstrap_config(ctx, fsid, addr_arg, ctx.image)
3955
3956 if not ctx.skip_pull:
3957 _pull_image(ctx, ctx.image)
3958
3959 image_ver = CephContainer(ctx, ctx.image, 'ceph', ['--version']).run().strip()
3960 logger.info(f'Ceph version: {image_ver}')
3961 image_release = image_ver.split()[4]
3962 if (
3963 not ctx.allow_mismatched_release
3964 and image_release not in [DEFAULT_IMAGE_RELEASE, LATEST_STABLE_RELEASE]
3965 ):
3966 raise Error(
3967 f'Container release {image_release} != cephadm release {DEFAULT_IMAGE_RELEASE}; please use matching version of cephadm (pass --allow-mismatched-release to continue anyway)'
3968 )
3969
3970 logger.info('Extracting ceph user uid/gid from container image...')
3971 (uid, gid) = extract_uid_gid(ctx)
3972
3973 # create some initial keys
3974 (mon_key, mgr_key, admin_key, bootstrap_keyring, admin_keyring) = \
3975 create_initial_keys(ctx, uid, gid, mgr_id)
3976
3977 monmap = create_initial_monmap(ctx, uid, gid, fsid, mon_id, addr_arg)
3978 (mon_dir, log_dir) = \
3979 prepare_create_mon(ctx, uid, gid, fsid, mon_id,
3980 bootstrap_keyring.name, monmap.name)
3981
3982 with open(mon_dir + '/config', 'w') as f:
3983 os.fchown(f.fileno(), uid, gid)
3984 os.fchmod(f.fileno(), 0o600)
3985 f.write(config)
3986
3987 make_var_run(ctx, fsid, uid, gid)
3988 create_mon(ctx, uid, gid, fsid, mon_id)
3989
3990 # config to issue various CLI commands
3991 tmp_config = write_tmp(config, uid, gid)
3992
3993 # a CLI helper to reduce our typing
3994 def cli(cmd, extra_mounts={}, timeout=DEFAULT_TIMEOUT):
3995 # type: (List[str], Dict[str, str], Optional[int]) -> str
3996 mounts = {
3997 log_dir: '/var/log/ceph:z',
3998 admin_keyring.name: '/etc/ceph/ceph.client.admin.keyring:z',
3999 tmp_config.name: '/etc/ceph/ceph.conf:z',
4000 }
4001 for k, v in extra_mounts.items():
4002 mounts[k] = v
4003 timeout = timeout or ctx.timeout
4004 return CephContainer(
4005 ctx,
4006 image=ctx.image,
4007 entrypoint='/usr/bin/ceph',
4008 args=cmd,
4009 volume_mounts=mounts,
4010 ).run(timeout=timeout)
4011
4012 wait_for_mon(ctx, mon_id, mon_dir, admin_keyring.name, tmp_config.name)
4013
4014 finish_bootstrap_config(ctx, fsid, config, mon_id, mon_dir,
4015 mon_network, ipv6, cli,
4016 cluster_network, ipv6_cluster_network)
4017
4018 # output files
4019 with open(ctx.output_keyring, 'w') as f:
4020 os.fchmod(f.fileno(), 0o600)
4021 f.write('[client.admin]\n'
4022 '\tkey = ' + admin_key + '\n')
4023 logger.info('Wrote keyring to %s' % ctx.output_keyring)
4024
4025 # create mgr
4026 create_mgr(ctx, uid, gid, fsid, mgr_id, mgr_key, config, cli)
4027
4028 def json_loads_retry(cli_func):
4029 for sleep_secs in [1, 4, 4]:
4030 try:
4031 return json.loads(cli_func())
4032 except json.JSONDecodeError:
4033 logger.debug('Invalid JSON. Retrying in %s seconds...' % sleep_secs)
4034 time.sleep(sleep_secs)
4035 return json.loads(cli_func())
4036
4037 # wait for mgr to restart (after enabling a module)
4038 def wait_for_mgr_restart():
4039 # first get latest mgrmap epoch from the mon. try newer 'mgr
4040 # stat' command first, then fall back to 'mgr dump' if
4041 # necessary
4042 try:
4043 j = json_loads_retry(lambda: cli(['mgr', 'stat']))
4044 except Exception:
4045 j = json_loads_retry(lambda: cli(['mgr', 'dump']))
4046 epoch = j['epoch']
4047
4048 # wait for mgr to have it
4049 logger.info('Waiting for the mgr to restart...')
4050
4051 def mgr_has_latest_epoch():
4052 # type: () -> bool
4053 try:
4054 out = cli(['tell', 'mgr', 'mgr_status'])
4055 j = json.loads(out)
4056 return j['mgrmap_epoch'] >= epoch
4057 except Exception as e:
4058 logger.debug('tell mgr mgr_status failed: %s' % e)
4059 return False
4060 is_available(ctx, 'mgr epoch %d' % epoch, mgr_has_latest_epoch)
4061
4062 enable_cephadm_mgr_module(cli, wait_for_mgr_restart)
4063
4064 # ssh
4065 if not ctx.skip_ssh:
4066 prepare_ssh(ctx, cli, wait_for_mgr_restart)
4067
4068 if ctx.registry_url and ctx.registry_username and ctx.registry_password:
4069 cli(['config', 'set', 'mgr', 'mgr/cephadm/registry_url', ctx.registry_url, '--force'])
4070 cli(['config', 'set', 'mgr', 'mgr/cephadm/registry_username', ctx.registry_username, '--force'])
4071 cli(['config', 'set', 'mgr', 'mgr/cephadm/registry_password', ctx.registry_password, '--force'])
4072
4073 cli(['config', 'set', 'mgr', 'mgr/cephadm/container_init', str(ctx.container_init), '--force'])
4074
4075 if ctx.with_exporter:
4076 cli(['config-key', 'set', 'mgr/cephadm/exporter_enabled', 'true'])
4077 if ctx.exporter_config:
4078 logger.info('Applying custom cephadm exporter settings')
4079 # validated within the parser, so we can just apply to the store
4080 with tempfile.NamedTemporaryFile(buffering=0) as tmp:
4081 tmp.write(json.dumps(ctx.exporter_config).encode('utf-8'))
4082 mounts = {
4083 tmp.name: '/tmp/exporter-config.json:z'
4084 }
4085 cli(['cephadm', 'set-exporter-config', '-i', '/tmp/exporter-config.json'], extra_mounts=mounts)
4086 logger.info('-> Use ceph orch apply cephadm-exporter to deploy')
4087 else:
4088 # generate a default SSL configuration for the exporter(s)
4089 logger.info('Generating a default cephadm exporter configuration (self-signed)')
4090 cli(['cephadm', 'generate-exporter-config'])
4091 #
4092 # deploy the service (commented out until the cephadm changes are in the ceph container build)
4093 logger.info('Deploying cephadm exporter service with default placement...')
4094 cli(['orch', 'apply', 'cephadm-exporter'])
4095
4096 if not ctx.skip_dashboard:
4097 prepare_dashboard(ctx, uid, gid, cli, wait_for_mgr_restart)
4098
4099 if ctx.apply_spec:
4100 logger.info('Applying %s to cluster' % ctx.apply_spec)
4101
4102 with open(ctx.apply_spec) as f:
4103 for line in f:
4104 if 'hostname:' in line:
4105 line = line.replace('\n', '')
4106 split = line.split(': ')
4107 if split[1] != hostname:
4108 logger.info('Adding ssh key to %s' % split[1])
4109
4110 ssh_key = '/etc/ceph/ceph.pub'
4111 if ctx.ssh_public_key:
4112 ssh_key = ctx.ssh_public_key.name
4113 out, err, code = call_throws(ctx, ['sudo', '-u', ctx.ssh_user, 'ssh-copy-id', '-f', '-i', ssh_key, '-o StrictHostKeyChecking=no', '%s@%s' % (ctx.ssh_user, split[1])])
4114
4115 mounts = {}
4116 mounts[pathify(ctx.apply_spec)] = '/tmp/spec.yml:z'
4117
4118 out = cli(['orch', 'apply', '-i', '/tmp/spec.yml'], extra_mounts=mounts)
4119 logger.info(out)
4120
4121 logger.info('You can access the Ceph CLI with:\n\n'
4122 '\tsudo %s shell --fsid %s -c %s -k %s\n' % (
4123 sys.argv[0],
4124 fsid,
4125 ctx.output_config,
4126 ctx.output_keyring))
4127 logger.info('Please consider enabling telemetry to help improve Ceph:\n\n'
4128 '\tceph telemetry on\n\n'
4129 'For more information see:\n\n'
4130 '\thttps://docs.ceph.com/docs/pacific/mgr/telemetry/\n')
4131 logger.info('Bootstrap complete.')
4132 return 0
4133
4134 ##################################
4135
4136
4137 def command_registry_login(ctx: CephadmContext):
4138 if ctx.registry_json:
4139 logger.info('Pulling custom registry login info from %s.' % ctx.registry_json)
4140 d = get_parm(ctx.registry_json)
4141 if d.get('url') and d.get('username') and d.get('password'):
4142 ctx.registry_url = d.get('url')
4143 ctx.registry_username = d.get('username')
4144 ctx.registry_password = d.get('password')
4145 registry_login(ctx, ctx.registry_url, ctx.registry_username, ctx.registry_password)
4146 else:
4147 raise Error('json provided for custom registry login did not include all necessary fields. '
4148 'Please setup json file as\n'
4149 '{\n'
4150 ' "url": "REGISTRY_URL",\n'
4151 ' "username": "REGISTRY_USERNAME",\n'
4152 ' "password": "REGISTRY_PASSWORD"\n'
4153 '}\n')
4154 elif ctx.registry_url and ctx.registry_username and ctx.registry_password:
4155 registry_login(ctx, ctx.registry_url, ctx.registry_username, ctx.registry_password)
4156 else:
4157 raise Error('Invalid custom registry arguments received. To login to a custom registry include '
4158 '--registry-url, --registry-username and --registry-password '
4159 'options or --registry-json option')
4160 return 0
4161
4162
4163 def registry_login(ctx: CephadmContext, url, username, password):
4164 logger.info('Logging into custom registry.')
4165 try:
4166 engine = ctx.container_engine
4167 cmd = [engine.path, 'login',
4168 '-u', username, '-p', password,
4169 url]
4170 if isinstance(engine, Podman):
4171 cmd.append('--authfile=/etc/ceph/podman-auth.json')
4172 out, _, _ = call_throws(ctx, cmd)
4173 if isinstance(engine, Podman):
4174 os.chmod('/etc/ceph/podman-auth.json', 0o600)
4175 except Exception:
4176 raise Error('Failed to login to custom registry @ %s as %s with given password' % (ctx.registry_url, ctx.registry_username))
4177
4178 ##################################
4179
4180
4181 def extract_uid_gid_monitoring(ctx, daemon_type):
4182 # type: (CephadmContext, str) -> Tuple[int, int]
4183
4184 if daemon_type == 'prometheus':
4185 uid, gid = extract_uid_gid(ctx, file_path='/etc/prometheus')
4186 elif daemon_type == 'node-exporter':
4187 uid, gid = 65534, 65534
4188 elif daemon_type == 'grafana':
4189 uid, gid = extract_uid_gid(ctx, file_path='/var/lib/grafana')
4190 elif daemon_type == 'alertmanager':
4191 uid, gid = extract_uid_gid(ctx, file_path=['/etc/alertmanager', '/etc/prometheus'])
4192 else:
4193 raise Error('{} not implemented yet'.format(daemon_type))
4194 return uid, gid
4195
4196
4197 @default_image
4198 def command_deploy(ctx):
4199 # type: (CephadmContext) -> None
4200 daemon_type, daemon_id = ctx.name.split('.', 1)
4201
4202 lock = FileLock(ctx, ctx.fsid)
4203 lock.acquire()
4204
4205 if daemon_type not in get_supported_daemons():
4206 raise Error('daemon type %s not recognized' % daemon_type)
4207
4208 redeploy = False
4209 unit_name = get_unit_name(ctx.fsid, daemon_type, daemon_id)
4210 container_name = 'ceph-%s-%s.%s' % (ctx.fsid, daemon_type, daemon_id)
4211 (_, state, _) = check_unit(ctx, unit_name)
4212 if state == 'running' or is_container_running(ctx, container_name):
4213 redeploy = True
4214
4215 if ctx.reconfig:
4216 logger.info('%s daemon %s ...' % ('Reconfig', ctx.name))
4217 elif redeploy:
4218 logger.info('%s daemon %s ...' % ('Redeploy', ctx.name))
4219 else:
4220 logger.info('%s daemon %s ...' % ('Deploy', ctx.name))
4221
4222 # Get and check ports explicitly required to be opened
4223 daemon_ports = [] # type: List[int]
4224
4225 # only check port in use if not reconfig or redeploy since service
4226 # we are redeploying/reconfiguring will already be using the port
4227 if not ctx.reconfig and not redeploy:
4228 if ctx.tcp_ports:
4229 daemon_ports = list(map(int, ctx.tcp_ports.split()))
4230
4231 if daemon_type in Ceph.daemons:
4232 config, keyring = get_config_and_keyring(ctx)
4233 uid, gid = extract_uid_gid(ctx)
4234 make_var_run(ctx, ctx.fsid, uid, gid)
4235
4236 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id,
4237 ptrace=ctx.allow_ptrace)
4238 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4239 config=config, keyring=keyring,
4240 osd_fsid=ctx.osd_fsid,
4241 reconfig=ctx.reconfig,
4242 ports=daemon_ports)
4243
4244 elif daemon_type in Monitoring.components:
4245 # monitoring daemon - prometheus, grafana, alertmanager, node-exporter
4246 # Default Checks
4247 if not ctx.reconfig and not redeploy:
4248 daemon_ports.extend(Monitoring.port_map[daemon_type])
4249
4250 # make sure provided config-json is sufficient
4251 config = get_parm(ctx.config_json) # type: ignore
4252 required_files = Monitoring.components[daemon_type].get('config-json-files', list())
4253 required_args = Monitoring.components[daemon_type].get('config-json-args', list())
4254 if required_files:
4255 if not config or not all(c in config.get('files', {}).keys() for c in required_files): # type: ignore
4256 raise Error('{} deployment requires config-json which must '
4257 'contain file content for {}'.format(daemon_type.capitalize(), ', '.join(required_files)))
4258 if required_args:
4259 if not config or not all(c in config.keys() for c in required_args): # type: ignore
4260 raise Error('{} deployment requires config-json which must '
4261 'contain arg for {}'.format(daemon_type.capitalize(), ', '.join(required_args)))
4262
4263 uid, gid = extract_uid_gid_monitoring(ctx, daemon_type)
4264 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4265 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4266 reconfig=ctx.reconfig,
4267 ports=daemon_ports)
4268
4269 elif daemon_type == NFSGanesha.daemon_type:
4270 if not ctx.reconfig and not redeploy:
4271 daemon_ports.extend(NFSGanesha.port_map.values())
4272
4273 config, keyring = get_config_and_keyring(ctx)
4274 # TODO: extract ganesha uid/gid (997, 994) ?
4275 uid, gid = extract_uid_gid(ctx)
4276 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4277 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4278 config=config, keyring=keyring,
4279 reconfig=ctx.reconfig,
4280 ports=daemon_ports)
4281
4282 elif daemon_type == CephIscsi.daemon_type:
4283 config, keyring = get_config_and_keyring(ctx)
4284 uid, gid = extract_uid_gid(ctx)
4285 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4286 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4287 config=config, keyring=keyring,
4288 reconfig=ctx.reconfig,
4289 ports=daemon_ports)
4290
4291 elif daemon_type == HAproxy.daemon_type:
4292 haproxy = HAproxy.init(ctx, ctx.fsid, daemon_id)
4293 uid, gid = haproxy.extract_uid_gid_haproxy()
4294 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4295 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4296 reconfig=ctx.reconfig,
4297 ports=daemon_ports)
4298
4299 elif daemon_type == Keepalived.daemon_type:
4300 keepalived = Keepalived.init(ctx, ctx.fsid, daemon_id)
4301 uid, gid = keepalived.extract_uid_gid_keepalived()
4302 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4303 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c, uid, gid,
4304 reconfig=ctx.reconfig,
4305 ports=daemon_ports)
4306
4307 elif daemon_type == CustomContainer.daemon_type:
4308 cc = CustomContainer.init(ctx, ctx.fsid, daemon_id)
4309 if not ctx.reconfig and not redeploy:
4310 daemon_ports.extend(cc.ports)
4311 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id,
4312 privileged=cc.privileged,
4313 ptrace=ctx.allow_ptrace)
4314 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, c,
4315 uid=cc.uid, gid=cc.gid, config=None,
4316 keyring=None, reconfig=ctx.reconfig,
4317 ports=daemon_ports)
4318
4319 elif daemon_type == CephadmDaemon.daemon_type:
4320 # get current user gid and uid
4321 uid = os.getuid()
4322 gid = os.getgid()
4323 config_js = get_parm(ctx.config_json) # type: Dict[str, str]
4324 if not daemon_ports:
4325 logger.info('cephadm-exporter will use default port ({})'.format(CephadmDaemon.default_port))
4326 daemon_ports = [CephadmDaemon.default_port]
4327
4328 CephadmDaemon.validate_config(config_js)
4329
4330 deploy_daemon(ctx, ctx.fsid, daemon_type, daemon_id, None,
4331 uid, gid, ports=daemon_ports)
4332
4333 else:
4334 raise Error('daemon type {} not implemented in command_deploy function'
4335 .format(daemon_type))
4336
4337 ##################################
4338
4339
4340 @infer_image
4341 def command_run(ctx):
4342 # type: (CephadmContext) -> int
4343 (daemon_type, daemon_id) = ctx.name.split('.', 1)
4344 c = get_container(ctx, ctx.fsid, daemon_type, daemon_id)
4345 command = c.run_cmd()
4346 return call_timeout(ctx, command, ctx.timeout)
4347
4348 ##################################
4349
4350
4351 def fsid_conf_mismatch(ctx):
4352 # type: (CephadmContext) -> bool
4353 (config, _) = get_config_and_keyring(ctx)
4354 if config:
4355 for c in config.split('\n'):
4356 if 'fsid = ' in c.strip():
4357 if 'fsid = ' + ctx.fsid != c.strip():
4358 return True
4359 return False
4360
4361
4362 @infer_fsid
4363 @infer_config
4364 @infer_image
4365 def command_shell(ctx):
4366 # type: (CephadmContext) -> int
4367 if fsid_conf_mismatch(ctx):
4368 raise Error('fsid does not match ceph conf')
4369
4370 if ctx.fsid:
4371 make_log_dir(ctx, ctx.fsid)
4372 if ctx.name:
4373 if '.' in ctx.name:
4374 (daemon_type, daemon_id) = ctx.name.split('.', 1)
4375 else:
4376 daemon_type = ctx.name
4377 daemon_id = None
4378 else:
4379 daemon_type = 'osd' # get the most mounts
4380 daemon_id = None
4381
4382 if daemon_id and not ctx.fsid:
4383 raise Error('must pass --fsid to specify cluster')
4384
4385 # use /etc/ceph files by default, if present. we do this instead of
4386 # making these defaults in the arg parser because we don't want an error
4387 # if they don't exist.
4388 if not ctx.keyring and os.path.exists(SHELL_DEFAULT_KEYRING):
4389 ctx.keyring = SHELL_DEFAULT_KEYRING
4390
4391 container_args: List[str] = ['-i']
4392 mounts = get_container_mounts(ctx, ctx.fsid, daemon_type, daemon_id,
4393 no_config=True if ctx.config else False)
4394 binds = get_container_binds(ctx, ctx.fsid, daemon_type, daemon_id)
4395 if ctx.config:
4396 mounts[pathify(ctx.config)] = '/etc/ceph/ceph.conf:z'
4397 if ctx.keyring:
4398 mounts[pathify(ctx.keyring)] = '/etc/ceph/ceph.keyring:z'
4399 if ctx.mount:
4400 for _mount in ctx.mount:
4401 split_src_dst = _mount.split(':')
4402 mount = pathify(split_src_dst[0])
4403 filename = os.path.basename(split_src_dst[0])
4404 if len(split_src_dst) > 1:
4405 dst = split_src_dst[1] + ':z' if len(split_src_dst) == 3 else split_src_dst[1]
4406 mounts[mount] = dst
4407 else:
4408 mounts[mount] = '/mnt/{}:z'.format(filename)
4409 if ctx.command:
4410 command = ctx.command
4411 else:
4412 command = ['bash']
4413 container_args += [
4414 '-t',
4415 '-e', 'LANG=C',
4416 '-e', 'PS1=%s' % CUSTOM_PS1,
4417 ]
4418 if ctx.fsid:
4419 home = os.path.join(ctx.data_dir, ctx.fsid, 'home')
4420 if not os.path.exists(home):
4421 logger.debug('Creating root home at %s' % home)
4422 makedirs(home, 0, 0, 0o660)
4423 if os.path.exists('/etc/skel'):
4424 for f in os.listdir('/etc/skel'):
4425 if f.startswith('.bash'):
4426 shutil.copyfile(os.path.join('/etc/skel', f),
4427 os.path.join(home, f))
4428 mounts[home] = '/root'
4429
4430 c = CephContainer(
4431 ctx,
4432 image=ctx.image,
4433 entrypoint='doesnotmatter',
4434 args=[],
4435 container_args=container_args,
4436 volume_mounts=mounts,
4437 bind_mounts=binds,
4438 envs=ctx.env,
4439 privileged=True)
4440 command = c.shell_cmd(command)
4441
4442 return call_timeout(ctx, command, ctx.timeout)
4443
4444 ##################################
4445
4446
4447 @infer_fsid
4448 def command_enter(ctx):
4449 # type: (CephadmContext) -> int
4450 if not ctx.fsid:
4451 raise Error('must pass --fsid to specify cluster')
4452 (daemon_type, daemon_id) = ctx.name.split('.', 1)
4453 container_args = ['-i'] # type: List[str]
4454 if ctx.command:
4455 command = ctx.command
4456 else:
4457 command = ['sh']
4458 container_args += [
4459 '-t',
4460 '-e', 'LANG=C',
4461 '-e', 'PS1=%s' % CUSTOM_PS1,
4462 ]
4463 c = CephContainer(
4464 ctx,
4465 image=ctx.image,
4466 entrypoint='doesnotmatter',
4467 container_args=container_args,
4468 cname='ceph-%s-%s.%s' % (ctx.fsid, daemon_type, daemon_id),
4469 )
4470 command = c.exec_cmd(command)
4471 return call_timeout(ctx, command, ctx.timeout)
4472
4473 ##################################
4474
4475
4476 @infer_fsid
4477 @infer_image
4478 def command_ceph_volume(ctx):
4479 # type: (CephadmContext) -> None
4480 if ctx.fsid:
4481 make_log_dir(ctx, ctx.fsid)
4482
4483 lock = FileLock(ctx, ctx.fsid)
4484 lock.acquire()
4485
4486 (uid, gid) = (0, 0) # ceph-volume runs as root
4487 mounts = get_container_mounts(ctx, ctx.fsid, 'osd', None)
4488
4489 tmp_config = None
4490 tmp_keyring = None
4491
4492 (config, keyring) = get_config_and_keyring(ctx)
4493
4494 if config:
4495 # tmp config file
4496 tmp_config = write_tmp(config, uid, gid)
4497 mounts[tmp_config.name] = '/etc/ceph/ceph.conf:z'
4498
4499 if keyring:
4500 # tmp keyring file
4501 tmp_keyring = write_tmp(keyring, uid, gid)
4502 mounts[tmp_keyring.name] = '/var/lib/ceph/bootstrap-osd/ceph.keyring:z'
4503
4504 c = CephContainer(
4505 ctx,
4506 image=ctx.image,
4507 entrypoint='/usr/sbin/ceph-volume',
4508 envs=ctx.env,
4509 args=ctx.command,
4510 privileged=True,
4511 volume_mounts=mounts,
4512 )
4513 verbosity = CallVerbosity.VERBOSE if ctx.log_output else CallVerbosity.VERBOSE_ON_FAILURE
4514 out, err, code = call_throws(ctx, c.run_cmd(), verbosity=verbosity)
4515 if not code:
4516 print(out)
4517
4518 ##################################
4519
4520
4521 @infer_fsid
4522 def command_unit(ctx):
4523 # type: (CephadmContext) -> None
4524 if not ctx.fsid:
4525 raise Error('must pass --fsid to specify cluster')
4526
4527 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
4528
4529 call_throws(ctx, [
4530 'systemctl',
4531 ctx.command,
4532 unit_name],
4533 verbosity=CallVerbosity.VERBOSE,
4534 desc=''
4535 )
4536
4537 ##################################
4538
4539
4540 @infer_fsid
4541 def command_logs(ctx):
4542 # type: (CephadmContext) -> None
4543 if not ctx.fsid:
4544 raise Error('must pass --fsid to specify cluster')
4545
4546 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
4547
4548 cmd = [find_program('journalctl')]
4549 cmd.extend(['-u', unit_name])
4550 if ctx.command:
4551 cmd.extend(ctx.command)
4552
4553 # call this directly, without our wrapper, so that we get an unmolested
4554 # stdout with logger prefixing.
4555 logger.debug('Running command: %s' % ' '.join(cmd))
4556 subprocess.call(cmd) # type: ignore
4557
4558 ##################################
4559
4560
4561 def list_networks(ctx):
4562 # type: (CephadmContext) -> Dict[str,Dict[str,List[str]]]
4563
4564 # sadly, 18.04's iproute2 4.15.0-2ubun doesn't support the -j flag,
4565 # so we'll need to use a regex to parse 'ip' command output.
4566 #
4567 # out, _, _ = call_throws(['ip', '-j', 'route', 'ls'])
4568 # j = json.loads(out)
4569 # for x in j:
4570
4571 res = _list_ipv4_networks(ctx)
4572 res.update(_list_ipv6_networks(ctx))
4573 return res
4574
4575
4576 def _list_ipv4_networks(ctx: CephadmContext):
4577 execstr: Optional[str] = find_executable('ip')
4578 if not execstr:
4579 raise FileNotFoundError("unable to find 'ip' command")
4580 out, _, _ = call_throws(ctx, [execstr, 'route', 'ls'])
4581 return _parse_ipv4_route(out)
4582
4583
4584 def _parse_ipv4_route(out):
4585 r = {} # type: Dict[str,Dict[str,List[str]]]
4586 p = re.compile(r'^(\S+) dev (\S+) (.*)scope link (.*)src (\S+)')
4587 for line in out.splitlines():
4588 m = p.findall(line)
4589 if not m:
4590 continue
4591 net = m[0][0]
4592 iface = m[0][1]
4593 ip = m[0][4]
4594 if net not in r:
4595 r[net] = {}
4596 if iface not in r[net]:
4597 r[net][iface] = []
4598 r[net][iface].append(ip)
4599 return r
4600
4601
4602 def _list_ipv6_networks(ctx: CephadmContext):
4603 execstr: Optional[str] = find_executable('ip')
4604 if not execstr:
4605 raise FileNotFoundError("unable to find 'ip' command")
4606 routes, _, _ = call_throws(ctx, [execstr, '-6', 'route', 'ls'])
4607 ips, _, _ = call_throws(ctx, [execstr, '-6', 'addr', 'ls'])
4608 return _parse_ipv6_route(routes, ips)
4609
4610
4611 def _parse_ipv6_route(routes, ips):
4612 r = {} # type: Dict[str,Dict[str,List[str]]]
4613 route_p = re.compile(r'^(\S+) dev (\S+) proto (\S+) metric (\S+) .*pref (\S+)$')
4614 ip_p = re.compile(r'^\s+inet6 (\S+)/(.*)scope (.*)$')
4615 iface_p = re.compile(r'^(\d+): (\S+): (.*)$')
4616 for line in routes.splitlines():
4617 m = route_p.findall(line)
4618 if not m or m[0][0].lower() == 'default':
4619 continue
4620 net = m[0][0]
4621 if '/' not in net: # only consider networks with a mask
4622 continue
4623 iface = m[0][1]
4624 if net not in r:
4625 r[net] = {}
4626 if iface not in r[net]:
4627 r[net][iface] = []
4628
4629 iface = None
4630 for line in ips.splitlines():
4631 m = ip_p.findall(line)
4632 if not m:
4633 m = iface_p.findall(line)
4634 if m:
4635 # drop @... suffix, if present
4636 iface = m[0][1].split('@')[0]
4637 continue
4638 ip = m[0][0]
4639 # find the network it belongs to
4640 net = [n for n in r.keys()
4641 if ipaddress.ip_address(ip) in ipaddress.ip_network(n)]
4642 if net:
4643 assert(iface)
4644 r[net[0]][iface].append(ip)
4645
4646 return r
4647
4648
4649 def command_list_networks(ctx):
4650 # type: (CephadmContext) -> None
4651 r = list_networks(ctx)
4652 print(json.dumps(r, indent=4))
4653
4654 ##################################
4655
4656
4657 def command_ls(ctx):
4658 # type: (CephadmContext) -> None
4659 ls = list_daemons(ctx, detail=not ctx.no_detail,
4660 legacy_dir=ctx.legacy_dir)
4661 print(json.dumps(ls, indent=4))
4662
4663
4664 def with_units_to_int(v: str) -> int:
4665 if v.endswith('iB'):
4666 v = v[:-2]
4667 elif v.endswith('B'):
4668 v = v[:-1]
4669 mult = 1
4670 if v[-1].upper() == 'K':
4671 mult = 1024
4672 v = v[:-1]
4673 elif v[-1].upper() == 'M':
4674 mult = 1024 * 1024
4675 v = v[:-1]
4676 elif v[-1].upper() == 'G':
4677 mult = 1024 * 1024 * 1024
4678 v = v[:-1]
4679 elif v[-1].upper() == 'T':
4680 mult = 1024 * 1024 * 1024 * 1024
4681 v = v[:-1]
4682 return int(float(v) * mult)
4683
4684
4685 def list_daemons(ctx, detail=True, legacy_dir=None):
4686 # type: (CephadmContext, bool, Optional[str]) -> List[Dict[str, str]]
4687 host_version: Optional[str] = None
4688 ls = []
4689 container_path = ctx.container_engine.path
4690
4691 data_dir = ctx.data_dir
4692 if legacy_dir is not None:
4693 data_dir = os.path.abspath(legacy_dir + data_dir)
4694
4695 # keep track of ceph versions we see
4696 seen_versions = {} # type: Dict[str, Optional[str]]
4697
4698 # keep track of image digests
4699 seen_digests = {} # type: Dict[str, List[str]]
4700
4701 # keep track of memory usage we've seen
4702 seen_memusage = {} # type: Dict[str, int]
4703 out, err, code = call(
4704 ctx,
4705 [container_path, 'stats', '--format', '{{.ID}},{{.MemUsage}}', '--no-stream'],
4706 verbosity=CallVerbosity.DEBUG
4707 )
4708 seen_memusage_cid_len = 0
4709 if not code:
4710 for line in out.splitlines():
4711 (cid, usage) = line.split(',')
4712 (used, limit) = usage.split(' / ')
4713 seen_memusage[cid] = with_units_to_int(used)
4714 if not seen_memusage_cid_len:
4715 seen_memusage_cid_len = len(cid)
4716
4717 # /var/lib/ceph
4718 if os.path.exists(data_dir):
4719 for i in os.listdir(data_dir):
4720 if i in ['mon', 'osd', 'mds', 'mgr']:
4721 daemon_type = i
4722 for j in os.listdir(os.path.join(data_dir, i)):
4723 if '-' not in j:
4724 continue
4725 (cluster, daemon_id) = j.split('-', 1)
4726 fsid = get_legacy_daemon_fsid(ctx,
4727 cluster, daemon_type, daemon_id,
4728 legacy_dir=legacy_dir)
4729 legacy_unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
4730 val: Dict[str, Any] = {
4731 'style': 'legacy',
4732 'name': '%s.%s' % (daemon_type, daemon_id),
4733 'fsid': fsid if fsid is not None else 'unknown',
4734 'systemd_unit': legacy_unit_name,
4735 }
4736 if detail:
4737 (val['enabled'], val['state'], _) = \
4738 check_unit(ctx, legacy_unit_name)
4739 if not host_version:
4740 try:
4741 out, err, code = call(ctx,
4742 ['ceph', '-v'],
4743 verbosity=CallVerbosity.DEBUG)
4744 if not code and out.startswith('ceph version '):
4745 host_version = out.split(' ')[2]
4746 except Exception:
4747 pass
4748 val['host_version'] = host_version
4749 ls.append(val)
4750 elif is_fsid(i):
4751 fsid = str(i) # convince mypy that fsid is a str here
4752 for j in os.listdir(os.path.join(data_dir, i)):
4753 if '.' in j and os.path.isdir(os.path.join(data_dir, fsid, j)):
4754 name = j
4755 (daemon_type, daemon_id) = j.split('.', 1)
4756 unit_name = get_unit_name(fsid,
4757 daemon_type,
4758 daemon_id)
4759 else:
4760 continue
4761 val = {
4762 'style': 'cephadm:v1',
4763 'name': name,
4764 'fsid': fsid,
4765 'systemd_unit': unit_name,
4766 }
4767 if detail:
4768 # get container id
4769 (val['enabled'], val['state'], _) = \
4770 check_unit(ctx, unit_name)
4771 container_id = None
4772 image_name = None
4773 image_id = None
4774 image_digests = None
4775 version = None
4776 start_stamp = None
4777
4778 cmd = [
4779 container_path, 'inspect',
4780 '--format', '{{.Id}},{{.Config.Image}},{{.Image}},{{.Created}},{{index .Config.Labels "io.ceph.version"}}',
4781 'ceph-%s-%s' % (fsid, j)
4782 ]
4783 out, err, code = call(ctx, cmd, verbosity=CallVerbosity.DEBUG)
4784 if not code:
4785 (container_id, image_name, image_id, start,
4786 version) = out.strip().split(',')
4787 image_id = normalize_container_id(image_id)
4788 daemon_type = name.split('.', 1)[0]
4789 start_stamp = try_convert_datetime(start)
4790
4791 # collect digests for this image id
4792 image_digests = seen_digests.get(image_id)
4793 if not image_digests:
4794 out, err, code = call(
4795 ctx,
4796 [
4797 container_path, 'image', 'inspect', image_id,
4798 '--format', '{{.RepoDigests}}',
4799 ],
4800 verbosity=CallVerbosity.DEBUG)
4801 if not code:
4802 image_digests = out.strip()[1:-1].split(' ')
4803 seen_digests[image_id] = image_digests
4804
4805 # identify software version inside the container (if we can)
4806 if not version or '.' not in version:
4807 version = seen_versions.get(image_id, None)
4808 if daemon_type == NFSGanesha.daemon_type:
4809 version = NFSGanesha.get_version(ctx, container_id)
4810 if daemon_type == CephIscsi.daemon_type:
4811 version = CephIscsi.get_version(ctx, container_id)
4812 elif not version:
4813 if daemon_type in Ceph.daemons:
4814 out, err, code = call(ctx,
4815 [container_path, 'exec', container_id,
4816 'ceph', '-v'],
4817 verbosity=CallVerbosity.DEBUG)
4818 if not code and \
4819 out.startswith('ceph version '):
4820 version = out.split(' ')[2]
4821 seen_versions[image_id] = version
4822 elif daemon_type == 'grafana':
4823 out, err, code = call(ctx,
4824 [container_path, 'exec', container_id,
4825 'grafana-server', '-v'],
4826 verbosity=CallVerbosity.DEBUG)
4827 if not code and \
4828 out.startswith('Version '):
4829 version = out.split(' ')[1]
4830 seen_versions[image_id] = version
4831 elif daemon_type in ['prometheus',
4832 'alertmanager',
4833 'node-exporter']:
4834 version = Monitoring.get_version(ctx, container_id, daemon_type)
4835 seen_versions[image_id] = version
4836 elif daemon_type == 'haproxy':
4837 out, err, code = call(ctx,
4838 [container_path, 'exec', container_id,
4839 'haproxy', '-v'],
4840 verbosity=CallVerbosity.DEBUG)
4841 if not code and \
4842 out.startswith('HA-Proxy version '):
4843 version = out.split(' ')[2]
4844 seen_versions[image_id] = version
4845 elif daemon_type == 'keepalived':
4846 out, err, code = call(ctx,
4847 [container_path, 'exec', container_id,
4848 'keepalived', '--version'],
4849 verbosity=CallVerbosity.DEBUG)
4850 if not code and \
4851 err.startswith('Keepalived '):
4852 version = err.split(' ')[1]
4853 if version[0] == 'v':
4854 version = version[1:]
4855 seen_versions[image_id] = version
4856 elif daemon_type == CustomContainer.daemon_type:
4857 # Because a custom container can contain
4858 # everything, we do not know which command
4859 # to execute to get the version.
4860 pass
4861 else:
4862 logger.warning('version for unknown daemon type %s' % daemon_type)
4863 else:
4864 vfile = os.path.join(data_dir, fsid, j, 'unit.image') # type: ignore
4865 try:
4866 with open(vfile, 'r') as f:
4867 image_name = f.read().strip() or None
4868 except IOError:
4869 pass
4870
4871 # unit.meta?
4872 mfile = os.path.join(data_dir, fsid, j, 'unit.meta') # type: ignore
4873 try:
4874 with open(mfile, 'r') as f:
4875 meta = json.loads(f.read())
4876 val.update(meta)
4877 except IOError:
4878 pass
4879
4880 val['container_id'] = container_id
4881 val['container_image_name'] = image_name
4882 val['container_image_id'] = image_id
4883 val['container_image_digests'] = image_digests
4884 if container_id:
4885 val['memory_usage'] = seen_memusage.get(container_id[0:seen_memusage_cid_len])
4886 val['version'] = version
4887 val['started'] = start_stamp
4888 val['created'] = get_file_timestamp(
4889 os.path.join(data_dir, fsid, j, 'unit.created')
4890 )
4891 val['deployed'] = get_file_timestamp(
4892 os.path.join(data_dir, fsid, j, 'unit.image'))
4893 val['configured'] = get_file_timestamp(
4894 os.path.join(data_dir, fsid, j, 'unit.configured'))
4895
4896 ls.append(val)
4897
4898 return ls
4899
4900
4901 def get_daemon_description(ctx, fsid, name, detail=False, legacy_dir=None):
4902 # type: (CephadmContext, str, str, bool, Optional[str]) -> Dict[str, str]
4903
4904 for d in list_daemons(ctx, detail=detail, legacy_dir=legacy_dir):
4905 if d['fsid'] != fsid:
4906 continue
4907 if d['name'] != name:
4908 continue
4909 return d
4910 raise Error('Daemon not found: {}. See `cephadm ls`'.format(name))
4911
4912 ##################################
4913
4914
4915 @default_image
4916 def command_adopt(ctx):
4917 # type: (CephadmContext) -> None
4918
4919 if not ctx.skip_pull:
4920 _pull_image(ctx, ctx.image)
4921
4922 (daemon_type, daemon_id) = ctx.name.split('.', 1)
4923
4924 # legacy check
4925 if ctx.style != 'legacy':
4926 raise Error('adoption of style %s not implemented' % ctx.style)
4927
4928 # lock
4929 fsid = get_legacy_daemon_fsid(ctx,
4930 ctx.cluster,
4931 daemon_type,
4932 daemon_id,
4933 legacy_dir=ctx.legacy_dir)
4934 if not fsid:
4935 raise Error('could not detect legacy fsid; set fsid in ceph.conf')
4936 lock = FileLock(ctx, fsid)
4937 lock.acquire()
4938
4939 # call correct adoption
4940 if daemon_type in Ceph.daemons:
4941 command_adopt_ceph(ctx, daemon_type, daemon_id, fsid)
4942 elif daemon_type == 'prometheus':
4943 command_adopt_prometheus(ctx, daemon_id, fsid)
4944 elif daemon_type == 'grafana':
4945 command_adopt_grafana(ctx, daemon_id, fsid)
4946 elif daemon_type == 'node-exporter':
4947 raise Error('adoption of node-exporter not implemented')
4948 elif daemon_type == 'alertmanager':
4949 command_adopt_alertmanager(ctx, daemon_id, fsid)
4950 else:
4951 raise Error('daemon type %s not recognized' % daemon_type)
4952
4953
4954 class AdoptOsd(object):
4955 def __init__(self, ctx, osd_data_dir, osd_id):
4956 # type: (CephadmContext, str, str) -> None
4957 self.ctx = ctx
4958 self.osd_data_dir = osd_data_dir
4959 self.osd_id = osd_id
4960
4961 def check_online_osd(self):
4962 # type: () -> Tuple[Optional[str], Optional[str]]
4963
4964 osd_fsid, osd_type = None, None
4965
4966 path = os.path.join(self.osd_data_dir, 'fsid')
4967 try:
4968 with open(path, 'r') as f:
4969 osd_fsid = f.read().strip()
4970 logger.info('Found online OSD at %s' % path)
4971 except IOError:
4972 logger.info('Unable to read OSD fsid from %s' % path)
4973 if os.path.exists(os.path.join(self.osd_data_dir, 'type')):
4974 with open(os.path.join(self.osd_data_dir, 'type')) as f:
4975 osd_type = f.read().strip()
4976 else:
4977 logger.info('"type" file missing for OSD data dir')
4978
4979 return osd_fsid, osd_type
4980
4981 def check_offline_lvm_osd(self):
4982 # type: () -> Tuple[Optional[str], Optional[str]]
4983 osd_fsid, osd_type = None, None
4984
4985 c = CephContainer(
4986 self.ctx,
4987 image=self.ctx.image,
4988 entrypoint='/usr/sbin/ceph-volume',
4989 args=['lvm', 'list', '--format=json'],
4990 privileged=True
4991 )
4992 out, err, code = call_throws(self.ctx, c.run_cmd())
4993 if not code:
4994 try:
4995 js = json.loads(out)
4996 if self.osd_id in js:
4997 logger.info('Found offline LVM OSD {}'.format(self.osd_id))
4998 osd_fsid = js[self.osd_id][0]['tags']['ceph.osd_fsid']
4999 for device in js[self.osd_id]:
5000 if device['tags']['ceph.type'] == 'block':
5001 osd_type = 'bluestore'
5002 break
5003 if device['tags']['ceph.type'] == 'data':
5004 osd_type = 'filestore'
5005 break
5006 except ValueError as e:
5007 logger.info('Invalid JSON in ceph-volume lvm list: {}'.format(e))
5008
5009 return osd_fsid, osd_type
5010
5011 def check_offline_simple_osd(self):
5012 # type: () -> Tuple[Optional[str], Optional[str]]
5013 osd_fsid, osd_type = None, None
5014
5015 osd_file = glob('/etc/ceph/osd/{}-[a-f0-9-]*.json'.format(self.osd_id))
5016 if len(osd_file) == 1:
5017 with open(osd_file[0], 'r') as f:
5018 try:
5019 js = json.loads(f.read())
5020 logger.info('Found offline simple OSD {}'.format(self.osd_id))
5021 osd_fsid = js['fsid']
5022 osd_type = js['type']
5023 if osd_type != 'filestore':
5024 # need this to be mounted for the adopt to work, as it
5025 # needs to move files from this directory
5026 call_throws(self.ctx, ['mount', js['data']['path'], self.osd_data_dir])
5027 except ValueError as e:
5028 logger.info('Invalid JSON in {}: {}'.format(osd_file, e))
5029
5030 return osd_fsid, osd_type
5031
5032
5033 def command_adopt_ceph(ctx, daemon_type, daemon_id, fsid):
5034 # type: (CephadmContext, str, str, str) -> None
5035
5036 (uid, gid) = extract_uid_gid(ctx)
5037
5038 data_dir_src = ('/var/lib/ceph/%s/%s-%s' %
5039 (daemon_type, ctx.cluster, daemon_id))
5040 data_dir_src = os.path.abspath(ctx.legacy_dir + data_dir_src)
5041
5042 if not os.path.exists(data_dir_src):
5043 raise Error("{}.{} data directory '{}' does not exist. "
5044 'Incorrect ID specified, or daemon already adopted?'.format(
5045 daemon_type, daemon_id, data_dir_src))
5046
5047 osd_fsid = None
5048 if daemon_type == 'osd':
5049 adopt_osd = AdoptOsd(ctx, data_dir_src, daemon_id)
5050 osd_fsid, osd_type = adopt_osd.check_online_osd()
5051 if not osd_fsid:
5052 osd_fsid, osd_type = adopt_osd.check_offline_lvm_osd()
5053 if not osd_fsid:
5054 osd_fsid, osd_type = adopt_osd.check_offline_simple_osd()
5055 if not osd_fsid:
5056 raise Error('Unable to find OSD {}'.format(daemon_id))
5057 logger.info('objectstore_type is %s' % osd_type)
5058 assert osd_type
5059 if osd_type == 'filestore':
5060 raise Error('FileStore is not supported by cephadm')
5061
5062 # NOTE: implicit assumption here that the units correspond to the
5063 # cluster we are adopting based on the /etc/{defaults,sysconfig}/ceph
5064 # CLUSTER field.
5065 unit_name = 'ceph-%s@%s' % (daemon_type, daemon_id)
5066 (enabled, state, _) = check_unit(ctx, unit_name)
5067 if state == 'running':
5068 logger.info('Stopping old systemd unit %s...' % unit_name)
5069 call_throws(ctx, ['systemctl', 'stop', unit_name])
5070 if enabled:
5071 logger.info('Disabling old systemd unit %s...' % unit_name)
5072 call_throws(ctx, ['systemctl', 'disable', unit_name])
5073
5074 # data
5075 logger.info('Moving data...')
5076 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5077 uid=uid, gid=gid)
5078 move_files(ctx, glob(os.path.join(data_dir_src, '*')),
5079 data_dir_dst,
5080 uid=uid, gid=gid)
5081 logger.debug('Remove dir `%s`' % (data_dir_src))
5082 if os.path.ismount(data_dir_src):
5083 call_throws(ctx, ['umount', data_dir_src])
5084 os.rmdir(data_dir_src)
5085
5086 logger.info('Chowning content...')
5087 call_throws(ctx, ['chown', '-c', '-R', '%d.%d' % (uid, gid), data_dir_dst])
5088
5089 if daemon_type == 'mon':
5090 # rename *.ldb -> *.sst, in case they are coming from ubuntu
5091 store = os.path.join(data_dir_dst, 'store.db')
5092 num_renamed = 0
5093 if os.path.exists(store):
5094 for oldf in os.listdir(store):
5095 if oldf.endswith('.ldb'):
5096 newf = oldf.replace('.ldb', '.sst')
5097 oldp = os.path.join(store, oldf)
5098 newp = os.path.join(store, newf)
5099 logger.debug('Renaming %s -> %s' % (oldp, newp))
5100 os.rename(oldp, newp)
5101 if num_renamed:
5102 logger.info('Renamed %d leveldb *.ldb files to *.sst',
5103 num_renamed)
5104 if daemon_type == 'osd':
5105 for n in ['block', 'block.db', 'block.wal']:
5106 p = os.path.join(data_dir_dst, n)
5107 if os.path.exists(p):
5108 logger.info('Chowning %s...' % p)
5109 os.chown(p, uid, gid)
5110 # disable the ceph-volume 'simple' mode files on the host
5111 simple_fn = os.path.join('/etc/ceph/osd',
5112 '%s-%s.json' % (daemon_id, osd_fsid))
5113 if os.path.exists(simple_fn):
5114 new_fn = simple_fn + '.adopted-by-cephadm'
5115 logger.info('Renaming %s -> %s', simple_fn, new_fn)
5116 os.rename(simple_fn, new_fn)
5117 logger.info('Disabling host unit ceph-volume@ simple unit...')
5118 call(ctx, ['systemctl', 'disable',
5119 'ceph-volume@simple-%s-%s.service' % (daemon_id, osd_fsid)])
5120 else:
5121 # assume this is an 'lvm' c-v for now, but don't error
5122 # out if it's not.
5123 logger.info('Disabling host unit ceph-volume@ lvm unit...')
5124 call(ctx, ['systemctl', 'disable',
5125 'ceph-volume@lvm-%s-%s.service' % (daemon_id, osd_fsid)])
5126
5127 # config
5128 config_src = '/etc/ceph/%s.conf' % (ctx.cluster)
5129 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5130 config_dst = os.path.join(data_dir_dst, 'config')
5131 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5132
5133 # logs
5134 logger.info('Moving logs...')
5135 log_dir_src = ('/var/log/ceph/%s-%s.%s.log*' %
5136 (ctx.cluster, daemon_type, daemon_id))
5137 log_dir_src = os.path.abspath(ctx.legacy_dir + log_dir_src)
5138 log_dir_dst = make_log_dir(ctx, fsid, uid=uid, gid=gid)
5139 move_files(ctx, glob(log_dir_src),
5140 log_dir_dst,
5141 uid=uid, gid=gid)
5142
5143 logger.info('Creating new units...')
5144 make_var_run(ctx, fsid, uid, gid)
5145 c = get_container(ctx, fsid, daemon_type, daemon_id)
5146 deploy_daemon_units(ctx, fsid, uid, gid, daemon_type, daemon_id, c,
5147 enable=True, # unconditionally enable the new unit
5148 start=(state == 'running' or ctx.force_start),
5149 osd_fsid=osd_fsid)
5150 update_firewalld(ctx, daemon_type)
5151
5152
5153 def command_adopt_prometheus(ctx, daemon_id, fsid):
5154 # type: (CephadmContext, str, str) -> None
5155 daemon_type = 'prometheus'
5156 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
5157
5158 _stop_and_disable(ctx, 'prometheus')
5159
5160 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5161 uid=uid, gid=gid)
5162
5163 # config
5164 config_src = '/etc/prometheus/prometheus.yml'
5165 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5166 config_dst = os.path.join(data_dir_dst, 'etc/prometheus')
5167 makedirs(config_dst, uid, gid, 0o755)
5168 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5169
5170 # data
5171 data_src = '/var/lib/prometheus/metrics/'
5172 data_src = os.path.abspath(ctx.legacy_dir + data_src)
5173 data_dst = os.path.join(data_dir_dst, 'data')
5174 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
5175
5176 make_var_run(ctx, fsid, uid, gid)
5177 c = get_container(ctx, fsid, daemon_type, daemon_id)
5178 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
5179 update_firewalld(ctx, daemon_type)
5180
5181
5182 def command_adopt_grafana(ctx, daemon_id, fsid):
5183 # type: (CephadmContext, str, str) -> None
5184
5185 daemon_type = 'grafana'
5186 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
5187
5188 _stop_and_disable(ctx, 'grafana-server')
5189
5190 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5191 uid=uid, gid=gid)
5192
5193 # config
5194 config_src = '/etc/grafana/grafana.ini'
5195 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5196 config_dst = os.path.join(data_dir_dst, 'etc/grafana')
5197 makedirs(config_dst, uid, gid, 0o755)
5198 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5199
5200 prov_src = '/etc/grafana/provisioning/'
5201 prov_src = os.path.abspath(ctx.legacy_dir + prov_src)
5202 prov_dst = os.path.join(data_dir_dst, 'etc/grafana')
5203 copy_tree(ctx, [prov_src], prov_dst, uid=uid, gid=gid)
5204
5205 # cert
5206 cert = '/etc/grafana/grafana.crt'
5207 key = '/etc/grafana/grafana.key'
5208 if os.path.exists(cert) and os.path.exists(key):
5209 cert_src = '/etc/grafana/grafana.crt'
5210 cert_src = os.path.abspath(ctx.legacy_dir + cert_src)
5211 makedirs(os.path.join(data_dir_dst, 'etc/grafana/certs'), uid, gid, 0o755)
5212 cert_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_file')
5213 copy_files(ctx, [cert_src], cert_dst, uid=uid, gid=gid)
5214
5215 key_src = '/etc/grafana/grafana.key'
5216 key_src = os.path.abspath(ctx.legacy_dir + key_src)
5217 key_dst = os.path.join(data_dir_dst, 'etc/grafana/certs/cert_key')
5218 copy_files(ctx, [key_src], key_dst, uid=uid, gid=gid)
5219
5220 _adjust_grafana_ini(os.path.join(config_dst, 'grafana.ini'))
5221 else:
5222 logger.debug('Skipping ssl, missing cert {} or key {}'.format(cert, key))
5223
5224 # data - possible custom dashboards/plugins
5225 data_src = '/var/lib/grafana/'
5226 data_src = os.path.abspath(ctx.legacy_dir + data_src)
5227 data_dst = os.path.join(data_dir_dst, 'data')
5228 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
5229
5230 make_var_run(ctx, fsid, uid, gid)
5231 c = get_container(ctx, fsid, daemon_type, daemon_id)
5232 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
5233 update_firewalld(ctx, daemon_type)
5234
5235
5236 def command_adopt_alertmanager(ctx, daemon_id, fsid):
5237 # type: (CephadmContext, str, str) -> None
5238
5239 daemon_type = 'alertmanager'
5240 (uid, gid) = extract_uid_gid_monitoring(ctx, daemon_type)
5241
5242 _stop_and_disable(ctx, 'prometheus-alertmanager')
5243
5244 data_dir_dst = make_data_dir(ctx, fsid, daemon_type, daemon_id,
5245 uid=uid, gid=gid)
5246
5247 # config
5248 config_src = '/etc/prometheus/alertmanager.yml'
5249 config_src = os.path.abspath(ctx.legacy_dir + config_src)
5250 config_dst = os.path.join(data_dir_dst, 'etc/alertmanager')
5251 makedirs(config_dst, uid, gid, 0o755)
5252 copy_files(ctx, [config_src], config_dst, uid=uid, gid=gid)
5253
5254 # data
5255 data_src = '/var/lib/prometheus/alertmanager/'
5256 data_src = os.path.abspath(ctx.legacy_dir + data_src)
5257 data_dst = os.path.join(data_dir_dst, 'etc/alertmanager/data')
5258 copy_tree(ctx, [data_src], data_dst, uid=uid, gid=gid)
5259
5260 make_var_run(ctx, fsid, uid, gid)
5261 c = get_container(ctx, fsid, daemon_type, daemon_id)
5262 deploy_daemon(ctx, fsid, daemon_type, daemon_id, c, uid, gid)
5263 update_firewalld(ctx, daemon_type)
5264
5265
5266 def _adjust_grafana_ini(filename):
5267 # type: (str) -> None
5268
5269 # Update cert_file, cert_key pathnames in server section
5270 # ConfigParser does not preserve comments
5271 try:
5272 with open(filename, 'r') as grafana_ini:
5273 lines = grafana_ini.readlines()
5274 with open('{}.new'.format(filename), 'w') as grafana_ini:
5275 server_section = False
5276 for line in lines:
5277 if line.startswith('['):
5278 server_section = False
5279 if line.startswith('[server]'):
5280 server_section = True
5281 if server_section:
5282 line = re.sub(r'^cert_file.*',
5283 'cert_file = /etc/grafana/certs/cert_file', line)
5284 line = re.sub(r'^cert_key.*',
5285 'cert_key = /etc/grafana/certs/cert_key', line)
5286 grafana_ini.write(line)
5287 os.rename('{}.new'.format(filename), filename)
5288 except OSError as err:
5289 raise Error('Cannot update {}: {}'.format(filename, err))
5290
5291
5292 def _stop_and_disable(ctx, unit_name):
5293 # type: (CephadmContext, str) -> None
5294
5295 (enabled, state, _) = check_unit(ctx, unit_name)
5296 if state == 'running':
5297 logger.info('Stopping old systemd unit %s...' % unit_name)
5298 call_throws(ctx, ['systemctl', 'stop', unit_name])
5299 if enabled:
5300 logger.info('Disabling old systemd unit %s...' % unit_name)
5301 call_throws(ctx, ['systemctl', 'disable', unit_name])
5302
5303 ##################################
5304
5305
5306 def command_rm_daemon(ctx):
5307 # type: (CephadmContext) -> None
5308 lock = FileLock(ctx, ctx.fsid)
5309 lock.acquire()
5310
5311 (daemon_type, daemon_id) = ctx.name.split('.', 1)
5312 unit_name = get_unit_name_by_daemon_name(ctx, ctx.fsid, ctx.name)
5313
5314 if daemon_type in ['mon', 'osd'] and not ctx.force:
5315 raise Error('must pass --force to proceed: '
5316 'this command may destroy precious data!')
5317
5318 call(ctx, ['systemctl', 'stop', unit_name],
5319 verbosity=CallVerbosity.DEBUG)
5320 call(ctx, ['systemctl', 'reset-failed', unit_name],
5321 verbosity=CallVerbosity.DEBUG)
5322 call(ctx, ['systemctl', 'disable', unit_name],
5323 verbosity=CallVerbosity.DEBUG)
5324 data_dir = get_data_dir(ctx.fsid, ctx.data_dir, daemon_type, daemon_id)
5325 if daemon_type in ['mon', 'osd', 'prometheus'] and \
5326 not ctx.force_delete_data:
5327 # rename it out of the way -- do not delete
5328 backup_dir = os.path.join(ctx.data_dir, ctx.fsid, 'removed')
5329 if not os.path.exists(backup_dir):
5330 makedirs(backup_dir, 0, 0, DATA_DIR_MODE)
5331 dirname = '%s.%s_%s' % (daemon_type, daemon_id,
5332 datetime.datetime.utcnow().strftime(DATEFMT))
5333 os.rename(data_dir,
5334 os.path.join(backup_dir, dirname))
5335 else:
5336 if daemon_type == CephadmDaemon.daemon_type:
5337 CephadmDaemon.uninstall(ctx, ctx.fsid, daemon_type, daemon_id)
5338 call_throws(ctx, ['rm', '-rf', data_dir])
5339
5340 ##################################
5341
5342
5343 def command_rm_cluster(ctx):
5344 # type: (CephadmContext) -> None
5345 if not ctx.force:
5346 raise Error('must pass --force to proceed: '
5347 'this command may destroy precious data!')
5348
5349 lock = FileLock(ctx, ctx.fsid)
5350 lock.acquire()
5351
5352 # stop + disable individual daemon units
5353 for d in list_daemons(ctx, detail=False):
5354 if d['fsid'] != ctx.fsid:
5355 continue
5356 if d['style'] != 'cephadm:v1':
5357 continue
5358 unit_name = get_unit_name(ctx.fsid, d['name'])
5359 call(ctx, ['systemctl', 'stop', unit_name],
5360 verbosity=CallVerbosity.DEBUG)
5361 call(ctx, ['systemctl', 'reset-failed', unit_name],
5362 verbosity=CallVerbosity.DEBUG)
5363 call(ctx, ['systemctl', 'disable', unit_name],
5364 verbosity=CallVerbosity.DEBUG)
5365
5366 # cluster units
5367 for unit_name in ['ceph-%s.target' % ctx.fsid]:
5368 call(ctx, ['systemctl', 'stop', unit_name],
5369 verbosity=CallVerbosity.DEBUG)
5370 call(ctx, ['systemctl', 'reset-failed', unit_name],
5371 verbosity=CallVerbosity.DEBUG)
5372 call(ctx, ['systemctl', 'disable', unit_name],
5373 verbosity=CallVerbosity.DEBUG)
5374
5375 slice_name = 'system-%s.slice' % (('ceph-%s' % ctx.fsid).replace('-', '\\x2d'))
5376 call(ctx, ['systemctl', 'stop', slice_name],
5377 verbosity=CallVerbosity.DEBUG)
5378
5379 # rm units
5380 call_throws(ctx, ['rm', '-f', ctx.unit_dir + # noqa: W504
5381 '/ceph-%s@.service' % ctx.fsid])
5382 call_throws(ctx, ['rm', '-f', ctx.unit_dir + # noqa: W504
5383 '/ceph-%s.target' % ctx.fsid])
5384 call_throws(ctx, ['rm', '-rf',
5385 ctx.unit_dir + '/ceph-%s.target.wants' % ctx.fsid])
5386 # rm data
5387 call_throws(ctx, ['rm', '-rf', ctx.data_dir + '/' + ctx.fsid])
5388
5389 if not ctx.keep_logs:
5390 # rm logs
5391 call_throws(ctx, ['rm', '-rf', ctx.log_dir + '/' + ctx.fsid])
5392 call_throws(ctx, ['rm', '-rf', ctx.log_dir + # noqa: W504
5393 '/*.wants/ceph-%s@*' % ctx.fsid])
5394
5395 # rm logrotate config
5396 call_throws(ctx, ['rm', '-f', ctx.logrotate_dir + '/ceph-%s' % ctx.fsid])
5397
5398 # clean up config, keyring, and pub key files
5399 files = ['/etc/ceph/ceph.conf', '/etc/ceph/ceph.pub', '/etc/ceph/ceph.client.admin.keyring']
5400
5401 if os.path.exists(files[0]):
5402 valid_fsid = False
5403 with open(files[0]) as f:
5404 if ctx.fsid in f.read():
5405 valid_fsid = True
5406 if valid_fsid:
5407 for n in range(0, len(files)):
5408 if os.path.exists(files[n]):
5409 os.remove(files[n])
5410
5411 ##################################
5412
5413
5414 def check_time_sync(ctx, enabler=None):
5415 # type: (CephadmContext, Optional[Packager]) -> bool
5416 units = [
5417 'chrony.service', # 18.04 (at least)
5418 'chronyd.service', # el / opensuse
5419 'systemd-timesyncd.service',
5420 'ntpd.service', # el7 (at least)
5421 'ntp.service', # 18.04 (at least)
5422 'ntpsec.service', # 20.04 (at least) / buster
5423 ]
5424 if not check_units(ctx, units, enabler):
5425 logger.warning('No time sync service is running; checked for %s' % units)
5426 return False
5427 return True
5428
5429
5430 def command_check_host(ctx: CephadmContext) -> None:
5431 container_path = ctx.container_engine.path
5432
5433 errors = []
5434 commands = ['systemctl', 'lvcreate']
5435
5436 try:
5437 check_container_engine(ctx)
5438 logger.info('podman|docker (%s) is present' % container_path)
5439 except Error as e:
5440 errors.append(str(e))
5441
5442 for command in commands:
5443 try:
5444 find_program(command)
5445 logger.info('%s is present' % command)
5446 except ValueError:
5447 errors.append('%s binary does not appear to be installed' % command)
5448
5449 # check for configured+running chronyd or ntp
5450 if not check_time_sync(ctx):
5451 errors.append('No time synchronization is active')
5452
5453 if 'expect_hostname' in ctx and ctx.expect_hostname:
5454 if get_hostname().lower() != ctx.expect_hostname.lower():
5455 errors.append('hostname "%s" does not match expected hostname "%s"' % (
5456 get_hostname(), ctx.expect_hostname))
5457 logger.info('Hostname "%s" matches what is expected.',
5458 ctx.expect_hostname)
5459
5460 if errors:
5461 raise Error('\nERROR: '.join(errors))
5462
5463 logger.info('Host looks OK')
5464
5465 ##################################
5466
5467
5468 def command_prepare_host(ctx: CephadmContext) -> None:
5469 logger.info('Verifying podman|docker is present...')
5470 pkg = None
5471 try:
5472 check_container_engine(ctx)
5473 except Error as e:
5474 logger.warning(str(e))
5475 if not pkg:
5476 pkg = create_packager(ctx)
5477 pkg.install_podman()
5478
5479 logger.info('Verifying lvm2 is present...')
5480 if not find_executable('lvcreate'):
5481 if not pkg:
5482 pkg = create_packager(ctx)
5483 pkg.install(['lvm2'])
5484
5485 logger.info('Verifying time synchronization is in place...')
5486 if not check_time_sync(ctx):
5487 if not pkg:
5488 pkg = create_packager(ctx)
5489 pkg.install(['chrony'])
5490 # check again, and this time try to enable
5491 # the service
5492 check_time_sync(ctx, enabler=pkg)
5493
5494 if 'expect_hostname' in ctx and ctx.expect_hostname and ctx.expect_hostname != get_hostname():
5495 logger.warning('Adjusting hostname from %s -> %s...' % (get_hostname(), ctx.expect_hostname))
5496 call_throws(ctx, ['hostname', ctx.expect_hostname])
5497 with open('/etc/hostname', 'w') as f:
5498 f.write(ctx.expect_hostname + '\n')
5499
5500 logger.info('Repeating the final host check...')
5501 command_check_host(ctx)
5502
5503 ##################################
5504
5505
5506 class CustomValidation(argparse.Action):
5507
5508 def _check_name(self, values):
5509 try:
5510 (daemon_type, daemon_id) = values.split('.', 1)
5511 except ValueError:
5512 raise argparse.ArgumentError(self,
5513 'must be of the format <type>.<id>. For example, osd.1 or prometheus.myhost.com')
5514
5515 daemons = get_supported_daemons()
5516 if daemon_type not in daemons:
5517 raise argparse.ArgumentError(self,
5518 'name must declare the type of daemon e.g. '
5519 '{}'.format(', '.join(daemons)))
5520
5521 def __call__(self, parser, namespace, values, option_string=None):
5522 if self.dest == 'name':
5523 self._check_name(values)
5524 setattr(namespace, self.dest, values)
5525 elif self.dest == 'exporter_config':
5526 cfg = get_parm(values)
5527 # run the class' validate method, and convert to an argparse error
5528 # if problems are found
5529 try:
5530 CephadmDaemon.validate_config(cfg)
5531 except Error as e:
5532 raise argparse.ArgumentError(self,
5533 str(e))
5534 setattr(namespace, self.dest, cfg)
5535
5536 ##################################
5537
5538
5539 def get_distro():
5540 # type: () -> Tuple[Optional[str], Optional[str], Optional[str]]
5541 distro = None
5542 distro_version = None
5543 distro_codename = None
5544 with open('/etc/os-release', 'r') as f:
5545 for line in f.readlines():
5546 line = line.strip()
5547 if '=' not in line or line.startswith('#'):
5548 continue
5549 (var, val) = line.split('=', 1)
5550 if val[0] == '"' and val[-1] == '"':
5551 val = val[1:-1]
5552 if var == 'ID':
5553 distro = val.lower()
5554 elif var == 'VERSION_ID':
5555 distro_version = val.lower()
5556 elif var == 'VERSION_CODENAME':
5557 distro_codename = val.lower()
5558 return distro, distro_version, distro_codename
5559
5560
5561 class Packager(object):
5562 def __init__(self, ctx: CephadmContext,
5563 stable=None, version=None, branch=None, commit=None):
5564 assert \
5565 (stable and not version and not branch and not commit) or \
5566 (not stable and version and not branch and not commit) or \
5567 (not stable and not version and branch) or \
5568 (not stable and not version and not branch and not commit)
5569 self.ctx = ctx
5570 self.stable = stable
5571 self.version = version
5572 self.branch = branch
5573 self.commit = commit
5574
5575 def add_repo(self):
5576 raise NotImplementedError
5577
5578 def rm_repo(self):
5579 raise NotImplementedError
5580
5581 def query_shaman(self, distro, distro_version, branch, commit):
5582 # query shaman
5583 logger.info('Fetching repo metadata from shaman and chacra...')
5584 shaman_url = 'https://shaman.ceph.com/api/repos/ceph/{branch}/{sha1}/{distro}/{distro_version}/repo/?arch={arch}'.format(
5585 distro=distro,
5586 distro_version=distro_version,
5587 branch=branch,
5588 sha1=commit or 'latest',
5589 arch=get_arch()
5590 )
5591 try:
5592 shaman_response = urlopen(shaman_url)
5593 except HTTPError as err:
5594 logger.error('repository not found in shaman (might not be available yet)')
5595 raise Error('%s, failed to fetch %s' % (err, shaman_url))
5596 chacra_url = ''
5597 try:
5598 chacra_url = shaman_response.geturl()
5599 chacra_response = urlopen(chacra_url)
5600 except HTTPError as err:
5601 logger.error('repository not found in chacra (might not be available yet)')
5602 raise Error('%s, failed to fetch %s' % (err, chacra_url))
5603 return chacra_response.read().decode('utf-8')
5604
5605 def repo_gpgkey(self):
5606 if self.ctx.gpg_url:
5607 return self.ctx.gpg_url
5608 if self.stable or self.version:
5609 return 'https://download.ceph.com/keys/release.asc', 'release'
5610 else:
5611 return 'https://download.ceph.com/keys/autobuild.asc', 'autobuild'
5612
5613 def enable_service(self, service):
5614 """
5615 Start and enable the service (typically using systemd).
5616 """
5617 call_throws(self.ctx, ['systemctl', 'enable', '--now', service])
5618
5619
5620 class Apt(Packager):
5621 DISTRO_NAMES = {
5622 'ubuntu': 'ubuntu',
5623 'debian': 'debian',
5624 }
5625
5626 def __init__(self, ctx: CephadmContext,
5627 stable, version, branch, commit,
5628 distro, distro_version, distro_codename):
5629 super(Apt, self).__init__(ctx, stable=stable, version=version,
5630 branch=branch, commit=commit)
5631 self.ctx = ctx
5632 self.distro = self.DISTRO_NAMES[distro]
5633 self.distro_codename = distro_codename
5634 self.distro_version = distro_version
5635
5636 def repo_path(self):
5637 return '/etc/apt/sources.list.d/ceph.list'
5638
5639 def add_repo(self):
5640
5641 url, name = self.repo_gpgkey()
5642 logger.info('Installing repo GPG key from %s...' % url)
5643 try:
5644 response = urlopen(url)
5645 except HTTPError as err:
5646 logger.error('failed to fetch GPG repo key from %s: %s' % (
5647 url, err))
5648 raise Error('failed to fetch GPG key')
5649 key = response.read().decode('utf-8')
5650 with open('/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name, 'w') as f:
5651 f.write(key)
5652
5653 if self.version:
5654 content = 'deb %s/debian-%s/ %s main\n' % (
5655 self.ctx.repo_url, self.version, self.distro_codename)
5656 elif self.stable:
5657 content = 'deb %s/debian-%s/ %s main\n' % (
5658 self.ctx.repo_url, self.stable, self.distro_codename)
5659 else:
5660 content = self.query_shaman(self.distro, self.distro_codename, self.branch,
5661 self.commit)
5662
5663 logger.info('Installing repo file at %s...' % self.repo_path())
5664 with open(self.repo_path(), 'w') as f:
5665 f.write(content)
5666
5667 def rm_repo(self):
5668 for name in ['autobuild', 'release']:
5669 p = '/etc/apt/trusted.gpg.d/ceph.%s.gpg' % name
5670 if os.path.exists(p):
5671 logger.info('Removing repo GPG key %s...' % p)
5672 os.unlink(p)
5673 if os.path.exists(self.repo_path()):
5674 logger.info('Removing repo at %s...' % self.repo_path())
5675 os.unlink(self.repo_path())
5676
5677 if self.distro == 'ubuntu':
5678 self.rm_kubic_repo()
5679
5680 def install(self, ls):
5681 logger.info('Installing packages %s...' % ls)
5682 call_throws(self.ctx, ['apt-get', 'install', '-y'] + ls)
5683
5684 def install_podman(self):
5685 if self.distro == 'ubuntu':
5686 logger.info('Setting up repo for podman...')
5687 self.add_kubic_repo()
5688 call_throws(self.ctx, ['apt-get', 'update'])
5689
5690 logger.info('Attempting podman install...')
5691 try:
5692 self.install(['podman'])
5693 except Error:
5694 logger.info('Podman did not work. Falling back to docker...')
5695 self.install(['docker.io'])
5696
5697 def kubic_repo_url(self):
5698 return 'https://download.opensuse.org/repositories/devel:/kubic:/' \
5699 'libcontainers:/stable/xUbuntu_%s/' % self.distro_version
5700
5701 def kubic_repo_path(self):
5702 return '/etc/apt/sources.list.d/devel:kubic:libcontainers:stable.list'
5703
5704 def kubric_repo_gpgkey_url(self):
5705 return '%s/Release.key' % self.kubic_repo_url()
5706
5707 def kubric_repo_gpgkey_path(self):
5708 return '/etc/apt/trusted.gpg.d/kubic.release.gpg'
5709
5710 def add_kubic_repo(self):
5711 url = self.kubric_repo_gpgkey_url()
5712 logger.info('Installing repo GPG key from %s...' % url)
5713 try:
5714 response = urlopen(url)
5715 except HTTPError as err:
5716 logger.error('failed to fetch GPG repo key from %s: %s' % (
5717 url, err))
5718 raise Error('failed to fetch GPG key')
5719 key = response.read().decode('utf-8')
5720 tmp_key = write_tmp(key, 0, 0)
5721 keyring = self.kubric_repo_gpgkey_path()
5722 call_throws(self.ctx, ['apt-key', '--keyring', keyring, 'add', tmp_key.name])
5723
5724 logger.info('Installing repo file at %s...' % self.kubic_repo_path())
5725 content = 'deb %s /\n' % self.kubic_repo_url()
5726 with open(self.kubic_repo_path(), 'w') as f:
5727 f.write(content)
5728
5729 def rm_kubic_repo(self):
5730 keyring = self.kubric_repo_gpgkey_path()
5731 if os.path.exists(keyring):
5732 logger.info('Removing repo GPG key %s...' % keyring)
5733 os.unlink(keyring)
5734
5735 p = self.kubic_repo_path()
5736 if os.path.exists(p):
5737 logger.info('Removing repo at %s...' % p)
5738 os.unlink(p)
5739
5740
5741 class YumDnf(Packager):
5742 DISTRO_NAMES = {
5743 'centos': ('centos', 'el'),
5744 'rhel': ('centos', 'el'),
5745 'scientific': ('centos', 'el'),
5746 'fedora': ('fedora', 'fc'),
5747 }
5748
5749 def __init__(self, ctx: CephadmContext,
5750 stable, version, branch, commit,
5751 distro, distro_version):
5752 super(YumDnf, self).__init__(ctx, stable=stable, version=version,
5753 branch=branch, commit=commit)
5754 self.ctx = ctx
5755 self.major = int(distro_version.split('.')[0])
5756 self.distro_normalized = self.DISTRO_NAMES[distro][0]
5757 self.distro_code = self.DISTRO_NAMES[distro][1] + str(self.major)
5758 if (self.distro_code == 'fc' and self.major >= 30) or \
5759 (self.distro_code == 'el' and self.major >= 8):
5760 self.tool = 'dnf'
5761 else:
5762 self.tool = 'yum'
5763
5764 def custom_repo(self, **kw):
5765 """
5766 Repo files need special care in that a whole line should not be present
5767 if there is no value for it. Because we were using `format()` we could
5768 not conditionally add a line for a repo file. So the end result would
5769 contain a key with a missing value (say if we were passing `None`).
5770
5771 For example, it could look like::
5772
5773 [ceph repo]
5774 name= ceph repo
5775 proxy=
5776 gpgcheck=
5777
5778 Which breaks. This function allows us to conditionally add lines,
5779 preserving an order and be more careful.
5780
5781 Previously, and for historical purposes, this is how the template used
5782 to look::
5783
5784 custom_repo =
5785 [{repo_name}]
5786 name={name}
5787 baseurl={baseurl}
5788 enabled={enabled}
5789 gpgcheck={gpgcheck}
5790 type={_type}
5791 gpgkey={gpgkey}
5792 proxy={proxy}
5793
5794 """
5795 lines = []
5796
5797 # by using tuples (vs a dict) we preserve the order of what we want to
5798 # return, like starting with a [repo name]
5799 tmpl = (
5800 ('reponame', '[%s]'),
5801 ('name', 'name=%s'),
5802 ('baseurl', 'baseurl=%s'),
5803 ('enabled', 'enabled=%s'),
5804 ('gpgcheck', 'gpgcheck=%s'),
5805 ('_type', 'type=%s'),
5806 ('gpgkey', 'gpgkey=%s'),
5807 ('proxy', 'proxy=%s'),
5808 ('priority', 'priority=%s'),
5809 )
5810
5811 for line in tmpl:
5812 tmpl_key, tmpl_value = line # key values from tmpl
5813
5814 # ensure that there is an actual value (not None nor empty string)
5815 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
5816 lines.append(tmpl_value % kw.get(tmpl_key))
5817
5818 return '\n'.join(lines)
5819
5820 def repo_path(self):
5821 return '/etc/yum.repos.d/ceph.repo'
5822
5823 def repo_baseurl(self):
5824 assert self.stable or self.version
5825 if self.version:
5826 return '%s/rpm-%s/%s' % (self.ctx.repo_url, self.version,
5827 self.distro_code)
5828 else:
5829 return '%s/rpm-%s/%s' % (self.ctx.repo_url, self.stable,
5830 self.distro_code)
5831
5832 def add_repo(self):
5833 if self.stable or self.version:
5834 content = ''
5835 for n, t in {
5836 'Ceph': '$basearch',
5837 'Ceph-noarch': 'noarch',
5838 'Ceph-source': 'SRPMS'}.items():
5839 content += '[%s]\n' % (n)
5840 content += self.custom_repo(
5841 name='Ceph %s' % t,
5842 baseurl=self.repo_baseurl() + '/' + t,
5843 enabled=1,
5844 gpgcheck=1,
5845 gpgkey=self.repo_gpgkey()[0],
5846 )
5847 content += '\n\n'
5848 else:
5849 content = self.query_shaman(self.distro_normalized, self.major,
5850 self.branch,
5851 self.commit)
5852
5853 logger.info('Writing repo to %s...' % self.repo_path())
5854 with open(self.repo_path(), 'w') as f:
5855 f.write(content)
5856
5857 if self.distro_code.startswith('el'):
5858 logger.info('Enabling EPEL...')
5859 call_throws(self.ctx, [self.tool, 'install', '-y', 'epel-release'])
5860
5861 def rm_repo(self):
5862 if os.path.exists(self.repo_path()):
5863 os.unlink(self.repo_path())
5864
5865 def install(self, ls):
5866 logger.info('Installing packages %s...' % ls)
5867 call_throws(self.ctx, [self.tool, 'install', '-y'] + ls)
5868
5869 def install_podman(self):
5870 self.install(['podman'])
5871
5872
5873 class Zypper(Packager):
5874 DISTRO_NAMES = [
5875 'sles',
5876 'opensuse-tumbleweed',
5877 'opensuse-leap'
5878 ]
5879
5880 def __init__(self, ctx: CephadmContext,
5881 stable, version, branch, commit,
5882 distro, distro_version):
5883 super(Zypper, self).__init__(ctx, stable=stable, version=version,
5884 branch=branch, commit=commit)
5885 self.ctx = ctx
5886 self.tool = 'zypper'
5887 self.distro = 'opensuse'
5888 self.distro_version = '15.1'
5889 if 'tumbleweed' not in distro and distro_version is not None:
5890 self.distro_version = distro_version
5891
5892 def custom_repo(self, **kw):
5893 """
5894 See YumDnf for format explanation.
5895 """
5896 lines = []
5897
5898 # by using tuples (vs a dict) we preserve the order of what we want to
5899 # return, like starting with a [repo name]
5900 tmpl = (
5901 ('reponame', '[%s]'),
5902 ('name', 'name=%s'),
5903 ('baseurl', 'baseurl=%s'),
5904 ('enabled', 'enabled=%s'),
5905 ('gpgcheck', 'gpgcheck=%s'),
5906 ('_type', 'type=%s'),
5907 ('gpgkey', 'gpgkey=%s'),
5908 ('proxy', 'proxy=%s'),
5909 ('priority', 'priority=%s'),
5910 )
5911
5912 for line in tmpl:
5913 tmpl_key, tmpl_value = line # key values from tmpl
5914
5915 # ensure that there is an actual value (not None nor empty string)
5916 if tmpl_key in kw and kw.get(tmpl_key) not in (None, ''):
5917 lines.append(tmpl_value % kw.get(tmpl_key))
5918
5919 return '\n'.join(lines)
5920
5921 def repo_path(self):
5922 return '/etc/zypp/repos.d/ceph.repo'
5923
5924 def repo_baseurl(self):
5925 assert self.stable or self.version
5926 if self.version:
5927 return '%s/rpm-%s/%s' % (self.ctx.repo_url,
5928 self.stable, self.distro)
5929 else:
5930 return '%s/rpm-%s/%s' % (self.ctx.repo_url,
5931 self.stable, self.distro)
5932
5933 def add_repo(self):
5934 if self.stable or self.version:
5935 content = ''
5936 for n, t in {
5937 'Ceph': '$basearch',
5938 'Ceph-noarch': 'noarch',
5939 'Ceph-source': 'SRPMS'}.items():
5940 content += '[%s]\n' % (n)
5941 content += self.custom_repo(
5942 name='Ceph %s' % t,
5943 baseurl=self.repo_baseurl() + '/' + t,
5944 enabled=1,
5945 gpgcheck=1,
5946 gpgkey=self.repo_gpgkey()[0],
5947 )
5948 content += '\n\n'
5949 else:
5950 content = self.query_shaman(self.distro, self.distro_version,
5951 self.branch,
5952 self.commit)
5953
5954 logger.info('Writing repo to %s...' % self.repo_path())
5955 with open(self.repo_path(), 'w') as f:
5956 f.write(content)
5957
5958 def rm_repo(self):
5959 if os.path.exists(self.repo_path()):
5960 os.unlink(self.repo_path())
5961
5962 def install(self, ls):
5963 logger.info('Installing packages %s...' % ls)
5964 call_throws(self.ctx, [self.tool, 'in', '-y'] + ls)
5965
5966 def install_podman(self):
5967 self.install(['podman'])
5968
5969
5970 def create_packager(ctx: CephadmContext,
5971 stable=None, version=None, branch=None, commit=None):
5972 distro, distro_version, distro_codename = get_distro()
5973 if distro in YumDnf.DISTRO_NAMES:
5974 return YumDnf(ctx, stable=stable, version=version,
5975 branch=branch, commit=commit,
5976 distro=distro, distro_version=distro_version)
5977 elif distro in Apt.DISTRO_NAMES:
5978 return Apt(ctx, stable=stable, version=version,
5979 branch=branch, commit=commit,
5980 distro=distro, distro_version=distro_version,
5981 distro_codename=distro_codename)
5982 elif distro in Zypper.DISTRO_NAMES:
5983 return Zypper(ctx, stable=stable, version=version,
5984 branch=branch, commit=commit,
5985 distro=distro, distro_version=distro_version)
5986 raise Error('Distro %s version %s not supported' % (distro, distro_version))
5987
5988
5989 def command_add_repo(ctx: CephadmContext):
5990 if ctx.version and ctx.release:
5991 raise Error('you can specify either --release or --version but not both')
5992 if not ctx.version and not ctx.release and not ctx.dev and not ctx.dev_commit:
5993 raise Error('please supply a --release, --version, --dev or --dev-commit argument')
5994 if ctx.version:
5995 try:
5996 (x, y, z) = ctx.version.split('.')
5997 except Exception:
5998 raise Error('version must be in the form x.y.z (e.g., 15.2.0)')
5999
6000 pkg = create_packager(ctx, stable=ctx.release,
6001 version=ctx.version,
6002 branch=ctx.dev,
6003 commit=ctx.dev_commit)
6004 pkg.add_repo()
6005
6006
6007 def command_rm_repo(ctx: CephadmContext):
6008 pkg = create_packager(ctx)
6009 pkg.rm_repo()
6010
6011
6012 def command_install(ctx: CephadmContext):
6013 pkg = create_packager(ctx)
6014 pkg.install(ctx.packages)
6015
6016 ##################################
6017
6018
6019 def get_ipv4_address(ifname):
6020 # type: (str) -> str
6021 def _extract(sock, offset):
6022 return socket.inet_ntop(
6023 socket.AF_INET,
6024 fcntl.ioctl(
6025 sock.fileno(),
6026 offset,
6027 struct.pack('256s', bytes(ifname[:15], 'utf-8'))
6028 )[20:24])
6029
6030 s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
6031 try:
6032 addr = _extract(s, 35093) # '0x8915' = SIOCGIFADDR
6033 dq_mask = _extract(s, 35099) # 0x891b = SIOCGIFNETMASK
6034 except OSError:
6035 # interface does not have an ipv4 address
6036 return ''
6037
6038 dec_mask = sum([bin(int(i)).count('1')
6039 for i in dq_mask.split('.')])
6040 return '{}/{}'.format(addr, dec_mask)
6041
6042
6043 def get_ipv6_address(ifname):
6044 # type: (str) -> str
6045 if not os.path.exists('/proc/net/if_inet6'):
6046 return ''
6047
6048 raw = read_file(['/proc/net/if_inet6'])
6049 data = raw.splitlines()
6050 # based on docs @ https://www.tldp.org/HOWTO/Linux+IPv6-HOWTO/ch11s04.html
6051 # field 0 is ipv6, field 2 is scope
6052 for iface_setting in data:
6053 field = iface_setting.split()
6054 if field[-1] == ifname:
6055 ipv6_raw = field[0]
6056 ipv6_fmtd = ':'.join([ipv6_raw[_p:_p + 4] for _p in range(0, len(field[0]), 4)])
6057 # apply naming rules using ipaddress module
6058 ipv6 = ipaddress.ip_address(ipv6_fmtd)
6059 return '{}/{}'.format(str(ipv6), int('0x{}'.format(field[2]), 16))
6060 return ''
6061
6062
6063 def bytes_to_human(num, mode='decimal'):
6064 # type: (float, str) -> str
6065 """Convert a bytes value into it's human-readable form.
6066
6067 :param num: number, in bytes, to convert
6068 :param mode: Either decimal (default) or binary to determine divisor
6069 :returns: string representing the bytes value in a more readable format
6070 """
6071 unit_list = ['', 'KB', 'MB', 'GB', 'TB', 'PB', 'EB', 'ZB']
6072 divisor = 1000.0
6073 yotta = 'YB'
6074
6075 if mode == 'binary':
6076 unit_list = ['', 'KiB', 'MiB', 'GiB', 'TiB', 'PiB', 'EiB', 'ZiB']
6077 divisor = 1024.0
6078 yotta = 'YiB'
6079
6080 for unit in unit_list:
6081 if abs(num) < divisor:
6082 return '%3.1f%s' % (num, unit)
6083 num /= divisor
6084 return '%.1f%s' % (num, yotta)
6085
6086
6087 def read_file(path_list, file_name=''):
6088 # type: (List[str], str) -> str
6089 """Returns the content of the first file found within the `path_list`
6090
6091 :param path_list: list of file paths to search
6092 :param file_name: optional file_name to be applied to a file path
6093 :returns: content of the file or 'Unknown'
6094 """
6095 for path in path_list:
6096 if file_name:
6097 file_path = os.path.join(path, file_name)
6098 else:
6099 file_path = path
6100 if os.path.exists(file_path):
6101 with open(file_path, 'r') as f:
6102 try:
6103 content = f.read().strip()
6104 except OSError:
6105 # sysfs may populate the file, but for devices like
6106 # virtio reads can fail
6107 return 'Unknown'
6108 else:
6109 return content
6110 return 'Unknown'
6111
6112 ##################################
6113
6114
6115 class HostFacts():
6116 _dmi_path_list = ['/sys/class/dmi/id']
6117 _nic_path_list = ['/sys/class/net']
6118 _selinux_path_list = ['/etc/selinux/config']
6119 _apparmor_path_list = ['/etc/apparmor']
6120 _disk_vendor_workarounds = {
6121 '0x1af4': 'Virtio Block Device'
6122 }
6123
6124 def __init__(self, ctx: CephadmContext):
6125 self.ctx: CephadmContext = ctx
6126 self.cpu_model: str = 'Unknown'
6127 self.cpu_count: int = 0
6128 self.cpu_cores: int = 0
6129 self.cpu_threads: int = 0
6130 self.interfaces: Dict[str, Any] = {}
6131
6132 self._meminfo: List[str] = read_file(['/proc/meminfo']).splitlines()
6133 self._get_cpuinfo()
6134 self._process_nics()
6135 self.arch: str = platform.processor()
6136 self.kernel: str = platform.release()
6137
6138 def _get_cpuinfo(self):
6139 # type: () -> None
6140 """Determine cpu information via /proc/cpuinfo"""
6141 raw = read_file(['/proc/cpuinfo'])
6142 output = raw.splitlines()
6143 cpu_set = set()
6144
6145 for line in output:
6146 field = [f.strip() for f in line.split(':')]
6147 if 'model name' in line:
6148 self.cpu_model = field[1]
6149 if 'physical id' in line:
6150 cpu_set.add(field[1])
6151 if 'siblings' in line:
6152 self.cpu_threads = int(field[1].strip())
6153 if 'cpu cores' in line:
6154 self.cpu_cores = int(field[1].strip())
6155 pass
6156 self.cpu_count = len(cpu_set)
6157
6158 def _get_block_devs(self):
6159 # type: () -> List[str]
6160 """Determine the list of block devices by looking at /sys/block"""
6161 return [dev for dev in os.listdir('/sys/block')
6162 if not dev.startswith('dm')]
6163
6164 def _get_devs_by_type(self, rota='0'):
6165 # type: (str) -> List[str]
6166 """Filter block devices by a given rotational attribute (0=flash, 1=spinner)"""
6167 devs = list()
6168 for blk_dev in self._get_block_devs():
6169 rot_path = '/sys/block/{}/queue/rotational'.format(blk_dev)
6170 rot_value = read_file([rot_path])
6171 if rot_value == rota:
6172 devs.append(blk_dev)
6173 return devs
6174
6175 @property
6176 def operating_system(self):
6177 # type: () -> str
6178 """Determine OS version"""
6179 raw_info = read_file(['/etc/os-release'])
6180 os_release = raw_info.splitlines()
6181 rel_str = 'Unknown'
6182 rel_dict = dict()
6183
6184 for line in os_release:
6185 if '=' in line:
6186 var_name, var_value = line.split('=')
6187 rel_dict[var_name] = var_value.strip('"')
6188
6189 # Would normally use PRETTY_NAME, but NAME and VERSION are more
6190 # consistent
6191 if all(_v in rel_dict for _v in ['NAME', 'VERSION']):
6192 rel_str = '{} {}'.format(rel_dict['NAME'], rel_dict['VERSION'])
6193 return rel_str
6194
6195 @property
6196 def hostname(self):
6197 # type: () -> str
6198 """Return the hostname"""
6199 return platform.node()
6200
6201 @property
6202 def subscribed(self):
6203 # type: () -> str
6204 """Highlevel check to see if the host is subscribed to receive updates/support"""
6205 def _red_hat():
6206 # type: () -> str
6207 # RHEL 7 and RHEL 8
6208 entitlements_dir = '/etc/pki/entitlement'
6209 if os.path.exists(entitlements_dir):
6210 pems = glob('{}/*.pem'.format(entitlements_dir))
6211 if len(pems) >= 2:
6212 return 'Yes'
6213
6214 return 'No'
6215
6216 os_name = self.operating_system
6217 if os_name.upper().startswith('RED HAT'):
6218 return _red_hat()
6219
6220 return 'Unknown'
6221
6222 @property
6223 def hdd_count(self):
6224 # type: () -> int
6225 """Return a count of HDDs (spinners)"""
6226 return len(self._get_devs_by_type(rota='1'))
6227
6228 def _get_capacity(self, dev):
6229 # type: (str) -> int
6230 """Determine the size of a given device"""
6231 size_path = os.path.join('/sys/block', dev, 'size')
6232 size_blocks = int(read_file([size_path]))
6233 blk_path = os.path.join('/sys/block', dev, 'queue', 'logical_block_size')
6234 blk_count = int(read_file([blk_path]))
6235 return size_blocks * blk_count
6236
6237 def _get_capacity_by_type(self, rota='0'):
6238 # type: (str) -> int
6239 """Return the total capacity of a category of device (flash or hdd)"""
6240 devs = self._get_devs_by_type(rota=rota)
6241 capacity = 0
6242 for dev in devs:
6243 capacity += self._get_capacity(dev)
6244 return capacity
6245
6246 def _dev_list(self, dev_list):
6247 # type: (List[str]) -> List[Dict[str, object]]
6248 """Return a 'pretty' name list for each device in the `dev_list`"""
6249 disk_list = list()
6250
6251 for dev in dev_list:
6252 disk_model = read_file(['/sys/block/{}/device/model'.format(dev)]).strip()
6253 disk_rev = read_file(['/sys/block/{}/device/rev'.format(dev)]).strip()
6254 disk_wwid = read_file(['/sys/block/{}/device/wwid'.format(dev)]).strip()
6255 vendor = read_file(['/sys/block/{}/device/vendor'.format(dev)]).strip()
6256 disk_vendor = HostFacts._disk_vendor_workarounds.get(vendor, vendor)
6257 disk_size_bytes = self._get_capacity(dev)
6258 disk_list.append({
6259 'description': '{} {} ({})'.format(disk_vendor, disk_model, bytes_to_human(disk_size_bytes)),
6260 'vendor': disk_vendor,
6261 'model': disk_model,
6262 'rev': disk_rev,
6263 'wwid': disk_wwid,
6264 'dev_name': dev,
6265 'disk_size_bytes': disk_size_bytes,
6266 })
6267 return disk_list
6268
6269 @property
6270 def hdd_list(self):
6271 # type: () -> List[Dict[str, object]]
6272 """Return a list of devices that are HDDs (spinners)"""
6273 devs = self._get_devs_by_type(rota='1')
6274 return self._dev_list(devs)
6275
6276 @property
6277 def flash_list(self):
6278 # type: () -> List[Dict[str, object]]
6279 """Return a list of devices that are flash based (SSD, NVMe)"""
6280 devs = self._get_devs_by_type(rota='0')
6281 return self._dev_list(devs)
6282
6283 @property
6284 def hdd_capacity_bytes(self):
6285 # type: () -> int
6286 """Return the total capacity for all HDD devices (bytes)"""
6287 return self._get_capacity_by_type(rota='1')
6288
6289 @property
6290 def hdd_capacity(self):
6291 # type: () -> str
6292 """Return the total capacity for all HDD devices (human readable format)"""
6293 return bytes_to_human(self.hdd_capacity_bytes)
6294
6295 @property
6296 def cpu_load(self):
6297 # type: () -> Dict[str, float]
6298 """Return the cpu load average data for the host"""
6299 raw = read_file(['/proc/loadavg']).strip()
6300 data = raw.split()
6301 return {
6302 '1min': float(data[0]),
6303 '5min': float(data[1]),
6304 '15min': float(data[2]),
6305 }
6306
6307 @property
6308 def flash_count(self):
6309 # type: () -> int
6310 """Return the number of flash devices in the system (SSD, NVMe)"""
6311 return len(self._get_devs_by_type(rota='0'))
6312
6313 @property
6314 def flash_capacity_bytes(self):
6315 # type: () -> int
6316 """Return the total capacity for all flash devices (bytes)"""
6317 return self._get_capacity_by_type(rota='0')
6318
6319 @property
6320 def flash_capacity(self):
6321 # type: () -> str
6322 """Return the total capacity for all Flash devices (human readable format)"""
6323 return bytes_to_human(self.flash_capacity_bytes)
6324
6325 def _process_nics(self):
6326 # type: () -> None
6327 """Look at the NIC devices and extract network related metadata"""
6328 # from https://github.com/torvalds/linux/blob/master/include/uapi/linux/if_arp.h
6329 hw_lookup = {
6330 '1': 'ethernet',
6331 '32': 'infiniband',
6332 '772': 'loopback',
6333 }
6334
6335 for nic_path in HostFacts._nic_path_list:
6336 if not os.path.exists(nic_path):
6337 continue
6338 for iface in os.listdir(nic_path):
6339
6340 lower_devs_list = [os.path.basename(link.replace('lower_', '')) for link in glob(os.path.join(nic_path, iface, 'lower_*'))]
6341 upper_devs_list = [os.path.basename(link.replace('upper_', '')) for link in glob(os.path.join(nic_path, iface, 'upper_*'))]
6342
6343 try:
6344 mtu = int(read_file([os.path.join(nic_path, iface, 'mtu')]))
6345 except ValueError:
6346 mtu = 0
6347
6348 operstate = read_file([os.path.join(nic_path, iface, 'operstate')])
6349 try:
6350 speed = int(read_file([os.path.join(nic_path, iface, 'speed')]))
6351 except (OSError, ValueError):
6352 # OSError : device doesn't support the ethtool get_link_ksettings
6353 # ValueError : raised when the read fails, and returns Unknown
6354 #
6355 # Either way, we show a -1 when speed isn't available
6356 speed = -1
6357
6358 if os.path.exists(os.path.join(nic_path, iface, 'bridge')):
6359 nic_type = 'bridge'
6360 elif os.path.exists(os.path.join(nic_path, iface, 'bonding')):
6361 nic_type = 'bonding'
6362 else:
6363 nic_type = hw_lookup.get(read_file([os.path.join(nic_path, iface, 'type')]), 'Unknown')
6364
6365 dev_link = os.path.join(nic_path, iface, 'device')
6366 if os.path.exists(dev_link):
6367 iftype = 'physical'
6368 driver_path = os.path.join(dev_link, 'driver')
6369 if os.path.exists(driver_path):
6370 driver = os.path.basename(os.path.realpath(driver_path))
6371 else:
6372 driver = 'Unknown'
6373
6374 else:
6375 iftype = 'logical'
6376 driver = ''
6377
6378 self.interfaces[iface] = {
6379 'mtu': mtu,
6380 'upper_devs_list': upper_devs_list,
6381 'lower_devs_list': lower_devs_list,
6382 'operstate': operstate,
6383 'iftype': iftype,
6384 'nic_type': nic_type,
6385 'driver': driver,
6386 'speed': speed,
6387 'ipv4_address': get_ipv4_address(iface),
6388 'ipv6_address': get_ipv6_address(iface),
6389 }
6390
6391 @property
6392 def nic_count(self):
6393 # type: () -> int
6394 """Return a total count of all physical NICs detected in the host"""
6395 phys_devs = []
6396 for iface in self.interfaces:
6397 if self.interfaces[iface]['iftype'] == 'physical':
6398 phys_devs.append(iface)
6399 return len(phys_devs)
6400
6401 def _get_mem_data(self, field_name):
6402 # type: (str) -> int
6403 for line in self._meminfo:
6404 if line.startswith(field_name):
6405 _d = line.split()
6406 return int(_d[1])
6407 return 0
6408
6409 @property
6410 def memory_total_kb(self):
6411 # type: () -> int
6412 """Determine the memory installed (kb)"""
6413 return self._get_mem_data('MemTotal')
6414
6415 @property
6416 def memory_free_kb(self):
6417 # type: () -> int
6418 """Determine the memory free (not cache, immediately usable)"""
6419 return self._get_mem_data('MemFree')
6420
6421 @property
6422 def memory_available_kb(self):
6423 # type: () -> int
6424 """Determine the memory available to new applications without swapping"""
6425 return self._get_mem_data('MemAvailable')
6426
6427 @property
6428 def vendor(self):
6429 # type: () -> str
6430 """Determine server vendor from DMI data in sysfs"""
6431 return read_file(HostFacts._dmi_path_list, 'sys_vendor')
6432
6433 @property
6434 def model(self):
6435 # type: () -> str
6436 """Determine server model information from DMI data in sysfs"""
6437 family = read_file(HostFacts._dmi_path_list, 'product_family')
6438 product = read_file(HostFacts._dmi_path_list, 'product_name')
6439 if family == 'Unknown' and product:
6440 return '{}'.format(product)
6441
6442 return '{} ({})'.format(family, product)
6443
6444 @property
6445 def bios_version(self):
6446 # type: () -> str
6447 """Determine server BIOS version from DMI data in sysfs"""
6448 return read_file(HostFacts._dmi_path_list, 'bios_version')
6449
6450 @property
6451 def bios_date(self):
6452 # type: () -> str
6453 """Determine server BIOS date from DMI data in sysfs"""
6454 return read_file(HostFacts._dmi_path_list, 'bios_date')
6455
6456 @property
6457 def timestamp(self):
6458 # type: () -> float
6459 """Return the current time as Epoch seconds"""
6460 return time.time()
6461
6462 @property
6463 def system_uptime(self):
6464 # type: () -> float
6465 """Return the system uptime (in secs)"""
6466 raw_time = read_file(['/proc/uptime'])
6467 up_secs, _ = raw_time.split()
6468 return float(up_secs)
6469
6470 @property
6471 def kernel_security(self):
6472 # type: () -> Dict[str, str]
6473 """Determine the security features enabled in the kernel - SELinux, AppArmor"""
6474 def _fetch_selinux() -> Dict[str, str]:
6475 """Read the selinux config file to determine state"""
6476 security = {}
6477 for selinux_path in HostFacts._selinux_path_list:
6478 if os.path.exists(selinux_path):
6479 selinux_config = read_file([selinux_path]).splitlines()
6480 security['type'] = 'SELinux'
6481 for line in selinux_config:
6482 if line.strip().startswith('#'):
6483 continue
6484 k, v = line.split('=')
6485 security[k] = v
6486 if security['SELINUX'].lower() == 'disabled':
6487 security['description'] = 'SELinux: Disabled'
6488 else:
6489 security['description'] = 'SELinux: Enabled({}, {})'.format(security['SELINUX'], security['SELINUXTYPE'])
6490 return security
6491 return {}
6492
6493 def _fetch_apparmor() -> Dict[str, str]:
6494 """Read the apparmor profiles directly, returning an overview of AppArmor status"""
6495 security = {}
6496 for apparmor_path in HostFacts._apparmor_path_list:
6497 if os.path.exists(apparmor_path):
6498 security['type'] = 'AppArmor'
6499 security['description'] = 'AppArmor: Enabled'
6500 try:
6501 profiles = read_file(['/sys/kernel/security/apparmor/profiles'])
6502 except OSError:
6503 pass
6504 else:
6505 summary = {} # type: Dict[str, int]
6506 for line in profiles.split('\n'):
6507 item, mode = line.split(' ')
6508 mode = mode.strip('()')
6509 if mode in summary:
6510 summary[mode] += 1
6511 else:
6512 summary[mode] = 0
6513 summary_str = ','.join(['{} {}'.format(v, k) for k, v in summary.items()])
6514 security = {**security, **summary} # type: ignore
6515 security['description'] += '({})'.format(summary_str)
6516
6517 return security
6518 return {}
6519
6520 ret = {}
6521 if os.path.exists('/sys/kernel/security/lsm'):
6522 lsm = read_file(['/sys/kernel/security/lsm']).strip()
6523 if 'selinux' in lsm:
6524 ret = _fetch_selinux()
6525 elif 'apparmor' in lsm:
6526 ret = _fetch_apparmor()
6527 else:
6528 return {
6529 'type': 'Unknown',
6530 'description': 'Linux Security Module framework is active, but is not using SELinux or AppArmor'
6531 }
6532
6533 if ret:
6534 return ret
6535
6536 return {
6537 'type': 'None',
6538 'description': 'Linux Security Module framework is not available'
6539 }
6540
6541 @property
6542 def selinux_enabled(self):
6543 return (self.kernel_security['type'] == 'SELinux') and \
6544 (self.kernel_security['description'] != 'SELinux: Disabled')
6545
6546 @property
6547 def kernel_parameters(self):
6548 # type: () -> Dict[str, str]
6549 """Get kernel parameters required/used in Ceph clusters"""
6550
6551 k_param = {}
6552 out, _, _ = call_throws(self.ctx, ['sysctl', '-a'], verbosity=CallVerbosity.SILENT)
6553 if out:
6554 param_list = out.split('\n')
6555 param_dict = {param.split(' = ')[0]: param.split(' = ')[-1] for param in param_list}
6556
6557 # return only desired parameters
6558 if 'net.ipv4.ip_nonlocal_bind' in param_dict:
6559 k_param['net.ipv4.ip_nonlocal_bind'] = param_dict['net.ipv4.ip_nonlocal_bind']
6560
6561 return k_param
6562
6563 def dump(self):
6564 # type: () -> str
6565 """Return the attributes of this HostFacts object as json"""
6566 data = {
6567 k: getattr(self, k) for k in dir(self)
6568 if not k.startswith('_')
6569 and isinstance(getattr(self, k), (float, int, str, list, dict, tuple))
6570 }
6571 return json.dumps(data, indent=2, sort_keys=True)
6572
6573 ##################################
6574
6575
6576 def command_gather_facts(ctx: CephadmContext):
6577 """gather_facts is intended to provide host releated metadata to the caller"""
6578 host = HostFacts(ctx)
6579 print(host.dump())
6580
6581 ##################################
6582
6583
6584 def command_verify_prereqs(ctx: CephadmContext):
6585 if ctx.service_type == 'haproxy' or ctx.service_type == 'keepalived':
6586 out, err, code = call(
6587 ctx, ['sysctl', '-n', 'net.ipv4.ip_nonlocal_bind']
6588 )
6589 if out.strip() != '1':
6590 raise Error('net.ipv4.ip_nonlocal_bind not set to 1')
6591
6592 ##################################
6593
6594
6595 class CephadmCache:
6596 task_types = ['disks', 'daemons', 'host', 'http_server']
6597
6598 def __init__(self):
6599 self.started_epoch_secs = time.time()
6600 self.tasks = {
6601 'daemons': 'inactive',
6602 'disks': 'inactive',
6603 'host': 'inactive',
6604 'http_server': 'inactive',
6605 }
6606 self.errors = []
6607 self.disks = {}
6608 self.daemons = {}
6609 self.host = {}
6610 self.lock = RLock()
6611
6612 @property
6613 def health(self):
6614 return {
6615 'started_epoch_secs': self.started_epoch_secs,
6616 'tasks': self.tasks,
6617 'errors': self.errors,
6618 }
6619
6620 def to_json(self):
6621 return {
6622 'health': self.health,
6623 'host': self.host,
6624 'daemons': self.daemons,
6625 'disks': self.disks,
6626 }
6627
6628 def update_health(self, task_type, task_status, error_msg=None):
6629 assert task_type in CephadmCache.task_types
6630 with self.lock:
6631 self.tasks[task_type] = task_status
6632 if error_msg:
6633 self.errors.append(error_msg)
6634
6635 def update_task(self, task_type, content):
6636 assert task_type in CephadmCache.task_types
6637 assert isinstance(content, dict)
6638 with self.lock:
6639 current = getattr(self, task_type)
6640 for k in content:
6641 current[k] = content[k]
6642
6643 setattr(self, task_type, current)
6644
6645
6646 class CephadmHTTPServer(ThreadingMixIn, HTTPServer):
6647 allow_reuse_address = True
6648 daemon_threads = True
6649 cephadm_cache: CephadmCache
6650 token: str
6651
6652
6653 class CephadmDaemonHandler(BaseHTTPRequestHandler):
6654 server: CephadmHTTPServer
6655 api_version = 'v1'
6656 valid_routes = [
6657 f'/{api_version}/metadata',
6658 f'/{api_version}/metadata/health',
6659 f'/{api_version}/metadata/disks',
6660 f'/{api_version}/metadata/daemons',
6661 f'/{api_version}/metadata/host',
6662 ]
6663
6664 class Decorators:
6665 @classmethod
6666 def authorize(cls, f):
6667 """Implement a basic token check.
6668
6669 The token is installed at deployment time and must be provided to
6670 ensure we only respond to callers who know our token i.e. mgr
6671 """
6672 def wrapper(self, *args, **kwargs):
6673 auth = self.headers.get('Authorization', None)
6674 if auth != 'Bearer ' + self.server.token:
6675 self.send_error(401)
6676 return
6677 f(self, *args, **kwargs)
6678 return wrapper
6679
6680 def _help_page(self):
6681 return """<!DOCTYPE html>
6682 <html>
6683 <head><title>cephadm metadata exporter</title></head>
6684 <style>
6685 body {{
6686 font-family: sans-serif;
6687 font-size: 0.8em;
6688 }}
6689 table {{
6690 border-width: 0px;
6691 border-spacing: 0px;
6692 margin-left:20px;
6693 }}
6694 tr:hover {{
6695 background: PowderBlue;
6696 }}
6697 td,th {{
6698 padding: 5px;
6699 }}
6700 </style>
6701 <body>
6702 <h1>cephadm metadata exporter {api_version}</h1>
6703 <table>
6704 <thead>
6705 <tr><th>Endpoint</th><th>Methods</th><th>Response</th><th>Description</th></tr>
6706 </thead>
6707 <tr><td><a href='{api_version}/metadata'>{api_version}/metadata</a></td><td>GET</td><td>JSON</td><td>Return <b>all</b> metadata for the host</td></tr>
6708 <tr><td><a href='{api_version}/metadata/daemons'>{api_version}/metadata/daemons</a></td><td>GET</td><td>JSON</td><td>Return daemon and systemd states for ceph daemons (ls)</td></tr>
6709 <tr><td><a href='{api_version}/metadata/disks'>{api_version}/metadata/disks</a></td><td>GET</td><td>JSON</td><td>show disk inventory (ceph-volume)</td></tr>
6710 <tr><td><a href='{api_version}/metadata/health'>{api_version}/metadata/health</a></td><td>GET</td><td>JSON</td><td>Show current health of the exporter sub-tasks</td></tr>
6711 <tr><td><a href='{api_version}/metadata/host'>{api_version}/metadata/host</a></td><td>GET</td><td>JSON</td><td>Show host metadata (gather-facts)</td></tr>
6712 </table>
6713 </body>
6714 </html>""".format(api_version=CephadmDaemonHandler.api_version)
6715
6716 def _fetch_root(self):
6717 self.send_response(200)
6718 self.send_header('Content-type', 'text/html; charset=utf-8')
6719 self.end_headers()
6720 self.wfile.write(self._help_page().encode('utf-8'))
6721
6722 @Decorators.authorize
6723 def do_GET(self):
6724 """Handle *all* GET requests"""
6725
6726 if self.path == '/':
6727 # provide a html response if someone hits the root url, to document the
6728 # available api endpoints
6729 return self._fetch_root()
6730 elif self.path in CephadmDaemonHandler.valid_routes:
6731 u = self.path.split('/')[-1]
6732 data = json.dumps({})
6733 status_code = 200
6734
6735 tasks = self.server.cephadm_cache.health.get('tasks', {})
6736 assert tasks
6737
6738 # We're using the http status code to help indicate thread health
6739 # - 200 (OK): request successful
6740 # - 204 (No Content): access to a cache relating to a dead thread
6741 # - 206 (Partial content): one or more theads are inactive
6742 # - 500 (Server Error): all threads inactive
6743 if u == 'metadata':
6744 data = json.dumps(self.server.cephadm_cache.to_json())
6745 if all([tasks[task_name] == 'inactive' for task_name in tasks if task_name != 'http_server']):
6746 # All the subtasks are dead!
6747 status_code = 500
6748 elif any([tasks[task_name] == 'inactive' for task_name in tasks if task_name != 'http_server']):
6749 status_code = 206
6750
6751 # Individual GETs against the a tasks endpoint will also return a 503 if the corresponding thread is inactive
6752 elif u == 'daemons':
6753 data = json.dumps(self.server.cephadm_cache.daemons)
6754 if tasks['daemons'] == 'inactive':
6755 status_code = 204
6756 elif u == 'disks':
6757 data = json.dumps(self.server.cephadm_cache.disks)
6758 if tasks['disks'] == 'inactive':
6759 status_code = 204
6760 elif u == 'host':
6761 data = json.dumps(self.server.cephadm_cache.host)
6762 if tasks['host'] == 'inactive':
6763 status_code = 204
6764
6765 # a GET against health will always return a 200, since the op is always successful
6766 elif u == 'health':
6767 data = json.dumps(self.server.cephadm_cache.health)
6768
6769 self.send_response(status_code)
6770 self.send_header('Content-type', 'application/json')
6771 self.end_headers()
6772 self.wfile.write(data.encode('utf-8'))
6773 else:
6774 # Invalid GET URL
6775 bad_request_msg = 'Valid URLs are: {}'.format(', '.join(CephadmDaemonHandler.valid_routes))
6776 self.send_response(404, message=bad_request_msg) # reason
6777 self.send_header('Content-type', 'application/json')
6778 self.end_headers()
6779 self.wfile.write(json.dumps({'message': bad_request_msg}).encode('utf-8'))
6780
6781 def log_message(self, format, *args):
6782 rqst = ' '.join(str(a) for a in args)
6783 logger.info(f'client:{self.address_string()} [{self.log_date_time_string()}] {rqst}')
6784
6785
6786 class CephadmDaemon():
6787
6788 daemon_type = 'cephadm-exporter'
6789 default_port = 9443
6790 key_name = 'key'
6791 crt_name = 'crt'
6792 token_name = 'token'
6793 config_requirements = [
6794 key_name,
6795 crt_name,
6796 token_name,
6797 ]
6798 loop_delay = 1
6799 thread_check_interval = 5
6800
6801 def __init__(self, ctx: CephadmContext, fsid, daemon_id=None, port=None):
6802 self.ctx = ctx
6803 self.fsid = fsid
6804 self.daemon_id = daemon_id
6805 if not port:
6806 self.port = CephadmDaemon.default_port
6807 else:
6808 self.port = port
6809 self.workers: List[Thread] = []
6810 self.http_server: CephadmHTTPServer
6811 self.stop = False
6812 self.cephadm_cache = CephadmCache()
6813 self.errors: List[str] = []
6814 self.token = read_file([os.path.join(self.daemon_path, CephadmDaemon.token_name)])
6815
6816 @classmethod
6817 def validate_config(cls, config):
6818 reqs = ', '.join(CephadmDaemon.config_requirements)
6819 errors = []
6820
6821 if not config or not all([k_name in config for k_name in CephadmDaemon.config_requirements]):
6822 raise Error(f'config must contain the following fields : {reqs}')
6823
6824 if not all([isinstance(config[k_name], str) for k_name in CephadmDaemon.config_requirements]):
6825 errors.append(f'the following fields must be strings: {reqs}')
6826
6827 crt = config[CephadmDaemon.crt_name]
6828 key = config[CephadmDaemon.key_name]
6829 token = config[CephadmDaemon.token_name]
6830
6831 if not crt.startswith('-----BEGIN CERTIFICATE-----') or not crt.endswith('-----END CERTIFICATE-----\n'):
6832 errors.append('crt field is not a valid SSL certificate')
6833 if not key.startswith('-----BEGIN PRIVATE KEY-----') or not key.endswith('-----END PRIVATE KEY-----\n'):
6834 errors.append('key is not a valid SSL private key')
6835 if len(token) < 8:
6836 errors.append("'token' must be more than 8 characters long")
6837
6838 if 'port' in config:
6839 try:
6840 p = int(config['port'])
6841 if p <= 1024:
6842 raise ValueError
6843 except (TypeError, ValueError):
6844 errors.append('port must be an integer > 1024')
6845
6846 if errors:
6847 raise Error('Parameter errors : {}'.format(', '.join(errors)))
6848
6849 @property
6850 def port_active(self):
6851 return port_in_use(self.ctx, self.port)
6852
6853 @property
6854 def can_run(self):
6855 # if port is in use
6856 if self.port_active:
6857 self.errors.append(f'TCP port {self.port} already in use, unable to bind')
6858 if not os.path.exists(os.path.join(self.daemon_path, CephadmDaemon.key_name)):
6859 self.errors.append(f"Key file '{CephadmDaemon.key_name}' is missing from {self.daemon_path}")
6860 if not os.path.exists(os.path.join(self.daemon_path, CephadmDaemon.crt_name)):
6861 self.errors.append(f"Certificate file '{CephadmDaemon.crt_name}' is missing from {self.daemon_path}")
6862 if self.token == 'Unknown':
6863 self.errors.append(f"Authentication token '{CephadmDaemon.token_name}' is missing from {self.daemon_path}")
6864 return len(self.errors) == 0
6865
6866 @staticmethod
6867 def _unit_name(fsid, daemon_id):
6868 return '{}.service'.format(get_unit_name(fsid, CephadmDaemon.daemon_type, daemon_id))
6869
6870 @property
6871 def unit_name(self):
6872 return CephadmDaemon._unit_name(self.fsid, self.daemon_id)
6873
6874 @property
6875 def daemon_path(self):
6876 return os.path.join(
6877 self.ctx.data_dir,
6878 self.fsid,
6879 f'{self.daemon_type}.{self.daemon_id}'
6880 )
6881
6882 @property
6883 def binary_path(self):
6884 path = os.path.realpath(__file__)
6885 assert os.path.isfile(path)
6886 return path
6887
6888 def _handle_thread_exception(self, exc, thread_type):
6889 e_msg = f'{exc.__class__.__name__} exception: {str(exc)}'
6890 thread_info = getattr(self.cephadm_cache, thread_type)
6891 errors = thread_info.get('scrape_errors', [])
6892 errors.append(e_msg)
6893 logger.error(e_msg)
6894 logger.exception(exc)
6895 self.cephadm_cache.update_task(
6896 thread_type,
6897 {
6898 'scrape_errors': errors,
6899 'data': None,
6900 }
6901 )
6902
6903 def _scrape_host_facts(self, refresh_interval=10):
6904 ctr = 0
6905 exception_encountered = False
6906
6907 while True:
6908
6909 if self.stop or exception_encountered:
6910 break
6911
6912 if ctr >= refresh_interval:
6913 ctr = 0
6914 logger.debug('executing host-facts scrape')
6915 errors = []
6916 s_time = time.time()
6917
6918 try:
6919 facts = HostFacts(self.ctx)
6920 except Exception as e:
6921 self._handle_thread_exception(e, 'host')
6922 exception_encountered = True
6923 else:
6924 elapsed = time.time() - s_time
6925 try:
6926 data = json.loads(facts.dump())
6927 except json.decoder.JSONDecodeError:
6928 errors.append('host-facts provided invalid JSON')
6929 logger.warning(errors[-1])
6930 data = {}
6931 self.cephadm_cache.update_task(
6932 'host',
6933 {
6934 'scrape_timestamp': s_time,
6935 'scrape_duration_secs': elapsed,
6936 'scrape_errors': errors,
6937 'data': data,
6938 }
6939 )
6940 logger.debug(f'completed host-facts scrape - {elapsed}s')
6941
6942 time.sleep(CephadmDaemon.loop_delay)
6943 ctr += CephadmDaemon.loop_delay
6944 logger.info('host-facts thread stopped')
6945
6946 def _scrape_ceph_volume(self, refresh_interval=15):
6947 # we're invoking the ceph_volume command, so we need to set the args that it
6948 # expects to use
6949 self.ctx.command = 'inventory --format=json'.split()
6950 self.ctx.fsid = self.fsid
6951 self.ctx.log_output = False
6952
6953 ctr = 0
6954 exception_encountered = False
6955
6956 while True:
6957 if self.stop or exception_encountered:
6958 break
6959
6960 if ctr >= refresh_interval:
6961 ctr = 0
6962 logger.debug('executing ceph-volume scrape')
6963 errors = []
6964 s_time = time.time()
6965 stream = io.StringIO()
6966 try:
6967 with redirect_stdout(stream):
6968 command_ceph_volume(self.ctx)
6969 except Exception as e:
6970 self._handle_thread_exception(e, 'disks')
6971 exception_encountered = True
6972 else:
6973 elapsed = time.time() - s_time
6974
6975 # if the call to ceph-volume returns junk with the
6976 # json, it won't parse
6977 stdout = stream.getvalue()
6978
6979 data = []
6980 if stdout:
6981 try:
6982 data = json.loads(stdout)
6983 except json.decoder.JSONDecodeError:
6984 errors.append('ceph-volume thread provided bad json data')
6985 logger.warning(errors[-1])
6986 else:
6987 errors.append('ceph-volume did not return any data')
6988 logger.warning(errors[-1])
6989
6990 self.cephadm_cache.update_task(
6991 'disks',
6992 {
6993 'scrape_timestamp': s_time,
6994 'scrape_duration_secs': elapsed,
6995 'scrape_errors': errors,
6996 'data': data,
6997 }
6998 )
6999
7000 logger.debug(f'completed ceph-volume scrape - {elapsed}s')
7001 time.sleep(CephadmDaemon.loop_delay)
7002 ctr += CephadmDaemon.loop_delay
7003
7004 logger.info('ceph-volume thread stopped')
7005
7006 def _scrape_list_daemons(self, refresh_interval=20):
7007 ctr = 0
7008 exception_encountered = False
7009 while True:
7010 if self.stop or exception_encountered:
7011 break
7012
7013 if ctr >= refresh_interval:
7014 ctr = 0
7015 logger.debug('executing list-daemons scrape')
7016 errors = []
7017 s_time = time.time()
7018
7019 try:
7020 # list daemons should ideally be invoked with a fsid
7021 data = list_daemons(self.ctx)
7022 except Exception as e:
7023 self._handle_thread_exception(e, 'daemons')
7024 exception_encountered = True
7025 else:
7026 if not isinstance(data, list):
7027 errors.append('list-daemons did not supply a list?')
7028 logger.warning(errors[-1])
7029 data = []
7030 elapsed = time.time() - s_time
7031 self.cephadm_cache.update_task(
7032 'daemons',
7033 {
7034 'scrape_timestamp': s_time,
7035 'scrape_duration_secs': elapsed,
7036 'scrape_errors': errors,
7037 'data': data,
7038 }
7039 )
7040 logger.debug(f'completed list-daemons scrape - {elapsed}s')
7041
7042 time.sleep(CephadmDaemon.loop_delay)
7043 ctr += CephadmDaemon.loop_delay
7044 logger.info('list-daemons thread stopped')
7045
7046 def _create_thread(self, target, name, refresh_interval=None):
7047 if refresh_interval:
7048 t = Thread(target=target, args=(refresh_interval,))
7049 else:
7050 t = Thread(target=target)
7051 t.daemon = True
7052 t.name = name
7053 self.cephadm_cache.update_health(name, 'active')
7054 t.start()
7055
7056 start_msg = f'Started {name} thread'
7057 if refresh_interval:
7058 logger.info(f'{start_msg}, with a refresh interval of {refresh_interval}s')
7059 else:
7060 logger.info(f'{start_msg}')
7061 return t
7062
7063 def reload(self, *args):
7064 """reload -HUP received
7065
7066 This is a placeholder function only, and serves to provide the hook that could
7067 be exploited later if the exporter evolves to incorporate a config file
7068 """
7069 logger.info('Reload request received - ignoring, no action needed')
7070
7071 def shutdown(self, *args):
7072 logger.info('Shutdown request received')
7073 self.stop = True
7074 self.http_server.shutdown()
7075
7076 def run(self):
7077 logger.info(f"cephadm exporter starting for FSID '{self.fsid}'")
7078 if not self.can_run:
7079 logger.error('Unable to start the exporter daemon')
7080 for e in self.errors:
7081 logger.error(e)
7082 return
7083
7084 # register signal handlers for running under systemd control
7085 signal.signal(signal.SIGTERM, self.shutdown)
7086 signal.signal(signal.SIGINT, self.shutdown)
7087 signal.signal(signal.SIGHUP, self.reload)
7088 logger.debug('Signal handlers attached')
7089
7090 host_facts = self._create_thread(self._scrape_host_facts, 'host', 5)
7091 self.workers.append(host_facts)
7092
7093 daemons = self._create_thread(self._scrape_list_daemons, 'daemons', 20)
7094 self.workers.append(daemons)
7095
7096 disks = self._create_thread(self._scrape_ceph_volume, 'disks', 20)
7097 self.workers.append(disks)
7098
7099 self.http_server = CephadmHTTPServer(('0.0.0.0', self.port), CephadmDaemonHandler) # IPv4 only
7100 self.http_server.socket = ssl.wrap_socket(self.http_server.socket,
7101 keyfile=os.path.join(self.daemon_path, CephadmDaemon.key_name),
7102 certfile=os.path.join(self.daemon_path, CephadmDaemon.crt_name),
7103 server_side=True)
7104
7105 self.http_server.cephadm_cache = self.cephadm_cache
7106 self.http_server.token = self.token
7107 server_thread = self._create_thread(self.http_server.serve_forever, 'http_server')
7108 logger.info(f'https server listening on {self.http_server.server_address[0]}:{self.http_server.server_port}')
7109
7110 ctr = 0
7111 while server_thread.is_alive():
7112 if self.stop:
7113 break
7114
7115 if ctr >= CephadmDaemon.thread_check_interval:
7116 ctr = 0
7117 for worker in self.workers:
7118 if self.cephadm_cache.tasks[worker.name] == 'inactive':
7119 continue
7120 if not worker.is_alive():
7121 logger.warning(f'{worker.name} thread not running')
7122 stop_time = datetime.datetime.now().strftime('%Y/%m/%d %H:%M:%S')
7123 self.cephadm_cache.update_health(worker.name, 'inactive', f'{worker.name} stopped at {stop_time}')
7124
7125 time.sleep(CephadmDaemon.loop_delay)
7126 ctr += CephadmDaemon.loop_delay
7127
7128 logger.info('Main http server thread stopped')
7129
7130 @property
7131 def unit_run(self):
7132
7133 return """set -e
7134 {py3} {bin_path} exporter --fsid {fsid} --id {daemon_id} --port {port} &""".format(
7135 py3=shutil.which('python3'),
7136 bin_path=self.binary_path,
7137 fsid=self.fsid,
7138 daemon_id=self.daemon_id,
7139 port=self.port
7140 )
7141
7142 @property
7143 def unit_file(self):
7144 docker = isinstance(self.ctx.container_engine, Docker)
7145 return """#generated by cephadm
7146 [Unit]
7147 Description=cephadm exporter service for cluster {fsid}
7148 After=network-online.target{docker_after}
7149 Wants=network-online.target
7150 {docker_requires}
7151
7152 PartOf=ceph-{fsid}.target
7153 Before=ceph-{fsid}.target
7154
7155 [Service]
7156 Type=forking
7157 ExecStart=/bin/bash {daemon_path}/unit.run
7158 ExecReload=/bin/kill -HUP $MAINPID
7159 Restart=on-failure
7160 RestartSec=10s
7161
7162 [Install]
7163 WantedBy=ceph-{fsid}.target
7164 """.format(fsid=self.fsid,
7165 daemon_path=self.daemon_path,
7166 # if docker, we depend on docker.service
7167 docker_after=' docker.service' if docker else '',
7168 docker_requires='Requires=docker.service\n' if docker else '')
7169
7170 def deploy_daemon_unit(self, config=None):
7171 """deploy a specific unit file for cephadm
7172
7173 The normal deploy_daemon_units doesn't apply for this
7174 daemon since it's not a container, so we just create a
7175 simple service definition and add it to the fsid's target
7176 """
7177 if not config:
7178 raise Error('Attempting to deploy cephadm daemon without a config')
7179 assert isinstance(config, dict)
7180
7181 # Create the required config files in the daemons dir, with restricted permissions
7182 for filename in config:
7183 with open(os.open(os.path.join(self.daemon_path, filename), os.O_CREAT | os.O_WRONLY, mode=0o600), 'w') as f:
7184 f.write(config[filename])
7185
7186 # When __file__ is <stdin> we're being invoked over remoto via the orchestrator, so
7187 # we pick up the file from where the orchestrator placed it - otherwise we'll
7188 # copy it to the binary location for this cluster
7189 if not __file__ == '<stdin>':
7190 shutil.copy(__file__,
7191 self.binary_path)
7192
7193 with open(os.path.join(self.daemon_path, 'unit.run'), 'w') as f:
7194 f.write(self.unit_run)
7195
7196 with open(
7197 os.path.join(self.ctx.unit_dir,
7198 f'{self.unit_name}.new'),
7199 'w'
7200 ) as f:
7201 f.write(self.unit_file)
7202 os.rename(
7203 os.path.join(self.ctx.unit_dir, f'{self.unit_name}.new'),
7204 os.path.join(self.ctx.unit_dir, self.unit_name))
7205
7206 call_throws(self.ctx, ['systemctl', 'daemon-reload'])
7207 call(self.ctx, ['systemctl', 'stop', self.unit_name],
7208 verbosity=CallVerbosity.DEBUG)
7209 call(self.ctx, ['systemctl', 'reset-failed', self.unit_name],
7210 verbosity=CallVerbosity.DEBUG)
7211 call_throws(self.ctx, ['systemctl', 'enable', '--now', self.unit_name])
7212
7213 @classmethod
7214 def uninstall(cls, ctx: CephadmContext, fsid, daemon_type, daemon_id):
7215 unit_name = CephadmDaemon._unit_name(fsid, daemon_id)
7216 unit_path = os.path.join(ctx.unit_dir, unit_name)
7217 unit_run = os.path.join(ctx.data_dir, fsid, f'{daemon_type}.{daemon_id}', 'unit.run')
7218 port = None
7219 try:
7220 with open(unit_run, 'r') as u:
7221 contents = u.read().strip(' &')
7222 except OSError:
7223 logger.warning(f'Unable to access the unit.run file @ {unit_run}')
7224 return
7225
7226 port = None
7227 for line in contents.split('\n'):
7228 if '--port ' in line:
7229 try:
7230 port = int(line.split('--port ')[-1])
7231 except ValueError:
7232 logger.warning('Unexpected format in unit.run file: port is not numeric')
7233 logger.warning('Unable to remove the systemd file and close the port')
7234 return
7235 break
7236
7237 if port:
7238 fw = Firewalld(ctx)
7239 try:
7240 fw.close_ports([port])
7241 except RuntimeError:
7242 logger.error(f'Unable to close port {port}')
7243
7244 stdout, stderr, rc = call(ctx, ['rm', '-f', unit_path])
7245 if rc:
7246 logger.error(f'Unable to remove the systemd file @ {unit_path}')
7247 else:
7248 logger.info(f'removed systemd unit file @ {unit_path}')
7249 stdout, stderr, rc = call(ctx, ['systemctl', 'daemon-reload'])
7250
7251
7252 def command_exporter(ctx: CephadmContext):
7253 exporter = CephadmDaemon(ctx, ctx.fsid, daemon_id=ctx.id, port=ctx.port)
7254
7255 if ctx.fsid not in os.listdir(ctx.data_dir):
7256 raise Error(f"cluster fsid '{ctx.fsid}' not found in '{ctx.data_dir}'")
7257
7258 exporter.run()
7259
7260 ##################################
7261
7262
7263 def systemd_target_state(target_name: str, subsystem: str = 'ceph') -> bool:
7264 # TODO: UNITTEST
7265 return os.path.exists(
7266 os.path.join(
7267 UNIT_DIR,
7268 f'{subsystem}.target.wants',
7269 target_name
7270 )
7271 )
7272
7273
7274 @infer_fsid
7275 def command_maintenance(ctx: CephadmContext):
7276 if not ctx.fsid:
7277 raise Error('must pass --fsid to specify cluster')
7278
7279 target = f'ceph-{ctx.fsid}.target'
7280
7281 if ctx.maintenance_action.lower() == 'enter':
7282 logger.info('Requested to place host into maintenance')
7283 if systemd_target_state(target):
7284 _out, _err, code = call(ctx,
7285 ['systemctl', 'disable', target],
7286 verbosity=CallVerbosity.DEBUG)
7287 if code:
7288 logger.error(f'Failed to disable the {target} target')
7289 return 'failed - to disable the target'
7290 else:
7291 # stopping a target waits by default
7292 _out, _err, code = call(ctx,
7293 ['systemctl', 'stop', target],
7294 verbosity=CallVerbosity.DEBUG)
7295 if code:
7296 logger.error(f'Failed to stop the {target} target')
7297 return 'failed - to disable the target'
7298 else:
7299 return f'success - systemd target {target} disabled'
7300
7301 else:
7302 return 'skipped - target already disabled'
7303
7304 else:
7305 logger.info('Requested to exit maintenance state')
7306 # exit maintenance request
7307 if not systemd_target_state(target):
7308 _out, _err, code = call(ctx,
7309 ['systemctl', 'enable', target],
7310 verbosity=CallVerbosity.DEBUG)
7311 if code:
7312 logger.error(f'Failed to enable the {target} target')
7313 return 'failed - unable to enable the target'
7314 else:
7315 # starting a target waits by default
7316 _out, _err, code = call(ctx,
7317 ['systemctl', 'start', target],
7318 verbosity=CallVerbosity.DEBUG)
7319 if code:
7320 logger.error(f'Failed to start the {target} target')
7321 return 'failed - unable to start the target'
7322 else:
7323 return f'success - systemd target {target} enabled and started'
7324
7325 ##################################
7326
7327
7328 def _get_parser():
7329 # type: () -> argparse.ArgumentParser
7330 parser = argparse.ArgumentParser(
7331 description='Bootstrap Ceph daemons with systemd and containers.',
7332 formatter_class=argparse.ArgumentDefaultsHelpFormatter)
7333 parser.add_argument(
7334 '--image',
7335 help='container image. Can also be set via the "CEPHADM_IMAGE" '
7336 'env var')
7337 parser.add_argument(
7338 '--docker',
7339 action='store_true',
7340 help='use docker instead of podman')
7341 parser.add_argument(
7342 '--data-dir',
7343 default=DATA_DIR,
7344 help='base directory for daemon data')
7345 parser.add_argument(
7346 '--log-dir',
7347 default=LOG_DIR,
7348 help='base directory for daemon logs')
7349 parser.add_argument(
7350 '--logrotate-dir',
7351 default=LOGROTATE_DIR,
7352 help='location of logrotate configuration files')
7353 parser.add_argument(
7354 '--unit-dir',
7355 default=UNIT_DIR,
7356 help='base directory for systemd units')
7357 parser.add_argument(
7358 '--verbose', '-v',
7359 action='store_true',
7360 help='Show debug-level log messages')
7361 parser.add_argument(
7362 '--timeout',
7363 type=int,
7364 default=DEFAULT_TIMEOUT,
7365 help='timeout in seconds')
7366 parser.add_argument(
7367 '--retry',
7368 type=int,
7369 default=DEFAULT_RETRY,
7370 help='max number of retries')
7371 parser.add_argument(
7372 '--env', '-e',
7373 action='append',
7374 default=[],
7375 help='set environment variable')
7376 parser.add_argument(
7377 '--no-container-init',
7378 action='store_true',
7379 default=not CONTAINER_INIT,
7380 help='Do not run podman/docker with `--init`')
7381
7382 subparsers = parser.add_subparsers(help='sub-command')
7383
7384 parser_version = subparsers.add_parser(
7385 'version', help='get ceph version from container')
7386 parser_version.set_defaults(func=command_version)
7387
7388 parser_pull = subparsers.add_parser(
7389 'pull', help='pull latest image version')
7390 parser_pull.set_defaults(func=command_pull)
7391
7392 parser_inspect_image = subparsers.add_parser(
7393 'inspect-image', help='inspect local container image')
7394 parser_inspect_image.set_defaults(func=command_inspect_image)
7395
7396 parser_ls = subparsers.add_parser(
7397 'ls', help='list daemon instances on this host')
7398 parser_ls.set_defaults(func=command_ls)
7399 parser_ls.add_argument(
7400 '--no-detail',
7401 action='store_true',
7402 help='Do not include daemon status')
7403 parser_ls.add_argument(
7404 '--legacy-dir',
7405 default='/',
7406 help='base directory for legacy daemon data')
7407
7408 parser_list_networks = subparsers.add_parser(
7409 'list-networks', help='list IP networks')
7410 parser_list_networks.set_defaults(func=command_list_networks)
7411
7412 parser_adopt = subparsers.add_parser(
7413 'adopt', help='adopt daemon deployed with a different tool')
7414 parser_adopt.set_defaults(func=command_adopt)
7415 parser_adopt.add_argument(
7416 '--name', '-n',
7417 required=True,
7418 help='daemon name (type.id)')
7419 parser_adopt.add_argument(
7420 '--style',
7421 required=True,
7422 help='deployment style (legacy, ...)')
7423 parser_adopt.add_argument(
7424 '--cluster',
7425 default='ceph',
7426 help='cluster name')
7427 parser_adopt.add_argument(
7428 '--legacy-dir',
7429 default='/',
7430 help='base directory for legacy daemon data')
7431 parser_adopt.add_argument(
7432 '--config-json',
7433 help='Additional configuration information in JSON format')
7434 parser_adopt.add_argument(
7435 '--skip-firewalld',
7436 action='store_true',
7437 help='Do not configure firewalld')
7438 parser_adopt.add_argument(
7439 '--skip-pull',
7440 action='store_true',
7441 help='do not pull the latest image before adopting')
7442 parser_adopt.add_argument(
7443 '--force-start',
7444 action='store_true',
7445 help='start newly adoped daemon, even if it was not running previously')
7446 parser_adopt.add_argument(
7447 '--container-init',
7448 action='store_true',
7449 default=CONTAINER_INIT,
7450 help=argparse.SUPPRESS)
7451
7452 parser_rm_daemon = subparsers.add_parser(
7453 'rm-daemon', help='remove daemon instance')
7454 parser_rm_daemon.set_defaults(func=command_rm_daemon)
7455 parser_rm_daemon.add_argument(
7456 '--name', '-n',
7457 required=True,
7458 action=CustomValidation,
7459 help='daemon name (type.id)')
7460 parser_rm_daemon.add_argument(
7461 '--fsid',
7462 required=True,
7463 help='cluster FSID')
7464 parser_rm_daemon.add_argument(
7465 '--force',
7466 action='store_true',
7467 help='proceed, even though this may destroy valuable data')
7468 parser_rm_daemon.add_argument(
7469 '--force-delete-data',
7470 action='store_true',
7471 help='delete valuable daemon data instead of making a backup')
7472
7473 parser_rm_cluster = subparsers.add_parser(
7474 'rm-cluster', help='remove all daemons for a cluster')
7475 parser_rm_cluster.set_defaults(func=command_rm_cluster)
7476 parser_rm_cluster.add_argument(
7477 '--fsid',
7478 required=True,
7479 help='cluster FSID')
7480 parser_rm_cluster.add_argument(
7481 '--force',
7482 action='store_true',
7483 help='proceed, even though this may destroy valuable data')
7484 parser_rm_cluster.add_argument(
7485 '--keep-logs',
7486 action='store_true',
7487 help='do not remove log files')
7488
7489 parser_run = subparsers.add_parser(
7490 'run', help='run a ceph daemon, in a container, in the foreground')
7491 parser_run.set_defaults(func=command_run)
7492 parser_run.add_argument(
7493 '--name', '-n',
7494 required=True,
7495 help='daemon name (type.id)')
7496 parser_run.add_argument(
7497 '--fsid',
7498 required=True,
7499 help='cluster FSID')
7500
7501 parser_shell = subparsers.add_parser(
7502 'shell', help='run an interactive shell inside a daemon container')
7503 parser_shell.set_defaults(func=command_shell)
7504 parser_shell.add_argument(
7505 '--fsid',
7506 help='cluster FSID')
7507 parser_shell.add_argument(
7508 '--name', '-n',
7509 help='daemon name (type.id)')
7510 parser_shell.add_argument(
7511 '--config', '-c',
7512 help='ceph.conf to pass through to the container')
7513 parser_shell.add_argument(
7514 '--keyring', '-k',
7515 help='ceph.keyring to pass through to the container')
7516 parser_shell.add_argument(
7517 '--mount', '-m',
7518 help=('mount a file or directory in the container. '
7519 'Support multiple mounts. '
7520 'ie: `--mount /foo /bar:/bar`. '
7521 'When no destination is passed, default is /mnt'),
7522 nargs='+')
7523 parser_shell.add_argument(
7524 '--env', '-e',
7525 action='append',
7526 default=[],
7527 help='set environment variable')
7528 parser_shell.add_argument(
7529 'command', nargs=argparse.REMAINDER,
7530 help='command (optional)')
7531
7532 parser_enter = subparsers.add_parser(
7533 'enter', help='run an interactive shell inside a running daemon container')
7534 parser_enter.set_defaults(func=command_enter)
7535 parser_enter.add_argument(
7536 '--fsid',
7537 help='cluster FSID')
7538 parser_enter.add_argument(
7539 '--name', '-n',
7540 required=True,
7541 help='daemon name (type.id)')
7542 parser_enter.add_argument(
7543 'command', nargs=argparse.REMAINDER,
7544 help='command')
7545
7546 parser_ceph_volume = subparsers.add_parser(
7547 'ceph-volume', help='run ceph-volume inside a container')
7548 parser_ceph_volume.set_defaults(func=command_ceph_volume)
7549 parser_ceph_volume.add_argument(
7550 '--fsid',
7551 help='cluster FSID')
7552 parser_ceph_volume.add_argument(
7553 '--config-json',
7554 help='JSON file with config and (client.bootrap-osd) key')
7555 parser_ceph_volume.add_argument(
7556 '--config', '-c',
7557 help='ceph conf file')
7558 parser_ceph_volume.add_argument(
7559 '--keyring', '-k',
7560 help='ceph.keyring to pass through to the container')
7561 parser_ceph_volume.add_argument(
7562 '--log-output',
7563 action='store_true',
7564 default=True,
7565 help='suppress ceph volume output from the log')
7566 parser_ceph_volume.add_argument(
7567 'command', nargs=argparse.REMAINDER,
7568 help='command')
7569
7570 parser_unit = subparsers.add_parser(
7571 'unit', help="operate on the daemon's systemd unit")
7572 parser_unit.set_defaults(func=command_unit)
7573 parser_unit.add_argument(
7574 'command',
7575 help='systemd command (start, stop, restart, enable, disable, ...)')
7576 parser_unit.add_argument(
7577 '--fsid',
7578 help='cluster FSID')
7579 parser_unit.add_argument(
7580 '--name', '-n',
7581 required=True,
7582 help='daemon name (type.id)')
7583
7584 parser_logs = subparsers.add_parser(
7585 'logs', help='print journald logs for a daemon container')
7586 parser_logs.set_defaults(func=command_logs)
7587 parser_logs.add_argument(
7588 '--fsid',
7589 help='cluster FSID')
7590 parser_logs.add_argument(
7591 '--name', '-n',
7592 required=True,
7593 help='daemon name (type.id)')
7594 parser_logs.add_argument(
7595 'command', nargs='*',
7596 help='additional journalctl args')
7597
7598 parser_bootstrap = subparsers.add_parser(
7599 'bootstrap', help='bootstrap a cluster (mon + mgr daemons)')
7600 parser_bootstrap.set_defaults(func=command_bootstrap)
7601 parser_bootstrap.add_argument(
7602 '--config', '-c',
7603 help='ceph conf file to incorporate')
7604 parser_bootstrap.add_argument(
7605 '--mon-id',
7606 required=False,
7607 help='mon id (default: local hostname)')
7608 parser_bootstrap.add_argument(
7609 '--mon-addrv',
7610 help='mon IPs (e.g., [v2:localipaddr:3300,v1:localipaddr:6789])')
7611 parser_bootstrap.add_argument(
7612 '--mon-ip',
7613 help='mon IP')
7614 parser_bootstrap.add_argument(
7615 '--mgr-id',
7616 required=False,
7617 help='mgr id (default: randomly generated)')
7618 parser_bootstrap.add_argument(
7619 '--fsid',
7620 help='cluster FSID')
7621 parser_bootstrap.add_argument(
7622 '--output-dir',
7623 default='/etc/ceph',
7624 help='directory to write config, keyring, and pub key files')
7625 parser_bootstrap.add_argument(
7626 '--output-keyring',
7627 help='location to write keyring file with new cluster admin and mon keys')
7628 parser_bootstrap.add_argument(
7629 '--output-config',
7630 help='location to write conf file to connect to new cluster')
7631 parser_bootstrap.add_argument(
7632 '--output-pub-ssh-key',
7633 help="location to write the cluster's public SSH key")
7634 parser_bootstrap.add_argument(
7635 '--skip-ssh',
7636 action='store_true',
7637 help='skip setup of ssh key on local host')
7638 parser_bootstrap.add_argument(
7639 '--initial-dashboard-user',
7640 default='admin',
7641 help='Initial user for the dashboard')
7642 parser_bootstrap.add_argument(
7643 '--initial-dashboard-password',
7644 help='Initial password for the initial dashboard user')
7645 parser_bootstrap.add_argument(
7646 '--ssl-dashboard-port',
7647 type=int,
7648 default=8443,
7649 help='Port number used to connect with dashboard using SSL')
7650 parser_bootstrap.add_argument(
7651 '--dashboard-key',
7652 type=argparse.FileType('r'),
7653 help='Dashboard key')
7654 parser_bootstrap.add_argument(
7655 '--dashboard-crt',
7656 type=argparse.FileType('r'),
7657 help='Dashboard certificate')
7658
7659 parser_bootstrap.add_argument(
7660 '--ssh-config',
7661 type=argparse.FileType('r'),
7662 help='SSH config')
7663 parser_bootstrap.add_argument(
7664 '--ssh-private-key',
7665 type=argparse.FileType('r'),
7666 help='SSH private key')
7667 parser_bootstrap.add_argument(
7668 '--ssh-public-key',
7669 type=argparse.FileType('r'),
7670 help='SSH public key')
7671 parser_bootstrap.add_argument(
7672 '--ssh-user',
7673 default='root',
7674 help='set user for SSHing to cluster hosts, passwordless sudo will be needed for non-root users')
7675
7676 parser_bootstrap.add_argument(
7677 '--skip-mon-network',
7678 action='store_true',
7679 help='set mon public_network based on bootstrap mon ip')
7680 parser_bootstrap.add_argument(
7681 '--skip-dashboard',
7682 action='store_true',
7683 help='do not enable the Ceph Dashboard')
7684 parser_bootstrap.add_argument(
7685 '--dashboard-password-noupdate',
7686 action='store_true',
7687 help='stop forced dashboard password change')
7688 parser_bootstrap.add_argument(
7689 '--no-minimize-config',
7690 action='store_true',
7691 help='do not assimilate and minimize the config file')
7692 parser_bootstrap.add_argument(
7693 '--skip-ping-check',
7694 action='store_true',
7695 help='do not verify that mon IP is pingable')
7696 parser_bootstrap.add_argument(
7697 '--skip-pull',
7698 action='store_true',
7699 help='do not pull the latest image before bootstrapping')
7700 parser_bootstrap.add_argument(
7701 '--skip-firewalld',
7702 action='store_true',
7703 help='Do not configure firewalld')
7704 parser_bootstrap.add_argument(
7705 '--allow-overwrite',
7706 action='store_true',
7707 help='allow overwrite of existing --output-* config/keyring/ssh files')
7708 parser_bootstrap.add_argument(
7709 '--allow-fqdn-hostname',
7710 action='store_true',
7711 help='allow hostname that is fully-qualified (contains ".")')
7712 parser_bootstrap.add_argument(
7713 '--allow-mismatched-release',
7714 action='store_true',
7715 help="allow bootstrap of ceph that doesn't match this version of cephadm")
7716 parser_bootstrap.add_argument(
7717 '--skip-prepare-host',
7718 action='store_true',
7719 help='Do not prepare host')
7720 parser_bootstrap.add_argument(
7721 '--orphan-initial-daemons',
7722 action='store_true',
7723 help='Set mon and mgr service to `unmanaged`, Do not create the crash service')
7724 parser_bootstrap.add_argument(
7725 '--skip-monitoring-stack',
7726 action='store_true',
7727 help='Do not automatically provision monitoring stack (prometheus, grafana, alertmanager, node-exporter)')
7728 parser_bootstrap.add_argument(
7729 '--apply-spec',
7730 help='Apply cluster spec after bootstrap (copy ssh key, add hosts and apply services)')
7731
7732 parser_bootstrap.add_argument(
7733 '--shared_ceph_folder',
7734 metavar='CEPH_SOURCE_FOLDER',
7735 help='Development mode. Several folders in containers are volumes mapped to different sub-folders in the ceph source folder')
7736
7737 parser_bootstrap.add_argument(
7738 '--registry-url',
7739 help='url for custom registry')
7740 parser_bootstrap.add_argument(
7741 '--registry-username',
7742 help='username for custom registry')
7743 parser_bootstrap.add_argument(
7744 '--registry-password',
7745 help='password for custom registry')
7746 parser_bootstrap.add_argument(
7747 '--registry-json',
7748 help='json file with custom registry login info (URL, Username, Password)')
7749 parser_bootstrap.add_argument(
7750 '--container-init',
7751 action='store_true',
7752 default=CONTAINER_INIT,
7753 help=argparse.SUPPRESS)
7754 parser_bootstrap.add_argument(
7755 '--with-exporter',
7756 action='store_true',
7757 help='Automatically deploy cephadm metadata exporter to each node')
7758 parser_bootstrap.add_argument(
7759 '--exporter-config',
7760 action=CustomValidation,
7761 help=f'Exporter configuration information in JSON format (providing: {", ".join(CephadmDaemon.config_requirements)}, port information)')
7762 parser_bootstrap.add_argument(
7763 '--cluster-network',
7764 help='subnet to use for cluster replication, recovery and heartbeats (in CIDR notation network/mask)')
7765
7766 parser_deploy = subparsers.add_parser(
7767 'deploy', help='deploy a daemon')
7768 parser_deploy.set_defaults(func=command_deploy)
7769 parser_deploy.add_argument(
7770 '--name',
7771 required=True,
7772 action=CustomValidation,
7773 help='daemon name (type.id)')
7774 parser_deploy.add_argument(
7775 '--fsid',
7776 required=True,
7777 help='cluster FSID')
7778 parser_deploy.add_argument(
7779 '--config', '-c',
7780 help='config file for new daemon')
7781 parser_deploy.add_argument(
7782 '--config-json',
7783 help='Additional configuration information in JSON format')
7784 parser_deploy.add_argument(
7785 '--keyring',
7786 help='keyring for new daemon')
7787 parser_deploy.add_argument(
7788 '--key',
7789 help='key for new daemon')
7790 parser_deploy.add_argument(
7791 '--osd-fsid',
7792 help='OSD uuid, if creating an OSD container')
7793 parser_deploy.add_argument(
7794 '--skip-firewalld',
7795 action='store_true',
7796 help='Do not configure firewalld')
7797 parser_deploy.add_argument(
7798 '--tcp-ports',
7799 help='List of tcp ports to open in the host firewall')
7800 parser_deploy.add_argument(
7801 '--reconfig',
7802 action='store_true',
7803 help='Reconfigure a previously deployed daemon')
7804 parser_deploy.add_argument(
7805 '--allow-ptrace',
7806 action='store_true',
7807 help='Allow SYS_PTRACE on daemon container')
7808 parser_deploy.add_argument(
7809 '--container-init',
7810 action='store_true',
7811 default=CONTAINER_INIT,
7812 help=argparse.SUPPRESS)
7813 parser_deploy.add_argument(
7814 '--memory-request',
7815 help='Container memory request/target'
7816 )
7817 parser_deploy.add_argument(
7818 '--memory-limit',
7819 help='Container memory hard limit'
7820 )
7821 parser_deploy.add_argument(
7822 '--meta-json',
7823 help='JSON dict of additional metadata'
7824 )
7825
7826 parser_check_host = subparsers.add_parser(
7827 'check-host', help='check host configuration')
7828 parser_check_host.set_defaults(func=command_check_host)
7829 parser_check_host.add_argument(
7830 '--expect-hostname',
7831 help='Check that hostname matches an expected value')
7832
7833 parser_prepare_host = subparsers.add_parser(
7834 'prepare-host', help='prepare a host for cephadm use')
7835 parser_prepare_host.set_defaults(func=command_prepare_host)
7836 parser_prepare_host.add_argument(
7837 '--expect-hostname',
7838 help='Set hostname')
7839
7840 parser_add_repo = subparsers.add_parser(
7841 'add-repo', help='configure package repository')
7842 parser_add_repo.set_defaults(func=command_add_repo)
7843 parser_add_repo.add_argument(
7844 '--release',
7845 help='use latest version of a named release (e.g., {})'.format(LATEST_STABLE_RELEASE))
7846 parser_add_repo.add_argument(
7847 '--version',
7848 help='use specific upstream version (x.y.z)')
7849 parser_add_repo.add_argument(
7850 '--dev',
7851 help='use specified bleeding edge build from git branch or tag')
7852 parser_add_repo.add_argument(
7853 '--dev-commit',
7854 help='use specified bleeding edge build from git commit')
7855 parser_add_repo.add_argument(
7856 '--gpg-url',
7857 help='specify alternative GPG key location')
7858 parser_add_repo.add_argument(
7859 '--repo-url',
7860 default='https://download.ceph.com',
7861 help='specify alternative repo location')
7862 # TODO: proxy?
7863
7864 parser_rm_repo = subparsers.add_parser(
7865 'rm-repo', help='remove package repository configuration')
7866 parser_rm_repo.set_defaults(func=command_rm_repo)
7867
7868 parser_install = subparsers.add_parser(
7869 'install', help='install ceph package(s)')
7870 parser_install.set_defaults(func=command_install)
7871 parser_install.add_argument(
7872 'packages', nargs='*',
7873 default=['cephadm'],
7874 help='packages')
7875
7876 parser_registry_login = subparsers.add_parser(
7877 'registry-login', help='log host into authenticated registry')
7878 parser_registry_login.set_defaults(func=command_registry_login)
7879 parser_registry_login.add_argument(
7880 '--registry-url',
7881 help='url for custom registry')
7882 parser_registry_login.add_argument(
7883 '--registry-username',
7884 help='username for custom registry')
7885 parser_registry_login.add_argument(
7886 '--registry-password',
7887 help='password for custom registry')
7888 parser_registry_login.add_argument(
7889 '--registry-json',
7890 help='json file with custom registry login info (URL, Username, Password)')
7891 parser_registry_login.add_argument(
7892 '--fsid',
7893 help='cluster FSID')
7894
7895 parser_gather_facts = subparsers.add_parser(
7896 'gather-facts', help='gather and return host related information (JSON format)')
7897 parser_gather_facts.set_defaults(func=command_gather_facts)
7898
7899 parser_exporter = subparsers.add_parser(
7900 'exporter', help='Start cephadm in exporter mode (web service), providing host/daemon/disk metadata')
7901 parser_exporter.add_argument(
7902 '--fsid',
7903 required=True,
7904 type=str,
7905 help='fsid of the cephadm exporter to run against')
7906 parser_exporter.add_argument(
7907 '--port',
7908 type=int,
7909 default=int(CephadmDaemon.default_port),
7910 help='port number for the cephadm exporter service')
7911 parser_exporter.add_argument(
7912 '--id',
7913 type=str,
7914 default=get_hostname().split('.')[0],
7915 help='daemon identifer for the exporter')
7916 parser_exporter.set_defaults(func=command_exporter)
7917
7918 parser_maintenance = subparsers.add_parser(
7919 'host-maintenance', help='Manage the maintenance state of a host')
7920 parser_maintenance.add_argument(
7921 '--fsid',
7922 help='cluster FSID')
7923 parser_maintenance.add_argument(
7924 'maintenance_action',
7925 type=str,
7926 choices=['enter', 'exit'],
7927 help='Maintenance action - enter maintenance, or exit maintenance')
7928 parser_maintenance.set_defaults(func=command_maintenance)
7929
7930 parser_verify_prereqs = subparsers.add_parser(
7931 'verify-prereqs',
7932 help='verify system prerequisites for a given service are met on this host')
7933 parser_verify_prereqs.set_defaults(func=command_verify_prereqs)
7934 parser_verify_prereqs.add_argument(
7935 '--daemon-type',
7936 required=True,
7937 help='service type of service to whose prereqs will be checked')
7938
7939 return parser
7940
7941
7942 def _parse_args(av):
7943 parser = _get_parser()
7944
7945 args = parser.parse_args(av)
7946 if 'command' in args and args.command and args.command[0] == '--':
7947 args.command.pop(0)
7948
7949 # workaround argparse to deprecate the subparser `--container-init` flag
7950 # container_init and no_container_init must always be mutually exclusive
7951 container_init_args = ('--container-init', '--no-container-init')
7952 if set(container_init_args).issubset(av):
7953 parser.error('argument %s: not allowed with argument %s' % (container_init_args))
7954 elif '--container-init' in av:
7955 args.no_container_init = not args.container_init
7956 else:
7957 args.container_init = not args.no_container_init
7958 assert args.container_init is not args.no_container_init
7959
7960 return args
7961
7962
7963 def cephadm_init_ctx(args: List[str]) -> Optional[CephadmContext]:
7964
7965 ctx = CephadmContext()
7966 ctx.set_args(_parse_args(args))
7967 return ctx
7968
7969
7970 def cephadm_init(args: List[str]) -> Optional[CephadmContext]:
7971
7972 global logger
7973 ctx = cephadm_init_ctx(args)
7974 assert ctx is not None
7975
7976 # Logger configuration
7977 if not os.path.exists(LOG_DIR):
7978 os.makedirs(LOG_DIR)
7979 dictConfig(logging_config)
7980 logger = logging.getLogger()
7981
7982 if ctx.verbose:
7983 for handler in logger.handlers:
7984 if handler.name == 'console':
7985 handler.setLevel(logging.DEBUG)
7986
7987 if not ctx.has_function():
7988 sys.stderr.write('No command specified; pass -h or --help for usage\n')
7989 return None
7990
7991 return ctx
7992
7993
7994 def main():
7995
7996 # root?
7997 if os.geteuid() != 0:
7998 sys.stderr.write('ERROR: cephadm should be run as root\n')
7999 sys.exit(1)
8000
8001 av: List[str] = []
8002 av = sys.argv[1:]
8003
8004 ctx = cephadm_init(av)
8005 if not ctx: # error, exit
8006 sys.exit(1)
8007
8008 try:
8009 # podman or docker?
8010 ctx.container_engine = find_container_engine(ctx)
8011 if ctx.func not in \
8012 [command_check_host, command_prepare_host, command_add_repo]:
8013 check_container_engine(ctx)
8014 # command handler
8015 r = ctx.func(ctx)
8016 except Error as e:
8017 if ctx.verbose:
8018 raise
8019 logger.error('ERROR: %s' % e)
8020 sys.exit(1)
8021 if not r:
8022 r = 0
8023 sys.exit(r)
8024
8025
8026 if __name__ == '__main__':
8027 main()